798 lines
28 KiB
Python
798 lines
28 KiB
Python
# -*- coding: utf-8 -*-
|
|
import argparse
|
|
import csv
|
|
import os
|
|
import operator
|
|
import pystache
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import urllib
|
|
import ujson as json
|
|
import yaml
|
|
import HTMLParser
|
|
|
|
from collections import defaultdict, OrderedDict
|
|
from lxml import etree
|
|
from itertools import ifilter, chain
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
|
|
|
|
from address_normalizer.text.tokenize import *
|
|
from geodata.language_id.disambiguation import street_types_gazetteer, disambiguate_language, WELL_REPRESENTED_LANGUAGES, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE
|
|
from geodata.language_id.polygon_lookup import country_and_languages
|
|
from geodata.i18n.languages import *
|
|
from geodata.polygons.language_polys import *
|
|
from geodata.i18n.unicode_paths import DATA_DIR
|
|
|
|
from geodata.csv_utils import *
|
|
from geodata.file_utils import *
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
|
|
FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
|
|
|
|
WAY_OFFSET = 10 ** 15
|
|
RELATION_OFFSET = 2 * 10 ** 15
|
|
|
|
# Input files
|
|
PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
|
|
PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
|
|
PLANET_VENUES_INPUT_FILE = 'planet-venues.osm'
|
|
PLANET_BORDERS_INPUT_FILE = 'planet-borders.osm'
|
|
|
|
ALL_OSM_TAGS = set(['node', 'way', 'relation'])
|
|
WAYS_RELATIONS = set(['way', 'relation'])
|
|
|
|
# Output files
|
|
WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
|
|
ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
|
|
ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
|
|
ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv'
|
|
ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
|
|
TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
|
|
|
|
|
|
class OSMField(object):
|
|
def __init__(self, name, c_constant, alternates=None):
|
|
self.name = name
|
|
self.c_constant = c_constant
|
|
self.alternates = alternates
|
|
|
|
osm_fields = [
|
|
# Field if alternate_names present, default field name if not, C header constant
|
|
OSMField('addr:housename', 'OSM_HOUSE_NAME'),
|
|
OSMField('addr:housenumber', 'OSM_HOUSE_NUMBER'),
|
|
OSMField('addr:block', 'OSM_BLOCK'),
|
|
OSMField('addr:street', 'OSM_STREET_ADDRESS'),
|
|
OSMField('addr:place', 'OSM_PLACE'),
|
|
OSMField('addr:city', 'OSM_CITY', alternates=['addr:locality', 'addr:municipality', 'addr:hamlet']),
|
|
OSMField('addr:suburb', 'OSM_SUBURB'),
|
|
OSMField('addr:neighborhood', 'OSM_NEIGHBORHOOD', alternates=['addr:neighbourhood']),
|
|
OSMField('addr:district', 'OSM_DISTRICT'),
|
|
OSMField('addr:subdistrict', 'OSM_SUBDISTRICT'),
|
|
OSMField('addr:ward', 'OSM_WARD'),
|
|
OSMField('addr:state', 'OSM_STATE'),
|
|
OSMField('addr:province', 'OSM_PROVINCE'),
|
|
OSMField('addr:postcode', 'OSM_POSTAL_CODE', alternates=['addr:postal_code']),
|
|
OSMField('addr:country', 'OSM_COUNTRY'),
|
|
]
|
|
|
|
|
|
# Currently, all our data sets are converted to nodes with osmconvert before parsing
|
|
def parse_osm(filename, allowed_types=ALL_OSM_TAGS):
|
|
f = open(filename)
|
|
parser = etree.iterparse(f)
|
|
|
|
single_type = len(allowed_types) == 1
|
|
|
|
for (_, elem) in parser:
|
|
elem_id = long(elem.attrib.pop('id', 0))
|
|
item_type = elem.tag
|
|
if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
|
|
elem_id -= WAY_OFFSET
|
|
item_type = 'way'
|
|
elif elem_id >= RELATION_OFFSET:
|
|
elem_id -= RELATION_OFFSET
|
|
item_type = 'relation'
|
|
|
|
if item_type in allowed_types:
|
|
attrs = OrderedDict(elem.attrib)
|
|
attrs.update(OrderedDict([(e.attrib['k'], e.attrib['v'])
|
|
for e in elem.getchildren() if e.tag == 'tag']))
|
|
key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
|
|
yield key, attrs
|
|
|
|
if elem.tag != 'tag':
|
|
elem.clear()
|
|
while elem.getprevious() is not None:
|
|
del elem.getparent()[0]
|
|
|
|
|
|
def write_osm_json(filename, out_filename):
|
|
out = open(out_filename, 'w')
|
|
writer = csv.writer(out, 'tsv_no_quote')
|
|
for key, attrs in parse_osm(filename):
|
|
writer.writerow((key, json.dumps(attrs)))
|
|
out.close()
|
|
|
|
|
|
def read_osm_json(filename):
|
|
reader = csv.reader(open(filename), delimiter='\t')
|
|
for key, attrs in reader:
|
|
yield key, json.loads(attrs)
|
|
|
|
|
|
class AddressFormatter(object):
|
|
''' Approximate Python port of lokku's Geo::Address::Formatter '''
|
|
MINIMAL_COMPONENT_KEYS = [
|
|
('road', 'house_number'),
|
|
('road', 'house'),
|
|
('road', 'postcode')
|
|
]
|
|
|
|
whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
|
|
|
|
splitter = ' | '
|
|
|
|
aliases = OrderedDict([
|
|
('name', 'house'),
|
|
('addr:housename', 'house'),
|
|
('addr:housenumber', 'house_number'),
|
|
('addr:street', 'road'),
|
|
('addr:city', 'city'),
|
|
('addr:locality', 'city'),
|
|
('addr:municipality', 'city'),
|
|
('addr:hamlet', 'village'),
|
|
('addr:suburb', 'suburb'),
|
|
('addr:neighbourhood', 'suburb'),
|
|
('addr:neighborhood', 'suburb'),
|
|
('addr:district', 'suburb'),
|
|
('addr:state', 'state'),
|
|
('addr:province', 'state'),
|
|
('addr:region', 'state'),
|
|
('addr:postal_code', 'postcode'),
|
|
('addr:postcode', 'postcode'),
|
|
('addr:country', 'country'),
|
|
('street', 'road'),
|
|
('street_name', 'road'),
|
|
('residential', 'road'),
|
|
('hamlet', 'village'),
|
|
('neighborhood', 'suburb'),
|
|
('neighbourhood', 'suburb'),
|
|
('city_district', 'suburb'),
|
|
('state_code', 'state'),
|
|
('country_name', 'country'),
|
|
])
|
|
|
|
def __init__(self, scratch_dir='/tmp', splitter=None):
|
|
if splitter is not None:
|
|
self.splitter = splitter
|
|
|
|
self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
|
|
self.clone_repo()
|
|
self.load_config()
|
|
|
|
def clone_repo(self):
|
|
subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
|
|
subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
|
|
|
|
def load_config(self):
|
|
self.config = yaml.load(open(os.path.join(self.formatter_repo_path,
|
|
'conf/countries/worldwide.yaml')))
|
|
|
|
def component_aliases(self):
|
|
self.aliases = OrderedDict()
|
|
self.aliases.update(self.osm_aliases)
|
|
components = yaml.load_all(open(os.path.join(self.formatter_repo_path,
|
|
'conf', 'components.yaml')))
|
|
for c in components:
|
|
name = c['name']
|
|
for a in c.get('aliases', []):
|
|
self.aliases[a] = name
|
|
|
|
def replace_aliases(self, components):
|
|
for k in components.keys():
|
|
new_key = self.aliases.get(k)
|
|
if new_key and new_key not in components:
|
|
components[new_key] = components.pop(k)
|
|
|
|
def country_template(self, c):
|
|
return self.config.get(c, self.config['default'])
|
|
|
|
def render_template(self, template, components, tagged=False):
|
|
def render_first(text):
|
|
text = pystache.render(text, **components)
|
|
splits = (e.strip() for e in text.split('||'))
|
|
selected = next(ifilter(bool, splits), '')
|
|
return selected
|
|
|
|
output = pystache.render(template, first=render_first,
|
|
**components).strip()
|
|
|
|
values = self.whitespace_component_regex.split(output)
|
|
|
|
output = self.splitter.join([
|
|
self.strip_component(val, tagged=tagged)
|
|
for val in values
|
|
])
|
|
|
|
return output
|
|
|
|
def minimal_components(self, components):
|
|
for component_list in self.MINIMAL_COMPONENT_KEYS:
|
|
if all((c in components for c in component_list)):
|
|
return True
|
|
return False
|
|
|
|
def apply_replacements(self, template, components):
|
|
if not template.get('replace'):
|
|
return
|
|
for key in components.keys():
|
|
value = components[key]
|
|
for regex, replacement in template['replace']:
|
|
value = re.sub(regex, replacement, value)
|
|
components[key] = value
|
|
|
|
def post_replacements(self, template, text):
|
|
components = []
|
|
seen = set()
|
|
for component in text.split(self.splitter):
|
|
component = component.strip()
|
|
if component not in seen:
|
|
components.append(component)
|
|
seen.add(component)
|
|
text = self.splitter.join(components)
|
|
post_format_replacements = template.get('postformat_replace')
|
|
if post_format_replacements:
|
|
for regex, replacement in post_format_replacements:
|
|
text = re.sub(regex, replacement, text)
|
|
return text
|
|
|
|
def strip_component(self, value, tagged=False):
|
|
if not tagged:
|
|
tokens = tokenize(value)
|
|
for i, (c, t) in enumerate(tokens):
|
|
if c.value < token_types.PERIOD.value:
|
|
break
|
|
|
|
for j, (c, t) in enumerate(reversed(tokens)):
|
|
if c.value < token_types.PERIOD.value:
|
|
break
|
|
tokens = [t for c, t in tokens]
|
|
else:
|
|
tokens = value.split()
|
|
for i, t in enumerate(tokens):
|
|
if '/' in t:
|
|
break
|
|
|
|
for j, t in enumerate(reversed(tokens)):
|
|
if '/' in t:
|
|
break
|
|
if j == 0:
|
|
j = None
|
|
else:
|
|
j = -j
|
|
return u' '.join(tokens[i:j])
|
|
|
|
def format_address(self, country, components, minimal_only=True, tag_components=True):
|
|
template = self.config.get(country.upper())
|
|
if not template:
|
|
return None
|
|
template_text = template['address_template']
|
|
self.replace_aliases(components)
|
|
|
|
if not self.minimal_components(components):
|
|
if minimal_only:
|
|
return None
|
|
if 'fallback_template' in template:
|
|
template_text = template['fallback_template']
|
|
else:
|
|
template_text = self.config['default']['fallback_template']
|
|
|
|
self.apply_replacements(template, components)
|
|
|
|
if tag_components:
|
|
components = {k: u' '.join([u'{}/{}'.format(t, k.replace(' ', '_'))
|
|
for c, t in tokenize(v)])
|
|
for k, v in components.iteritems()}
|
|
else:
|
|
components = {k: u' '.join([t for c, t in tokenize(v)])
|
|
for k, v in components.iteritems()}
|
|
|
|
text = self.render_template(template_text, components, tagged=tag_components)
|
|
|
|
text = self.post_replacements(template, text)
|
|
return text
|
|
|
|
|
|
def normalize_osm_name_tag(tag, script=False):
|
|
norm = tag.rsplit(':', 1)[-1]
|
|
if not script:
|
|
return norm
|
|
return norm.split('_', 1)[0]
|
|
|
|
|
|
beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
|
|
end_re = re.compile('[^0-9]+$', re.UNICODE)
|
|
|
|
latitude_dms_regex = re.compile(ur'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$', re.I | re.UNICODE)
|
|
longitude_dms_regex = re.compile(ur'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$', re.I | re.UNICODE)
|
|
|
|
latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
|
|
longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I)
|
|
|
|
|
|
def latlon_to_floats(latitude, longitude):
|
|
have_lat = False
|
|
have_lon = False
|
|
|
|
latitude = safe_decode(latitude).strip(u' ,;|')
|
|
longitude = safe_decode(longitude).strip(u' ,;|')
|
|
|
|
latitude = latitude.replace(u',', u'.')
|
|
longitude = longitude.replace(u',', u'.')
|
|
|
|
lat_dms = latitude_dms_regex.match(latitude)
|
|
lat_dir = latitude_decimal_with_direction_regex.match(latitude)
|
|
|
|
if lat_dms:
|
|
d, m, s, c = lat_dms.groups()
|
|
sign = direction_sign(c)
|
|
latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
|
|
have_lat = True
|
|
elif lat_dir:
|
|
d, c = lat_dir.groups()
|
|
sign = direction_sign(c)
|
|
latitude = float(d) * sign
|
|
have_lat = True
|
|
else:
|
|
latitude = re.sub(beginning_re, u'', latitude)
|
|
latitude = re.sub(end_re, u'', latitude)
|
|
|
|
lon_dms = longitude_dms_regex.match(longitude)
|
|
lon_dir = longitude_decimal_with_direction_regex.match(longitude)
|
|
|
|
if lon_dms:
|
|
d, m, s, c = lon_dms.groups()
|
|
sign = direction_sign(c)
|
|
longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
|
|
have_lon = True
|
|
elif lon_dir:
|
|
d, c = lon_dir.groups()
|
|
sign = direction_sign(c)
|
|
longitude = float(d) * sign
|
|
have_lon = True
|
|
else:
|
|
longitude = re.sub(beginning_re, u'', longitude)
|
|
longitude = re.sub(end_re, u'', longitude)
|
|
|
|
return float(latitude), float(longitude)
|
|
|
|
|
|
def get_language_names(language_rtree, key, value, tag_prefix='name'):
|
|
if not ('lat' in value and 'lon' in value):
|
|
return None, None
|
|
|
|
has_colon = ':' in tag_prefix
|
|
tag_first_component = tag_prefix.split(':')[0]
|
|
tag_last_component = tag_prefix.split(':')[-1]
|
|
|
|
try:
|
|
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
|
|
except Exception:
|
|
return None, None
|
|
|
|
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
|
if not (country and candidate_languages):
|
|
return None, None
|
|
|
|
num_langs = len(candidate_languages)
|
|
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
|
|
num_defaults = len(default_langs)
|
|
name_language = defaultdict(list)
|
|
|
|
alternate_langs = []
|
|
|
|
equivalent_alternatives = defaultdict(list)
|
|
for k, v in value.iteritems():
|
|
if k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) in languages:
|
|
lang = k.rsplit(':', 1)[-1]
|
|
alternate_langs.append((lang, v))
|
|
equivalent_alternatives[v].append(lang)
|
|
|
|
has_alternate_names = len(alternate_langs)
|
|
# Some countries like Lebanon list things like name:en == name:fr == "Rue Abdel Hamid Karame"
|
|
# Those addresses should be disambiguated rather than taken for granted
|
|
ambiguous_alternatives = set([k for k, v in equivalent_alternatives.iteritems() if len(v) > 1])
|
|
|
|
regional_defaults = 0
|
|
country_defaults = 0
|
|
regional_langs = set()
|
|
country_langs = set()
|
|
for p in language_props:
|
|
if p['admin_level'] > 0:
|
|
regional_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
|
regional_langs |= set([l['lang'] for l in p['languages']])
|
|
else:
|
|
country_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
|
|
country_langs |= set([l['lang'] for l in p['languages']])
|
|
|
|
ambiguous_already_seen = set()
|
|
|
|
for k, v in value.iteritems():
|
|
if k.startswith(tag_prefix + ':'):
|
|
if v not in ambiguous_alternatives:
|
|
norm = normalize_osm_name_tag(k)
|
|
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
|
if norm in languages or norm_sans_script in languages:
|
|
name_language[norm].append(v)
|
|
elif v not in ambiguous_already_seen:
|
|
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
|
|
lang = disambiguate_language(v, langs)
|
|
|
|
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
|
|
name_language[lang].append(v)
|
|
|
|
ambiguous_already_seen.add(v)
|
|
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
|
|
if num_langs == 1:
|
|
name_language[candidate_languages[0]['lang']].append(v)
|
|
else:
|
|
lang = disambiguate_language(v, [(l['lang'], l['default']) for l in candidate_languages])
|
|
default_lang = candidate_languages[0]['lang']
|
|
|
|
if lang == AMBIGUOUS_LANGUAGE:
|
|
return None, None
|
|
elif lang == UNKNOWN_LANGUAGE and num_defaults == 1:
|
|
name_language[default_lang].append(v)
|
|
elif lang != UNKNOWN_LANGUAGE:
|
|
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
|
|
return None, None
|
|
name_language[lang].append(v)
|
|
else:
|
|
return None, None
|
|
|
|
return country, name_language
|
|
|
|
|
|
def build_ways_training_data(language_rtree, infile, out_dir):
|
|
i = 0
|
|
f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
|
|
writer = csv.writer(f, 'tsv_no_quote')
|
|
|
|
for key, value in parse_osm(infile, allowed_types=WAYS_RELATIONS):
|
|
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
|
|
if not name_language:
|
|
continue
|
|
|
|
for k, v in name_language.iteritems():
|
|
for s in v:
|
|
if k in languages:
|
|
writer.writerow((k, country, tsv_string(s)))
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'ways'
|
|
i += 1
|
|
f.close()
|
|
|
|
|
|
def build_address_format_training_data(language_rtree, infile, out_dir):
|
|
i = 0
|
|
|
|
formatter = AddressFormatter()
|
|
|
|
formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
|
|
formatted_writer = csv.writer(formatted_file, 'tsv_no_quote')
|
|
|
|
formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
|
formatted_tagged_writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
|
|
|
for key, value in parse_osm(infile):
|
|
try:
|
|
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
|
|
except Exception:
|
|
continue
|
|
|
|
country, default_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
|
if not (country and default_languages):
|
|
continue
|
|
|
|
formatted_address_tagged = formatter.format_address(country, value)
|
|
formatted_address_untagged = formatter.format_address(country, value, tag_components=False)
|
|
if formatted_address_tagged is not None:
|
|
formatted_address_tagged = tsv_string(formatted_address_tagged)
|
|
formatted_tagged_writer.writerow((default_languages[0]['lang'], country, formatted_address_tagged))
|
|
|
|
if formatted_address_untagged is not None:
|
|
formatted_address_untagged = tsv_string(formatted_address_untagged)
|
|
formatted_writer.writerow((default_languages[0]['lang'], country, formatted_address_untagged))
|
|
|
|
if formatted_address_tagged is not None or formatted_address_untagged is not None:
|
|
i += 1
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'formatted addresses'
|
|
|
|
|
|
NAME_KEYS = (
|
|
'name',
|
|
'addr:housename',
|
|
)
|
|
COUNTRY_KEYS = (
|
|
'country',
|
|
'country_name',
|
|
'addr:country',
|
|
)
|
|
POSTAL_KEYS = (
|
|
'postcode',
|
|
'postal_code',
|
|
'addr:postcode',
|
|
'addr:postal_code',
|
|
)
|
|
|
|
|
|
def build_address_format_training_data_limited(language_rtree, infile, out_dir):
|
|
i = 0
|
|
|
|
formatter = AddressFormatter()
|
|
|
|
f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
|
|
writer = csv.writer(f, 'tsv_no_quote')
|
|
|
|
remove_keys = NAME_KEYS + COUNTRY_KEYS + POSTAL_KEYS
|
|
|
|
for key, value in parse_osm(infile):
|
|
try:
|
|
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
|
|
except Exception:
|
|
continue
|
|
|
|
for k in remove_keys:
|
|
_ = value.pop(k, None)
|
|
|
|
if not value:
|
|
continue
|
|
|
|
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street')
|
|
if not name_language:
|
|
continue
|
|
|
|
single_language = len(name_language) == 1
|
|
for lang, val in name_language.iteritems():
|
|
if lang not in languages:
|
|
continue
|
|
|
|
address_dict = value.copy()
|
|
for k in address_dict.keys():
|
|
namespaced_val = u'{}:{}'.format(k, lang)
|
|
if namespaced_val in address_dict:
|
|
address_dict[k] = address_dict[namespaced_val]
|
|
elif not single_language:
|
|
address_dict.pop(k)
|
|
|
|
if not address_dict:
|
|
continue
|
|
|
|
formatted_address_untagged = formatter.format_address(country, address_dict, tag_components=False)
|
|
if formatted_address_untagged is not None:
|
|
formatted_address_untagged = tsv_string(formatted_address_untagged)
|
|
|
|
writer.writerow((lang, country, formatted_address_untagged))
|
|
|
|
i += 1
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'formatted addresses'
|
|
|
|
|
|
apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
|
|
|
|
html_parser = HTMLParser.HTMLParser()
|
|
|
|
|
|
def normalize_wikipedia_title(title):
|
|
match = apposition_regex.match(title)
|
|
if match:
|
|
title = match.group(1)
|
|
|
|
title = safe_decode(title)
|
|
title = html_parser.unescape(title)
|
|
title = urllib.unquote_plus(title)
|
|
|
|
return title.replace(u'_', u' ').strip()
|
|
|
|
|
|
def build_toponym_training_data(language_rtree, infile, out_dir):
|
|
i = 0
|
|
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
|
writer = csv.writer(f, 'tsv_no_quote')
|
|
|
|
for key, value in parse_osm(infile):
|
|
if not sum((1 for k, v in value.iteritems() if k.startswith('name:'))) > 0:
|
|
continue
|
|
|
|
try:
|
|
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
|
|
except Exception:
|
|
continue
|
|
|
|
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
|
if not (country and candidate_languages):
|
|
continue
|
|
|
|
name_language = defaultdict(list)
|
|
|
|
official = official_languages[country]
|
|
|
|
num_langs = len(candidate_languages)
|
|
default_langs = set([l for l, default in official.iteritems() if default])
|
|
num_defaults = len(default_langs)
|
|
|
|
top_lang = None
|
|
if len(official) > 0:
|
|
top_lang = official.iterkeys().next()
|
|
|
|
if top_lang is not None and top_lang not in WELL_REPRESENTED_LANGUAGES and len(default_langs) > 1:
|
|
default_langs -= WELL_REPRESENTED_LANGUAGES
|
|
elif len(default_langs & WELL_REPRESENTED_LANGUAGES) > 1:
|
|
continue
|
|
|
|
valid_languages = (set([l['lang'] for l in candidate_languages]) - WELL_REPRESENTED_LANGUAGES) | default_langs
|
|
|
|
if not valid_languages:
|
|
continue
|
|
|
|
for k, v in value.iteritems():
|
|
if not k.startswith('name:'):
|
|
continue
|
|
|
|
norm = normalize_osm_name_tag(k)
|
|
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
|
|
|
if norm in languages:
|
|
lang = norm
|
|
elif norm_sans_script in languages:
|
|
lang = norm_sans_script
|
|
else:
|
|
continue
|
|
|
|
if lang in valid_languages:
|
|
have_alternate_names = True
|
|
name_language[lang].append(v)
|
|
|
|
for k, v in name_language.iteritems():
|
|
for s in v:
|
|
s = s.strip()
|
|
if not s:
|
|
continue
|
|
writer.writerow((k, country, tsv_string(s)))
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'toponyms'
|
|
i += 1
|
|
|
|
f.close()
|
|
|
|
|
|
def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
|
|
i = 0
|
|
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
|
|
writer = csv.writer(f, 'tsv_no_quote')
|
|
|
|
for key, value in parse_osm(infile):
|
|
country, street_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street')
|
|
if not street_language:
|
|
continue
|
|
|
|
for k, v in street_language.iteritems():
|
|
for s in v:
|
|
s = s.strip()
|
|
if not s:
|
|
continue
|
|
if k in languages:
|
|
writer.writerow((k, country, tsv_string(s)))
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'streets'
|
|
i += 1
|
|
|
|
f.close()
|
|
|
|
VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'
|
|
|
|
|
|
def build_venue_training_data(language_rtree, infile, out_dir):
|
|
i = 0
|
|
|
|
f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
|
|
writer = csv.writer(f, 'tsv_no_quote')
|
|
|
|
for key, value in parse_osm(infile):
|
|
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
|
|
if not name_language:
|
|
continue
|
|
|
|
venue_type = None
|
|
for key in (u'amenity', u'building'):
|
|
amenity = value.get(key, u'').strip()
|
|
if amenity in ('yes', 'y'):
|
|
continue
|
|
|
|
if amenity:
|
|
venue_type = u':'.join([key, amenity])
|
|
break
|
|
|
|
if venue_type is None:
|
|
continue
|
|
|
|
for k, v in name_language.iteritems():
|
|
for s in v:
|
|
s = s.strip()
|
|
if k in languages:
|
|
writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
|
|
if i % 1000 == 0 and i > 0:
|
|
print 'did', i, 'venues'
|
|
i += 1
|
|
|
|
f.close()
|
|
|
|
if __name__ == '__main__':
|
|
# Handle argument parsing here
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('-s', '--streets-file',
|
|
help='Path to planet-ways.osm')
|
|
|
|
parser.add_argument('-a', '--address-file',
|
|
help='Path to planet-addresses.osm')
|
|
|
|
parser.add_argument('-v', '--venues-file',
|
|
help='Path to planet-venues.osm')
|
|
|
|
parser.add_argument('-b', '--borders-file',
|
|
help='Path to planet-borders.osm')
|
|
|
|
parser.add_argument('-f', '--format-only',
|
|
action='store_true',
|
|
default=False,
|
|
help='Save formatted addresses (slow)')
|
|
|
|
parser.add_argument('-l', '--limited-addresses',
|
|
action='store_true',
|
|
default=False,
|
|
help='Save formatted addresses without house names or country (slow)')
|
|
|
|
parser.add_argument('-t', '--temp-dir',
|
|
default=tempfile.gettempdir(),
|
|
help='Temp directory to use')
|
|
|
|
parser.add_argument('-r', '--rtree-dir',
|
|
required=True,
|
|
help='Language RTree directory')
|
|
|
|
parser.add_argument('-o', '--out-dir',
|
|
default=os.getcwd(),
|
|
help='Output directory')
|
|
|
|
args = parser.parse_args()
|
|
|
|
init_languages()
|
|
|
|
language_rtree = LanguagePolygonIndex.load(args.rtree_dir)
|
|
|
|
street_types_gazetteer.configure()
|
|
|
|
# Can parallelize
|
|
if args.streets_file:
|
|
build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
|
|
if args.borders_file:
|
|
build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)
|
|
if args.address_file and not args.format_only and not args.limited_addresses:
|
|
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
|
if args.address_file and args.format_only:
|
|
build_address_format_training_data(language_rtree, args.address_file, args.out_dir)
|
|
if args.address_file and args.limited_addresses:
|
|
build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
|
|
if args.venues_file:
|
|
build_venue_training_data(language_rtree, args.venues_file, args.out_dir)
|