diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index ce12df21..6f09d4c5 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -131,6 +131,16 @@ def num_deps(c): return len(c.dependencies) +RANDOM_VALUE_REPLACEMENTS = { + # Key: address component + AddressFormatter.COUNTRY: { + # value: (replacement, probability) + 'GB': ('UK', 0.3), + 'United Kingdom': ('UK', 0.3), + } +} + + OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps) OSM_ADDRESS_COMPONENT_COMBINATIONS = [] @@ -309,6 +319,155 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language +ALL_LANGUAGES = 'all' + +LOWER, UPPER, TITLE, MIXED = range(4) + + +def token_capitalization(s): + if s.istitle(): + return TITLE + elif s.islower(): + return LOWER + elif s.isupper(): + return UPPER + else: + return MIXED + + +def recase_abbreviation(expansion, tokens): + expansion_tokens = expansion.split() + if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)): + return expansion.upper() + elif len(tokens) == len(expansion_tokens): + strings = [] + for (t, c), e in zip(tokens, expansion_tokens): + cap = token_capitalization(t) + if cap == LOWER: + strings.append(e.lower()) + elif cap == UPPER: + strings.append(e.upper()) + elif cap == TITLE: + strings.append(e.title()) + elif t.lower() == e.lower(): + strings.append(t) + else: + strings.append(e.title()) + return u' '.join(strings) + else: + return u' '.join([t.title() for t in expansion_tokens]) + + +def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2): + ''' + Abbreviations + ------------- + + OSM discourages abbreviations, but to make our training data map better + to real-world input, we can safely replace the canonical phrase with an + abbreviated version and retain the meaning of the words + ''' + tokens = tokenize(s) + norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] + + abbreviated = [] + + i = 0 + + for t, c, length, data in gazetteer.filter(norm_tokens): + if c is PHRASE: + valid = [] + data = [d.split('|') for d in data] + + added = False + + for lang, dictionary, is_canonical, canonical in data: + if lang not in (language, 'all'): + continue + + is_canonical = int(is_canonical) + is_stopword = dictionary == 'stopword' + is_prefix = dictionary.startswith('concatenated_prefixes') + is_suffix = dictionary.startswith('concatenated_suffixes') + is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length + + suffix = None + prefix = None + + if not is_canonical or random.random() > abbreviate_prob: + continue + + if not is_prefix and not is_suffix: + abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary)) + token = random.choice(abbreviations) if abbreviations else canonical + token = recase_abbreviation(token, tokens[i:i + len(t)]) + abbreviated.append(token) + if t[-1][1] != token_types.IDEOGRAPHIC_CHAR: + abbreviated.append(u' ') + added = True + elif is_prefix: + token = tokens[i][0] + prefix, token = token[:length], token[length:] + abbreviated.append(prefix) + if random.random() < separate_prob: + abbreviated.append(u' ') + if token.islower(): + abbreviated.append(token.title()) + else: + abbreviated.append(token) + abbreviated.append(u' ') + added = True + elif is_suffix: + token = tokens[i][0] + + token, suffix = token[:-length], token[-length:] + + concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), []) + + separated_abbreviations = [] + phrase = gazetteer.trie.get(suffix.rstrip('.')) + suffix_data = [safe_decode(d).split(u'|') for d in (phrase or [])] + for l, d, _, c in suffix_data: + if l == lang and c == canonical: + separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d))) + + separate = random.random() < separate_prob + + if concatenated_abbreviations and not separate: + abbreviation = random.choice(concatenated_abbreviations) + elif separated_abbreviations: + abbreviation = random.choice(separated_abbreviations) + else: + abbreviation = canonical + + abbreviated.append(token) + if separate: + abbreviated.append(u' ') + if suffix.isupper(): + abbreviated.append(abbreviation.upper()) + elif separate: + abbreviated.append(abbreviation.title()) + else: + abbreviated.append(abbreviation) + abbreviated.append(u' ') + added = True + + if not added: + for j, (t_i, c_i) in enumerate(t): + abbreviated.append(tokens[i + j][0]) + if c_i != token_types.IDEOGRAPHIC_CHAR: + abbreviated.append(u' ') + i += len(t) + + else: + abbreviated.append(tokens[i][0]) + if (c != token_types.IDEOGRAPHIC_CHAR): + abbreviated.append(u' ') + i += 1 + + return u''.join(abbreviated).strip() + + def build_ways_training_data(language_rtree, infile, out_dir): ''' Creates a training set for language classification using most OSM ways @@ -334,7 +493,7 @@ def build_ways_training_data(language_rtree, infile, out_dir): if k in languages: writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'ways' + print('did {} ways'.format(i)) i += 1 f.close() @@ -362,124 +521,111 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude): return ret -def osm_pick_random_name_key(suffix=''): - name_key = ''.join(('name', suffix)) - raw_name_key = 'name' - short_name_key = ''.join(('short_name', suffix)) - raw_short_name_key = 'short_name' - alt_name_key = ''.join(('alt_name', suffix)) - raw_alt_name_key = 'alt_name' - official_name_key = ''.join(('official_name', suffix)) - raw_official_name_key = 'official_name' - - # Choose which name to use with given probabilities - r = random.random() - if r < 0.7: - # 70% of the time use the name tag - key = name_key - raw_key = raw_name_key - elif r < 0.8: - # 10% of the time use the short name - key = short_name_key - raw_key = raw_short_name_key - elif r < 0.9: - # 10% of the time use the official name - key = official_name_key - raw_key = raw_official_name_key - else: - # 10% of the time use the official name - key = alt_name_key - raw_key = raw_alt_name_key - - return key, raw_key - - -def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True): - ''' - Creates formatted address training data for supervised sequence labeling (or potentially - for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. - - Example: - - cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country - - The field structure is similar to other training data created by this script i.e. - {language, country, data}. The data field here is a sequence of labeled tokens similar - to what we might see in part-of-speech tagging. - - - This format uses a special character "|" to denote possible breaks in the input (comma, newline). - - Note that for the address parser, we'd like it to be robust to many different types - of input, so we may selectively eleminate components - - This information can potentially be used downstream by the sequence model as these - breaks may be present at prediction time. - - Example: - - sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic - - This may be useful in learning word representations, statistical phrases, morphology - or other models requiring only the sequence of words. - ''' - i = 0 - - formatter = AddressFormatter() - osm_address_components.configure() - - if tag_components: - formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') - writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') - else: - formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') - writer = csv.writer(formatted_file, 'tsv_no_quote') - - remove_keys = OSM_IGNORE_KEYS - +class OSMAddressFormatter(object): alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries} - for node_id, value, deps in parse_osm(infile): - try: - latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) - except Exception: - continue - - country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) - if not (country and candidate_languages): - continue - - for key in remove_keys: - _ = value.pop(key, None) + def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None): + self.admin_rtree = admin_rtree + self.language_rtree = language_rtree + self.neighborhoods_rtree = neighborhoods_rtree + self.quattroshapes_rtree = quattroshapes_rtree + self.geonames = geonames + self.formatter = AddressFormatter(splitter=splitter) + osm_address_components.configure() + def pick_language(self, value, candidate_languages, pick_namespaced_language_prob=0.6): language = None - more_than_one_official_language = len(candidate_languages) > 1 + if len(candidate_languages) == 1: + language = candidate_languages[0]['lang'] + else: + street = value.get('addr:street', None) - if tag_components: - if len(candidate_languages) == 1: - language = candidate_languages[0]['lang'] + namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] + + if street is not None and not namespaced: + language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) + elif namespaced and random.random() < pick_namespaced_language_prob: + language = random.choice(namespaced) + lang_suffix = ':{}'.format(language) + for k in value: + if k.startswith('addr:') and k.endswith(lang_suffix): + value[k.rstrip(lang_suffix)] = value[k] else: - street = value.get('addr:street', None) + language = UNKNOWN_LANGUAGE - namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] + return language - if street is not None and not namespaced: - language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) - elif namespaced and random.random() < 0.6: - language = random.choice(namespaced) - lang_suffix = ':{}'.format(language) - for k in value: - if k.startswith('addr:') and k.endswith(lang_suffix): - value[k.rstrip(lang_suffix)] = value[k] - else: - language = UNKNOWN_LANGUAGE + def pick_random_name_key(self, suffix=''): + name_key = ''.join(('name', suffix)) + raw_name_key = 'name' + short_name_key = ''.join(('short_name', suffix)) + raw_short_name_key = 'short_name' + alt_name_key = ''.join(('alt_name', suffix)) + raw_alt_name_key = 'alt_name' + official_name_key = ''.join(('official_name', suffix)) + raw_official_name_key = 'official_name' - address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases} - formatter.replace_aliases(address_components) + # Choose which name to use with given probabilities + r = random.random() + if r < 0.7: + # 70% of the time use the name tag + key = name_key + raw_key = raw_name_key + elif r < 0.8: + # 10% of the time use the short name + key = short_name_key + raw_key = raw_short_name_key + elif r < 0.9: + # 10% of the time use the official name + key = official_name_key + raw_key = raw_official_name_key + else: + # 10% of the time use the official name + key = alt_name_key + raw_key = raw_alt_name_key - address_country = address_components.get(AddressFormatter.COUNTRY) + return key, raw_key + def normalize_address_components(self, value): + address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases} + self.formatter.replace_aliases(address_components) + return address_components + + def abbreviated_street(self, street, language, abbreviate_prob=0.3, separate_prob=0.2): + ''' + Street abbreviations + -------------------- + + Use street and unit type dictionaries to probabilistically abbreviate + phrases. Because the abbreviation is picked at random, this should + help bridge the gap between OSM addresses and user input, in addition + to capturing some non-standard abbreviations/surface forms which may be + missing or sparse in OSM. + ''' + return osm_abbreviate(street_and_unit_types_gazetteer, street, language, + abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) + + def abbreviated_venue_name(self, name, language, abbreviate_prob=0.2, separate_prob=0.0): + ''' + Venue abbreviations + ------------------- + + Use street and unit type dictionaries to probabilistically abbreviate + phrases. Because the abbreviation is picked at random, this should + help bridge the gap between OSM addresses and user input, in addition + to capturing some non-standard abbreviations/surface forms which may be + missing or sparse in OSM. + ''' + return osm_abbreviate(names_gazetteer, name, language, + abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) + + def country_name(self, address_components, country_code, language, + use_country_code_prob=0.3, + local_language_name_prob=0.6, + random_language_name_prob=0.1, + alpha_3_iso_code_prob=0.1, + ): ''' Country names ------------- @@ -506,14 +652,16 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood non_local_language = None - if random.random() < 0.3: + address_country = address_components.get(AddressFormatter.COUNTRY) + + if random.random() < use_country_code_prob: # 30% of the time: add Quattroshapes country - address_country = country.upper() + address_country = country_code.upper() r = random.random() # 1. 60% of the time: use the country name in the current language or the country's local language - if address_country and r < 0.6: + if address_country and r < local_language_name_prob: localized = None if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): localized = language_country_names.get(language, {}).get(address_country.upper()) @@ -522,25 +670,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood localized = country_localized_display_name(address_country.lower()) if localized: - address_components[AddressFormatter.COUNTRY] = localized + address_country = localized # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet - elif address_country and r < 0.7: + elif address_country and r < local_language_name_prob + random_language_name_prob: non_local_language = sample_random_language() lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper()) if lang_country: - address_components[AddressFormatter.COUNTRY] = lang_country + address_country = lang_country # 3. 10% of the time: use the country's alpha-3 ISO code - elif address_country and r < 0.8: - iso_code_alpha3 = alpha3_codes.get(address_country) + elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob: + iso_code_alpha3 = self.alpha3_codes.get(address_country) if iso_code_alpha3: - address_components[AddressFormatter.COUNTRY] = iso_code_alpha3 + address_country = iso_code_alpha3 # 4. Implicit: the rest of the time keep the alpha-2 country code + return address_country, non_local_language + + def venue_names(self, value): ''' Venue names ----------- Some venues have multiple names listed in OSM, grab them all + With a certain probability, add None to the list so we drop the name ''' venue_names = [] @@ -548,7 +700,9 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood venue_name = value.get(key) if venue_name: venue_names.append(venue_name) + return venue_names + def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.3): ''' States ------ @@ -559,14 +713,38 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood ''' address_state = address_components.get(AddressFormatter.STATE) - if address_state and not non_local_language: + if address_state and country and not non_local_language: state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language) - if state_full_name and random.random() < 0.3: - address_components[AddressFormatter.STATE] = state_full_name + if state_full_name and random.random() < state_full_name_prob: + address_state = state_full_name elif address_state and non_local_language: _ = address_components.pop(AddressFormatter.STATE, None) + address_state = None + return address_state + def tag_suffix(self, language, non_local_language, more_than_one_official_language=False): + if non_local_language is not None: + osm_suffix = ':{}'.format(non_local_language) + elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): + osm_suffix = ':{}'.format(language) + else: + osm_suffix = '' + return osm_suffix + + def add_osm_boundaries(self, address_components, + country, language, + latitude, longitude, + osm_suffix='', + non_local_language=None, + random_key=True, + alpha_3_iso_code_prob=0.1, + alpha_2_iso_code_prob=0.2, + simple_country_key_prob=0.4, + replace_with_non_local_prob=0.4, + join_state_district_prob=0.5, + expand_state_prob=0.7 + ): ''' OSM boundaries -------------- @@ -583,14 +761,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood include these qualifiers in the training data. ''' - osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude) - - if non_local_language is not None: - osm_suffix = ':{}'.format(non_local_language) - elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): - osm_suffix = ':{}'.format(language) - else: - osm_suffix = '' + osm_components = osm_reverse_geocoded_components(self.admin_rtree, country, latitude, longitude) name_key = ''.join(('name', osm_suffix)) raw_name_key = 'name' @@ -608,24 +779,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood for component, components_values in osm_components.iteritems(): seen = set() - key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) + if random_key: + key, raw_key = self.pick_random_name_key(suffix=osm_suffix) + else: + key, raw_key = name_key, raw_name_key for component_value in components_values: r = random.random() name = None - if iso_code3_key in component_value and r < 0.1: - name = component_value[iso_code3_key] - elif iso_code_key in component_value and r < 0.3: - name = component_value[iso_code_key] - elif language == 'en' and not non_local_language and r < 0.7: - # Particularly to address the US (prefer United States, - # not United States of America) but may capture variations - # in other English-speaking countries as well. - if simple_name_key in component_value: - name = component_value[simple_name_key] - elif international_name_key in component_value: - name = component_value[international_name_key] + if component == AddressFormatter.COUNTRY: + if iso_code3_key in component_value and r < alpha_3_iso_code_prob: + name = component_value[iso_code3_key] + elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob: + name = component_value[iso_code_key] + elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob: + # Particularly to address the US (prefer United States, + # not United States of America) but may capture variations + # in other English-speaking countries as well. + if simple_name_key in component_value: + name = component_value[simple_name_key] + elif international_name_key in component_value: + name = component_value[international_name_key] + if not name: name = component_value.get(key, component_value.get(raw_key)) @@ -640,17 +816,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood seen.add((component, name)) for component, vals in poly_components.iteritems(): - if component not in address_components or (non_local_language and random.random() < 0.4): - if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5: + if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob): + if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob: num = random.randrange(1, len(vals) + 1) val = u', '.join(vals[:num]) else: val = random.choice(vals) - if component == AddressFormatter.STATE and random.random() < 0.7: - val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) + if component == AddressFormatter.STATE and random.random() < expand_state_prob: + val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val) address_components[component] = val + def quattroshapes_city(self, address_components, + latitude, longitude, + language, non_local_language=None, + qs_add_city_prob=0.2, + abbreviated_name_prob=0.1): ''' Quattroshapes/GeoNames cities ----------------------------- @@ -661,13 +842,15 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood reliably use local names, which we'll want for consistency ''' - if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2): + city = None + + if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob): lang = non_local_language or language - quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True) + quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True) for result in quattroshapes_cities: - if result.get(quattroshapes_rtree.LEVEL) == quattroshapes_rtree.LOCALITY and quattroshapes_rtree.GEONAMES_ID in result: - geonames_id = int(result[quattroshapes_rtree.GEONAMES_ID].split(',')[0]) - names = geonames.get_alternate_names(geonames_id) + if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result: + geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0]) + names = self.geonames.get_alternate_names(geonames_id) if not names or lang not in names: continue @@ -676,20 +859,27 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if 'abbr' not in names or non_local_language: # Use the common city name in the target language city = names[lang][0][0] - elif random.random() < 0.1: + elif random.random() < abbreviated_name_prob: # Use an abbreviation: NYC, BK, SF, etc. city = random.choice(names['abbr'])[0] if not city or not city.strip(): continue - address_components[AddressFormatter.CITY] = city + return city break else: if non_local_language and AddressFormatter.CITY in address_components and ( - AddressFormatter.CITY_DISTRICT in osm_components or - AddressFormatter.SUBURB in osm_components): + AddressFormatter.CITY_DISTRICT in address_components or + AddressFormatter.SUBURB in address_components): address_components.pop(AddressFormatter.CITY) + return city + + def add_neighborhoods(self, address_components, + latitude, longitude, + osm_suffix='', + add_prefix_prob=0.5, + add_neighborhood_prob=0.5): ''' Neighborhoods ------------- @@ -702,13 +892,17 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood on the whole of better quality). ''' - neighborhoods = neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) + neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) neighborhood_levels = defaultdict(list) + + name_key = ''.join(('name', osm_suffix)) + raw_name_key = 'name' + for neighborhood in neighborhoods: place_type = neighborhood.get('place') polygon_type = neighborhood.get('polygon_type') - key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) + key, raw_key = self.pick_random_name_key(suffix=osm_suffix) name = neighborhood.get(key, neighborhood.get(raw_key)) if not name: @@ -716,7 +910,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood name_prefix = neighborhood.get('name:prefix') - if name_prefix and random.random() < 0.5: + if name_prefix and random.random() < add_prefix_prob: name = u' '.join([name_prefix, name]) if not name: @@ -737,9 +931,10 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood neighborhood_levels[neighborhood_level].append(name) for component, neighborhoods in neighborhood_levels.iteritems(): - if component not in address_components and random.random() < 0.5: + if component not in address_components and random.random() < add_neighborhood_prob: address_components[component] = neighborhoods[0] + def normalize_names(self, address_components, replacement_prob=0.6): ''' Name normalization ------------------ @@ -751,9 +946,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if not name: continue replacement = replace_name_prefixes(replace_name_suffixes(name)) - if replacement != name and random.random() < 0.6: + if replacement != name and random.random() < replacement_prob: address_components[component] = replacement + def replace_names(self, address_components): + ''' + Name replacements + ----------------- + + Make a few special replacements (like UK instead of GB) + ''' + for component, value in address_components.iteritems(): + replacement, prob = RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0)) + if replacement is not None and random.random() < prob: + address_components[component] = replacement + + def prune_duplicate_names(self, address_components): ''' Name deduping ------------- @@ -764,7 +972,8 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood name_components = defaultdict(list) - for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): + for component in (AddressFormatter.CITY, AddressFormatter.STATE_DISTRICT, + AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): name = address_components.get(component) if name: name_components[name].append(component) @@ -774,7 +983,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood for component in components[1:]: address_components.pop(component, None) - + def cleanup_house_number(self, address_components): ''' House number cleanup -------------------- @@ -796,8 +1005,161 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood else: address_components.pop(AddressFormatter.HOUSE_NUMBER, None) + def expanded_address_components(self, value): + try: + latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) + except Exception: + return None, None, None + + country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + for key in OSM_IGNORE_KEYS: + _ = value.pop(key, None) + + language = None + + more_than_one_official_language = len(candidate_languages) > 1 + + language = self.pick_language(value, candidate_languages) + + address_components = self.normalize_address_components(value) + + address_country, non_local_language = self.country_name(address_components, country, language) + if address_country: + address_components[AddressFormatter.COUNTRY] = address_country + + address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0) + if address_state: + address_components[AddressFormatter.STATE] = address_state + + osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language) + + self.add_osm_boundaries(address_components, country, language, latitude, longitude, + non_local_language=non_local_language, + osm_suffix=osm_suffix) + + city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language) + if city: + address_components[AddressFormatter.CITY] = city + + self.add_neighborhoods(address_components, latitude, longitude, + osm_suffix=osm_suffix) + + street = address_components.get(AddressFormatter.ROAD) + if street: + address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language) + + self.normalize_names(address_components) + + self.replace_names(address_components) + + self.prune_duplicate_names(address_components) + + self.cleanup_house_number(address_components) + + return address_components, country, language + + def limited_address_components(self, value): + try: + latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) + except Exception: + return None, None, None + + country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS + + for key in remove_keys: + _ = value.pop(key, None) + + language = None + + more_than_one_official_language = len(candidate_languages) > 1 + + language = self.pick_language(value, candidate_languages) + + address_components = self.normalize_address_components(value) + + address_country, non_local_language = self.country_name(address_components, country, language, + use_country_code_prob=0.0, + local_language_name_prob=1.0, + random_language_name_prob=0.0, + alpha_3_iso_code_prob=0.0) + if address_country: + address_components[AddressFormatter.COUNTRY] = address_country + + address_state = self.state_name(address_components, non_local_language) + if address_state: + address_components[AddressFormatter.STATE] = address_state + + street = address_components.get(AddressFormatter.ROAD) + if street: + address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language) + + osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language) + + self.add_osm_boundaries(address_components, country, language, latitude, longitude, + osm_suffix=osm_suffix, + non_local_language=non_local_language, + random_key=False, + alpha_3_iso_code_prob=0.0, + alpha_2_iso_code_prob=0.0, + replace_with_non_local_prob=0.0, + expand_state_prob=1.0) + + city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language) + + if city: + address_components[AddressFormatter.CITY] = city + + self.add_neighborhoods(address_components, latitude, longitude, + osm_suffix=osm_suffix) + + self.normalize_names(address_components) + + self.prune_duplicate_names(address_components) + + return address_components, country, language + + def formatted_addresses(self, value, dropout_prob=0.5, tag_components=True): + ''' + Formatted addresses + ------------------- + + Produces one or more formatted addresses (tagged/untagged) + from the given dictionary of OSM tags and values. + + Here we also apply component dropout meaning we produce several + different addresses with various components removed at random. + That way the parser will have many examples of queries that are + just city/state or just house_number/street. The selected + components still have to make sense i.e. a lone house_number will + not be used without a street name. The dependencies are listed + above, see: OSM_ADDRESS_COMPONENTS. + + If there is more than one venue name (say name and alt_name), + addresses using both names and the selected components are + returned. + ''' + + venue_names = self.venue_names(value) or [] + + address_components, country, language = self.expanded_address_components(value) + + if not address_components: + return None, None, None + + for venue_name in venue_names: + abbreviated_venue = self.abbreviated_venue_name(venue_name, language) + if abbreviated_venue != venue_name and abbreviated_venue not in set(venue_names): + venue_names.append(abbreviated_venue) + # Version with all components - formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) if tag_components: formatted_addresses = [] @@ -807,7 +1169,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} if not address_components: - continue + return [] current_components = address_components.keys() random.shuffle(current_components) @@ -815,36 +1177,159 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood component_set = component_bitset(address_components.keys()) for component in current_components: - if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5: + if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < dropout_prob: address_components.pop(component) component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component] if not address_components: - break + return [] # Since venue names are 1-per-record, we must use them all for venue_name in (venue_names or [None]): if venue_name and AddressFormatter.HOUSE in address_components: address_components[AddressFormatter.HOUSE] = venue_name - formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) - if formatted_address not in seen: + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) + if formatted_address and formatted_address not in seen: formatted_addresses.append(formatted_address) seen.add(formatted_address) + return formatted_addresses, country, language + else: + formatted_addresses = [] + seen = set() + # Since venue names are 1-per-record, we must use them all + for venue_name in (venue_names or [None]): + if venue_name: + address_components[AddressFormatter.HOUSE] = venue_name + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) + if formatted_address and formatted_address not in seen: + formatted_addresses.append(formatted_address) + seen.add(formatted_address) + return formatted_addresses, country, language + + def formatted_address_limited(self, value, admin_dropout_prob=0.7): + address_components, country, language = self.limited_address_components(value) + + if not address_components: + return None, None, None + + formatted_addresses = [] + + address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} + if not address_components: + return [] + + current_components = address_components.keys() + random.shuffle(current_components) + + for component in (AddressFormatter.COUNTRY, AddressFormatter.STATE, + AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, + AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): + if random.random() < admin_dropout_prob: + _ = address_components.pop(component, None) + + if not address_components: + return None, None, None + + # Version with all components + formatted_address = self.formatter.format_address(country, address_components, tag_components=False, minimal_only=False) + + return formatted_address, country, language + + def build_training_data(self, infile, out_dir, tag_components=True): + ''' + Creates formatted address training data for supervised sequence labeling (or potentially + for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. + + Example: + + cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country + + The field structure is similar to other training data created by this script i.e. + {language, country, data}. The data field here is a sequence of labeled tokens similar + to what we might see in part-of-speech tagging. + + + This format uses a special character "|" to denote possible breaks in the input (comma, newline). + + Note that for the address parser, we'd like it to be robust to many different types + of input, so we may selectively eleminate components + + This information can potentially be used downstream by the sequence model as these + breaks may be present at prediction time. + + Example: + + sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic + + This may be useful in learning word representations, statistical phrases, morphology + or other models requiring only the sequence of words. + ''' + i = 0 + + if tag_components: + formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') + writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') + else: + formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') + writer = csv.writer(formatted_file, 'tsv_no_quote') + + for node_id, value, deps in parse_osm(infile): + formatted_addresses, country, language = self.formatted_addresses(value, tag_components=tag_components) + if not formatted_addresses: + continue + for formatted_address in formatted_addresses: if formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue - row = (language, country, formatted_address) + + if tag_components: + row = (language, country, formatted_address) + else: + row = formatted_address writer.writerow(row) - elif formatted_address and formatted_address.strip(): - formatted_address = tsv_string(formatted_address) - writer.writerow([formatted_address]) - i += 1 - if i % 1000 == 0 and i > 0: - print 'did', i, 'formatted addresses' + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i)) + + def build_limited_training_data(self, infile, out_dir): + ''' + Creates a special kind of formatted address training data from OSM's addr:* tags + but are designed for use in language classification. These records are similar + to the untagged formatted records but include the language and country + (suitable for concatenation with the rest of the language training data), + and remove several fields like country which usually do not contain helpful + information for classifying the language. + + Example: + + nb no Olaf Ryes Plass Oslo + ''' + i = 0 + + f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') + writer = csv.writer(f, 'tsv_no_quote') + + for node_id, value, deps in parse_osm(infile): + formatted_address, country, language = self.formatted_address_limited(value) + if not formatted_addresses: + continue + + if formatted_address.strip(): + formatted_address = tsv_string(formatted_address.strip()) + if not formatted_address or not formatted_address.strip(): + continue + + row = (language, country, formatted_address) + writer.writerow(row) + + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i)) + NAME_KEYS = ( 'name', @@ -875,93 +1360,6 @@ POSTAL_KEYS = ( ) -def build_address_format_training_data_limited(language_rtree, infile, out_dir): - ''' - Creates a special kind of formatted address training data from OSM's addr:* tags - but are designed for use in language classification. These records are similar - to the untagged formatted records but include the language and country - (suitable for concatenation with the rest of the language training data), - and remove several fields like country which usually do not contain helpful - information for classifying the language. - - Example: - - nb no Olaf Ryes Plass Oslo - ''' - i = 0 - - # Simple whitespace splitter is all that's necessary - formatter = AddressFormatter(splitter=u' ') - - f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') - writer = csv.writer(f, 'tsv_no_quote') - - remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS - - country_keys_set = set(COUNTRY_KEYS) - - for key, value, deps in parse_osm(infile): - try: - latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) - except Exception: - continue - - have_country = False - - for k in remove_keys: - if k in country_keys_set and k in value: - have_country = True - _ = value.pop(k, None) - - if have_country and random.random() < 0.4: - have_country = False - elif not have_country and random.random() < 0.2: - have_country = True - - if not value: - continue - - country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') - if not name_language: - continue - - if have_country: - value['addr:country'] = u'' - - single_language = len(name_language) == 1 - - for lang, val in name_language.iteritems(): - if lang not in languages: - continue - - if have_country: - localized = language_country_names.get(lang, {}).get(country.upper()) - - if localized: - value['addr:country:{}'.format(lang)] = localized - - address_dict = value.copy() - for k in address_dict.keys(): - namespaced_val = u'{}:{}'.format(k, lang) - if namespaced_val in address_dict: - address_dict[k] = address_dict[namespaced_val] - elif not single_language: - address_dict.pop(k) - - if not address_dict: - continue - - formatted_address_untagged = formatter.format_address(country, address_dict, minimal_only=False, tag_components=False) - if formatted_address_untagged is not None: - formatted_address_untagged = tsv_string(formatted_address_untagged) - - writer.writerow((lang, country, formatted_address_untagged)) - - i += 1 - if i % 1000 == 0 and i > 0: - print 'did', i, 'formatted addresses' - - def build_toponym_training_data(language_rtree, infile, out_dir): ''' Data set of toponyms by language and country which should assist @@ -1054,7 +1452,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir): continue writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'toponyms' + print('did {} toponyms'.format(i)) i += 1 f.close() @@ -1086,7 +1484,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False): if k in languages: writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'streets' + print('did {} streets'.format(i)) i += 1 f.close() @@ -1124,7 +1522,7 @@ def build_venue_training_data(language_rtree, infile, out_dir): if k in languages: writer.writerow((k, country, safe_encode(venue_type), tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'venues' + print('did, {} venues'.format(i)) i += 1 f.close() @@ -1192,6 +1590,7 @@ if __name__ == '__main__': init_country_names() init_languages() + init_disambiguation() language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) osm_rtree = None @@ -1221,7 +1620,7 @@ if __name__ == '__main__': if args.address_file and not args.format_only and not args.limited_addresses: build_address_training_data(language_rtree, args.address_file, args.out_dir) - elif args.address_file and not args.limited_addresses: + elif args.address_file: if osm_rtree is None: parser.error('--rtree-dir required for formatted addresses') elif neighborhoods_rtree is None: @@ -1232,8 +1631,10 @@ if __name__ == '__main__': parser.error('--geonames-db required for formatted addresses') if args.address_file and args.format_only: - build_address_format_training_data(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, args.address_file, args.out_dir, tag_components=not args.untagged) + osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) if args.address_file and args.limited_addresses: - build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir) + osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter.build_limited_training_data(args.address_file, args.out_dir) if args.venues_file: build_venue_training_data(language_rtree, args.venues_file, args.out_dir)