From 6ac72576bc281217fb036431e284ffdfe36da50b Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 22 Jan 2016 02:56:31 -0500 Subject: [PATCH] [osm/formatting] Randomly abbreviating street names and venue names using all the available libpostal dictionaries. Refactoring OSM formatting into separate methods which can be individually tested. Adding override for special phrases like UK --- .../geodata/osm/osm_address_training_data.py | 933 +++++++++++++----- 1 file changed, 667 insertions(+), 266 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index ce12df21..6f09d4c5 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -131,6 +131,16 @@ def num_deps(c): return len(c.dependencies) +RANDOM_VALUE_REPLACEMENTS = { + # Key: address component + AddressFormatter.COUNTRY: { + # value: (replacement, probability) + 'GB': ('UK', 0.3), + 'United Kingdom': ('UK', 0.3), + } +} + + OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps) OSM_ADDRESS_COMPONENT_COMBINATIONS = [] @@ -309,6 +319,155 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language +ALL_LANGUAGES = 'all' + +LOWER, UPPER, TITLE, MIXED = range(4) + + +def token_capitalization(s): + if s.istitle(): + return TITLE + elif s.islower(): + return LOWER + elif s.isupper(): + return UPPER + else: + return MIXED + + +def recase_abbreviation(expansion, tokens): + expansion_tokens = expansion.split() + if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)): + return expansion.upper() + elif len(tokens) == len(expansion_tokens): + strings = [] + for (t, c), e in zip(tokens, expansion_tokens): + cap = token_capitalization(t) + if cap == LOWER: + strings.append(e.lower()) + elif cap == UPPER: + strings.append(e.upper()) + elif cap == TITLE: + strings.append(e.title()) + elif t.lower() == e.lower(): + strings.append(t) + else: + strings.append(e.title()) + return u' '.join(strings) + else: + return u' '.join([t.title() for t in expansion_tokens]) + + +def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2): + ''' + Abbreviations + ------------- + + OSM discourages abbreviations, but to make our training data map better + to real-world input, we can safely replace the canonical phrase with an + abbreviated version and retain the meaning of the words + ''' + tokens = tokenize(s) + norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] + + abbreviated = [] + + i = 0 + + for t, c, length, data in gazetteer.filter(norm_tokens): + if c is PHRASE: + valid = [] + data = [d.split('|') for d in data] + + added = False + + for lang, dictionary, is_canonical, canonical in data: + if lang not in (language, 'all'): + continue + + is_canonical = int(is_canonical) + is_stopword = dictionary == 'stopword' + is_prefix = dictionary.startswith('concatenated_prefixes') + is_suffix = dictionary.startswith('concatenated_suffixes') + is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length + + suffix = None + prefix = None + + if not is_canonical or random.random() > abbreviate_prob: + continue + + if not is_prefix and not is_suffix: + abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary)) + token = random.choice(abbreviations) if abbreviations else canonical + token = recase_abbreviation(token, tokens[i:i + len(t)]) + abbreviated.append(token) + if t[-1][1] != token_types.IDEOGRAPHIC_CHAR: + abbreviated.append(u' ') + added = True + elif is_prefix: + token = tokens[i][0] + prefix, token = token[:length], token[length:] + abbreviated.append(prefix) + if random.random() < separate_prob: + abbreviated.append(u' ') + if token.islower(): + abbreviated.append(token.title()) + else: + abbreviated.append(token) + abbreviated.append(u' ') + added = True + elif is_suffix: + token = tokens[i][0] + + token, suffix = token[:-length], token[-length:] + + concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), []) + + separated_abbreviations = [] + phrase = gazetteer.trie.get(suffix.rstrip('.')) + suffix_data = [safe_decode(d).split(u'|') for d in (phrase or [])] + for l, d, _, c in suffix_data: + if l == lang and c == canonical: + separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d))) + + separate = random.random() < separate_prob + + if concatenated_abbreviations and not separate: + abbreviation = random.choice(concatenated_abbreviations) + elif separated_abbreviations: + abbreviation = random.choice(separated_abbreviations) + else: + abbreviation = canonical + + abbreviated.append(token) + if separate: + abbreviated.append(u' ') + if suffix.isupper(): + abbreviated.append(abbreviation.upper()) + elif separate: + abbreviated.append(abbreviation.title()) + else: + abbreviated.append(abbreviation) + abbreviated.append(u' ') + added = True + + if not added: + for j, (t_i, c_i) in enumerate(t): + abbreviated.append(tokens[i + j][0]) + if c_i != token_types.IDEOGRAPHIC_CHAR: + abbreviated.append(u' ') + i += len(t) + + else: + abbreviated.append(tokens[i][0]) + if (c != token_types.IDEOGRAPHIC_CHAR): + abbreviated.append(u' ') + i += 1 + + return u''.join(abbreviated).strip() + + def build_ways_training_data(language_rtree, infile, out_dir): ''' Creates a training set for language classification using most OSM ways @@ -334,7 +493,7 @@ def build_ways_training_data(language_rtree, infile, out_dir): if k in languages: writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'ways' + print('did {} ways'.format(i)) i += 1 f.close() @@ -362,124 +521,111 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude): return ret -def osm_pick_random_name_key(suffix=''): - name_key = ''.join(('name', suffix)) - raw_name_key = 'name' - short_name_key = ''.join(('short_name', suffix)) - raw_short_name_key = 'short_name' - alt_name_key = ''.join(('alt_name', suffix)) - raw_alt_name_key = 'alt_name' - official_name_key = ''.join(('official_name', suffix)) - raw_official_name_key = 'official_name' - - # Choose which name to use with given probabilities - r = random.random() - if r < 0.7: - # 70% of the time use the name tag - key = name_key - raw_key = raw_name_key - elif r < 0.8: - # 10% of the time use the short name - key = short_name_key - raw_key = raw_short_name_key - elif r < 0.9: - # 10% of the time use the official name - key = official_name_key - raw_key = raw_official_name_key - else: - # 10% of the time use the official name - key = alt_name_key - raw_key = raw_alt_name_key - - return key, raw_key - - -def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True): - ''' - Creates formatted address training data for supervised sequence labeling (or potentially - for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. - - Example: - - cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country - - The field structure is similar to other training data created by this script i.e. - {language, country, data}. The data field here is a sequence of labeled tokens similar - to what we might see in part-of-speech tagging. - - - This format uses a special character "|" to denote possible breaks in the input (comma, newline). - - Note that for the address parser, we'd like it to be robust to many different types - of input, so we may selectively eleminate components - - This information can potentially be used downstream by the sequence model as these - breaks may be present at prediction time. - - Example: - - sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic - - This may be useful in learning word representations, statistical phrases, morphology - or other models requiring only the sequence of words. - ''' - i = 0 - - formatter = AddressFormatter() - osm_address_components.configure() - - if tag_components: - formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') - writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') - else: - formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') - writer = csv.writer(formatted_file, 'tsv_no_quote') - - remove_keys = OSM_IGNORE_KEYS - +class OSMAddressFormatter(object): alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries} - for node_id, value, deps in parse_osm(infile): - try: - latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) - except Exception: - continue - - country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) - if not (country and candidate_languages): - continue - - for key in remove_keys: - _ = value.pop(key, None) + def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None): + self.admin_rtree = admin_rtree + self.language_rtree = language_rtree + self.neighborhoods_rtree = neighborhoods_rtree + self.quattroshapes_rtree = quattroshapes_rtree + self.geonames = geonames + self.formatter = AddressFormatter(splitter=splitter) + osm_address_components.configure() + def pick_language(self, value, candidate_languages, pick_namespaced_language_prob=0.6): language = None - more_than_one_official_language = len(candidate_languages) > 1 + if len(candidate_languages) == 1: + language = candidate_languages[0]['lang'] + else: + street = value.get('addr:street', None) - if tag_components: - if len(candidate_languages) == 1: - language = candidate_languages[0]['lang'] + namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] + + if street is not None and not namespaced: + language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) + elif namespaced and random.random() < pick_namespaced_language_prob: + language = random.choice(namespaced) + lang_suffix = ':{}'.format(language) + for k in value: + if k.startswith('addr:') and k.endswith(lang_suffix): + value[k.rstrip(lang_suffix)] = value[k] else: - street = value.get('addr:street', None) + language = UNKNOWN_LANGUAGE - namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] + return language - if street is not None and not namespaced: - language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) - elif namespaced and random.random() < 0.6: - language = random.choice(namespaced) - lang_suffix = ':{}'.format(language) - for k in value: - if k.startswith('addr:') and k.endswith(lang_suffix): - value[k.rstrip(lang_suffix)] = value[k] - else: - language = UNKNOWN_LANGUAGE + def pick_random_name_key(self, suffix=''): + name_key = ''.join(('name', suffix)) + raw_name_key = 'name' + short_name_key = ''.join(('short_name', suffix)) + raw_short_name_key = 'short_name' + alt_name_key = ''.join(('alt_name', suffix)) + raw_alt_name_key = 'alt_name' + official_name_key = ''.join(('official_name', suffix)) + raw_official_name_key = 'official_name' - address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases} - formatter.replace_aliases(address_components) + # Choose which name to use with given probabilities + r = random.random() + if r < 0.7: + # 70% of the time use the name tag + key = name_key + raw_key = raw_name_key + elif r < 0.8: + # 10% of the time use the short name + key = short_name_key + raw_key = raw_short_name_key + elif r < 0.9: + # 10% of the time use the official name + key = official_name_key + raw_key = raw_official_name_key + else: + # 10% of the time use the official name + key = alt_name_key + raw_key = raw_alt_name_key - address_country = address_components.get(AddressFormatter.COUNTRY) + return key, raw_key + def normalize_address_components(self, value): + address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases} + self.formatter.replace_aliases(address_components) + return address_components + + def abbreviated_street(self, street, language, abbreviate_prob=0.3, separate_prob=0.2): + ''' + Street abbreviations + -------------------- + + Use street and unit type dictionaries to probabilistically abbreviate + phrases. Because the abbreviation is picked at random, this should + help bridge the gap between OSM addresses and user input, in addition + to capturing some non-standard abbreviations/surface forms which may be + missing or sparse in OSM. + ''' + return osm_abbreviate(street_and_unit_types_gazetteer, street, language, + abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) + + def abbreviated_venue_name(self, name, language, abbreviate_prob=0.2, separate_prob=0.0): + ''' + Venue abbreviations + ------------------- + + Use street and unit type dictionaries to probabilistically abbreviate + phrases. Because the abbreviation is picked at random, this should + help bridge the gap between OSM addresses and user input, in addition + to capturing some non-standard abbreviations/surface forms which may be + missing or sparse in OSM. + ''' + return osm_abbreviate(names_gazetteer, name, language, + abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) + + def country_name(self, address_components, country_code, language, + use_country_code_prob=0.3, + local_language_name_prob=0.6, + random_language_name_prob=0.1, + alpha_3_iso_code_prob=0.1, + ): ''' Country names ------------- @@ -506,14 +652,16 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood non_local_language = None - if random.random() < 0.3: + address_country = address_components.get(AddressFormatter.COUNTRY) + + if random.random() < use_country_code_prob: # 30% of the time: add Quattroshapes country - address_country = country.upper() + address_country = country_code.upper() r = random.random() # 1. 60% of the time: use the country name in the current language or the country's local language - if address_country and r < 0.6: + if address_country and r < local_language_name_prob: localized = None if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): localized = language_country_names.get(language, {}).get(address_country.upper()) @@ -522,25 +670,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood localized = country_localized_display_name(address_country.lower()) if localized: - address_components[AddressFormatter.COUNTRY] = localized + address_country = localized # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet - elif address_country and r < 0.7: + elif address_country and r < local_language_name_prob + random_language_name_prob: non_local_language = sample_random_language() lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper()) if lang_country: - address_components[AddressFormatter.COUNTRY] = lang_country + address_country = lang_country # 3. 10% of the time: use the country's alpha-3 ISO code - elif address_country and r < 0.8: - iso_code_alpha3 = alpha3_codes.get(address_country) + elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob: + iso_code_alpha3 = self.alpha3_codes.get(address_country) if iso_code_alpha3: - address_components[AddressFormatter.COUNTRY] = iso_code_alpha3 + address_country = iso_code_alpha3 # 4. Implicit: the rest of the time keep the alpha-2 country code + return address_country, non_local_language + + def venue_names(self, value): ''' Venue names ----------- Some venues have multiple names listed in OSM, grab them all + With a certain probability, add None to the list so we drop the name ''' venue_names = [] @@ -548,7 +700,9 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood venue_name = value.get(key) if venue_name: venue_names.append(venue_name) + return venue_names + def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.3): ''' States ------ @@ -559,14 +713,38 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood ''' address_state = address_components.get(AddressFormatter.STATE) - if address_state and not non_local_language: + if address_state and country and not non_local_language: state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language) - if state_full_name and random.random() < 0.3: - address_components[AddressFormatter.STATE] = state_full_name + if state_full_name and random.random() < state_full_name_prob: + address_state = state_full_name elif address_state and non_local_language: _ = address_components.pop(AddressFormatter.STATE, None) + address_state = None + return address_state + def tag_suffix(self, language, non_local_language, more_than_one_official_language=False): + if non_local_language is not None: + osm_suffix = ':{}'.format(non_local_language) + elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): + osm_suffix = ':{}'.format(language) + else: + osm_suffix = '' + return osm_suffix + + def add_osm_boundaries(self, address_components, + country, language, + latitude, longitude, + osm_suffix='', + non_local_language=None, + random_key=True, + alpha_3_iso_code_prob=0.1, + alpha_2_iso_code_prob=0.2, + simple_country_key_prob=0.4, + replace_with_non_local_prob=0.4, + join_state_district_prob=0.5, + expand_state_prob=0.7 + ): ''' OSM boundaries -------------- @@ -583,14 +761,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood include these qualifiers in the training data. ''' - osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude) - - if non_local_language is not None: - osm_suffix = ':{}'.format(non_local_language) - elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): - osm_suffix = ':{}'.format(language) - else: - osm_suffix = '' + osm_components = osm_reverse_geocoded_components(self.admin_rtree, country, latitude, longitude) name_key = ''.join(('name', osm_suffix)) raw_name_key = 'name' @@ -608,24 +779,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood for component, components_values in osm_components.iteritems(): seen = set() - key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) + if random_key: + key, raw_key = self.pick_random_name_key(suffix=osm_suffix) + else: + key, raw_key = name_key, raw_name_key for component_value in components_values: r = random.random() name = None - if iso_code3_key in component_value and r < 0.1: - name = component_value[iso_code3_key] - elif iso_code_key in component_value and r < 0.3: - name = component_value[iso_code_key] - elif language == 'en' and not non_local_language and r < 0.7: - # Particularly to address the US (prefer United States, - # not United States of America) but may capture variations - # in other English-speaking countries as well. - if simple_name_key in component_value: - name = component_value[simple_name_key] - elif international_name_key in component_value: - name = component_value[international_name_key] + if component == AddressFormatter.COUNTRY: + if iso_code3_key in component_value and r < alpha_3_iso_code_prob: + name = component_value[iso_code3_key] + elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob: + name = component_value[iso_code_key] + elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob: + # Particularly to address the US (prefer United States, + # not United States of America) but may capture variations + # in other English-speaking countries as well. + if simple_name_key in component_value: + name = component_value[simple_name_key] + elif international_name_key in component_value: + name = component_value[international_name_key] + if not name: name = component_value.get(key, component_value.get(raw_key)) @@ -640,17 +816,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood seen.add((component, name)) for component, vals in poly_components.iteritems(): - if component not in address_components or (non_local_language and random.random() < 0.4): - if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5: + if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob): + if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob: num = random.randrange(1, len(vals) + 1) val = u', '.join(vals[:num]) else: val = random.choice(vals) - if component == AddressFormatter.STATE and random.random() < 0.7: - val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) + if component == AddressFormatter.STATE and random.random() < expand_state_prob: + val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val) address_components[component] = val + def quattroshapes_city(self, address_components, + latitude, longitude, + language, non_local_language=None, + qs_add_city_prob=0.2, + abbreviated_name_prob=0.1): ''' Quattroshapes/GeoNames cities ----------------------------- @@ -661,13 +842,15 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood reliably use local names, which we'll want for consistency ''' - if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2): + city = None + + if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob): lang = non_local_language or language - quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True) + quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True) for result in quattroshapes_cities: - if result.get(quattroshapes_rtree.LEVEL) == quattroshapes_rtree.LOCALITY and quattroshapes_rtree.GEONAMES_ID in result: - geonames_id = int(result[quattroshapes_rtree.GEONAMES_ID].split(',')[0]) - names = geonames.get_alternate_names(geonames_id) + if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result: + geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0]) + names = self.geonames.get_alternate_names(geonames_id) if not names or lang not in names: continue @@ -676,20 +859,27 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if 'abbr' not in names or non_local_language: # Use the common city name in the target language city = names[lang][0][0] - elif random.random() < 0.1: + elif random.random() < abbreviated_name_prob: # Use an abbreviation: NYC, BK, SF, etc. city = random.choice(names['abbr'])[0] if not city or not city.strip(): continue - address_components[AddressFormatter.CITY] = city + return city break else: if non_local_language and AddressFormatter.CITY in address_components and ( - AddressFormatter.CITY_DISTRICT in osm_components or - AddressFormatter.SUBURB in osm_components): + AddressFormatter.CITY_DISTRICT in address_components or + AddressFormatter.SUBURB in address_components): address_components.pop(AddressFormatter.CITY) + return city + + def add_neighborhoods(self, address_components, + latitude, longitude, + osm_suffix='', + add_prefix_prob=0.5, + add_neighborhood_prob=0.5): ''' Neighborhoods ------------- @@ -702,13 +892,17 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood on the whole of better quality). ''' - neighborhoods = neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) + neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) neighborhood_levels = defaultdict(list) + + name_key = ''.join(('name', osm_suffix)) + raw_name_key = 'name' + for neighborhood in neighborhoods: place_type = neighborhood.get('place') polygon_type = neighborhood.get('polygon_type') - key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) + key, raw_key = self.pick_random_name_key(suffix=osm_suffix) name = neighborhood.get(key, neighborhood.get(raw_key)) if not name: @@ -716,7 +910,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood name_prefix = neighborhood.get('name:prefix') - if name_prefix and random.random() < 0.5: + if name_prefix and random.random() < add_prefix_prob: name = u' '.join([name_prefix, name]) if not name: @@ -737,9 +931,10 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood neighborhood_levels[neighborhood_level].append(name) for component, neighborhoods in neighborhood_levels.iteritems(): - if component not in address_components and random.random() < 0.5: + if component not in address_components and random.random() < add_neighborhood_prob: address_components[component] = neighborhoods[0] + def normalize_names(self, address_components, replacement_prob=0.6): ''' Name normalization ------------------ @@ -751,9 +946,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if not name: continue replacement = replace_name_prefixes(replace_name_suffixes(name)) - if replacement != name and random.random() < 0.6: + if replacement != name and random.random() < replacement_prob: address_components[component] = replacement + def replace_names(self, address_components): + ''' + Name replacements + ----------------- + + Make a few special replacements (like UK instead of GB) + ''' + for component, value in address_components.iteritems(): + replacement, prob = RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0)) + if replacement is not None and random.random() < prob: + address_components[component] = replacement + + def prune_duplicate_names(self, address_components): ''' Name deduping ------------- @@ -764,7 +972,8 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood name_components = defaultdict(list) - for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): + for component in (AddressFormatter.CITY, AddressFormatter.STATE_DISTRICT, + AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): name = address_components.get(component) if name: name_components[name].append(component) @@ -774,7 +983,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood for component in components[1:]: address_components.pop(component, None) - + def cleanup_house_number(self, address_components): ''' House number cleanup -------------------- @@ -796,8 +1005,161 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood else: address_components.pop(AddressFormatter.HOUSE_NUMBER, None) + def expanded_address_components(self, value): + try: + latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) + except Exception: + return None, None, None + + country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + for key in OSM_IGNORE_KEYS: + _ = value.pop(key, None) + + language = None + + more_than_one_official_language = len(candidate_languages) > 1 + + language = self.pick_language(value, candidate_languages) + + address_components = self.normalize_address_components(value) + + address_country, non_local_language = self.country_name(address_components, country, language) + if address_country: + address_components[AddressFormatter.COUNTRY] = address_country + + address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0) + if address_state: + address_components[AddressFormatter.STATE] = address_state + + osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language) + + self.add_osm_boundaries(address_components, country, language, latitude, longitude, + non_local_language=non_local_language, + osm_suffix=osm_suffix) + + city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language) + if city: + address_components[AddressFormatter.CITY] = city + + self.add_neighborhoods(address_components, latitude, longitude, + osm_suffix=osm_suffix) + + street = address_components.get(AddressFormatter.ROAD) + if street: + address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language) + + self.normalize_names(address_components) + + self.replace_names(address_components) + + self.prune_duplicate_names(address_components) + + self.cleanup_house_number(address_components) + + return address_components, country, language + + def limited_address_components(self, value): + try: + latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) + except Exception: + return None, None, None + + country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS + + for key in remove_keys: + _ = value.pop(key, None) + + language = None + + more_than_one_official_language = len(candidate_languages) > 1 + + language = self.pick_language(value, candidate_languages) + + address_components = self.normalize_address_components(value) + + address_country, non_local_language = self.country_name(address_components, country, language, + use_country_code_prob=0.0, + local_language_name_prob=1.0, + random_language_name_prob=0.0, + alpha_3_iso_code_prob=0.0) + if address_country: + address_components[AddressFormatter.COUNTRY] = address_country + + address_state = self.state_name(address_components, non_local_language) + if address_state: + address_components[AddressFormatter.STATE] = address_state + + street = address_components.get(AddressFormatter.ROAD) + if street: + address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language) + + osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language) + + self.add_osm_boundaries(address_components, country, language, latitude, longitude, + osm_suffix=osm_suffix, + non_local_language=non_local_language, + random_key=False, + alpha_3_iso_code_prob=0.0, + alpha_2_iso_code_prob=0.0, + replace_with_non_local_prob=0.0, + expand_state_prob=1.0) + + city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language) + + if city: + address_components[AddressFormatter.CITY] = city + + self.add_neighborhoods(address_components, latitude, longitude, + osm_suffix=osm_suffix) + + self.normalize_names(address_components) + + self.prune_duplicate_names(address_components) + + return address_components, country, language + + def formatted_addresses(self, value, dropout_prob=0.5, tag_components=True): + ''' + Formatted addresses + ------------------- + + Produces one or more formatted addresses (tagged/untagged) + from the given dictionary of OSM tags and values. + + Here we also apply component dropout meaning we produce several + different addresses with various components removed at random. + That way the parser will have many examples of queries that are + just city/state or just house_number/street. The selected + components still have to make sense i.e. a lone house_number will + not be used without a street name. The dependencies are listed + above, see: OSM_ADDRESS_COMPONENTS. + + If there is more than one venue name (say name and alt_name), + addresses using both names and the selected components are + returned. + ''' + + venue_names = self.venue_names(value) or [] + + address_components, country, language = self.expanded_address_components(value) + + if not address_components: + return None, None, None + + for venue_name in venue_names: + abbreviated_venue = self.abbreviated_venue_name(venue_name, language) + if abbreviated_venue != venue_name and abbreviated_venue not in set(venue_names): + venue_names.append(abbreviated_venue) + # Version with all components - formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) if tag_components: formatted_addresses = [] @@ -807,7 +1169,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} if not address_components: - continue + return [] current_components = address_components.keys() random.shuffle(current_components) @@ -815,36 +1177,159 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood component_set = component_bitset(address_components.keys()) for component in current_components: - if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5: + if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < dropout_prob: address_components.pop(component) component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component] if not address_components: - break + return [] # Since venue names are 1-per-record, we must use them all for venue_name in (venue_names or [None]): if venue_name and AddressFormatter.HOUSE in address_components: address_components[AddressFormatter.HOUSE] = venue_name - formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) - if formatted_address not in seen: + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) + if formatted_address and formatted_address not in seen: formatted_addresses.append(formatted_address) seen.add(formatted_address) + return formatted_addresses, country, language + else: + formatted_addresses = [] + seen = set() + # Since venue names are 1-per-record, we must use them all + for venue_name in (venue_names or [None]): + if venue_name: + address_components[AddressFormatter.HOUSE] = venue_name + formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) + if formatted_address and formatted_address not in seen: + formatted_addresses.append(formatted_address) + seen.add(formatted_address) + return formatted_addresses, country, language + + def formatted_address_limited(self, value, admin_dropout_prob=0.7): + address_components, country, language = self.limited_address_components(value) + + if not address_components: + return None, None, None + + formatted_addresses = [] + + address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} + if not address_components: + return [] + + current_components = address_components.keys() + random.shuffle(current_components) + + for component in (AddressFormatter.COUNTRY, AddressFormatter.STATE, + AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, + AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): + if random.random() < admin_dropout_prob: + _ = address_components.pop(component, None) + + if not address_components: + return None, None, None + + # Version with all components + formatted_address = self.formatter.format_address(country, address_components, tag_components=False, minimal_only=False) + + return formatted_address, country, language + + def build_training_data(self, infile, out_dir, tag_components=True): + ''' + Creates formatted address training data for supervised sequence labeling (or potentially + for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. + + Example: + + cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country + + The field structure is similar to other training data created by this script i.e. + {language, country, data}. The data field here is a sequence of labeled tokens similar + to what we might see in part-of-speech tagging. + + + This format uses a special character "|" to denote possible breaks in the input (comma, newline). + + Note that for the address parser, we'd like it to be robust to many different types + of input, so we may selectively eleminate components + + This information can potentially be used downstream by the sequence model as these + breaks may be present at prediction time. + + Example: + + sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic + + This may be useful in learning word representations, statistical phrases, morphology + or other models requiring only the sequence of words. + ''' + i = 0 + + if tag_components: + formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') + writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') + else: + formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') + writer = csv.writer(formatted_file, 'tsv_no_quote') + + for node_id, value, deps in parse_osm(infile): + formatted_addresses, country, language = self.formatted_addresses(value, tag_components=tag_components) + if not formatted_addresses: + continue + for formatted_address in formatted_addresses: if formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue - row = (language, country, formatted_address) + + if tag_components: + row = (language, country, formatted_address) + else: + row = formatted_address writer.writerow(row) - elif formatted_address and formatted_address.strip(): - formatted_address = tsv_string(formatted_address) - writer.writerow([formatted_address]) - i += 1 - if i % 1000 == 0 and i > 0: - print 'did', i, 'formatted addresses' + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i)) + + def build_limited_training_data(self, infile, out_dir): + ''' + Creates a special kind of formatted address training data from OSM's addr:* tags + but are designed for use in language classification. These records are similar + to the untagged formatted records but include the language and country + (suitable for concatenation with the rest of the language training data), + and remove several fields like country which usually do not contain helpful + information for classifying the language. + + Example: + + nb no Olaf Ryes Plass Oslo + ''' + i = 0 + + f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') + writer = csv.writer(f, 'tsv_no_quote') + + for node_id, value, deps in parse_osm(infile): + formatted_address, country, language = self.formatted_address_limited(value) + if not formatted_addresses: + continue + + if formatted_address.strip(): + formatted_address = tsv_string(formatted_address.strip()) + if not formatted_address or not formatted_address.strip(): + continue + + row = (language, country, formatted_address) + writer.writerow(row) + + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i)) + NAME_KEYS = ( 'name', @@ -875,93 +1360,6 @@ POSTAL_KEYS = ( ) -def build_address_format_training_data_limited(language_rtree, infile, out_dir): - ''' - Creates a special kind of formatted address training data from OSM's addr:* tags - but are designed for use in language classification. These records are similar - to the untagged formatted records but include the language and country - (suitable for concatenation with the rest of the language training data), - and remove several fields like country which usually do not contain helpful - information for classifying the language. - - Example: - - nb no Olaf Ryes Plass Oslo - ''' - i = 0 - - # Simple whitespace splitter is all that's necessary - formatter = AddressFormatter(splitter=u' ') - - f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') - writer = csv.writer(f, 'tsv_no_quote') - - remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS - - country_keys_set = set(COUNTRY_KEYS) - - for key, value, deps in parse_osm(infile): - try: - latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) - except Exception: - continue - - have_country = False - - for k in remove_keys: - if k in country_keys_set and k in value: - have_country = True - _ = value.pop(k, None) - - if have_country and random.random() < 0.4: - have_country = False - elif not have_country and random.random() < 0.2: - have_country = True - - if not value: - continue - - country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') - if not name_language: - continue - - if have_country: - value['addr:country'] = u'' - - single_language = len(name_language) == 1 - - for lang, val in name_language.iteritems(): - if lang not in languages: - continue - - if have_country: - localized = language_country_names.get(lang, {}).get(country.upper()) - - if localized: - value['addr:country:{}'.format(lang)] = localized - - address_dict = value.copy() - for k in address_dict.keys(): - namespaced_val = u'{}:{}'.format(k, lang) - if namespaced_val in address_dict: - address_dict[k] = address_dict[namespaced_val] - elif not single_language: - address_dict.pop(k) - - if not address_dict: - continue - - formatted_address_untagged = formatter.format_address(country, address_dict, minimal_only=False, tag_components=False) - if formatted_address_untagged is not None: - formatted_address_untagged = tsv_string(formatted_address_untagged) - - writer.writerow((lang, country, formatted_address_untagged)) - - i += 1 - if i % 1000 == 0 and i > 0: - print 'did', i, 'formatted addresses' - - def build_toponym_training_data(language_rtree, infile, out_dir): ''' Data set of toponyms by language and country which should assist @@ -1054,7 +1452,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir): continue writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'toponyms' + print('did {} toponyms'.format(i)) i += 1 f.close() @@ -1086,7 +1484,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False): if k in languages: writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'streets' + print('did {} streets'.format(i)) i += 1 f.close() @@ -1124,7 +1522,7 @@ def build_venue_training_data(language_rtree, infile, out_dir): if k in languages: writer.writerow((k, country, safe_encode(venue_type), tsv_string(s))) if i % 1000 == 0 and i > 0: - print 'did', i, 'venues' + print('did, {} venues'.format(i)) i += 1 f.close() @@ -1192,6 +1590,7 @@ if __name__ == '__main__': init_country_names() init_languages() + init_disambiguation() language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) osm_rtree = None @@ -1221,7 +1620,7 @@ if __name__ == '__main__': if args.address_file and not args.format_only and not args.limited_addresses: build_address_training_data(language_rtree, args.address_file, args.out_dir) - elif args.address_file and not args.limited_addresses: + elif args.address_file: if osm_rtree is None: parser.error('--rtree-dir required for formatted addresses') elif neighborhoods_rtree is None: @@ -1232,8 +1631,10 @@ if __name__ == '__main__': parser.error('--geonames-db required for formatted addresses') if args.address_file and args.format_only: - build_address_format_training_data(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, args.address_file, args.out_dir, tag_components=not args.untagged) + osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) if args.address_file and args.limited_addresses: - build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir) + osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter.build_limited_training_data(args.address_file, args.out_dir) if args.venues_file: build_venue_training_data(language_rtree, args.venues_file, args.out_dir)