From 22efce73374b2189fca1c0041dba87fc56616048 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 3 Oct 2015 14:31:51 -0400 Subject: [PATCH] [osm/parsing] Randomly replacing country codes with local and foreign language expansions as well as randomly expanding state abbreviations to make parser more robust to different input --- .../geodata/osm/osm_address_training_data.py | 65 ++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 2e9ff504..a5b2feee 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -55,6 +55,8 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python'))) from geodata.language_id.disambiguation import * +from geodata.language_id.sample import sample_random_language +from geodata.states.state_abbreviatins import STATE_ABBREVIATIONS from geodata.language_id.polygon_lookup import country_and_languages from geodata.i18n.languages import * from geodata.address_formatting.formatter import AddressFormatter @@ -489,6 +491,67 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp # Version with all components formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) + address_country = address_components.get(AddressFormatter.COUNTRY) + + ''' + Country names + ------------- + + In OSM, addr:country is almost always an ISO-3166 alpha-2 country code. + However, we'd like to expand these to include natural language forms + of the country names we might be likely to encounter in a geocoder or + handwritten address. + + These splits are somewhat arbitrary but could potentially be fit to data + from OpenVenues or other sources on the usage of country name forms. + + If the address includes a country, the selection procedure proceeds as follows: + + 1. With probability a, select the country name in the language of the address + (determined above), or with the localized country name if the language is + undtermined or ambiguous. + + 2. With probability b(1-a), sample a language from the distribution of + languages on the Internet and use the country's name in that language. + + 3. This is implicit, but with probability (1-b)(1-a), keep the country code + ''' + + # 1. use the country name in the current language or the country's local language + if address_country and random.random() < 0.7: + localized = None + if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): + localized = language_country_names.get(language, {}).get(address_country.upper()) + + if not localized: + localized = country_localized_display_name(address_country.lower()) + + if localized: + address_components[AddressFormatter.COUNTRY] = localized + # 2. country's name in a language samples from the distribution of languages on the Internet + elif address_country and random.random() < 0.7: + lang = sample_random_language() + lang_country = language_country_names.get(lang, {}).get(address_country.upper()) + if lang_country: + address_components[AddressFormatter.COUNTRY] = lang_country + # 3. Implicit: the rest of the time keep the country code + + ''' + States + ------ + + Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name + whereas we'd like to include both forms, so wtih some probability, replace the abbreviated + name with the unabbreviated one e.g. CA => California + ''' + address_state = address_components.get(AddressFormatter.STATE) + + if address_state: + state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language) + + if state_full_name and random.random() < 0.3: + address_components[AddressFormatter.STATE] = state_full_name + if tag_components: formatted_addresses = [] formatted_addresses.append(formatted_address) @@ -500,7 +563,7 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp current_components = component_bitset(address_components.keys()) for component in address_components.keys(): - if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() >= 0.5: + if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5: address_components.pop(component) current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component] if not address_components: