[osm/parsing] Randomly replacing country codes with local and foreign language expansions as well as randomly expanding state abbreviations to make parser more robust to different input

2015-10-03 14:31:51 -04:00
parent 8920812055
commit 22efce7337
1 changed files with 64 additions and 1 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -55,6 +55,8 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))

 from geodata.language_id.disambiguation import *
+from geodata.language_id.sample import sample_random_language
+from geodata.states.state_abbreviatins import STATE_ABBREVIATIONS
 from geodata.language_id.polygon_lookup import country_and_languages
 from geodata.i18n.languages import *
 from geodata.address_formatting.formatter import AddressFormatter
@@ -489,6 +491,67 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
        # Version with all components
        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)

+        address_country = address_components.get(AddressFormatter.COUNTRY)
+
+        '''
+        Country names
+        -------------
+
+        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
+        However, we'd like to expand these to include natural language forms
+        of the country names we might be likely to encounter in a geocoder or
+        handwritten address.
+
+        These splits are somewhat arbitrary but could potentially be fit to data
+        from OpenVenues or other sources on the usage of country name forms.
+
+        If the address includes a country, the selection procedure proceeds as follows:
+
+        1. With probability a, select the country name in the language of the address
+           (determined above), or with the localized country name if the language is
+           undtermined or ambiguous.
+
+        2. With probability b(1-a), sample a language from the distribution of
+           languages on the Internet and use the country's name in that language.
+
+        3. This is implicit, but with probability (1-b)(1-a), keep the country code
+        '''
+
+        # 1. use the country name in the current language or the country's local language
+        if address_country and random.random() < 0.7:
+            localized = None
+            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
+                localized = language_country_names.get(language, {}).get(address_country.upper())
+
+            if not localized:
+                localized = country_localized_display_name(address_country.lower())
+
+            if localized:
+                address_components[AddressFormatter.COUNTRY] = localized
+        # 2. country's name in a language samples from the distribution of languages on the Internet
+        elif address_country and random.random() < 0.7:
+            lang = sample_random_language()
+            lang_country = language_country_names.get(lang, {}).get(address_country.upper())
+            if lang_country:
+                address_components[AddressFormatter.COUNTRY] = lang_country
+        # 3. Implicit: the rest of the time keep the country code
+
+        '''
+        States
+        ------
+
+        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
+        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
+        name with the unabbreviated one e.g. CA => California
+        '''
+        address_state = address_components.get(AddressFormatter.STATE)
+
+        if address_state:
+            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
+
+            if state_full_name and random.random() < 0.3:
+                address_components[AddressFormatter.STATE] = state_full_name
+
        if tag_components:
            formatted_addresses = []
            formatted_addresses.append(formatted_address)
@@ -500,7 +563,7 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
            current_components = component_bitset(address_components.keys())

            for component in address_components.keys():
-                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() >= 0.5:
+                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
                    address_components.pop(component)
                    current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                    if not address_components: