[osm/parsing] Randomly replacing country codes with local and foreign language expansions as well as randomly expanding state abbreviations to make parser more robust to different input

2015-10-03 14:31:51 -04:00
parent 8920812055
commit 22efce7337
1 changed files with 64 additions and 1 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -55,6 +55,8 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
 from geodata.language_id.disambiguation import *
 from geodata.language_id.sample import sample_random_language
 from geodata.states.state_abbreviatins import STATE_ABBREVIATIONS
 from geodata.language_id.polygon_lookup import country_and_languages
 from geodata.i18n.languages import *
 from geodata.address_formatting.formatter import AddressFormatter
@@ -489,6 +491,67 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
        # Version with all components
        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
        address_country = address_components.get(AddressFormatter.COUNTRY)
        '''
        Country names
        -------------
        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
        However, we'd like to expand these to include natural language forms
        of the country names we might be likely to encounter in a geocoder or
        handwritten address.
        These splits are somewhat arbitrary but could potentially be fit to data
        from OpenVenues or other sources on the usage of country name forms.
        If the address includes a country, the selection procedure proceeds as follows:
        1. With probability a, select the country name in the language of the address
           (determined above), or with the localized country name if the language is
           undtermined or ambiguous.
        2. With probability b(1-a), sample a language from the distribution of
           languages on the Internet and use the country's name in that language.
        3. This is implicit, but with probability (1-b)(1-a), keep the country code
        '''
        # 1. use the country name in the current language or the country's local language
        if address_country and random.random() < 0.7:
            localized = None
            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
                localized = language_country_names.get(language, {}).get(address_country.upper())
            if not localized:
                localized = country_localized_display_name(address_country.lower())
            if localized:
                address_components[AddressFormatter.COUNTRY] = localized
        # 2. country's name in a language samples from the distribution of languages on the Internet
        elif address_country and random.random() < 0.7:
            lang = sample_random_language()
            lang_country = language_country_names.get(lang, {}).get(address_country.upper())
            if lang_country:
                address_components[AddressFormatter.COUNTRY] = lang_country
        # 3. Implicit: the rest of the time keep the country code
        '''
        States
        ------
        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
        name with the unabbreviated one e.g. CA => California
        '''
        address_state = address_components.get(AddressFormatter.STATE)
        if address_state:
            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
            if state_full_name and random.random() < 0.3:
                address_components[AddressFormatter.STATE] = state_full_name
        if tag_components:
            formatted_addresses = []
            formatted_addresses.append(formatted_address)
@@ -500,7 +563,7 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
            current_components = component_bitset(address_components.keys())
            for component in address_components.keys():
-                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() >= 0.5:
+                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
                    address_components.pop(component)
                    current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                    if not address_components: