[addresses] wrapping up some of the functionality from OSM formatter to be used in on an arbitrary address component dictionary

2016-04-28 23:47:18 -04:00
parent a94debc4ed
commit fa99b4ce77
1 changed files with 662 additions and 0 deletions
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -0,0 +1,662 @@
+import pycountry
+import random
+
+from collections import defaultdict
+
+from geodata.address_formatting.formatter import AddressFormatter
+
+from geodata.addresses.floors import Floor
+from geodata.addresses.units import Unit
+from geodata.countries.country_names import *
+from geodata.language_id.disambiguation import *
+from geodata.language_id.sample import sample_random_language
+from geodata.names.normalization import replace_name_prefixes, replace_name_suffixes
+from geodata.osm.extract import osm_address_components
+from geodata.states.state_abbreviations import STATE_ABBREVIATIONS, STATE_EXPANSIONS
+
+
+class AddressExpander(object):
+    '''
+    This class, while it has a few dependencies, exposes a simple method
+    for transforming geocoded input addresses (usually a lat/lon with either
+    a name or house number + street name) into the sorts of examples used by
+    libpostal's address parser. The dictionaries produced here can be fed
+    directly to AddressFormatter.format_address to produce training examples.
+
+    There are several steps in expanding an address including reverse geocoding
+    to polygons, disambiguating which language the address uses, stripping standard 
+    prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
+
+    Usage:
+    >>> expander = AddressExpander(osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames)
+    >>> expander.expanded_address_components({'name': 'Hackney Empire'}, 51.54559, -0.05567)
+
+    Returns (results vary because of randomness):
+
+    ({'city': u'London',
+      'city_district': u'London Borough of Hackney',
+      'country': 'UK',
+      'name': 'Hackney Empire',
+      'state': u'England',
+      'state_district': u'Greater London'},
+     u'gb',
+     u'en')
+
+    '''
+    alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries}
+
+    rare_components = {
+        AddressFormatter.SUBURB,
+        AddressFormatter.CITY_DISTRICT,
+        AddressFormatter.STATE_DISTRICT,
+        AddressFormatter.STATE,
+    }
+
+    BOUNDARY_COMPONENTS = (
+        AddressFormatter.SUBURB,
+        AddressFormatter.CITY_DISTRICT,
+        AddressFormatter.CITY,
+        AddressFormatter.STATE_DISTRICT,
+        AddressFormatter.STATE
+    )
+
+    # List of places where it's much more common to use city, state than city, country
+    state_important = {
+        'US',
+        'CA',
+    }
+
+    RANDOM_VALUE_REPLACEMENTS = {
+        # Key: address component
+        AddressFormatter.COUNTRY: {
+            # value: (replacement, probability)
+            'GB': ('UK', 0.3),
+            'United Kingdom': ('UK', 0.3),
+        }
+    }
+
+    def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames):
+        self.osm_admin_rtree = osm_admin_rtree
+        self.language_rtree = language_rtree
+        self.neighborhoods_rtree = neighborhoods_rtree
+        self.subdivisions_rtree = subdivisions_rtree
+        self.buildings_rtree = buildings_rtree
+        self.quattroshapes_rtree = quattroshapes_rtree
+        self.geonames = geonames
+
+    def strip_keys(self, value, ignore_keys):
+        for key in ignore_keys:
+            value.pop(key, None)
+
+    def osm_reverse_geocoded_components(self, country, latitude, longitude):
+        components = defaultdict(list)
+        for props in self.osm_admin_rtree.point_in_poly(latitude, longitude, return_all=True):
+            name = props.get('name')
+            if not name:
+                continue
+
+            for k, v in props.iteritems():
+                normalized_key = osm_address_components.get_component(country, k, v)
+                if normalized_key:
+                    components[normalized_key].append(props)
+        return components
+
+    def address_language(self, components, candidate_languages):
+        '''
+        Language
+        --------
+
+        If there's only one candidate language for a given country or region,
+        return that language.
+
+        In countries that speak multiple languages (Belgium, Hong Kong, Wales, the US
+        in Spanish-speaking regions, etc.), we need at least a road name for disambiguation.
+
+        If we can't identify a language, the address will be labeled "unk". If the street name
+        itself contains phrases from > 1 language, the address will be labeled ambiguous.
+        '''
+        language = None
+
+        if len(candidate_languages) == 1:
+            language = candidate_languages[0]['lang']
+        else:
+            street = components.get(AddressFormatter.ROAD, None)
+            if street is not None:
+                language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
+            else:
+                language = UNKNOWN_LANGUAGE
+
+        return language
+
+    def pick_random_name_key(self, suffix=''):
+        '''
+        
+        '''
+        name_key = ''.join(('name', suffix))
+        raw_name_key = 'name'
+        short_name_key = ''.join(('short_name', suffix))
+        raw_short_name_key = 'short_name'
+        alt_name_key = ''.join(('alt_name', suffix))
+        raw_alt_name_key = 'alt_name'
+        official_name_key = ''.join(('official_name', suffix))
+        raw_official_name_key = 'official_name'
+
+        # Choose which name to use with given probabilities
+        r = random.random()
+        if r < 0.7:
+            # 70% of the time use the name tag
+            key = name_key
+            raw_key = raw_name_key
+        elif r < 0.8:
+            # 10% of the time use the short name
+            key = short_name_key
+            raw_key = raw_short_name_key
+        elif r < 0.9:
+            # 10% of the time use the official name
+            key = official_name_key
+            raw_key = raw_official_name_key
+        else:
+            # 10% of the time use the official name
+            key = alt_name_key
+            raw_key = raw_alt_name_key
+
+        return key, raw_key
+
+    def contains_multiple_place_names()
+
+    def normalize_address_components(self, value):
+        address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases}
+        self.formatter.replace_aliases(address_components)
+        return address_components
+
+    def country_name(self, address_components, country_code, language,
+                     use_country_code_prob=0.3,
+                     local_language_name_prob=0.6,
+                     random_language_name_prob=0.1,
+                     alpha_3_iso_code_prob=0.1,
+                     ):
+        '''
+        Country names
+        -------------
+
+        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
+        However, we'd like to expand these to include natural language forms
+        of the country names we might be likely to encounter in a geocoder or
+        handwritten address.
+
+        These splits are somewhat arbitrary but could potentially be fit to data
+        from OpenVenues or other sources on the usage of country name forms.
+
+        If the address includes a country, the selection procedure proceeds as follows:
+
+        1. With probability a, select the country name in the language of the address
+           (determined above), or with the localized country name if the language is
+           undtermined or ambiguous.
+
+        2. With probability b(1-a), sample a language from the distribution of
+           languages on the Internet and use the country's name in that language.
+
+        3. This is implicit, but with probability (1-b)(1-a), keep the country code
+        '''
+
+        non_local_language = None
+
+        address_country = address_components.get(AddressFormatter.COUNTRY)
+
+        if random.random() < use_country_code_prob:
+            # 30% of the time: add Quattroshapes country
+            address_country = country_code.upper()
+
+        r = random.random()
+
+        # 1. 60% of the time: use the country name in the current language or the country's local language
+        if address_country and r < local_language_name_prob:
+            localized = None
+            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
+                localized = language_country_names.get(language, {}).get(address_country.upper())
+
+            if not localized:
+                localized = country_localized_display_name(address_country.lower())
+
+            if localized:
+                address_country = localized
+        # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
+        elif address_country and r < local_language_name_prob + random_language_name_prob:
+            non_local_language = sample_random_language()
+            lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
+            if lang_country:
+                address_country = lang_country
+        # 3. 10% of the time: use the country's alpha-3 ISO code
+        elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob:
+            iso_code_alpha3 = self.alpha3_codes.get(address_country)
+            if iso_code_alpha3:
+                address_country = iso_code_alpha3
+        # 4. Implicit: the rest of the time keep the alpha-2 country code
+
+        return address_country, non_local_language
+
+    def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.4):
+        '''
+        States
+        ------
+
+        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
+        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
+        name with the unabbreviated one e.g. CA => California
+        '''
+        address_state = address_components.get(AddressFormatter.STATE)
+
+        if address_state and country and not non_local_language:
+            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
+
+            if state_full_name and random.random() < state_full_name_prob:
+                address_state = state_full_name
+        elif address_state and non_local_language:
+            _ = address_components.pop(AddressFormatter.STATE, None)
+            address_state = None
+        return address_state
+
+    def tag_suffix(self, language, non_local_language, more_than_one_official_language=False):
+        if non_local_language is not None:
+            osm_suffix = ':{}'.format(non_local_language)
+        elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
+            osm_suffix = ':{}'.format(language)
+        else:
+            osm_suffix = ''
+        return osm_suffix
+
+    def add_admin_boundaries(self, address_components,
+                             country, language,
+                             latitude, longitude,
+                             osm_suffix='',
+                             non_local_language=None,
+                             random_key=True,
+                             alpha_3_iso_code_prob=0.1,
+                             alpha_2_iso_code_prob=0.2,
+                             simple_country_key_prob=0.4,
+                             replace_with_non_local_prob=0.4,
+                             join_state_district_prob=0.5,
+                             expand_state_prob=0.7
+                             ):
+        '''
+        OSM boundaries
+        --------------
+
+        For many addresses, the city, district, region, etc. are all implicitly
+        generated by the reverse geocoder e.g. we do not need an addr:city tag
+        to identify that 40.74, -74.00 is in New York City as well as its parent
+        geographies (New York county, New York state, etc.)
+
+        Where possible we augment the addr:* tags with some of the reverse-geocoded
+        relations from OSM.
+
+        Since addresses found on the web may have the same properties, we
+        include these qualifiers in the training data.
+        '''
+
+        osm_components = self.osm_reverse_geocoded_components(country, latitude, longitude)
+
+        name_key = ''.join(('name', osm_suffix))
+        raw_name_key = 'name'
+        simple_name_key = 'name:simple'
+        international_name_key = 'int_name'
+
+        iso_code_key = 'ISO3166-1:alpha2'
+        iso_code3_key = 'ISO3166-1:alpha3'
+
+        if osm_components:
+            poly_components = defaultdict(list)
+
+            existing_city_name = address_components.get(AddressFormatter.CITY)
+
+            for component, components_values in osm_components.iteritems():
+                seen = set()
+
+                if random_key:
+                    key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
+                else:
+                    key, raw_key = name_key, raw_name_key
+
+                for component_value in components_values:
+                    r = random.random()
+                    name = None
+
+                    if component == AddressFormatter.COUNTRY:
+                        if iso_code3_key in component_value and r < alpha_3_iso_code_prob:
+                            name = component_value[iso_code3_key]
+                        elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob:
+                            name = component_value[iso_code_key]
+                        elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob:
+                            # Particularly to address the US (prefer United States,
+                            # not United States of America) but may capture variations
+                            # in other English-speaking countries as well.
+                            if simple_name_key in component_value:
+                                name = component_value[simple_name_key]
+                            elif international_name_key in component_value:
+                                name = component_value[international_name_key]
+
+                    if not name:
+                        name = component_value.get(key, component_value.get(raw_key))
+
+                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
+                        name = component_value.get(name_key, component_value.get(raw_name_key))
+
+                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
+                        continue
+
+                    if (component, name) not in seen:
+                        poly_components[component].append(name)
+                        seen.add((component, name))
+
+            for component, vals in poly_components.iteritems():
+                if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
+                    if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
+                        num = random.randrange(1, len(vals) + 1)
+                        val = u', '.join(vals[:num])
+                    else:
+                        val = random.choice(vals)
+
+                    if component == AddressFormatter.STATE and random.random() < expand_state_prob:
+                        val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val)
+                    address_components[component] = val
+
+    def quattroshapes_city(self, address_components,
+                           latitude, longitude,
+                           language, non_local_language=None,
+                           qs_add_city_prob=0.2,
+                           abbreviated_name_prob=0.1):
+        '''
+        Quattroshapes/GeoNames cities
+        -----------------------------
+
+        Quattroshapes isn't great for everything, but it has decent city boundaries
+        in places where OSM sometimes does not (or at least in places where we aren't
+        currently able to create valid polygons). While Quattroshapes itself doesn't
+        reliably use local names, which we'll want for consistency, Quattroshapes cities
+        are linked with GeoNames, which has per-language localized names for most places.
+        '''
+
+        city = None
+
+        if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob):
+            lang = non_local_language or language
+            quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
+            for result in quattroshapes_cities:
+                if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result:
+                    geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0])
+                    names = self.geonames.get_alternate_names(geonames_id)
+
+                    if not names or lang not in names:
+                        continue
+
+                    city = None
+                    if 'abbr' not in names or non_local_language:
+                        # Use the common city name in the target language
+                        city = names[lang][0][0]
+                    elif random.random() < abbreviated_name_prob:
+                        # Use an abbreviation: NYC, BK, SF, etc.
+                        city = random.choice(names['abbr'])[0]
+
+                    if not city or not city.strip():
+                        continue
+                    return city
+                    break
+            else:
+                if non_local_language and AddressFormatter.CITY in address_components and (
+                        AddressFormatter.CITY_DISTRICT in address_components or
+                        AddressFormatter.SUBURB in address_components):
+                    address_components.pop(AddressFormatter.CITY)
+
+        return city
+
+    def add_neighborhoods(self, address_components,
+                          latitude, longitude,
+                          osm_suffix='',
+                          add_prefix_prob=0.5,
+                          add_neighborhood_prob=0.5):
+        '''
+        Neighborhoods
+        -------------
+
+        In some cities, neighborhoods may be included in a free-text address.
+
+        OSM includes many neighborhoods but only as points, rather than the polygons
+        needed to perform reverse-geocoding. We use a hybrid index containing
+        Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are
+        on the whole of better quality).
+        '''
+
+        neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
+        neighborhood_levels = defaultdict(list)
+
+        name_key = ''.join(('name', osm_suffix))
+        raw_name_key = 'name'
+
+        for neighborhood in neighborhoods:
+            place_type = neighborhood.get('place')
+            polygon_type = neighborhood.get('polygon_type')
+
+            key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
+            name = neighborhood.get(key, neighborhood.get(raw_key))
+
+            if not name:
+                name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
+
+                name_prefix = neighborhood.get('name:prefix')
+
+                if name_prefix and random.random() < add_prefix_prob:
+                    name = u' '.join([name_prefix, name])
+
+            if not name:
+                continue
+
+            neighborhood_level = AddressFormatter.SUBURB
+
+            if place_type == 'borough' or polygon_type == 'local_admin':
+                neighborhood_level = AddressFormatter.CITY_DISTRICT
+
+                # Optimization so we don't use e.g. Brooklyn multiple times
+                city_name = address_components.get(AddressFormatter.CITY)
+                if name == city_name:
+                    name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
+                    if not name or name == city_name:
+                        continue
+
+            neighborhood_levels[neighborhood_level].append(name)
+
+        for component, neighborhoods in neighborhood_levels.iteritems():
+            if component not in address_components and random.random() < add_neighborhood_prob:
+                address_components[component] = neighborhoods[0]
+
+    def normalize_names(self, address_components, replacement_prob=0.6):
+        '''
+        Name normalization
+        ------------------
+
+        Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
+        '''
+        for component in self.BOUNDARY_COMPONENTS:
+            name = address_components.get(component)
+            if not name:
+                continue
+            replacement = replace_name_prefixes(replace_name_suffixes(name))
+            if replacement != name and random.random() < replacement_prob:
+                address_components[component] = replacement
+
+    def replace_names(self, address_components):
+        '''
+        Name replacements
+        -----------------
+
+        Make a few special replacements (like UK instead of GB)
+        '''
+        for component, value in address_components.iteritems():
+            replacement, prob = self.RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0))
+            if replacement is not None and random.random() < prob:
+                address_components[component] = replacement
+
+    def prune_duplicate_names(self, address_components):
+        '''
+        Name deduping
+        -------------
+
+        For some cases like "Antwerpen, Antwerpen, Antwerpen"
+        that are very unlikely to occur in real life.
+
+        Note: prefer the city name in these cases
+        '''
+
+        name_components = defaultdict(list)
+
+        for component in (AddressFormatter.CITY, AddressFormatter.STATE_DISTRICT,
+                          AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
+            name = address_components.get(component)
+            if name:
+                name_components[name].append(component)
+
+        for name, components in name_components.iteritems():
+            if len(components) > 1:
+                for component in components[1:]:
+                    address_components.pop(component, None)
+
+    def cleanup_house_number(self, address_components):
+        '''
+        House number cleanup
+        --------------------
+
+        This method was originally used for OSM nodes because in some places,
+        particularly Uruguay, we see house numbers that are actually a comma-separated
+        list. It seemed prudent to retain this cleanup in the generalized version
+        in case we see similar issues with other data sets.
+
+        If there's one comma in the house number, allow it as it might
+        be legitimate, but if there are 2 or more, just take the first one.
+        '''
+
+        house_number = address_components.get(AddressFormatter.HOUSE_NUMBER)
+        if not house_number:
+            return
+        if ';' in house_number:
+            house_number = house_number.replace(';', ',')
+            address_components[AddressFormatter.HOUSE_NUMBER] = house_number
+        if house_number and house_number.count(',') >= 2:
+            house_numbers = house_number.split(',')
+            random.shuffle(house_numbers)
+            for num in house_numbers:
+                num = num.strip()
+                if num:
+                    address_components[AddressFormatter.HOUSE_NUMBER] = num
+                    break
+            else:
+                address_components.pop(AddressFormatter.HOUSE_NUMBER, None)
+
+    def expanded_address_components(self, address_components, latitude, longitude):
+        try:
+            latitude, longitude = latlon_to_decimal(latitude, longitude)
+        except Exception:
+            return None, None, None
+
+        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
+        if not (country and candidate_languages):
+            return None, None, None
+
+        language = None
+
+        more_than_one_official_language = len(candidate_languages) > 1
+
+        language = self.address_language(address_components, candidate_languages)
+
+        address_country, non_local_language = self.country_name(address_components, country, language)
+        if address_country:
+            address_components[AddressFormatter.COUNTRY] = address_country
+
+        address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
+        if address_state:
+            address_components[AddressFormatter.STATE] = address_state
+
+        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
+
+        self.add_admin_boundaries(address_components, country, language, latitude, longitude,
+                                  non_local_language=non_local_language,
+                                  osm_suffix=osm_suffix)
+
+        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
+        if city:
+            address_components[AddressFormatter.CITY] = city
+
+        self.add_neighborhoods(address_components, latitude, longitude,
+                               osm_suffix=osm_suffix)
+
+        street = address_components.get(AddressFormatter.ROAD)
+
+        self.normalize_names(address_components)
+
+        self.replace_names(address_components)
+
+        self.prune_duplicate_names(address_components)
+
+        self.cleanup_house_number(address_components)
+
+        return address_components, country, language
+
+    def limited_address_components(self, address_components, latitude, longitude):
+        try:
+            latitude, longitude = latlon_to_decimal(latitude, longitude)
+        except Exception:
+            return None, None, None
+
+        country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
+        if not (country and candidate_languages):
+            return None, None, None
+
+        remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS
+
+        for key in remove_keys:
+            _ = value.pop(key, None)
+
+        language = None
+
+        more_than_one_official_language = len(candidate_languages) > 1
+
+        language = self.address_language(value, candidate_languages)
+
+        address_components = self.normalize_address_components(value)
+
+        address_country, non_local_language = self.country_name(address_components, country, language,
+                                                                use_country_code_prob=0.0,
+                                                                local_language_name_prob=1.0,
+                                                                random_language_name_prob=0.0,
+                                                                alpha_3_iso_code_prob=0.0)
+        if address_country:
+            address_components[AddressFormatter.COUNTRY] = address_country
+
+        address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0)
+        if address_state:
+            address_components[AddressFormatter.STATE] = address_state
+
+        street = address_components.get(AddressFormatter.ROAD)
+
+        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
+
+        self.add_osm_boundaries(address_components, country, language, latitude, longitude,
+                                osm_suffix=osm_suffix,
+                                non_local_language=non_local_language,
+                                random_key=False,
+                                alpha_3_iso_code_prob=0.0,
+                                alpha_2_iso_code_prob=0.0,
+                                replace_with_non_local_prob=0.0,
+                                expand_state_prob=1.0)
+
+        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
+
+        if city:
+            address_components[AddressFormatter.CITY] = city
+
+        self.add_neighborhoods(address_components, latitude, longitude,
+                               osm_suffix=osm_suffix)
+
+        self.normalize_names(address_components)
+
+        self.prune_duplicate_names(address_components)
+
+        return address_components, country, language