[addresses] Using the new parser config for address component expansions
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import os
|
||||
import pycountry
|
||||
import random
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -9,14 +11,22 @@ from geodata.address_formatting.aliases import Aliases
|
||||
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.units import Unit
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.countries.country_names import *
|
||||
from geodata.language_id.disambiguation import *
|
||||
from geodata.language_id.sample import sample_random_language
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
from geodata.names.normalization import name_affixes
|
||||
from geodata.boundaries.names import boundary_names
|
||||
from geodata.osm.components import osm_address_components
|
||||
from geodata.states.state_abbreviations import state_abbreviations
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
PARSER_DEFAULT_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'parser', 'default.yaml')
|
||||
|
||||
|
||||
class AddressExpander(object):
|
||||
'''
|
||||
@@ -46,7 +56,8 @@ class AddressExpander(object):
|
||||
u'en')
|
||||
|
||||
'''
|
||||
alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries}
|
||||
iso_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||
iso_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
||||
|
||||
rare_components = {
|
||||
AddressFormatter.SUBURB,
|
||||
@@ -65,26 +76,13 @@ class AddressExpander(object):
|
||||
AddressFormatter.STATE
|
||||
)
|
||||
|
||||
# List of places where it's much more common to use city, state than city, country
|
||||
state_important = {
|
||||
'US',
|
||||
'CA',
|
||||
}
|
||||
|
||||
RANDOM_VALUE_REPLACEMENTS = {
|
||||
# Key: address component
|
||||
AddressFormatter.COUNTRY: {
|
||||
# value: (replacement, probability)
|
||||
'GB': ('UK', 0.3),
|
||||
'United Kingdom': ('UK', 0.3),
|
||||
}
|
||||
}
|
||||
|
||||
ALL_OSM_NAME_KEYS = set(['name', 'name:simple',
|
||||
'ISO3166-1:alpha2', 'ISO3166-1:alpha3',
|
||||
'short_name', 'alt_name', 'official_name'])
|
||||
|
||||
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames):
|
||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||
|
||||
self.osm_admin_rtree = osm_admin_rtree
|
||||
self.language_rtree = language_rtree
|
||||
self.neighborhoods_rtree = neighborhoods_rtree
|
||||
@@ -140,42 +138,15 @@ class AddressExpander(object):
|
||||
|
||||
return language
|
||||
|
||||
def pick_random_name_key(self, suffix=''):
|
||||
def pick_random_name_key(self, props, component, suffix=''):
|
||||
'''
|
||||
Random name
|
||||
-----------
|
||||
|
||||
Pick a name key from OSM
|
||||
'''
|
||||
name_key = ''.join(('name', suffix))
|
||||
raw_name_key = 'name'
|
||||
short_name_key = ''.join(('short_name', suffix))
|
||||
raw_short_name_key = 'short_name'
|
||||
alt_name_key = ''.join(('alt_name', suffix))
|
||||
raw_alt_name_key = 'alt_name'
|
||||
official_name_key = ''.join(('official_name', suffix))
|
||||
raw_official_name_key = 'official_name'
|
||||
|
||||
# Choose which name to use with given probabilities
|
||||
r = random.random()
|
||||
if r < 0.7:
|
||||
# 70% of the time use the name tag
|
||||
key = name_key
|
||||
raw_key = raw_name_key
|
||||
elif r < 0.8:
|
||||
# 10% of the time use the short name
|
||||
key = short_name_key
|
||||
raw_key = raw_short_name_key
|
||||
elif r < 0.9:
|
||||
# 10% of the time use the official name
|
||||
key = official_name_key
|
||||
raw_key = raw_official_name_key
|
||||
else:
|
||||
# 10% of the time use the alt name
|
||||
key = alt_name_key
|
||||
raw_key = raw_alt_name_key
|
||||
|
||||
return key, raw_key
|
||||
name_key = boundary_names.name_key(props, component)
|
||||
return name_key, ''.join((name_key, suffix)) if ':' not in name_key else name_key
|
||||
|
||||
def all_names(self, props, languages=None):
|
||||
names = set()
|
||||
@@ -217,7 +188,7 @@ class AddressExpander(object):
|
||||
if is_state:
|
||||
for state in component_names:
|
||||
for language in languages:
|
||||
state_code = state_abbreviations.get_abbreviation(country, language, state)
|
||||
state_code = state_abbreviations.get_abbreviation(country, language, state, default=None)
|
||||
if state_code:
|
||||
names.add(state_code.upper())
|
||||
|
||||
@@ -289,12 +260,7 @@ class AddressExpander(object):
|
||||
self.formatter.aliases.replace(address_components)
|
||||
return address_components
|
||||
|
||||
def country_name(self, address_components, country_code, language,
|
||||
use_country_code_prob=0.3,
|
||||
local_language_name_prob=0.6,
|
||||
random_language_name_prob=0.1,
|
||||
alpha_3_iso_code_prob=0.1,
|
||||
):
|
||||
def cldr_country_name(self, country_code, language):
|
||||
'''
|
||||
Country names
|
||||
-------------
|
||||
@@ -319,41 +285,53 @@ class AddressExpander(object):
|
||||
3. This is implicit, but with probability (1-b)(1-a), keep the country code
|
||||
'''
|
||||
|
||||
non_local_language = None
|
||||
cldr_config = nested_get(self.config, ('country', 'cldr'))
|
||||
|
||||
alpha_2_iso_code_prob = float(cldr_config['iso_alpha_2_code_probability'])
|
||||
localized_name_prob = float(cldr_config['localized_name_probability'])
|
||||
alpha_3_iso_code_prob = float(cldr_config['iso_alpha_3_code_probability'])
|
||||
|
||||
values = ('localized', 'alpha3', 'alpha2')
|
||||
probs = cdf([localized_name_prob, alpha_3_iso_code_prob, alpha_2_iso_code_prob])
|
||||
value = weighted_choice(values, probs)
|
||||
|
||||
country_name = country_code.upper()
|
||||
|
||||
if language in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
||||
language = None
|
||||
|
||||
if value == 'localized':
|
||||
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
|
||||
elif value == 'alpha3':
|
||||
country_name = country_names.alpha3_code(country_code) or country_name
|
||||
|
||||
return country_name
|
||||
|
||||
def is_country_iso_code(self, country):
|
||||
country = country.lower()
|
||||
return country in self.iso_alpha2_codes or country in self.iso_alpha3_codes
|
||||
|
||||
def replace_country_name(self, address_components, country, language):
|
||||
address_country = address_components.get(AddressFormatter.COUNTRY)
|
||||
|
||||
if random.random() < use_country_code_prob:
|
||||
# 30% of the time: add Quattroshapes country
|
||||
address_country = country_code.upper()
|
||||
cldr_country_prob = float(nested_get(self.config, ('country', 'cldr_country_probability')))
|
||||
replace_with_cldr_country_prob = float(nested_get(self.config, ('country', 'replace_with_cldr_country_probability')))
|
||||
remove_iso_code_prob = float(nested_get(self.config, ('country', 'remove_iso_code_probability')))
|
||||
|
||||
r = random.random()
|
||||
is_iso_code = address_country and self.is_country_iso_code(address_country)
|
||||
|
||||
# 1. 60% of the time: use the country name in the current language or the country's local language
|
||||
if address_country and r < local_language_name_prob:
|
||||
localized = None
|
||||
if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
||||
localized = language_country_names.get(language, {}).get(address_country.upper())
|
||||
if (is_iso_code and random.random() < replace_with_cldr_country_prob) or random.random() < cldr_country_prob:
|
||||
address_country = self.cldr_country_name(country, language)
|
||||
if address_country:
|
||||
address_components[AddressFormatter.COUNTRY] = address_country
|
||||
elif is_iso_code and random.random() < remove_iso_code_prob:
|
||||
address_components.pop(AddressFormatter.COUNTRY)
|
||||
|
||||
if not localized:
|
||||
localized = country_localized_display_name(address_country.lower())
|
||||
|
||||
if localized:
|
||||
address_country = localized
|
||||
# 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
|
||||
elif address_country and r < local_language_name_prob + random_language_name_prob:
|
||||
non_local_language = sample_random_language()
|
||||
lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
|
||||
if lang_country:
|
||||
address_country = lang_country
|
||||
# 3. 10% of the time: use the country's alpha-3 ISO code
|
||||
elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob:
|
||||
iso_code_alpha3 = self.alpha3_codes.get(address_country)
|
||||
if iso_code_alpha3:
|
||||
address_country = iso_code_alpha3
|
||||
# 4. Implicit: the rest of the time keep the alpha-2 country code
|
||||
|
||||
return address_country, non_local_language
|
||||
def non_local_language(self):
|
||||
non_local_language_prob = float(nested_get(self.config, ('languages', 'non_local_language_probability')))
|
||||
if random.random() < non_local_language_prob:
|
||||
return sample_random_language()
|
||||
return None
|
||||
|
||||
def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.4):
|
||||
'''
|
||||
@@ -391,12 +369,6 @@ class AddressExpander(object):
|
||||
osm_suffix='',
|
||||
non_local_language=None,
|
||||
random_key=True,
|
||||
alpha_3_iso_code_prob=0.1,
|
||||
alpha_2_iso_code_prob=0.2,
|
||||
simple_country_key_prob=0.4,
|
||||
replace_with_non_local_prob=0.4,
|
||||
join_state_district_prob=0.5,
|
||||
abbreviate_state_prob=0.7
|
||||
):
|
||||
'''
|
||||
OSM boundaries
|
||||
@@ -414,14 +386,11 @@ class AddressExpander(object):
|
||||
include these qualifiers in the training data.
|
||||
'''
|
||||
|
||||
name_key = ''.join(('name', osm_suffix))
|
||||
raw_name_key = 'name'
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
simple_name_key = 'name:simple'
|
||||
international_name_key = 'int_name'
|
||||
|
||||
iso_code_key = 'ISO3166-1:alpha2'
|
||||
iso_code3_key = 'ISO3166-1:alpha3'
|
||||
|
||||
if osm_components:
|
||||
osm_components = self.categorized_osm_components(country, osm_components)
|
||||
poly_components = defaultdict(list)
|
||||
@@ -431,31 +400,13 @@ class AddressExpander(object):
|
||||
for component, components_values in osm_components.iteritems():
|
||||
seen = set()
|
||||
|
||||
if random_key:
|
||||
key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
|
||||
else:
|
||||
key, raw_key = name_key, raw_name_key
|
||||
|
||||
for component_value in components_values:
|
||||
r = random.random()
|
||||
name = None
|
||||
if random_key:
|
||||
key, raw_key = self.pick_random_name_key(component_value, component, suffix=osm_suffix)
|
||||
else:
|
||||
key, raw_key = name_key, raw_name_key
|
||||
|
||||
if component == AddressFormatter.COUNTRY:
|
||||
if iso_code3_key in component_value and r < alpha_3_iso_code_prob:
|
||||
name = component_value[iso_code3_key]
|
||||
elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob:
|
||||
name = component_value[iso_code_key]
|
||||
elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob:
|
||||
# Particularly to address the US (prefer United States,
|
||||
# not United States of America) but may capture variations
|
||||
# in other English-speaking countries as well.
|
||||
if simple_name_key in component_value:
|
||||
name = component_value[simple_name_key]
|
||||
elif international_name_key in component_value:
|
||||
name = component_value[international_name_key]
|
||||
|
||||
if not name:
|
||||
name = component_value.get(key, component_value.get(raw_key))
|
||||
name = component_value.get(key, component_value.get(raw_key))
|
||||
|
||||
if not name or (component != AddressFormatter.CITY and name == existing_city_name):
|
||||
name = component_value.get(name_key, component_value.get(raw_name_key))
|
||||
@@ -467,6 +418,10 @@ class AddressExpander(object):
|
||||
poly_components[component].append(name)
|
||||
seen.add((component, name))
|
||||
|
||||
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
||||
join_state_district_prob = float(nested_get(self.config, ('state_district', 'join_probability')))
|
||||
replace_with_non_local_prob = float(nested_get(self.config, ('languages', 'replace_non_local_probability')))
|
||||
|
||||
for component, vals in poly_components.iteritems():
|
||||
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
||||
if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
|
||||
@@ -483,7 +438,6 @@ class AddressExpander(object):
|
||||
def quattroshapes_city(self, address_components,
|
||||
latitude, longitude,
|
||||
language, non_local_language=None,
|
||||
qs_add_city_prob=0.2,
|
||||
abbreviated_name_prob=0.1):
|
||||
'''
|
||||
Quattroshapes/GeoNames cities
|
||||
@@ -498,6 +452,9 @@ class AddressExpander(object):
|
||||
|
||||
city = None
|
||||
|
||||
qs_add_city_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_backup_city_probability')))
|
||||
abbreviated_name_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_abbreviated_probability')))
|
||||
|
||||
if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob):
|
||||
lang = non_local_language or language
|
||||
quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
@@ -534,9 +491,7 @@ class AddressExpander(object):
|
||||
|
||||
def add_neighborhoods(self, address_components,
|
||||
neighborhoods,
|
||||
osm_suffix='',
|
||||
add_prefix_prob=0.5,
|
||||
add_neighborhood_prob=0.5):
|
||||
osm_suffix=''):
|
||||
'''
|
||||
Neighborhoods
|
||||
-------------
|
||||
@@ -551,27 +506,16 @@ class AddressExpander(object):
|
||||
|
||||
neighborhood_levels = defaultdict(list)
|
||||
|
||||
name_key = ''.join(('name', osm_suffix))
|
||||
raw_name_key = 'name'
|
||||
add_prefix_prob = float(nested_get(self.config, ('neighborhood', 'add_prefix_probability')))
|
||||
add_neighborhood_prob = float(nested_get(self.config, ('neighborhood', 'add_neighborhood_probability')))
|
||||
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
|
||||
for neighborhood in neighborhoods:
|
||||
place_type = neighborhood.get('place')
|
||||
polygon_type = neighborhood.get('polygon_type')
|
||||
|
||||
key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
|
||||
name = neighborhood.get(key, neighborhood.get(raw_key))
|
||||
|
||||
if not name:
|
||||
name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
|
||||
|
||||
name_prefix = neighborhood.get('name:prefix')
|
||||
|
||||
if name_prefix and random.random() < add_prefix_prob:
|
||||
name = six.u(' ').join([name_prefix, name])
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
neighborhood_level = AddressFormatter.SUBURB
|
||||
|
||||
if place_type == 'borough' or polygon_type == 'local_admin':
|
||||
@@ -584,6 +528,20 @@ class AddressExpander(object):
|
||||
if not name or name == city_name:
|
||||
continue
|
||||
|
||||
key, raw_key = self.pick_random_name_key(neighborhood, neighborhood_level, suffix=osm_suffix)
|
||||
name = neighborhood.get(key, neighborhood.get(raw_key))
|
||||
|
||||
if not name:
|
||||
name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
|
||||
|
||||
name_prefix = neighborhood.get('name:prefix')
|
||||
|
||||
if name and name_prefix and random.random() < add_prefix_prob:
|
||||
name = six.u(' ').join([name_prefix, name])
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
neighborhood_levels[neighborhood_level].append(name)
|
||||
|
||||
for component, neighborhoods in neighborhood_levels.iteritems():
|
||||
@@ -597,14 +555,17 @@ class AddressExpander(object):
|
||||
|
||||
Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
|
||||
'''
|
||||
|
||||
replacement_prob = float(nested_get(self.config, ('names', 'replace_affix_probability')))
|
||||
|
||||
for component in list(address_components):
|
||||
if component not in self.BOUNDARY_COMPONENTS:
|
||||
continue
|
||||
name = address_components[component]
|
||||
if not name:
|
||||
continue
|
||||
replacement = name_affixes.replace_name_suffixes(name, language)
|
||||
replacement = name_affixes.replace_name_prefixes(replacement, language)
|
||||
replacement = name_affixes.replace_suffixes(name, language)
|
||||
replacement = name_affixes.replace_prefixes(replacement, language)
|
||||
if replacement != name and random.random() < replacement_prob:
|
||||
address_components[component] = replacement
|
||||
|
||||
@@ -615,10 +576,14 @@ class AddressExpander(object):
|
||||
|
||||
Make a few special replacements (like UK instead of GB)
|
||||
'''
|
||||
|
||||
for component, value in address_components.iteritems():
|
||||
replacement, prob = self.RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0))
|
||||
if replacement is not None and random.random() < prob:
|
||||
address_components[component] = replacement
|
||||
replacement = nested_get(self.config, ('value_replacements', component, value), default=None)
|
||||
if replacement is not None:
|
||||
new_value = repl['replacement']
|
||||
prob = repl['probability']
|
||||
if random.random() < prob:
|
||||
address_components[component] = new_value
|
||||
|
||||
def prune_duplicate_names(self, address_components):
|
||||
'''
|
||||
@@ -691,9 +656,9 @@ class AddressExpander(object):
|
||||
|
||||
language = self.address_language(address_components, candidate_languages)
|
||||
|
||||
address_country, non_local_language = self.country_name(address_components, country, language)
|
||||
if address_country:
|
||||
address_components[AddressFormatter.COUNTRY] = address_country
|
||||
non_local_language = self.non_local_language()
|
||||
# If a country already was specified
|
||||
self.replace_country_name(address_components, country, non_local_language or language)
|
||||
|
||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
||||
if address_state:
|
||||
@@ -755,13 +720,8 @@ class AddressExpander(object):
|
||||
|
||||
address_components = self.normalize_address_components(value)
|
||||
|
||||
address_country, non_local_language = self.country_name(address_components, country, language,
|
||||
use_country_code_prob=0.0,
|
||||
local_language_name_prob=1.0,
|
||||
random_language_name_prob=0.0,
|
||||
alpha_3_iso_code_prob=0.0)
|
||||
if address_country:
|
||||
address_components[AddressFormatter.COUNTRY] = address_country
|
||||
non_local_language = self.non_local_language()
|
||||
self.replace_country_name(address_components, country, non_local_language or language)
|
||||
|
||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0)
|
||||
if address_state:
|
||||
@@ -798,7 +758,7 @@ class AddressExpander(object):
|
||||
self.add_neighborhoods(address_components, neighborhoods,
|
||||
osm_suffix=osm_suffix)
|
||||
|
||||
self.replace_name_affixes(address_components, non_local_language or language)
|
||||
self.replace_name_affixes(address_components, non_local_language or language)
|
||||
|
||||
self.replace_names(address_components)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user