[addresses] Adding support for the combined fields portion of the address config (Canadian apartment numbers, etc.) Adding a new language suffix chooser which can pick things like ja_rm or ru_Latn in addition to standard language codes for admin boundary names, etc.
This commit is contained in:
@@ -162,7 +162,7 @@ class AddressComponents(object):
|
|||||||
name_key = boundary_names.name_key(props, component)
|
name_key = boundary_names.name_key(props, component)
|
||||||
return name_key, ''.join((name_key, suffix)) if ':' not in name_key else name_key
|
return name_key, ''.join((name_key, suffix)) if ':' not in name_key else name_key
|
||||||
|
|
||||||
def all_names(self, props, languages=None):
|
def all_names(self, props, languages):
|
||||||
names = set()
|
names = set()
|
||||||
for k, v in six.iteritems(props):
|
for k, v in six.iteritems(props):
|
||||||
if k in self.ALL_OSM_NAME_KEYS:
|
if k in self.ALL_OSM_NAME_KEYS:
|
||||||
@@ -185,7 +185,7 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
components = defaultdict(set)
|
components = defaultdict(set)
|
||||||
for props in osm_components:
|
for props in osm_components:
|
||||||
component_names = self.all_names(props, languages=languages)
|
component_names = self.all_names(props, languages or set())
|
||||||
names |= component_names
|
names |= component_names
|
||||||
|
|
||||||
is_state = False
|
is_state = False
|
||||||
@@ -274,6 +274,39 @@ class AddressComponents(object):
|
|||||||
self.formatter.aliases.replace(address_components)
|
self.formatter.aliases.replace(address_components)
|
||||||
return address_components
|
return address_components
|
||||||
|
|
||||||
|
def combine_fields(self, address_components, language, country=None):
|
||||||
|
combo_config = address_config.get_property('components.combinations', language, country=country, default={})
|
||||||
|
values = []
|
||||||
|
probs = []
|
||||||
|
for k, v in six.iteritems(combo_config):
|
||||||
|
values.append(v)
|
||||||
|
probs.append(v['probability'])
|
||||||
|
|
||||||
|
if not isclose(sum(probs), 1.0):
|
||||||
|
values.append(None)
|
||||||
|
probs.append(1.0 - sum(probs))
|
||||||
|
|
||||||
|
probs = cdf(probs)
|
||||||
|
|
||||||
|
combo = weighted_choice(values, probs)
|
||||||
|
if combo is not None:
|
||||||
|
components = OrderedDict(combo['components']).keys()
|
||||||
|
if not all((c in address_components for c in components)):
|
||||||
|
return
|
||||||
|
|
||||||
|
values = []
|
||||||
|
probs = []
|
||||||
|
for s in combo['separators']:
|
||||||
|
values.append(s['separator'])
|
||||||
|
probs.append(s['probability'])
|
||||||
|
|
||||||
|
probs = cdf(probs)
|
||||||
|
separator = weighted_choice(values, probs)
|
||||||
|
|
||||||
|
new_label = combo['label']
|
||||||
|
new_value = separator.join([address_components.pop(c) for c in components])
|
||||||
|
address_components[new_label] = new_value
|
||||||
|
|
||||||
def generated_type(self, component, existing_components, language, country=None):
|
def generated_type(self, component, existing_components, language, country=None):
|
||||||
component_config = address_config.get_property('components.{}'.format(component), language, country=country)
|
component_config = address_config.get_property('components.{}'.format(component), language, country=country)
|
||||||
if not component_config:
|
if not component_config:
|
||||||
@@ -418,20 +451,59 @@ class AddressComponents(object):
|
|||||||
address_state = None
|
address_state = None
|
||||||
return address_state
|
return address_state
|
||||||
|
|
||||||
def tag_suffix(self, language, non_local_language, more_than_one_official_language=False):
|
def use_language(self, language, non_local_language, more_than_one_official_language):
|
||||||
if non_local_language is not None:
|
if non_local_language is not None:
|
||||||
osm_suffix = ':{}'.format(non_local_language)
|
return non_local_language
|
||||||
elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
elif language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
||||||
osm_suffix = ':{}'.format(language)
|
return language
|
||||||
else:
|
else:
|
||||||
osm_suffix = ''
|
return None
|
||||||
return osm_suffix
|
|
||||||
|
def pick_language_suffix(self, osm_components, language, non_local_language, more_than_one_official_language):
|
||||||
|
'''
|
||||||
|
Language suffix
|
||||||
|
---------------
|
||||||
|
|
||||||
|
This captures some variations in languages written with different scripts
|
||||||
|
e.g. language=ja_rm is for Japanese Romaji.
|
||||||
|
|
||||||
|
Pick a language suffix with probability proportional to how often the name is used
|
||||||
|
in the reverse geocoded components. So if only 2/5 components have name:ja_rm listed
|
||||||
|
but 5/5 have either name:ja or just plain name, we would pick standard Japanese (Kanji)
|
||||||
|
with probability .7143 (5/7) and Romaji with probability .2857 (2/7).
|
||||||
|
'''
|
||||||
|
# This captures name variations like "ja_rm" for Japanese Romaji, etc.
|
||||||
|
language_scripts = defaultdict(int)
|
||||||
|
use_language = (non_local_language or language)
|
||||||
|
|
||||||
|
for c in osm_components:
|
||||||
|
for k, v in six.iteritems(c):
|
||||||
|
if ':' not in k:
|
||||||
|
continue
|
||||||
|
splits = k.split(':')
|
||||||
|
if len(splits) > 0 and splits[0] == 'name' and '_' in splits[-1] and splits[-1].split('_', 1)[0] == use_language:
|
||||||
|
language_scripts[splits[-1]] += 1
|
||||||
|
elif k == 'name' or (splits[0] == 'name' and splits[-1]) == use_language:
|
||||||
|
language_scripts[None] += 1
|
||||||
|
|
||||||
|
language_script = None
|
||||||
|
|
||||||
|
if len(language_scripts) > 1:
|
||||||
|
cumulative = float(sum(language_scripts.values()))
|
||||||
|
values = list(language_scripts)
|
||||||
|
probs = cdf([float(c) / cumulative for c in language_scripts.values()])
|
||||||
|
language_script = weighted_choice(values, probs)
|
||||||
|
|
||||||
|
if not language_script and not non_local_language and not more_than_one_official_language:
|
||||||
|
return ''
|
||||||
|
else:
|
||||||
|
return ':{}'.format(language_script or non_local_language or language)
|
||||||
|
|
||||||
def add_admin_boundaries(self, address_components,
|
def add_admin_boundaries(self, address_components,
|
||||||
osm_components,
|
osm_components,
|
||||||
country, language,
|
country, language,
|
||||||
osm_suffix='',
|
|
||||||
non_local_language=None,
|
non_local_language=None,
|
||||||
|
language_suffix='',
|
||||||
random_key=True,
|
random_key=True,
|
||||||
always_use_full_names=False,
|
always_use_full_names=False,
|
||||||
):
|
):
|
||||||
@@ -451,12 +523,13 @@ class AddressComponents(object):
|
|||||||
include these qualifiers in the training data.
|
include these qualifiers in the training data.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
|
||||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
|
||||||
simple_name_key = 'name:simple'
|
simple_name_key = 'name:simple'
|
||||||
international_name_key = 'int_name'
|
international_name_key = 'int_name'
|
||||||
|
|
||||||
if osm_components:
|
if osm_components:
|
||||||
|
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
|
||||||
|
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||||
|
|
||||||
osm_components = self.categorized_osm_components(country, osm_components)
|
osm_components = self.categorized_osm_components(country, osm_components)
|
||||||
poly_components = defaultdict(list)
|
poly_components = defaultdict(list)
|
||||||
|
|
||||||
@@ -467,7 +540,7 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
for component_value in components_values:
|
for component_value in components_values:
|
||||||
if random_key:
|
if random_key:
|
||||||
key, raw_key = self.pick_random_name_key(component_value, component, suffix=osm_suffix)
|
key, raw_key = self.pick_random_name_key(component_value, component, suffix=language_suffix)
|
||||||
else:
|
else:
|
||||||
key, raw_key = name_key, raw_name_key
|
key, raw_key = name_key, raw_name_key
|
||||||
|
|
||||||
@@ -499,7 +572,7 @@ class AddressComponents(object):
|
|||||||
val = random.choice(vals)
|
val = random.choice(vals)
|
||||||
|
|
||||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||||
|
|
||||||
address_components[component] = val
|
address_components[component] = val
|
||||||
|
|
||||||
@@ -558,8 +631,7 @@ class AddressComponents(object):
|
|||||||
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||||
|
|
||||||
def add_neighborhoods(self, address_components,
|
def add_neighborhoods(self, address_components,
|
||||||
neighborhoods,
|
neighborhoods, language_suffix=''):
|
||||||
osm_suffix=''):
|
|
||||||
'''
|
'''
|
||||||
Neighborhoods
|
Neighborhoods
|
||||||
-------------
|
-------------
|
||||||
@@ -577,7 +649,7 @@ class AddressComponents(object):
|
|||||||
add_prefix_prob = float(nested_get(self.config, ('neighborhood', 'add_prefix_probability')))
|
add_prefix_prob = float(nested_get(self.config, ('neighborhood', 'add_prefix_probability')))
|
||||||
add_neighborhood_prob = float(nested_get(self.config, ('neighborhood', 'add_neighborhood_probability')))
|
add_neighborhood_prob = float(nested_get(self.config, ('neighborhood', 'add_neighborhood_probability')))
|
||||||
|
|
||||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
|
||||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||||
|
|
||||||
for neighborhood in neighborhoods:
|
for neighborhood in neighborhoods:
|
||||||
@@ -596,7 +668,7 @@ class AddressComponents(object):
|
|||||||
if not name or name == city_name:
|
if not name or name == city_name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
key, raw_key = self.pick_random_name_key(neighborhood, neighborhood_level, suffix=osm_suffix)
|
key, raw_key = self.pick_random_name_key(neighborhood, neighborhood_level, suffix=language_suffix)
|
||||||
name = neighborhood.get(key, neighborhood.get(raw_key))
|
name = neighborhood.get(key, neighborhood.get(raw_key))
|
||||||
|
|
||||||
if not name:
|
if not name:
|
||||||
@@ -757,6 +829,18 @@ class AddressComponents(object):
|
|||||||
address_components[AddressFormatter.POSTCODE] = phrase
|
address_components[AddressFormatter.POSTCODE] = phrase
|
||||||
|
|
||||||
def expanded(self, address_components, latitude, longitude, num_floors=None, num_basements=None, zone=None):
|
def expanded(self, address_components, latitude, longitude, num_floors=None, num_basements=None, zone=None):
|
||||||
|
'''
|
||||||
|
Expanded components
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Many times in geocoded address data sets, we get only a few components
|
||||||
|
(say street name and house number) plus a lat/lon. There's a lot of information
|
||||||
|
in a lat/lon though, so this method "fills in the blanks" as it were.
|
||||||
|
|
||||||
|
Namely, it calls all the methods above to reverse geocode to a few of the
|
||||||
|
R-tree + point-in-polygon indices passed in at initialization and adds things
|
||||||
|
like admin boundaries, neighborhoods,
|
||||||
|
'''
|
||||||
try:
|
try:
|
||||||
latitude, longitude = latlon_to_decimal(latitude, longitude)
|
latitude, longitude = latlon_to_decimal(latitude, longitude)
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -773,33 +857,33 @@ class AddressComponents(object):
|
|||||||
language = self.address_language(address_components, candidate_languages)
|
language = self.address_language(address_components, candidate_languages)
|
||||||
|
|
||||||
non_local_language = self.non_local_language()
|
non_local_language = self.non_local_language()
|
||||||
# If a country already was specified
|
# If a country was already specified
|
||||||
self.replace_country_name(address_components, country, non_local_language or language)
|
self.replace_country_name(address_components, country, non_local_language or language)
|
||||||
|
|
||||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
||||||
if address_state:
|
if address_state:
|
||||||
address_components[AddressFormatter.STATE] = address_state
|
address_components[AddressFormatter.STATE] = address_state
|
||||||
|
|
||||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
|
||||||
|
|
||||||
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
||||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||||
|
|
||||||
all_languages = set([l['lang'] for l in candidate_languages])
|
all_languages = set([l['lang'] for l in candidate_languages])
|
||||||
|
|
||||||
all_osm_components = osm_components + neighborhoods
|
all_osm_components = osm_components + neighborhoods
|
||||||
|
language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language)
|
||||||
|
|
||||||
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
||||||
|
|
||||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||||
non_local_language=non_local_language,
|
non_local_language=non_local_language,
|
||||||
osm_suffix=osm_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
||||||
if city:
|
if city:
|
||||||
address_components[AddressFormatter.CITY] = city
|
address_components[AddressFormatter.CITY] = city
|
||||||
|
|
||||||
self.add_neighborhoods(address_components, neighborhoods,
|
self.add_neighborhoods(address_components, neighborhoods, language, non_local_language=non_local_language,
|
||||||
osm_suffix=osm_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
street = address_components.get(AddressFormatter.ROAD)
|
street = address_components.get(AddressFormatter.ROAD)
|
||||||
|
|
||||||
@@ -816,6 +900,8 @@ class AddressComponents(object):
|
|||||||
self.add_sub_building_components(address_components, language, country=country,
|
self.add_sub_building_components(address_components, language, country=country,
|
||||||
num_floors=num_floors, num_basements=num_basements, zone=zone)
|
num_floors=num_floors, num_basements=num_basements, zone=zone)
|
||||||
|
|
||||||
|
self.combine_fields(address_components, language, country=country)
|
||||||
|
|
||||||
return address_components, country, language
|
return address_components, country, language
|
||||||
|
|
||||||
def limited(self, address_components, latitude, longitude):
|
def limited(self, address_components, latitude, longitude):
|
||||||
@@ -850,18 +936,18 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
street = address_components.get(AddressFormatter.ROAD)
|
street = address_components.get(AddressFormatter.ROAD)
|
||||||
|
|
||||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
|
||||||
|
|
||||||
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
||||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||||
|
|
||||||
all_languages = set([l['lang'] for l in candidate_languages])
|
all_languages = set([l['lang'] for l in candidate_languages])
|
||||||
|
|
||||||
all_osm_components = osm_components + neighborhoods
|
all_osm_components = osm_components + neighborhoods
|
||||||
|
language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language)
|
||||||
|
|
||||||
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
||||||
|
|
||||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||||
osm_suffix=osm_suffix,
|
language_suffix=language_suffix,
|
||||||
non_local_language=non_local_language,
|
non_local_language=non_local_language,
|
||||||
random_key=False,
|
random_key=False,
|
||||||
always_use_full_names=True)
|
always_use_full_names=True)
|
||||||
@@ -874,8 +960,8 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||||
|
|
||||||
self.add_neighborhoods(address_components, neighborhoods,
|
self.add_neighborhoods(address_components, neighborhoods, language, non_local_language=non_local_language,
|
||||||
osm_suffix=osm_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
self.replace_name_affixes(address_components, non_local_language or language)
|
self.replace_name_affixes(address_components, non_local_language or language)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user