[addresses] Adding support for the combined fields portion of the address config (Canadian apartment numbers, etc.) Adding a new language suffix chooser which can pick things like ja_rm or ru_Latn in addition to standard language codes for admin boundary names, etc.
This commit is contained in:
@@ -162,7 +162,7 @@ class AddressComponents(object):
|
||||
name_key = boundary_names.name_key(props, component)
|
||||
return name_key, ''.join((name_key, suffix)) if ':' not in name_key else name_key
|
||||
|
||||
def all_names(self, props, languages=None):
|
||||
def all_names(self, props, languages):
|
||||
names = set()
|
||||
for k, v in six.iteritems(props):
|
||||
if k in self.ALL_OSM_NAME_KEYS:
|
||||
@@ -185,7 +185,7 @@ class AddressComponents(object):
|
||||
|
||||
components = defaultdict(set)
|
||||
for props in osm_components:
|
||||
component_names = self.all_names(props, languages=languages)
|
||||
component_names = self.all_names(props, languages or set())
|
||||
names |= component_names
|
||||
|
||||
is_state = False
|
||||
@@ -274,6 +274,39 @@ class AddressComponents(object):
|
||||
self.formatter.aliases.replace(address_components)
|
||||
return address_components
|
||||
|
||||
def combine_fields(self, address_components, language, country=None):
|
||||
combo_config = address_config.get_property('components.combinations', language, country=country, default={})
|
||||
values = []
|
||||
probs = []
|
||||
for k, v in six.iteritems(combo_config):
|
||||
values.append(v)
|
||||
probs.append(v['probability'])
|
||||
|
||||
if not isclose(sum(probs), 1.0):
|
||||
values.append(None)
|
||||
probs.append(1.0 - sum(probs))
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
combo = weighted_choice(values, probs)
|
||||
if combo is not None:
|
||||
components = OrderedDict(combo['components']).keys()
|
||||
if not all((c in address_components for c in components)):
|
||||
return
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
for s in combo['separators']:
|
||||
values.append(s['separator'])
|
||||
probs.append(s['probability'])
|
||||
|
||||
probs = cdf(probs)
|
||||
separator = weighted_choice(values, probs)
|
||||
|
||||
new_label = combo['label']
|
||||
new_value = separator.join([address_components.pop(c) for c in components])
|
||||
address_components[new_label] = new_value
|
||||
|
||||
def generated_type(self, component, existing_components, language, country=None):
|
||||
component_config = address_config.get_property('components.{}'.format(component), language, country=country)
|
||||
if not component_config:
|
||||
@@ -418,20 +451,59 @@ class AddressComponents(object):
|
||||
address_state = None
|
||||
return address_state
|
||||
|
||||
def tag_suffix(self, language, non_local_language, more_than_one_official_language=False):
|
||||
def use_language(self, language, non_local_language, more_than_one_official_language):
|
||||
if non_local_language is not None:
|
||||
osm_suffix = ':{}'.format(non_local_language)
|
||||
elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
||||
osm_suffix = ':{}'.format(language)
|
||||
return non_local_language
|
||||
elif language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
|
||||
return language
|
||||
else:
|
||||
osm_suffix = ''
|
||||
return osm_suffix
|
||||
return None
|
||||
|
||||
def pick_language_suffix(self, osm_components, language, non_local_language, more_than_one_official_language):
|
||||
'''
|
||||
Language suffix
|
||||
---------------
|
||||
|
||||
This captures some variations in languages written with different scripts
|
||||
e.g. language=ja_rm is for Japanese Romaji.
|
||||
|
||||
Pick a language suffix with probability proportional to how often the name is used
|
||||
in the reverse geocoded components. So if only 2/5 components have name:ja_rm listed
|
||||
but 5/5 have either name:ja or just plain name, we would pick standard Japanese (Kanji)
|
||||
with probability .7143 (5/7) and Romaji with probability .2857 (2/7).
|
||||
'''
|
||||
# This captures name variations like "ja_rm" for Japanese Romaji, etc.
|
||||
language_scripts = defaultdict(int)
|
||||
use_language = (non_local_language or language)
|
||||
|
||||
for c in osm_components:
|
||||
for k, v in six.iteritems(c):
|
||||
if ':' not in k:
|
||||
continue
|
||||
splits = k.split(':')
|
||||
if len(splits) > 0 and splits[0] == 'name' and '_' in splits[-1] and splits[-1].split('_', 1)[0] == use_language:
|
||||
language_scripts[splits[-1]] += 1
|
||||
elif k == 'name' or (splits[0] == 'name' and splits[-1]) == use_language:
|
||||
language_scripts[None] += 1
|
||||
|
||||
language_script = None
|
||||
|
||||
if len(language_scripts) > 1:
|
||||
cumulative = float(sum(language_scripts.values()))
|
||||
values = list(language_scripts)
|
||||
probs = cdf([float(c) / cumulative for c in language_scripts.values()])
|
||||
language_script = weighted_choice(values, probs)
|
||||
|
||||
if not language_script and not non_local_language and not more_than_one_official_language:
|
||||
return ''
|
||||
else:
|
||||
return ':{}'.format(language_script or non_local_language or language)
|
||||
|
||||
def add_admin_boundaries(self, address_components,
|
||||
osm_components,
|
||||
country, language,
|
||||
osm_suffix='',
|
||||
non_local_language=None,
|
||||
language_suffix='',
|
||||
random_key=True,
|
||||
always_use_full_names=False,
|
||||
):
|
||||
@@ -451,12 +523,13 @@ class AddressComponents(object):
|
||||
include these qualifiers in the training data.
|
||||
'''
|
||||
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
simple_name_key = 'name:simple'
|
||||
international_name_key = 'int_name'
|
||||
|
||||
if osm_components:
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
|
||||
osm_components = self.categorized_osm_components(country, osm_components)
|
||||
poly_components = defaultdict(list)
|
||||
|
||||
@@ -467,7 +540,7 @@ class AddressComponents(object):
|
||||
|
||||
for component_value in components_values:
|
||||
if random_key:
|
||||
key, raw_key = self.pick_random_name_key(component_value, component, suffix=osm_suffix)
|
||||
key, raw_key = self.pick_random_name_key(component_value, component, suffix=language_suffix)
|
||||
else:
|
||||
key, raw_key = name_key, raw_name_key
|
||||
|
||||
@@ -499,7 +572,7 @@ class AddressComponents(object):
|
||||
val = random.choice(vals)
|
||||
|
||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||
|
||||
address_components[component] = val
|
||||
|
||||
@@ -558,8 +631,7 @@ class AddressComponents(object):
|
||||
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
|
||||
def add_neighborhoods(self, address_components,
|
||||
neighborhoods,
|
||||
osm_suffix=''):
|
||||
neighborhoods, language_suffix=''):
|
||||
'''
|
||||
Neighborhoods
|
||||
-------------
|
||||
@@ -577,7 +649,7 @@ class AddressComponents(object):
|
||||
add_prefix_prob = float(nested_get(self.config, ('neighborhood', 'add_prefix_probability')))
|
||||
add_neighborhood_prob = float(nested_get(self.config, ('neighborhood', 'add_neighborhood_probability')))
|
||||
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, osm_suffix))
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
|
||||
for neighborhood in neighborhoods:
|
||||
@@ -596,7 +668,7 @@ class AddressComponents(object):
|
||||
if not name or name == city_name:
|
||||
continue
|
||||
|
||||
key, raw_key = self.pick_random_name_key(neighborhood, neighborhood_level, suffix=osm_suffix)
|
||||
key, raw_key = self.pick_random_name_key(neighborhood, neighborhood_level, suffix=language_suffix)
|
||||
name = neighborhood.get(key, neighborhood.get(raw_key))
|
||||
|
||||
if not name:
|
||||
@@ -757,6 +829,18 @@ class AddressComponents(object):
|
||||
address_components[AddressFormatter.POSTCODE] = phrase
|
||||
|
||||
def expanded(self, address_components, latitude, longitude, num_floors=None, num_basements=None, zone=None):
|
||||
'''
|
||||
Expanded components
|
||||
-------------------
|
||||
|
||||
Many times in geocoded address data sets, we get only a few components
|
||||
(say street name and house number) plus a lat/lon. There's a lot of information
|
||||
in a lat/lon though, so this method "fills in the blanks" as it were.
|
||||
|
||||
Namely, it calls all the methods above to reverse geocode to a few of the
|
||||
R-tree + point-in-polygon indices passed in at initialization and adds things
|
||||
like admin boundaries, neighborhoods,
|
||||
'''
|
||||
try:
|
||||
latitude, longitude = latlon_to_decimal(latitude, longitude)
|
||||
except Exception:
|
||||
@@ -773,33 +857,33 @@ class AddressComponents(object):
|
||||
language = self.address_language(address_components, candidate_languages)
|
||||
|
||||
non_local_language = self.non_local_language()
|
||||
# If a country already was specified
|
||||
# If a country was already specified
|
||||
self.replace_country_name(address_components, country, non_local_language or language)
|
||||
|
||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
||||
if address_state:
|
||||
address_components[AddressFormatter.STATE] = address_state
|
||||
|
||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
||||
|
||||
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||
|
||||
all_languages = set([l['lang'] for l in candidate_languages])
|
||||
|
||||
all_osm_components = osm_components + neighborhoods
|
||||
language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language)
|
||||
|
||||
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
||||
|
||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||
non_local_language=non_local_language,
|
||||
osm_suffix=osm_suffix)
|
||||
language_suffix=language_suffix)
|
||||
|
||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
||||
if city:
|
||||
address_components[AddressFormatter.CITY] = city
|
||||
|
||||
self.add_neighborhoods(address_components, neighborhoods,
|
||||
osm_suffix=osm_suffix)
|
||||
self.add_neighborhoods(address_components, neighborhoods, language, non_local_language=non_local_language,
|
||||
language_suffix=language_suffix)
|
||||
|
||||
street = address_components.get(AddressFormatter.ROAD)
|
||||
|
||||
@@ -816,6 +900,8 @@ class AddressComponents(object):
|
||||
self.add_sub_building_components(address_components, language, country=country,
|
||||
num_floors=num_floors, num_basements=num_basements, zone=zone)
|
||||
|
||||
self.combine_fields(address_components, language, country=country)
|
||||
|
||||
return address_components, country, language
|
||||
|
||||
def limited(self, address_components, latitude, longitude):
|
||||
@@ -850,18 +936,18 @@ class AddressComponents(object):
|
||||
|
||||
street = address_components.get(AddressFormatter.ROAD)
|
||||
|
||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
||||
|
||||
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||
|
||||
all_languages = set([l['lang'] for l in candidate_languages])
|
||||
|
||||
all_osm_components = osm_components + neighborhoods
|
||||
language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language)
|
||||
|
||||
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
||||
|
||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||
osm_suffix=osm_suffix,
|
||||
language_suffix=language_suffix,
|
||||
non_local_language=non_local_language,
|
||||
random_key=False,
|
||||
always_use_full_names=True)
|
||||
@@ -874,8 +960,8 @@ class AddressComponents(object):
|
||||
|
||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||
|
||||
self.add_neighborhoods(address_components, neighborhoods,
|
||||
osm_suffix=osm_suffix)
|
||||
self.add_neighborhoods(address_components, neighborhoods, language, non_local_language=non_local_language,
|
||||
language_suffix=language_suffix)
|
||||
|
||||
self.replace_name_affixes(address_components, non_local_language or language)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user