[addresses] Adding toponym abbreviation to the input admin components as well as those obtained through reverse geocoding. Also was doing two random tests before abbreviating toponyms, reducing their frequency in the training data, now correctly using a single test.
This commit is contained in:
@@ -733,6 +733,22 @@ class AddressComponents(object):
|
||||
return True
|
||||
return False
|
||||
|
||||
def abbreviate_admin_components(self, address_components, country, language, hyphenation=True):
|
||||
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
||||
abbreviate_toponym_prob = float(nested_get(self.config, ('boundaries', 'abbreviate_toponym_probability')))
|
||||
|
||||
for component, val in six.iteritems(address_components):
|
||||
if component not in AddressFormatter.BOUNDARY_COMPONENTS:
|
||||
continue
|
||||
|
||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||
else:
|
||||
val = abbreviate(toponym_abbreviations_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob)
|
||||
if hyphenation:
|
||||
val = self.name_hyphens(val)
|
||||
address_components[component] = val
|
||||
|
||||
def add_admin_boundaries(self, address_components,
|
||||
osm_components,
|
||||
country, language,
|
||||
@@ -815,10 +831,10 @@ class AddressComponents(object):
|
||||
poly_components[component].append(name)
|
||||
seen.add((component, name))
|
||||
|
||||
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
||||
join_state_district_prob = float(nested_get(self.config, ('state_district', 'join_probability')))
|
||||
replace_with_non_local_prob = float(nested_get(self.config, ('languages', 'replace_non_local_probability')))
|
||||
abbreviate_toponym_prob = float(nested_get(self.config, ('boundaries', 'abbreviate_toponym_probability')))
|
||||
|
||||
new_admin_components = {}
|
||||
|
||||
for component, vals in poly_components.iteritems():
|
||||
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
||||
@@ -831,13 +847,11 @@ class AddressComponents(object):
|
||||
else:
|
||||
val = random.choice(vals)
|
||||
|
||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||
elif random.random() < abbreviate_toponym_prob:
|
||||
val = abbreviate(toponym_abbreviations_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob)
|
||||
else:
|
||||
val = self.name_hyphens(val)
|
||||
address_components[component] = val
|
||||
new_admin_components[component] = val
|
||||
|
||||
self.abbreviate_admin_components(new_admin_components)
|
||||
|
||||
address_components.update(new_admin_components)
|
||||
|
||||
def quattroshapes_city(self, address_components,
|
||||
latitude, longitude,
|
||||
@@ -1387,14 +1401,6 @@ class AddressComponents(object):
|
||||
except Exception:
|
||||
return None, None, None
|
||||
|
||||
if hyphenation:
|
||||
for component in address_components:
|
||||
if component in place_config.ADMIN_COMPONENTS:
|
||||
value = address_components[component]
|
||||
value_hyphens = self.name_hyphens(value)
|
||||
if value_hyphens != value:
|
||||
address_components[component] = value_hyphens
|
||||
|
||||
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
|
||||
country, candidate_languages = self.osm_country_and_languages(osm_components)
|
||||
if not (country and candidate_languages):
|
||||
@@ -1416,6 +1422,8 @@ class AddressComponents(object):
|
||||
else:
|
||||
language_suffix = ':{}'.format(language)
|
||||
|
||||
self.abbreviate_admin_components(address_components, country, language, hyphenation=hyphenation)
|
||||
|
||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
||||
if address_state:
|
||||
address_components[AddressFormatter.STATE] = address_state
|
||||
|
||||
Reference in New Issue
Block a user