[addresses] refactoring place component cleanup into a method that can be reused with the place and ways training data

This commit is contained in:
Al
2017-01-16 20:43:55 -05:00
parent 024a6a40b1
commit 8566cb4054

View File

@@ -610,6 +610,31 @@ class OSMAddressFormatter(object):
return postal_codes return postal_codes
def cleanup_place_components(self, address_components, osm_components, country, language, containing_ids, population=None, keep_component=None, population_from_city=False):
revised_address_components = self.components.dropout_places(address_components, osm_components, country, language, population=population, population_from_city=population_from_city)
if keep_components is not None:
revised_address_components[keep_component] = address_components[keep_component]
self.components.cleanup_boundary_names(revised_address_components)
self.components.country_specific_cleanup(revised_address_components, country)
self.components.drop_invalid_components(revised_address_components, country)
self.components.replace_name_affixes(revised_address_components, language)
self.components.replace_names(revised_address_components)
self.components.remove_numeric_boundary_names(revised_address_components)
cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))
if (AddressFormatter.COUNTRY in revised_address_components or place_config.include_component(AddressFormatter.COUNTRY, containing_ids, country=country, check_population=False)) and random.random() < cldr_country_prob:
address_country = self.components.cldr_country_name(country, language)
if address_country:
revised_address_components[AddressFormatter.COUNTRY] = address_country
return revised_address_components
def node_place_tags(self, tags, city_or_below=False): def node_place_tags(self, tags, city_or_below=False):
try: try:
latitude, longitude = latlon_to_decimal(tags['lat'], tags['lon']) latitude, longitude = latlon_to_decimal(tags['lat'], tags['lon'])
@@ -735,8 +760,6 @@ class OSMAddressFormatter(object):
max_references = 1000 # Cap the number of references e.g. for India and China country nodes max_references = 1000 # Cap the number of references e.g. for India and China country nodes
num_references = min(population / population_divisor + min_references, max_references) num_references = min(population / population_divisor + min_references, max_references)
cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))
component_order = AddressFormatter.component_order[component_name] component_order = AddressFormatter.component_order[component_name]
sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY] sub_city = component_order < AddressFormatter.component_order[AddressFormatter.CITY]
@@ -884,23 +907,7 @@ class OSMAddressFormatter(object):
revised_place_tags = [] revised_place_tags = []
for address_components, language, is_default in place_tags: for address_components, language, is_default in place_tags:
revised_address_components = place_config.dropout_components(address_components, osm_components, country=country, population=population) revised_address_components = self.cleanup_place_components(address_components, osm_components, country, language, containing_ids, population=population, keep_component=component_name)
revised_address_components[component_name] = address_components[component_name]
self.components.cleanup_boundary_names(revised_address_components)
self.components.country_specific_cleanup(revised_address_components, country)
self.components.drop_invalid_components(revised_address_components, country)
self.components.replace_name_affixes(revised_address_components, language)
self.components.replace_names(revised_address_components)
self.components.remove_numeric_boundary_names(revised_address_components)
if (AddressFormatter.COUNTRY in revised_address_components or place_config.include_component(AddressFormatter.COUNTRY, containing_ids, country=country, check_population=False)) and random.random() < cldr_country_prob:
address_country = self.components.cldr_country_name(country, language)
if address_country:
revised_address_components[AddressFormatter.COUNTRY] = address_country
if revised_address_components: if revised_address_components:
revised_place_tags.append((revised_address_components, language, is_default)) revised_place_tags.append((revised_address_components, language, is_default))
@@ -1580,6 +1587,8 @@ class OSMAddressFormatter(object):
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
containing_ids = [(b['type'], b['id']) for b in osm_components]
for lang, vals in six.iteritems(names): for lang, vals in six.iteritems(names):
way_tags = [] way_tags = []
for v, is_base in vals: for v, is_base in vals:
@@ -1592,12 +1601,15 @@ class OSMAddressFormatter(object):
country, lang, country, lang,
latitude, longitude) latitude, longitude)
way_tags.append(address_components) revised_address_components = self.cleanup_place_components(address_components, osm_components, country, lang, containing_ids, population_from_city=True)
way_tags.append(revised_address_components)
normalized = self.abbreviated_street(street_name, lang) normalized = self.abbreviated_street(street_name, lang)
if normalized and normalized != street_name: if normalized and normalized != street_name:
address_components = address_components.copy() revisd_address_components = revised_address_components.copy()
address_components[AddressFormatter.ROAD] = normalized revised_address_components[AddressFormatter.ROAD] = normalized
way_tags.append(revised_address_components)
for address_components in way_tags: for address_components in way_tags:
formatted = self.formatter.format_address(address_components, country, language=lang, formatted = self.formatter.format_address(address_components, country, language=lang,