[osm] Doing more deduping in the OSM training data to avoid confusing the parser when city, state, district all have the same name
This commit is contained in:
@@ -193,7 +193,7 @@ osm_fields = [
|
||||
]
|
||||
|
||||
|
||||
REPLACE_COMPONENTS = (
|
||||
BOUNDARY_COMPONENTS = (
|
||||
AddressFormatter.SUBURB,
|
||||
AddressFormatter.CITY_DISTRICT,
|
||||
AddressFormatter.CITY,
|
||||
@@ -614,10 +614,12 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
||||
if not name:
|
||||
name = component_value.get(key, component_value.get(raw_key))
|
||||
|
||||
if not name:
|
||||
existing_city_name = address_components.get(AddressFormatter.CITY)
|
||||
|
||||
if not name or (component != AddressFormatter.CITY and name == existing_city_name):
|
||||
name = component_value.get(name_key, component_value.get(raw_name_key))
|
||||
|
||||
if not name:
|
||||
if not name or (component != AddressFormatter.CITY and name == existing_city_name):
|
||||
continue
|
||||
|
||||
if (component, name) not in seen:
|
||||
@@ -706,7 +708,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
||||
if place_type == 'borough' or polygon_type == 'local_admin':
|
||||
neighborhood_level = AddressFormatter.CITY_DISTRICT
|
||||
|
||||
# Optimization so we don't use Brooklyn for Kings County
|
||||
# Optimization so we don't use e.g. Brooklyn multiple times
|
||||
city_name = address_components.get(AddressFormatter.CITY)
|
||||
if name == city_name:
|
||||
name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
|
||||
@@ -725,7 +727,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
||||
|
||||
Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
|
||||
'''
|
||||
for component in REPLACE_COMPONENTS:
|
||||
for component in BOUNDARY_COMPONENTS:
|
||||
name = address_components.get(component)
|
||||
if not name:
|
||||
continue
|
||||
@@ -733,6 +735,26 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
||||
if replacement != name and random.random() < 0.6:
|
||||
address_components[component] = replacement
|
||||
|
||||
'''
|
||||
Name deduping
|
||||
-------------
|
||||
|
||||
For some cases like "Antwerpen, Antwerpen, Antwerpen"
|
||||
that are very unlikely to occur in real life.
|
||||
'''
|
||||
|
||||
name_components = defaultdict(list)
|
||||
|
||||
for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
|
||||
name = address_components.get(component)
|
||||
if name:
|
||||
name_components[name].append(component)
|
||||
|
||||
for name, components in name_components.iteritems():
|
||||
if len(components) > 1:
|
||||
for component in components[1:]:
|
||||
address_components.pop(component, None)
|
||||
|
||||
# Version with all components
|
||||
formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user