From b25a7380003849214100bb8a54f22a18e67e62f5 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 6 Dec 2015 16:14:02 -0500 Subject: [PATCH] [osm] Doing more deduping in the OSM training data to avoid confusing the parser when city, state, district all have the same name --- .../geodata/osm/osm_address_training_data.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index ae86e597..6c21fbaa 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -193,7 +193,7 @@ osm_fields = [ ] -REPLACE_COMPONENTS = ( +BOUNDARY_COMPONENTS = ( AddressFormatter.SUBURB, AddressFormatter.CITY_DISTRICT, AddressFormatter.CITY, @@ -614,10 +614,12 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if not name: name = component_value.get(key, component_value.get(raw_key)) - if not name: + existing_city_name = address_components.get(AddressFormatter.CITY) + + if not name or (component != AddressFormatter.CITY and name == existing_city_name): name = component_value.get(name_key, component_value.get(raw_name_key)) - if not name: + if not name or (component != AddressFormatter.CITY and name == existing_city_name): continue if (component, name) not in seen: @@ -706,7 +708,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if place_type == 'borough' or polygon_type == 'local_admin': neighborhood_level = AddressFormatter.CITY_DISTRICT - # Optimization so we don't use Brooklyn for Kings County + # Optimization so we don't use e.g. Brooklyn multiple times city_name = address_components.get(AddressFormatter.CITY) if name == city_name: name = neighborhood.get(name_key, neighborhood.get(raw_name_key)) @@ -725,7 +727,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood Probabilistically strip standard prefixes/suffixes e.g. "London Borough of" ''' - for component in REPLACE_COMPONENTS: + for component in BOUNDARY_COMPONENTS: name = address_components.get(component) if not name: continue @@ -733,6 +735,26 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if replacement != name and random.random() < 0.6: address_components[component] = replacement + ''' + Name deduping + ------------- + + For some cases like "Antwerpen, Antwerpen, Antwerpen" + that are very unlikely to occur in real life. + ''' + + name_components = defaultdict(list) + + for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): + name = address_components.get(component) + if name: + name_components[name].append(component) + + for name, components in name_components.iteritems(): + if len(components) > 1: + for component in components[1:]: + address_components.pop(component, None) + # Version with all components formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)