From af170de0195d9eac2b9235084d1c1c2ebf6bffe3 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 30 Nov 2015 18:35:31 -0500 Subject: [PATCH] [fix] Smaller probabilities on adding neighborhoods and admin polygons, eliminating duplicates on the row level --- scripts/geodata/osm/osm_address_training_data.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 7c4303e8..7709feea 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -615,7 +615,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood seen.add((component, name)) for component, vals in poly_components.iteritems(): - if component not in address_components or non_local_language: + if component not in address_components or non_local_language and random.random() < 0.4: val = u', '.join(vals) if component == AddressFormatter.STATE and random.random() < 0.7: val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) @@ -691,7 +691,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood neighborhood_levels[neighborhood_level].append(name) for component, neighborhoods in neighborhood_levels.iteritems(): - if component not in address_components: + if component not in address_components and random.random() < 0.4: address_components[component] = neighborhoods[0] # Version with all components @@ -701,6 +701,8 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood formatted_addresses = [] formatted_addresses.append(formatted_address) + seen = set([formatted_address]) + address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} if not address_components: continue @@ -722,10 +724,16 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if venue_name: address_components[AddressFormatter.HOUSE] = venue_name formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) - formatted_addresses.append(formatted_address) + if formatted_address not in seen: + formatted_addresses.append(formatted_address) + seen.add(formatted_address) for formatted_address in formatted_addresses: - if formatted_address and formatted_address.strip(): + if not formatted_address: + continue + + formatted_address = formatted_address.strip() + if formatted_address and formatted_add not in seen: formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue