[fix] Smaller probabilities on adding neighborhoods and admin polygons, eliminating duplicates on the row level

This commit is contained in:
Al
2015-11-30 18:35:31 -05:00
parent 621fd79002
commit af170de019

View File

@@ -615,7 +615,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
seen.add((component, name))
for component, vals in poly_components.iteritems():
if component not in address_components or non_local_language:
if component not in address_components or non_local_language and random.random() < 0.4:
val = u', '.join(vals)
if component == AddressFormatter.STATE and random.random() < 0.7:
val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
@@ -691,7 +691,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
neighborhood_levels[neighborhood_level].append(name)
for component, neighborhoods in neighborhood_levels.iteritems():
if component not in address_components:
if component not in address_components and random.random() < 0.4:
address_components[component] = neighborhoods[0]
# Version with all components
@@ -701,6 +701,8 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
formatted_addresses = []
formatted_addresses.append(formatted_address)
seen = set([formatted_address])
address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
if not address_components:
continue
@@ -722,10 +724,16 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
if venue_name:
address_components[AddressFormatter.HOUSE] = venue_name
formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
formatted_addresses.append(formatted_address)
if formatted_address not in seen:
formatted_addresses.append(formatted_address)
seen.add(formatted_address)
for formatted_address in formatted_addresses:
if formatted_address and formatted_address.strip():
if not formatted_address:
continue
formatted_address = formatted_address.strip()
if formatted_address and formatted_add not in seen:
formatted_address = tsv_string(formatted_address)
if not formatted_address or not formatted_address.strip():
continue