[fix] venue names should be removed probabilistically in the training data, giving neighborhoods a slightly better chance of being included

This commit is contained in:
Al
2015-11-30 23:28:12 -05:00
parent 6ef40c1769
commit 8484d4fffd

View File

@@ -691,7 +691,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
neighborhood_levels[neighborhood_level].append(name) neighborhood_levels[neighborhood_level].append(name)
for component, neighborhoods in neighborhood_levels.iteritems(): for component, neighborhoods in neighborhood_levels.iteritems():
if component not in address_components and random.random() < 0.4: if component not in address_components and random.random() < 0.5:
address_components[component] = neighborhoods[0] address_components[component] = neighborhoods[0]
# Version with all components # Version with all components
@@ -721,7 +721,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
# Since venue names are 1-per-record, we must use them all # Since venue names are 1-per-record, we must use them all
for venue_name in (venue_names or [None]): for venue_name in (venue_names or [None]):
if venue_name: if venue_name and AddressFormatter.HOUSE in address_components:
address_components[AddressFormatter.HOUSE] = venue_name address_components[AddressFormatter.HOUSE] = venue_name
formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
if formatted_address not in seen: if formatted_address not in seen: