[fix] venue names should be removed probabilistically in the training data, giving neighborhoods a slightly better chance of being included

2015-11-30 23:28:12 -05:00
parent 6ef40c1769
commit 8484d4fffd
1 changed files with 2 additions and 2 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -691,7 +691,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
            neighborhood_levels[neighborhood_level].append(name)

        for component, neighborhoods in neighborhood_levels.iteritems():
-            if component not in address_components and random.random() < 0.4:
+            if component not in address_components and random.random() < 0.5:
                address_components[component] = neighborhoods[0]

        # Version with all components
@@ -721,7 +721,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood

                    # Since venue names are 1-per-record, we must use them all
                    for venue_name in (venue_names or [None]):
-                        if venue_name:
+                        if venue_name and AddressFormatter.HOUSE in address_components:
                            address_components[AddressFormatter.HOUSE] = venue_name
                        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
                        if formatted_address not in seen: