[osm/formatting] Adding per-field drop probabilities to OSM training data to make some fields more likely to be dropped, although it might create more training data

2015-11-30 11:10:07 -05:00
parent c8e4602d4c
commit 9a8ba14887
1 changed files with 15 additions and 1 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -350,6 +350,20 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude):
    return ret


+DROP_PROBABILITIES = {
+    AddressFormatter.HOUSE: 0.8,
+    AddressFormatter.HOUSE_NUMBER: 0.5,
+    AddressFormatter.ROAD: 0.5,
+    AddressFormatter.SUBURB: 1.0,
+    AddressFormatter.CITY_DISTRICT: 1.0,
+    AddressFormatter.CITY: 0.6,
+    AddressFormatter.STATE_DISTRICT: 1.0,
+    AddressFormatter.STATE: 0.8,
+    AddressFormatter.POSTCODE: 0.7,
+    AddressFormatter.COUNTRY: 0.8
+}
+
+
 def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True):
    '''
    Creates formatted address training data for supervised sequence labeling (or potentially 
@@ -655,7 +669,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
            current_components = component_bitset(address_components.keys())

            for component in address_components.keys():
-                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
+                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < DROP_PROBABILITIES[component]:
                    address_components.pop(component)
                    current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                    if not address_components: