From aeb72d7d26c45c6a9fa3edc330a9033b8f0cf8ef Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 9 Dec 2015 00:20:20 -0500 Subject: [PATCH] [osm] Randomly select up to n components for state_district OSM boundaries. For all other fields select one name at random --- scripts/geodata/osm/osm_address_training_data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 5fe9f7c0..88dac22a 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -635,8 +635,13 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood seen.add((component, name)) for component, vals in poly_components.iteritems(): - if component not in address_components or non_local_language and random.random() < 0.4: - val = u', '.join(vals) + if component not in address_components or (non_local_language and random.random() < 0.4): + if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5: + num = random.randrange(1, len(vals) + 1) + val = u', '.join(vals[:num]) + else: + val = random.choice(vals) + if component == AddressFormatter.STATE and random.random() < 0.7: val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) address_components[component] = val