diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 5fe9f7c0..88dac22a 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -635,8 +635,13 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood seen.add((component, name)) for component, vals in poly_components.iteritems(): - if component not in address_components or non_local_language and random.random() < 0.4: - val = u', '.join(vals) + if component not in address_components or (non_local_language and random.random() < 0.4): + if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5: + num = random.randrange(1, len(vals) + 1) + val = u', '.join(vals[:num]) + else: + val = random.choice(vals) + if component == AddressFormatter.STATE and random.random() < 0.7: val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) address_components[component] = val