[fix] ISO code and simple/international name checks should be on the polygons

This commit is contained in:
Al
2015-11-23 14:30:38 -05:00
parent eb7488ab55
commit e46e1a93a0

View File

@@ -392,7 +392,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
remove_keys = OSM_IGNORE_KEYS
for key, value, deps in parse_osm(infile):
for node_id, value, deps in parse_osm(infile):
try:
latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
except Exception:
@@ -526,49 +526,50 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
poly_components = defaultdict(list)
for component, values in osm_components.iteritems():
for component, components_values in osm_components.iteritems():
seen = set()
# Choose which name to use with given probabilities
r = random.random()
if iso_code_key in value and r < 0.3:
if r < 0.1 and iso_code3_key in value:
key = iso_code3_key
else:
key = iso_code_key
elif language == 'en' and not non_local_language and r < 0.7:
# Particularly to address the US (prefer United States,
# not United States of America) but may capture variations
# in other English-speaking countries as well.
if simple_name_key in value:
key = raw_key = simple_name_key
elif international_name_key in value:
key = raw_key = international_name_key
else:
key = name_key
raw_key = raw_name_key
elif r < 0.1:
# 10% of the time use the short name
key = short_name_key
raw_key = raw_short_name_key
elif r < 0.2:
# 10% of the time use the official name
key = official_name_key
raw_key = raw_official_name_key
elif r < 0.3:
# 10% of the time use the official name
key = alt_name_key
raw_key = raw_alt_name_key
else:
if r < 0.7:
# 70% of the time use the name tag
key = name_key
raw_key = raw_name_key
elif r < 0.8:
# 10% of the time use the short name
key = short_name_key
raw_key = raw_short_name_key
elif r < 0.9:
# 10% of the time use the official name
key = official_name_key
raw_key = raw_official_name_key
else:
# 10% of the time use the official name
key = alt_name_key
raw_key = raw_alt_name_key
for value in values:
name = value.get(key, value.get(raw_key))
for component_value in components_values:
r = random.random()
name = None
if iso_code3_key in component_value and r < 0.1:
name = component_value[iso_code3_key]
elif iso_code_key in component_value and r < 0.3:
name = component_value[iso_code_key]
elif language == 'en' and not non_local_language and r < 0.7:
# Particularly to address the US (prefer United States,
# not United States of America) but may capture variations
# in other English-speaking countries as well.
if simple_name_key in component_value:
name = component_value[simple_name_key]
elif international_name_key in component_value:
name = component_value[international_name_key]
if not name:
name = value.get(name_key, value.get(raw_name_key))
name = component_value.get(key, component_value.get(raw_key))
if not name:
name = component_value.get(name_key, component_value.get(raw_name_key))
if not name:
continue