From 5af95ee613a2b3c193e4b8ec1f44d08e4b65777e Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 3 Dec 2015 18:00:05 -0500 Subject: [PATCH] [osm] Adding GeoNames abbreviated city names in a small percentage of cases to get variations like NYC, BK, SF, etc. in the training data --- scripts/geodata/osm/osm_address_training_data.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 774c05fe..9cccc2e8 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -642,7 +642,13 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood if not names or lang not in names: continue - city = names[lang][0][0] + if 'abbr' not in names or non_local_language: + # Use the common city name in the target language + city = names[lang][0][0] + elif random.random() < 0.1: + # Use an abbreviation: NYC, BK, SF, etc. + city = random.choice(names['abbr'])[0] + if not city or not city.strip(): continue address_components[AddressFormatter.CITY] = city