From 69a469d9d35b49d5e3a53282ea0821fc135d0765 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 8 Dec 2015 20:38:32 -0500 Subject: [PATCH] [osm] Choosing a language at random in countries with multilingual addresses for the parser training data so we get some monolingual examples --- scripts/geodata/osm/osm_address_training_data.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 3972bdec..5fe9f7c0 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -458,8 +458,17 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood language = candidate_languages[0]['lang'] else: street = value.get('addr:street', None) - if street is not None: + + namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] + + if street is not None and not namespaced: language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) + elif namespaced and random.random() < 0.6: + language = random.choice(namespaced) + lang_suffix = ':{}'.format(language) + for k in value: + if k.startswith('addr:') and k.endswith(lang_suffix): + value[k.rstrip(lang_suffix)] = value[k] else: language = UNKNOWN_LANGUAGE