[osm] Choosing a language at random in countries with multilingual addresses for the parser training data so we get some monolingual examples
This commit is contained in:
@@ -458,8 +458,17 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
|||||||
language = candidate_languages[0]['lang']
|
language = candidate_languages[0]['lang']
|
||||||
else:
|
else:
|
||||||
street = value.get('addr:street', None)
|
street = value.get('addr:street', None)
|
||||||
if street is not None:
|
|
||||||
|
namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value]
|
||||||
|
|
||||||
|
if street is not None and not namespaced:
|
||||||
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
|
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
|
||||||
|
elif namespaced and random.random() < 0.6:
|
||||||
|
language = random.choice(namespaced)
|
||||||
|
lang_suffix = ':{}'.format(language)
|
||||||
|
for k in value:
|
||||||
|
if k.startswith('addr:') and k.endswith(lang_suffix):
|
||||||
|
value[k.rstrip(lang_suffix)] = value[k]
|
||||||
else:
|
else:
|
||||||
language = UNKNOWN_LANGUAGE
|
language = UNKNOWN_LANGUAGE
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user