From 49ac3dc553c29c197ec333dd1f358a54bcf9343c Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Apr 2016 17:23:00 -0400 Subject: [PATCH] [disambiguation] Adding best_country_and_language --- scripts/geodata/language_id/polygon_lookup.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/scripts/geodata/language_id/polygon_lookup.py b/scripts/geodata/language_id/polygon_lookup.py index 351fd549..32281d3a 100644 --- a/scripts/geodata/language_id/polygon_lookup.py +++ b/scripts/geodata/language_id/polygon_lookup.py @@ -1,5 +1,7 @@ import operator +from geodata.language_id.disambiguation import disambiguate_language, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES + def country_and_languages(language_rtree, latitude, longitude): props = language_rtree.point_in_poly(latitude, longitude, return_all=True) @@ -28,3 +30,40 @@ def country_and_languages(language_rtree, latitude, longitude): # it means the default language of the region overrides the country default default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True) return country, default_languages, props + + +def best_country_and_language(language_rtree, latitude, longitude, name): + country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) + if not (country and candidate_languages): + return None, None + + num_langs = len(candidate_languages) + default_langs = set([l['lang'] for l in candidate_languages if l.get('default')]) + num_defaults = len(default_langs) + + regional_defaults = 0 + country_defaults = 0 + regional_langs = set() + country_langs = set() + for p in language_props: + if p['admin_level'] > 0: + regional_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) + regional_langs |= set([l['lang'] for l in p['languages']]) + else: + country_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) + country_langs |= set([l['lang'] for l in p['languages']]) + + if num_langs == 1: + return country, candidate_languages[0]['lang'] + else: + lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages]) + default_lang = candidate_languages[0]['lang'] + + if lang == UNKNOWN_LANGUAGE and num_defaults == 1: + return country, default_lang + elif lang != UNKNOWN_LANGUAGE: + if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES: + return country, UNKNOWN_LANGUAGE + return country, lang + else: + return None, None