From f8a0463aa093eda56e60c5484b739bec16dcb4a2 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Jan 2016 15:09:51 -0500 Subject: [PATCH] [languages] Language disambiguation treats the national languages as non-default --- scripts/geodata/language_id/disambiguation.py | 2 +- scripts/geodata/language_id/polygon_lookup.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 6023c82c..425b7dbc 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -18,7 +18,7 @@ from geodata.text.tokenize import tokenize WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt']) # For toponyms, we want to limit the countries we consider to those where -# we the place names can themselves be considered training examples of the language +# the place names can themselves be considered training examples of the language WELL_REPRESENTED_LANGUAGE_COUNTRIES = { 'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']), 'fr': set(['fr']), diff --git a/scripts/geodata/language_id/polygon_lookup.py b/scripts/geodata/language_id/polygon_lookup.py index f2642d6a..eb3879cf 100644 --- a/scripts/geodata/language_id/polygon_lookup.py +++ b/scripts/geodata/language_id/polygon_lookup.py @@ -10,11 +10,17 @@ def country_and_languages(language_rtree, latitude, longitude): languages = [] language_set = set() + have_regional = False + for p in props: for l in p['languages']: lang = l['lang'] if lang not in language_set: language_set.add(lang) + if p['admin_level'] > 0: + have_regional = True + elif have_regional: + l = {'lang': l['lang'], 'default': 0} languages.append(l) # Python's builtin sort is stable, so if there are two defaults, the first remains first