[languages] Language disambiguation treats the national languages as non-default
This commit is contained in:
@@ -18,7 +18,7 @@ from geodata.text.tokenize import tokenize
|
|||||||
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
||||||
|
|
||||||
# For toponyms, we want to limit the countries we consider to those where
|
# For toponyms, we want to limit the countries we consider to those where
|
||||||
# we the place names can themselves be considered training examples of the language
|
# the place names can themselves be considered training examples of the language
|
||||||
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
|
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
|
||||||
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
|
'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
|
||||||
'fr': set(['fr']),
|
'fr': set(['fr']),
|
||||||
|
|||||||
@@ -10,11 +10,17 @@ def country_and_languages(language_rtree, latitude, longitude):
|
|||||||
languages = []
|
languages = []
|
||||||
language_set = set()
|
language_set = set()
|
||||||
|
|
||||||
|
have_regional = False
|
||||||
|
|
||||||
for p in props:
|
for p in props:
|
||||||
for l in p['languages']:
|
for l in p['languages']:
|
||||||
lang = l['lang']
|
lang = l['lang']
|
||||||
if lang not in language_set:
|
if lang not in language_set:
|
||||||
language_set.add(lang)
|
language_set.add(lang)
|
||||||
|
if p['admin_level'] > 0:
|
||||||
|
have_regional = True
|
||||||
|
elif have_regional:
|
||||||
|
l = {'lang': l['lang'], 'default': 0}
|
||||||
languages.append(l)
|
languages.append(l)
|
||||||
|
|
||||||
# Python's builtin sort is stable, so if there are two defaults, the first remains first
|
# Python's builtin sort is stable, so if there are two defaults, the first remains first
|
||||||
|
|||||||
Reference in New Issue
Block a user