diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index d47aa679..bce352a7 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -184,10 +184,12 @@ def disambiguate_language(text, languages): continue is_default = valid_languages[lang] - if canonical or (is_default and len(potentials) == 1): + if (canonical and not stopword) or (is_default and len(potentials) == 1): valid.append(lang) elif is_default and num_defaults > 1 and current_lang != lang: return AMBIGUOUS_LANGUAGE + elif stopword and canonical and not is_default and lang in seen_languages: + valid.append(lang) elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1: possible_lang = lang if possible_lang is None or possible_lang == lang else None diff --git a/scripts/geodata/tests/test_disambiguation.py b/scripts/geodata/tests/test_disambiguation.py index 587106f2..6bea6d0f 100644 --- a/scripts/geodata/tests/test_disambiguation.py +++ b/scripts/geodata/tests/test_disambiguation.py @@ -27,6 +27,7 @@ country_test_cases = [ ('El Camino', 'us', 'es'), ('Rue Louis Phillippe', 'us', 'fr'), ('Calle Street', 'us', AMBIGUOUS_LANGUAGE), + ('Del Rio Avenue', 'us', 'en'), # Avenue + stopword ('Avenue du Bourget-du-Lac', 'je', 'fr'),