diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index c6ed5467..8fbc1ae1 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -195,7 +195,8 @@ def disambiguate_language(text, languages): elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1: possible_lang = lang if possible_lang is None or possible_lang == lang else None - if seen_languages and valid and not any((l in seen_languages for l in valid)): + if seen_languages and valid and not any((l in seen_languages for l in valid)) and \ + (not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))): return AMBIGUOUS_LANGUAGE if len(valid) == 1: diff --git a/scripts/geodata/tests/test_disambiguation.py b/scripts/geodata/tests/test_disambiguation.py index 68897bf4..d9e90657 100644 --- a/scripts/geodata/tests/test_disambiguation.py +++ b/scripts/geodata/tests/test_disambiguation.py @@ -29,13 +29,20 @@ country_test_cases = [ ('No 2 School House', 'us', UNKNOWN_LANGUAGE), ('E Thetford Rd', 'us', 'en'), ('El Camino', 'us', 'es'), + ('The El Camino', 'us', 'en'), + ('Via Antiqua Street', 'us', 'en'), + ('Salt Evaporator Plan Road', 'us', 'en'), + ('Calle Las Brisas North', 'us', 'en'), + ('Chateau Estates', 'us', 'en'), + ('Grand Boulevard', 'us', 'en'), ('Rue Louis Phillippe', 'us', 'fr'), - ('Calle Street', 'us', AMBIGUOUS_LANGUAGE), + ('Calle Street', 'us', 'en'), ('Del Rio Avenue', 'us', 'en'), ('South Signal Butte Road', 'us', 'en'), ('Chief All Over', 'us', UNKNOWN_LANGUAGE), ('South Alameda Street', 'us', 'en'), ('The Alameda', 'us', 'en'), + ('Rincon Road', 'us', 'en'), # Avenue + stopword ('Avenue du Bourget-du-Lac', 'je', 'fr'),