[languages] Non-default language canonicals, more test cases
This commit is contained in:
@@ -195,7 +195,8 @@ def disambiguate_language(text, languages):
|
|||||||
elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
|
elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
|
||||||
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||||
|
|
||||||
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
|
||||||
|
(not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
|
|
||||||
if len(valid) == 1:
|
if len(valid) == 1:
|
||||||
|
|||||||
@@ -29,13 +29,20 @@ country_test_cases = [
|
|||||||
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
||||||
('E Thetford Rd', 'us', 'en'),
|
('E Thetford Rd', 'us', 'en'),
|
||||||
('El Camino', 'us', 'es'),
|
('El Camino', 'us', 'es'),
|
||||||
|
('The El Camino', 'us', 'en'),
|
||||||
|
('Via Antiqua Street', 'us', 'en'),
|
||||||
|
('Salt Evaporator Plan Road', 'us', 'en'),
|
||||||
|
('Calle Las Brisas North', 'us', 'en'),
|
||||||
|
('Chateau Estates', 'us', 'en'),
|
||||||
|
('Grand Boulevard', 'us', 'en'),
|
||||||
('Rue Louis Phillippe', 'us', 'fr'),
|
('Rue Louis Phillippe', 'us', 'fr'),
|
||||||
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
|
('Calle Street', 'us', 'en'),
|
||||||
('Del Rio Avenue', 'us', 'en'),
|
('Del Rio Avenue', 'us', 'en'),
|
||||||
('South Signal Butte Road', 'us', 'en'),
|
('South Signal Butte Road', 'us', 'en'),
|
||||||
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
|
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
|
||||||
('South Alameda Street', 'us', 'en'),
|
('South Alameda Street', 'us', 'en'),
|
||||||
('The Alameda', 'us', 'en'),
|
('The Alameda', 'us', 'en'),
|
||||||
|
('Rincon Road', 'us', 'en'),
|
||||||
|
|
||||||
# Avenue + stopword
|
# Avenue + stopword
|
||||||
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
||||||
|
|||||||
Reference in New Issue
Block a user