[languages] Non-default language canonicals, more test cases

This commit is contained in:
Al
2015-08-24 02:21:53 -04:00
parent c1ce91abbf
commit e1d336716c
2 changed files with 10 additions and 2 deletions

View File

@@ -195,7 +195,8 @@ def disambiguate_language(text, languages):
elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
possible_lang = lang if possible_lang is None or possible_lang == lang else None
if seen_languages and valid and not any((l in seen_languages for l in valid)):
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
(not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
return AMBIGUOUS_LANGUAGE
if len(valid) == 1:

View File

@@ -29,13 +29,20 @@ country_test_cases = [
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
('E Thetford Rd', 'us', 'en'),
('El Camino', 'us', 'es'),
('The El Camino', 'us', 'en'),
('Via Antiqua Street', 'us', 'en'),
('Salt Evaporator Plan Road', 'us', 'en'),
('Calle Las Brisas North', 'us', 'en'),
('Chateau Estates', 'us', 'en'),
('Grand Boulevard', 'us', 'en'),
('Rue Louis Phillippe', 'us', 'fr'),
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
('Calle Street', 'us', 'en'),
('Del Rio Avenue', 'us', 'en'),
('South Signal Butte Road', 'us', 'en'),
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
('South Alameda Street', 'us', 'en'),
('The Alameda', 'us', 'en'),
('Rincon Road', 'us', 'en'),
# Avenue + stopword
('Avenue du Bourget-du-Lac', 'je', 'fr'),