[languages] Better handling of non-default langauge canonicals in default langauge text

This commit is contained in:
Al
2015-08-24 01:26:01 -04:00
parent 96d7b990b5
commit c1ce91abbf
5 changed files with 20 additions and 5 deletions

View File

@@ -7,6 +7,7 @@ carrera|cra|carra
carreró|cró|carrero|cro|carro|carr
carretera|ctra
cinturó|cint|cinturo
diagonal|diag
drecera|drec
eix
eix diagonal

View File

@@ -1,3 +1,4 @@
all
and
between|betw|btwn|btw|btween|b/t
by

View File

@@ -38,6 +38,7 @@ broadway|bdwy|bway|bwy|brdway
brook|brk
brow|brw
burg|bg
butte|btte|bte
bypass|bypa|byps|bps|byp
byway|bywy
camp|cp
@@ -77,6 +78,7 @@ court|ct|crt
courts|crts|cts
courtyard|cyd|ctyd
cove|cov|ce|cv
creek|cr|crk
crescent|cr|cres|crs|crecent
crest|crst|cst
crief|crf
@@ -147,6 +149,7 @@ gates|gtes
gateway|gwy|gway|gtwy|gtway
glade|gl|gld|glde
glen|gln
grand boulevard|gbd|grbd|grdbd|gdbd|g bd|gr bd|grd bd|gd bd|g blvd|gr blvd|grd blvd|gd blvd|g bde|gr bde|grd bde|gd bde|g blvrd|gr blvrd|grd blvrd|gd blvrd|g boul|gr boul|grd boul|gd boul|g bvd|gr bvd|grd bvd|gd bvd|g bld|gr bld|grd bld|gd bld
grange|gra
green|grn|gn
ground|grnd

View File

@@ -184,9 +184,11 @@ def disambiguate_language(text, languages):
continue
is_default = valid_languages[lang]
if (canonical and not stopword) or (is_default and len(potentials) == 1):
lang_valid = is_default or not seen_languages or lang in seen_languages
if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
valid.append(lang)
elif is_default and num_defaults > 1 and current_lang != lang:
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
return AMBIGUOUS_LANGUAGE
elif stopword and canonical and not is_default and lang in seen_languages:
valid.append(lang)

View File

@@ -16,10 +16,14 @@ from geodata.language_id.disambiguation import disambiguate_language, street_typ
country_test_cases = [
# String, country, expected language
('Division Street', 'us', 'en'),
('Kalfarveien', 'no', 'nb'),
('Upper Glenburn Road', 'gb', 'en'),
('Zafer Caddesi', 'cy', 'tr'),
# US has some Spanish and French street names
('Avenue P', 'us', 'en'),
('Avenue du', 'us', 'fr'),
('Avenue du Champs', 'us', 'fr'),
('Avenida de la Plata', 'us', 'es'),
('Pl', 'us', UNKNOWN_LANGUAGE),
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
@@ -28,6 +32,10 @@ country_test_cases = [
('Rue Louis Phillippe', 'us', 'fr'),
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
('Del Rio Avenue', 'us', 'en'),
('South Signal Butte Road', 'us', 'en'),
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
('South Alameda Street', 'us', 'en'),
('The Alameda', 'us', 'en'),
# Avenue + stopword
('Avenue du Bourget-du-Lac', 'je', 'fr'),
@@ -39,7 +47,6 @@ country_test_cases = [
# English / Arabic street address
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
# Random script
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
@@ -63,6 +70,8 @@ country_test_cases = [
regional_test_cases = [
# Spain
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
@@ -72,7 +81,6 @@ regional_test_cases = [
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
# France / Occitan
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),