[languages] Better handling of non-default langauge canonicals in default langauge text
This commit is contained in:
@@ -7,6 +7,7 @@ carrera|cra|carra
|
|||||||
carreró|cró|carrero|cro|carro|carr
|
carreró|cró|carrero|cro|carro|carr
|
||||||
carretera|ctra
|
carretera|ctra
|
||||||
cinturó|cint|cinturo
|
cinturó|cint|cinturo
|
||||||
|
diagonal|diag
|
||||||
drecera|drec
|
drecera|drec
|
||||||
eix
|
eix
|
||||||
eix diagonal
|
eix diagonal
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
all
|
||||||
and
|
and
|
||||||
between|betw|btwn|btw|btween|b/t
|
between|betw|btwn|btw|btween|b/t
|
||||||
by
|
by
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ broadway|bdwy|bway|bwy|brdway
|
|||||||
brook|brk
|
brook|brk
|
||||||
brow|brw
|
brow|brw
|
||||||
burg|bg
|
burg|bg
|
||||||
|
butte|btte|bte
|
||||||
bypass|bypa|byps|bps|byp
|
bypass|bypa|byps|bps|byp
|
||||||
byway|bywy
|
byway|bywy
|
||||||
camp|cp
|
camp|cp
|
||||||
@@ -77,6 +78,7 @@ court|ct|crt
|
|||||||
courts|crts|cts
|
courts|crts|cts
|
||||||
courtyard|cyd|ctyd
|
courtyard|cyd|ctyd
|
||||||
cove|cov|ce|cv
|
cove|cov|ce|cv
|
||||||
|
creek|cr|crk
|
||||||
crescent|cr|cres|crs|crecent
|
crescent|cr|cres|crs|crecent
|
||||||
crest|crst|cst
|
crest|crst|cst
|
||||||
crief|crf
|
crief|crf
|
||||||
@@ -147,6 +149,7 @@ gates|gtes
|
|||||||
gateway|gwy|gway|gtwy|gtway
|
gateway|gwy|gway|gtwy|gtway
|
||||||
glade|gl|gld|glde
|
glade|gl|gld|glde
|
||||||
glen|gln
|
glen|gln
|
||||||
|
grand boulevard|gbd|grbd|grdbd|gdbd|g bd|gr bd|grd bd|gd bd|g blvd|gr blvd|grd blvd|gd blvd|g bde|gr bde|grd bde|gd bde|g blvrd|gr blvrd|grd blvrd|gd blvrd|g boul|gr boul|grd boul|gd boul|g bvd|gr bvd|grd bvd|gd bvd|g bld|gr bld|grd bld|gd bld
|
||||||
grange|gra
|
grange|gra
|
||||||
green|grn|gn
|
green|grn|gn
|
||||||
ground|grnd
|
ground|grnd
|
||||||
|
|||||||
@@ -184,9 +184,11 @@ def disambiguate_language(text, languages):
|
|||||||
continue
|
continue
|
||||||
is_default = valid_languages[lang]
|
is_default = valid_languages[lang]
|
||||||
|
|
||||||
if (canonical and not stopword) or (is_default and len(potentials) == 1):
|
lang_valid = is_default or not seen_languages or lang in seen_languages
|
||||||
|
|
||||||
|
if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
elif is_default and num_defaults > 1 and current_lang != lang:
|
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
elif stopword and canonical and not is_default and lang in seen_languages:
|
elif stopword and canonical and not is_default and lang in seen_languages:
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
|
|||||||
@@ -16,10 +16,14 @@ from geodata.language_id.disambiguation import disambiguate_language, street_typ
|
|||||||
|
|
||||||
country_test_cases = [
|
country_test_cases = [
|
||||||
# String, country, expected language
|
# String, country, expected language
|
||||||
|
('Division Street', 'us', 'en'),
|
||||||
|
('Kalfarveien', 'no', 'nb'),
|
||||||
|
('Upper Glenburn Road', 'gb', 'en'),
|
||||||
|
('Zafer Caddesi', 'cy', 'tr'),
|
||||||
|
|
||||||
# US has some Spanish and French street names
|
# US has some Spanish and French street names
|
||||||
('Avenue P', 'us', 'en'),
|
('Avenue P', 'us', 'en'),
|
||||||
('Avenue du', 'us', 'fr'),
|
('Avenue du Champs', 'us', 'fr'),
|
||||||
('Avenida de la Plata', 'us', 'es'),
|
('Avenida de la Plata', 'us', 'es'),
|
||||||
('Pl', 'us', UNKNOWN_LANGUAGE),
|
('Pl', 'us', UNKNOWN_LANGUAGE),
|
||||||
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
||||||
@@ -28,6 +32,10 @@ country_test_cases = [
|
|||||||
('Rue Louis Phillippe', 'us', 'fr'),
|
('Rue Louis Phillippe', 'us', 'fr'),
|
||||||
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
|
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
|
||||||
('Del Rio Avenue', 'us', 'en'),
|
('Del Rio Avenue', 'us', 'en'),
|
||||||
|
('South Signal Butte Road', 'us', 'en'),
|
||||||
|
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
|
||||||
|
('South Alameda Street', 'us', 'en'),
|
||||||
|
('The Alameda', 'us', 'en'),
|
||||||
|
|
||||||
# Avenue + stopword
|
# Avenue + stopword
|
||||||
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
||||||
@@ -39,7 +47,6 @@ country_test_cases = [
|
|||||||
# English / Arabic street address
|
# English / Arabic street address
|
||||||
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
|
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
|
||||||
|
|
||||||
|
|
||||||
# Random script
|
# Random script
|
||||||
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
|
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
|
||||||
|
|
||||||
@@ -63,6 +70,8 @@ country_test_cases = [
|
|||||||
regional_test_cases = [
|
regional_test_cases = [
|
||||||
# Spain
|
# Spain
|
||||||
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
||||||
|
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
||||||
|
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
|
||||||
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
|
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
|
||||||
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
|
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
|
||||||
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
|
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
|
||||||
@@ -72,7 +81,6 @@ regional_test_cases = [
|
|||||||
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
|
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
|
||||||
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
|
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
|
||||||
|
|
||||||
|
|
||||||
# France / Occitan
|
# France / Occitan
|
||||||
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
|
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user