[languages] Better handling of non-default langauge canonicals in default langauge text
This commit is contained in:
@@ -7,6 +7,7 @@ carrera|cra|carra
|
||||
carreró|cró|carrero|cro|carro|carr
|
||||
carretera|ctra
|
||||
cinturó|cint|cinturo
|
||||
diagonal|diag
|
||||
drecera|drec
|
||||
eix
|
||||
eix diagonal
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
all
|
||||
and
|
||||
between|betw|btwn|btw|btween|b/t
|
||||
by
|
||||
|
||||
@@ -38,6 +38,7 @@ broadway|bdwy|bway|bwy|brdway
|
||||
brook|brk
|
||||
brow|brw
|
||||
burg|bg
|
||||
butte|btte|bte
|
||||
bypass|bypa|byps|bps|byp
|
||||
byway|bywy
|
||||
camp|cp
|
||||
@@ -77,6 +78,7 @@ court|ct|crt
|
||||
courts|crts|cts
|
||||
courtyard|cyd|ctyd
|
||||
cove|cov|ce|cv
|
||||
creek|cr|crk
|
||||
crescent|cr|cres|crs|crecent
|
||||
crest|crst|cst
|
||||
crief|crf
|
||||
@@ -147,6 +149,7 @@ gates|gtes
|
||||
gateway|gwy|gway|gtwy|gtway
|
||||
glade|gl|gld|glde
|
||||
glen|gln
|
||||
grand boulevard|gbd|grbd|grdbd|gdbd|g bd|gr bd|grd bd|gd bd|g blvd|gr blvd|grd blvd|gd blvd|g bde|gr bde|grd bde|gd bde|g blvrd|gr blvrd|grd blvrd|gd blvrd|g boul|gr boul|grd boul|gd boul|g bvd|gr bvd|grd bvd|gd bvd|g bld|gr bld|grd bld|gd bld
|
||||
grange|gra
|
||||
green|grn|gn
|
||||
ground|grnd
|
||||
|
||||
@@ -184,9 +184,11 @@ def disambiguate_language(text, languages):
|
||||
continue
|
||||
is_default = valid_languages[lang]
|
||||
|
||||
if (canonical and not stopword) or (is_default and len(potentials) == 1):
|
||||
lang_valid = is_default or not seen_languages or lang in seen_languages
|
||||
|
||||
if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
|
||||
valid.append(lang)
|
||||
elif is_default and num_defaults > 1 and current_lang != lang:
|
||||
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
||||
return AMBIGUOUS_LANGUAGE
|
||||
elif stopword and canonical and not is_default and lang in seen_languages:
|
||||
valid.append(lang)
|
||||
|
||||
@@ -16,10 +16,14 @@ from geodata.language_id.disambiguation import disambiguate_language, street_typ
|
||||
|
||||
country_test_cases = [
|
||||
# String, country, expected language
|
||||
('Division Street', 'us', 'en'),
|
||||
('Kalfarveien', 'no', 'nb'),
|
||||
('Upper Glenburn Road', 'gb', 'en'),
|
||||
('Zafer Caddesi', 'cy', 'tr'),
|
||||
|
||||
# US has some Spanish and French street names
|
||||
('Avenue P', 'us', 'en'),
|
||||
('Avenue du', 'us', 'fr'),
|
||||
('Avenue du Champs', 'us', 'fr'),
|
||||
('Avenida de la Plata', 'us', 'es'),
|
||||
('Pl', 'us', UNKNOWN_LANGUAGE),
|
||||
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
||||
@@ -28,6 +32,10 @@ country_test_cases = [
|
||||
('Rue Louis Phillippe', 'us', 'fr'),
|
||||
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
|
||||
('Del Rio Avenue', 'us', 'en'),
|
||||
('South Signal Butte Road', 'us', 'en'),
|
||||
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
|
||||
('South Alameda Street', 'us', 'en'),
|
||||
('The Alameda', 'us', 'en'),
|
||||
|
||||
# Avenue + stopword
|
||||
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
||||
@@ -39,7 +47,6 @@ country_test_cases = [
|
||||
# English / Arabic street address
|
||||
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
|
||||
|
||||
|
||||
# Random script
|
||||
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
|
||||
|
||||
@@ -63,6 +70,8 @@ country_test_cases = [
|
||||
regional_test_cases = [
|
||||
# Spain
|
||||
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
||||
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
||||
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
|
||||
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
|
||||
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
|
||||
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
|
||||
@@ -72,7 +81,6 @@ regional_test_cases = [
|
||||
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
|
||||
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
|
||||
|
||||
|
||||
# France / Occitan
|
||||
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
|
||||
|
||||
|
||||
Reference in New Issue
Block a user