[languages] Better handling of non-default langauge canonicals in default langauge text

This commit is contained in:
Al
2015-08-24 01:26:01 -04:00
parent 96d7b990b5
commit c1ce91abbf
5 changed files with 20 additions and 5 deletions

View File

@@ -7,6 +7,7 @@ carrera|cra|carra
carreró|cró|carrero|cro|carro|carr carreró|cró|carrero|cro|carro|carr
carretera|ctra carretera|ctra
cinturó|cint|cinturo cinturó|cint|cinturo
diagonal|diag
drecera|drec drecera|drec
eix eix
eix diagonal eix diagonal

View File

@@ -1,3 +1,4 @@
all
and and
between|betw|btwn|btw|btween|b/t between|betw|btwn|btw|btween|b/t
by by

View File

@@ -38,6 +38,7 @@ broadway|bdwy|bway|bwy|brdway
brook|brk brook|brk
brow|brw brow|brw
burg|bg burg|bg
butte|btte|bte
bypass|bypa|byps|bps|byp bypass|bypa|byps|bps|byp
byway|bywy byway|bywy
camp|cp camp|cp
@@ -77,6 +78,7 @@ court|ct|crt
courts|crts|cts courts|crts|cts
courtyard|cyd|ctyd courtyard|cyd|ctyd
cove|cov|ce|cv cove|cov|ce|cv
creek|cr|crk
crescent|cr|cres|crs|crecent crescent|cr|cres|crs|crecent
crest|crst|cst crest|crst|cst
crief|crf crief|crf
@@ -147,6 +149,7 @@ gates|gtes
gateway|gwy|gway|gtwy|gtway gateway|gwy|gway|gtwy|gtway
glade|gl|gld|glde glade|gl|gld|glde
glen|gln glen|gln
grand boulevard|gbd|grbd|grdbd|gdbd|g bd|gr bd|grd bd|gd bd|g blvd|gr blvd|grd blvd|gd blvd|g bde|gr bde|grd bde|gd bde|g blvrd|gr blvrd|grd blvrd|gd blvrd|g boul|gr boul|grd boul|gd boul|g bvd|gr bvd|grd bvd|gd bvd|g bld|gr bld|grd bld|gd bld
grange|gra grange|gra
green|grn|gn green|grn|gn
ground|grnd ground|grnd

View File

@@ -184,9 +184,11 @@ def disambiguate_language(text, languages):
continue continue
is_default = valid_languages[lang] is_default = valid_languages[lang]
if (canonical and not stopword) or (is_default and len(potentials) == 1): lang_valid = is_default or not seen_languages or lang in seen_languages
if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
valid.append(lang) valid.append(lang)
elif is_default and num_defaults > 1 and current_lang != lang: elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
return AMBIGUOUS_LANGUAGE return AMBIGUOUS_LANGUAGE
elif stopword and canonical and not is_default and lang in seen_languages: elif stopword and canonical and not is_default and lang in seen_languages:
valid.append(lang) valid.append(lang)

View File

@@ -16,10 +16,14 @@ from geodata.language_id.disambiguation import disambiguate_language, street_typ
country_test_cases = [ country_test_cases = [
# String, country, expected language # String, country, expected language
('Division Street', 'us', 'en'),
('Kalfarveien', 'no', 'nb'),
('Upper Glenburn Road', 'gb', 'en'),
('Zafer Caddesi', 'cy', 'tr'),
# US has some Spanish and French street names # US has some Spanish and French street names
('Avenue P', 'us', 'en'), ('Avenue P', 'us', 'en'),
('Avenue du', 'us', 'fr'), ('Avenue du Champs', 'us', 'fr'),
('Avenida de la Plata', 'us', 'es'), ('Avenida de la Plata', 'us', 'es'),
('Pl', 'us', UNKNOWN_LANGUAGE), ('Pl', 'us', UNKNOWN_LANGUAGE),
('No 2 School House', 'us', UNKNOWN_LANGUAGE), ('No 2 School House', 'us', UNKNOWN_LANGUAGE),
@@ -28,6 +32,10 @@ country_test_cases = [
('Rue Louis Phillippe', 'us', 'fr'), ('Rue Louis Phillippe', 'us', 'fr'),
('Calle Street', 'us', AMBIGUOUS_LANGUAGE), ('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
('Del Rio Avenue', 'us', 'en'), ('Del Rio Avenue', 'us', 'en'),
('South Signal Butte Road', 'us', 'en'),
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
('South Alameda Street', 'us', 'en'),
('The Alameda', 'us', 'en'),
# Avenue + stopword # Avenue + stopword
('Avenue du Bourget-du-Lac', 'je', 'fr'), ('Avenue du Bourget-du-Lac', 'je', 'fr'),
@@ -39,7 +47,6 @@ country_test_cases = [
# English / Arabic street address # English / Arabic street address
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE), ('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
# Random script # Random script
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE), ('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
@@ -63,6 +70,8 @@ country_test_cases = [
regional_test_cases = [ regional_test_cases = [
# Spain # Spain
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'), ('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'), ('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'), ('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'), ('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
@@ -72,7 +81,6 @@ regional_test_cases = [
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'), ('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'), ('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
# France / Occitan # France / Occitan
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'), ('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),