diff --git a/resources/dictionaries/ca/street_types.txt b/resources/dictionaries/ca/street_types.txt index 00513454..78040908 100644 --- a/resources/dictionaries/ca/street_types.txt +++ b/resources/dictionaries/ca/street_types.txt @@ -7,6 +7,7 @@ carrera|cra|carra carreró|cró|carrero|cro|carro|carr carretera|ctra cinturó|cint|cinturo +diagonal|diag drecera|drec eix eix diagonal diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index 80d8e611..ade90bdd 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -1,3 +1,4 @@ +all and between|betw|btwn|btw|btween|b/t by diff --git a/resources/dictionaries/en/street_types.txt b/resources/dictionaries/en/street_types.txt index fd36c877..621d53aa 100644 --- a/resources/dictionaries/en/street_types.txt +++ b/resources/dictionaries/en/street_types.txt @@ -38,6 +38,7 @@ broadway|bdwy|bway|bwy|brdway brook|brk brow|brw burg|bg +butte|btte|bte bypass|bypa|byps|bps|byp byway|bywy camp|cp @@ -77,6 +78,7 @@ court|ct|crt courts|crts|cts courtyard|cyd|ctyd cove|cov|ce|cv +creek|cr|crk crescent|cr|cres|crs|crecent crest|crst|cst crief|crf @@ -147,6 +149,7 @@ gates|gtes gateway|gwy|gway|gtwy|gtway glade|gl|gld|glde glen|gln +grand boulevard|gbd|grbd|grdbd|gdbd|g bd|gr bd|grd bd|gd bd|g blvd|gr blvd|grd blvd|gd blvd|g bde|gr bde|grd bde|gd bde|g blvrd|gr blvrd|grd blvrd|gd blvrd|g boul|gr boul|grd boul|gd boul|g bvd|gr bvd|grd bvd|gd bvd|g bld|gr bld|grd bld|gd bld grange|gra green|grn|gn ground|grnd diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index bce352a7..c6ed5467 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -184,9 +184,11 @@ def disambiguate_language(text, languages): continue is_default = valid_languages[lang] - if (canonical and not stopword) or (is_default and len(potentials) == 1): + lang_valid = is_default or not seen_languages or lang in seen_languages + + if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)): valid.append(lang) - elif is_default and num_defaults > 1 and current_lang != lang: + elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang: return AMBIGUOUS_LANGUAGE elif stopword and canonical and not is_default and lang in seen_languages: valid.append(lang) diff --git a/scripts/geodata/tests/test_disambiguation.py b/scripts/geodata/tests/test_disambiguation.py index 6bea6d0f..68897bf4 100644 --- a/scripts/geodata/tests/test_disambiguation.py +++ b/scripts/geodata/tests/test_disambiguation.py @@ -16,10 +16,14 @@ from geodata.language_id.disambiguation import disambiguate_language, street_typ country_test_cases = [ # String, country, expected language + ('Division Street', 'us', 'en'), + ('Kalfarveien', 'no', 'nb'), + ('Upper Glenburn Road', 'gb', 'en'), + ('Zafer Caddesi', 'cy', 'tr'), # US has some Spanish and French street names ('Avenue P', 'us', 'en'), - ('Avenue du', 'us', 'fr'), + ('Avenue du Champs', 'us', 'fr'), ('Avenida de la Plata', 'us', 'es'), ('Pl', 'us', UNKNOWN_LANGUAGE), ('No 2 School House', 'us', UNKNOWN_LANGUAGE), @@ -28,6 +32,10 @@ country_test_cases = [ ('Rue Louis Phillippe', 'us', 'fr'), ('Calle Street', 'us', AMBIGUOUS_LANGUAGE), ('Del Rio Avenue', 'us', 'en'), + ('South Signal Butte Road', 'us', 'en'), + ('Chief All Over', 'us', UNKNOWN_LANGUAGE), + ('South Alameda Street', 'us', 'en'), + ('The Alameda', 'us', 'en'), # Avenue + stopword ('Avenue du Bourget-du-Lac', 'je', 'fr'), @@ -39,7 +47,6 @@ country_test_cases = [ # English / Arabic street address ('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE), - # Random script ('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE), @@ -63,6 +70,8 @@ country_test_cases = [ regional_test_cases = [ # Spain ('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'), + ('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'), + ('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE), ('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'), ('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'), ('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'), @@ -72,7 +81,6 @@ regional_test_cases = [ ('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'), ('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'), - # France / Occitan ('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),