125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import os
|
|
import sys
|
|
import unittest
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
from geodata.i18n.languages import init_languages, get_country_languages, get_regional_languages
|
|
from geodata.language_id.disambiguation import disambiguate_language, street_types_gazetteer, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE
|
|
|
|
|
|
country_test_cases = [
|
|
# String, country, expected language
|
|
('Division Street', 'us', 'en'),
|
|
('Kalfarveien', 'no', 'nb'),
|
|
('Upper Glenburn Road', 'gb', 'en'),
|
|
('Zafer Caddesi', 'cy', 'tr'),
|
|
|
|
# US has some Spanish and French street names
|
|
('Avenue P', 'us', 'en'),
|
|
('Avenue du Champs', 'us', 'fr'),
|
|
('Avenida de la Plata', 'us', 'es'),
|
|
('Pl', 'us', UNKNOWN_LANGUAGE),
|
|
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
|
('E Thetford Rd', 'us', 'en'),
|
|
('El Camino', 'us', 'es'),
|
|
('The El Camino', 'us', 'en'),
|
|
('Via Antiqua Street', 'us', 'en'),
|
|
('Salt Evaporator Plan Road', 'us', 'en'),
|
|
('Calle Las Brisas North', 'us', 'en'),
|
|
('Chateau Estates', 'us', 'en'),
|
|
('Grand Boulevard', 'us', 'en'),
|
|
('Rue Louis Phillippe', 'us', 'fr'),
|
|
('Calle Street', 'us', 'en'),
|
|
('Del Rio Avenue', 'us', 'en'),
|
|
('South Signal Butte Road', 'us', 'en'),
|
|
('Chief All Over', 'us', UNKNOWN_LANGUAGE),
|
|
('South Alameda Street', 'us', 'en'),
|
|
('The Alameda', 'us', 'en'),
|
|
('Rincon Road', 'us', 'en'),
|
|
|
|
# Avenue + stopword
|
|
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
|
|
|
# UAE, English is non-default, has abbreviation
|
|
('128 A St', 'ae', 'en'),
|
|
('128 A St.', 'ae', 'en'),
|
|
|
|
# English / Arabic street address
|
|
('Omar Street شارع عمر', 'iq', AMBIGUOUS_LANGUAGE),
|
|
|
|
# Random script
|
|
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
|
|
|
|
|
|
# Brussels address
|
|
('Avenue Paul Héger - Paul Hégerlaan', 'be', AMBIGUOUS_LANGUAGE),
|
|
('Smaragdstraat', 'be', 'nl'),
|
|
|
|
|
|
# India
|
|
('Kidwai nagar', 'in', 'hi'),
|
|
('Mavoor Rd.', 'in', 'en'),
|
|
|
|
# Sri Lanka
|
|
('Sri Sadathissa Mawatha', 'lk', 'si'),
|
|
|
|
# Russian
|
|
('Фрунзе улица', 'kg', 'ru'),
|
|
]
|
|
|
|
regional_test_cases = [
|
|
# Spain
|
|
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
|
('Avinguda Diagonal', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
|
('Avinguda de Filipines - Avenida de Filipinas', 'es', 'qs_a1r', 'Cataluña/Catalunya', AMBIGUOUS_LANGUAGE),
|
|
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
|
|
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
|
|
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
|
|
('Txurruka', 'es', 'qs_a1r', 'País Vasco/Euskadi', UNKNOWN_LANGUAGE),
|
|
|
|
# Belgium
|
|
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
|
|
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
|
|
|
|
# France / Occitan
|
|
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
|
|
|
|
]
|
|
|
|
|
|
class TestNormalization(unittest.TestCase):
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
init_languages()
|
|
street_types_gazetteer.configure()
|
|
|
|
def test_countries(self):
|
|
for s, country, expected in country_test_cases:
|
|
languages = get_country_languages(country)
|
|
self.assertTrue(bool(languages))
|
|
|
|
lang = disambiguate_language(s, languages.items())
|
|
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
|
|
|
|
def test_regional(self):
|
|
for s, country, k, v, expected in regional_test_cases:
|
|
languages = get_country_languages(country)
|
|
self.assertTrue(bool(languages))
|
|
regional = get_regional_languages(country, k, v)
|
|
self.assertTrue(bool(regional))
|
|
regional.update(languages)
|
|
|
|
lang = disambiguate_language(s, regional.items())
|
|
|
|
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|