#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import os import sys import unittest this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.i18n.languages import init_languages, get_country_languages, get_regional_languages from geodata.language_id.disambiguation import disambiguate_language, street_types_gazetteer, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE country_test_cases = [ # String, country, expected language # US has some Spanish and French street names ('Avenue P', 'us', 'en'), ('Avenue du', 'us', 'fr'), ('Avenida de la Plata', 'us', 'es'), ('Pl', 'us', UNKNOWN_LANGUAGE), ('No 2 School House', 'us', UNKNOWN_LANGUAGE), ('E Thetford Rd', 'us', 'en'), ('El Camino', 'us', 'es'), ('Rue Louis Phillippe', 'us', 'fr'), ('Calle Street', 'us', AMBIGUOUS_LANGUAGE), # Avenue + stopword ('Avenue du Bourget-du-Lac', 'je', 'fr'), # UAE, English is non-default, has abbreviation ('128 A St', 'ae', 'en'), ('128 A St.', 'ae', 'en'), # English / Arabic street address ('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE), # Random script ('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE), # Brussels address ('Avenue Paul Héger - Paul Hégerlaan', 'be', AMBIGUOUS_LANGUAGE), ('Smaragdstraat', 'be', 'nl'), # India ('Kidwai nagar', 'in', 'hi'), ('Mavoor Rd.', 'in', 'en'), # Sri Lanka ('Sri Sadathissa Mawatha', 'lk', 'si'), # Russian ('Фрунзе улица', 'kg', 'ru'), ] regional_test_cases = [ # Spain ('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'), ('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'), ('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'), ('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'), ('Txurruka', 'es', 'qs_a1r', 'País Vasco/Euskadi', UNKNOWN_LANGUAGE), # Belgium ('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'), ('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'), # France / Occitan ('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'), ] class TestNormalization(unittest.TestCase): @classmethod def setUpClass(cls): init_languages() street_types_gazetteer.configure() def test_countries(self): for s, country, expected in country_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) lang = disambiguate_language(s, languages.items()) self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items())) def test_regional(self): for s, country, k, v, expected in regional_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) regional = get_regional_languages(country, k, v) self.assertTrue(bool(regional)) regional.update(languages) lang = disambiguate_language(s, regional.items()) self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items())) if __name__ == '__main__': unittest.main()