diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 3ade348a..085ad67b 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -124,7 +124,7 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt', 'stopwords.txt',) char_scripts = get_chars_by_script() -script_languages = get_script_languages() +script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()} UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' @@ -148,15 +148,23 @@ def get_string_script(s): last_script = script return (last_script, script_len, is_ascii) - +LATIN_SCRIPT = 'Latin' UNKNOWN_LANGUAGE = 'unk' AMBIGUOUS_LANGUAGE = 'xxx' def disambiguate_language(text, languages): - num_defaults = sum((1 for lang, default in languages if default)) valid_languages = OrderedDict(languages) - tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower()) + script_langs = {} + read_len = 0 + while read_len < len(text): + script, script_len, is_ascii = get_string_script(text[read_len:]) + if script != LATIN_SCRIPT: + script_langs[script] = set([l for l, d in languages if l in script_languages[script]]) + read_len += script_len + + num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default)) + tokens = [(c, t.rstrip('.')) for c, t in tokenize(safe_decode(text).replace(u'-', u' ').lower())] current_lang = None possible_lang = None @@ -172,7 +180,7 @@ def disambiguate_language(text, languages): for lang, canonical, stopword in data: canonical = int(canonical) stopword = int(stopword) - if lang not in valid_languages or stopword: + if lang not in valid_languages or (stopword and len(potentials) > 1): continue is_default = valid_languages[lang] @@ -180,7 +188,7 @@ def disambiguate_language(text, languages): valid.append(lang) elif is_default and num_defaults > 1 and current_lang != lang: return AMBIGUOUS_LANGUAGE - elif not seen_languages and len(t[0][1]) > 1: + elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1: possible_lang = lang if possible_lang is None or possible_lang == lang else None if seen_languages and valid and not any((l in seen_languages for l in valid)): @@ -189,16 +197,20 @@ def disambiguate_language(text, languages): if len(valid) == 1: current_lang = valid[0] else: - valid = [l for l in valid if valid_languages.get(l)] - if len(valid) == 1 and current_lang is not None and valid[0] != current_lang: + valid_default = [l for l in valid if valid_languages.get(l)] + if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang: return AMBIGUOUS_LANGUAGE - elif len(valid) == 1: - current_lang = valid[0] - - seen_languages.update(valid) + elif len(valid_default) == 1: + current_lang = valid_default[0] if current_lang is not None: - return current_lang + if not any((current_lang not in langs for script, langs in script_langs.iteritems())): + return current_lang + else: + return AMBIGUOUS_LANGUAGE elif possible_lang is not None: - return possible_lang + if not any((possible_lang not in langs for script, langs in script_langs.iteritems())): + return possible_lang + else: + return AMBIGUOUS_LANGUAGE return UNKNOWN_LANGUAGE diff --git a/scripts/geodata/tests/__init__.py b/scripts/geodata/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/tests/test_disambiguation.py b/scripts/geodata/tests/test_disambiguation.py new file mode 100644 index 00000000..587106f2 --- /dev/null +++ b/scripts/geodata/tests/test_disambiguation.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import os +import sys +import unittest + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.i18n.languages import init_languages, get_country_languages, get_regional_languages +from geodata.language_id.disambiguation import disambiguate_language, street_types_gazetteer, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE + + +country_test_cases = [ + # String, country, expected language + + # US has some Spanish and French street names + ('Avenue P', 'us', 'en'), + ('Avenue du', 'us', 'fr'), + ('Avenida de la Plata', 'us', 'es'), + ('Pl', 'us', UNKNOWN_LANGUAGE), + ('No 2 School House', 'us', UNKNOWN_LANGUAGE), + ('E Thetford Rd', 'us', 'en'), + ('El Camino', 'us', 'es'), + ('Rue Louis Phillippe', 'us', 'fr'), + ('Calle Street', 'us', AMBIGUOUS_LANGUAGE), + + # Avenue + stopword + ('Avenue du Bourget-du-Lac', 'je', 'fr'), + + # UAE, English is non-default, has abbreviation + ('128 A St', 'ae', 'en'), + ('128 A St.', 'ae', 'en'), + + # English / Arabic street address + ('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE), + + + # Random script + ('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE), + + + # Brussels address + ('Avenue Paul Héger - Paul Hégerlaan', 'be', AMBIGUOUS_LANGUAGE), + ('Smaragdstraat', 'be', 'nl'), + + + # India + ('Kidwai nagar', 'in', 'hi'), + ('Mavoor Rd.', 'in', 'en'), + + # Sri Lanka + ('Sri Sadathissa Mawatha', 'lk', 'si'), + + # Russian + ('Фрунзе улица', 'kg', 'ru'), +] + +regional_test_cases = [ + # Spain + ('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'), + ('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'), + ('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'), + ('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'), + ('Txurruka', 'es', 'qs_a1r', 'País Vasco/Euskadi', UNKNOWN_LANGUAGE), + + # Belgium + ('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'), + ('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'), + + + # France / Occitan + ('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'), + +] + + +class TestNormalization(unittest.TestCase): + @classmethod + def setUpClass(cls): + init_languages() + street_types_gazetteer.configure() + + def test_countries(self): + for s, country, expected in country_test_cases: + languages = get_country_languages(country) + self.assertTrue(bool(languages)) + + lang = disambiguate_language(s, languages.items()) + self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items())) + + def test_regional(self): + for s, country, k, v, expected in regional_test_cases: + languages = get_country_languages(country) + self.assertTrue(bool(languages)) + regional = get_regional_languages(country, k, v) + self.assertTrue(bool(regional)) + regional.update(languages) + + lang = disambiguate_language(s, regional.items()) + + self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items())) + +if __name__ == '__main__': + unittest.main()