[languages] If a non-Latin script in a string would prohibit the found language, return ambiguous. Adding some test cases for sanity checking the labeling
This commit is contained in:
@@ -124,7 +124,7 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
|
|||||||
'stopwords.txt',)
|
'stopwords.txt',)
|
||||||
|
|
||||||
char_scripts = get_chars_by_script()
|
char_scripts = get_chars_by_script()
|
||||||
script_languages = get_script_languages()
|
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
|
||||||
|
|
||||||
UNKNOWN_SCRIPT = 'Unknown'
|
UNKNOWN_SCRIPT = 'Unknown'
|
||||||
COMMON_SCRIPT = 'Common'
|
COMMON_SCRIPT = 'Common'
|
||||||
@@ -148,15 +148,23 @@ def get_string_script(s):
|
|||||||
last_script = script
|
last_script = script
|
||||||
return (last_script, script_len, is_ascii)
|
return (last_script, script_len, is_ascii)
|
||||||
|
|
||||||
|
LATIN_SCRIPT = 'Latin'
|
||||||
UNKNOWN_LANGUAGE = 'unk'
|
UNKNOWN_LANGUAGE = 'unk'
|
||||||
AMBIGUOUS_LANGUAGE = 'xxx'
|
AMBIGUOUS_LANGUAGE = 'xxx'
|
||||||
|
|
||||||
|
|
||||||
def disambiguate_language(text, languages):
|
def disambiguate_language(text, languages):
|
||||||
num_defaults = sum((1 for lang, default in languages if default))
|
|
||||||
valid_languages = OrderedDict(languages)
|
valid_languages = OrderedDict(languages)
|
||||||
tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower())
|
script_langs = {}
|
||||||
|
read_len = 0
|
||||||
|
while read_len < len(text):
|
||||||
|
script, script_len, is_ascii = get_string_script(text[read_len:])
|
||||||
|
if script != LATIN_SCRIPT:
|
||||||
|
script_langs[script] = set([l for l, d in languages if l in script_languages[script]])
|
||||||
|
read_len += script_len
|
||||||
|
|
||||||
|
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||||
|
tokens = [(c, t.rstrip('.')) for c, t in tokenize(safe_decode(text).replace(u'-', u' ').lower())]
|
||||||
|
|
||||||
current_lang = None
|
current_lang = None
|
||||||
possible_lang = None
|
possible_lang = None
|
||||||
@@ -172,7 +180,7 @@ def disambiguate_language(text, languages):
|
|||||||
for lang, canonical, stopword in data:
|
for lang, canonical, stopword in data:
|
||||||
canonical = int(canonical)
|
canonical = int(canonical)
|
||||||
stopword = int(stopword)
|
stopword = int(stopword)
|
||||||
if lang not in valid_languages or stopword:
|
if lang not in valid_languages or (stopword and len(potentials) > 1):
|
||||||
continue
|
continue
|
||||||
is_default = valid_languages[lang]
|
is_default = valid_languages[lang]
|
||||||
|
|
||||||
@@ -180,7 +188,7 @@ def disambiguate_language(text, languages):
|
|||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
elif is_default and num_defaults > 1 and current_lang != lang:
|
elif is_default and num_defaults > 1 and current_lang != lang:
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
elif not seen_languages and len(t[0][1]) > 1:
|
elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
|
||||||
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||||
|
|
||||||
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
||||||
@@ -189,16 +197,20 @@ def disambiguate_language(text, languages):
|
|||||||
if len(valid) == 1:
|
if len(valid) == 1:
|
||||||
current_lang = valid[0]
|
current_lang = valid[0]
|
||||||
else:
|
else:
|
||||||
valid = [l for l in valid if valid_languages.get(l)]
|
valid_default = [l for l in valid if valid_languages.get(l)]
|
||||||
if len(valid) == 1 and current_lang is not None and valid[0] != current_lang:
|
if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang:
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
elif len(valid) == 1:
|
elif len(valid_default) == 1:
|
||||||
current_lang = valid[0]
|
current_lang = valid_default[0]
|
||||||
|
|
||||||
seen_languages.update(valid)
|
|
||||||
|
|
||||||
if current_lang is not None:
|
if current_lang is not None:
|
||||||
return current_lang
|
if not any((current_lang not in langs for script, langs in script_langs.iteritems())):
|
||||||
|
return current_lang
|
||||||
|
else:
|
||||||
|
return AMBIGUOUS_LANGUAGE
|
||||||
elif possible_lang is not None:
|
elif possible_lang is not None:
|
||||||
return possible_lang
|
if not any((possible_lang not in langs for script, langs in script_langs.iteritems())):
|
||||||
|
return possible_lang
|
||||||
|
else:
|
||||||
|
return AMBIGUOUS_LANGUAGE
|
||||||
return UNKNOWN_LANGUAGE
|
return UNKNOWN_LANGUAGE
|
||||||
|
|||||||
0
scripts/geodata/tests/__init__.py
Normal file
0
scripts/geodata/tests/__init__.py
Normal file
108
scripts/geodata/tests/test_disambiguation.py
Normal file
108
scripts/geodata/tests/test_disambiguation.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||||
|
|
||||||
|
from geodata.i18n.languages import init_languages, get_country_languages, get_regional_languages
|
||||||
|
from geodata.language_id.disambiguation import disambiguate_language, street_types_gazetteer, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE
|
||||||
|
|
||||||
|
|
||||||
|
country_test_cases = [
|
||||||
|
# String, country, expected language
|
||||||
|
|
||||||
|
# US has some Spanish and French street names
|
||||||
|
('Avenue P', 'us', 'en'),
|
||||||
|
('Avenue du', 'us', 'fr'),
|
||||||
|
('Avenida de la Plata', 'us', 'es'),
|
||||||
|
('Pl', 'us', UNKNOWN_LANGUAGE),
|
||||||
|
('No 2 School House', 'us', UNKNOWN_LANGUAGE),
|
||||||
|
('E Thetford Rd', 'us', 'en'),
|
||||||
|
('El Camino', 'us', 'es'),
|
||||||
|
('Rue Louis Phillippe', 'us', 'fr'),
|
||||||
|
('Calle Street', 'us', AMBIGUOUS_LANGUAGE),
|
||||||
|
|
||||||
|
# Avenue + stopword
|
||||||
|
('Avenue du Bourget-du-Lac', 'je', 'fr'),
|
||||||
|
|
||||||
|
# UAE, English is non-default, has abbreviation
|
||||||
|
('128 A St', 'ae', 'en'),
|
||||||
|
('128 A St.', 'ae', 'en'),
|
||||||
|
|
||||||
|
# English / Arabic street address
|
||||||
|
('Omar Street ﺵﺍﺮﻋ ﻊﻣﺭ', 'iq', AMBIGUOUS_LANGUAGE),
|
||||||
|
|
||||||
|
|
||||||
|
# Random script
|
||||||
|
('Bayard Street - 擺也街', 'us', AMBIGUOUS_LANGUAGE),
|
||||||
|
|
||||||
|
|
||||||
|
# Brussels address
|
||||||
|
('Avenue Paul Héger - Paul Hégerlaan', 'be', AMBIGUOUS_LANGUAGE),
|
||||||
|
('Smaragdstraat', 'be', 'nl'),
|
||||||
|
|
||||||
|
|
||||||
|
# India
|
||||||
|
('Kidwai nagar', 'in', 'hi'),
|
||||||
|
('Mavoor Rd.', 'in', 'en'),
|
||||||
|
|
||||||
|
# Sri Lanka
|
||||||
|
('Sri Sadathissa Mawatha', 'lk', 'si'),
|
||||||
|
|
||||||
|
# Russian
|
||||||
|
('Фрунзе улица', 'kg', 'ru'),
|
||||||
|
]
|
||||||
|
|
||||||
|
regional_test_cases = [
|
||||||
|
# Spain
|
||||||
|
('Carrer de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'ca'),
|
||||||
|
('Calle de la Morella', 'es', 'qs_a1r', 'Cataluña/Catalunya', 'es'),
|
||||||
|
('autobidea', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'eu'),
|
||||||
|
('Calle', 'es', 'qs_a1r', 'Comunidad Foral de Navarra', 'es'),
|
||||||
|
('Txurruka', 'es', 'qs_a1r', 'País Vasco/Euskadi', UNKNOWN_LANGUAGE),
|
||||||
|
|
||||||
|
# Belgium
|
||||||
|
('Lutticherstrasse', 'be', 'qs_a1', 'Liège', 'de'),
|
||||||
|
('Chaussée de Charleroi', 'be', 'qs_a1', 'Namur', 'fr'),
|
||||||
|
|
||||||
|
|
||||||
|
# France / Occitan
|
||||||
|
('Carriera de Brasinvert', 'fr', 'qs_a1r', 'Rhône-Alpes', 'oc'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestNormalization(unittest.TestCase):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
init_languages()
|
||||||
|
street_types_gazetteer.configure()
|
||||||
|
|
||||||
|
def test_countries(self):
|
||||||
|
for s, country, expected in country_test_cases:
|
||||||
|
languages = get_country_languages(country)
|
||||||
|
self.assertTrue(bool(languages))
|
||||||
|
|
||||||
|
lang = disambiguate_language(s, languages.items())
|
||||||
|
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
|
||||||
|
|
||||||
|
def test_regional(self):
|
||||||
|
for s, country, k, v, expected in regional_test_cases:
|
||||||
|
languages = get_country_languages(country)
|
||||||
|
self.assertTrue(bool(languages))
|
||||||
|
regional = get_regional_languages(country, k, v)
|
||||||
|
self.assertTrue(bool(regional))
|
||||||
|
regional.update(languages)
|
||||||
|
|
||||||
|
lang = disambiguate_language(s, regional.items())
|
||||||
|
|
||||||
|
self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user