[languages] Adding script-only disambiguation

This commit is contained in:
Al
2016-05-23 11:17:59 -04:00
parent e6157915af
commit bd341417a3
2 changed files with 42 additions and 6 deletions

View File

@@ -240,10 +240,20 @@ class AddressComponents(object):
language = candidate_languages[0]['lang']
else:
street = components.get(AddressFormatter.ROAD, None)
lang_tuples = [(l['lang'], l['default']) for l in candidate_languages]
if street is not None:
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
language = disambiguate_language(street, lang_tuples)
else:
language = UNKNOWN_LANGUAGE
if has_non_latin_script(lang_tuples):
for component, value in six.iteritems(components):
language = disambiguate_language_script(value, lang_tuples)
if language is not UNKNOWN_LANGUAGE:
break
else:
language = UNKNOWN_LANGUAGE
else:
language = UNKNOWN_LANGUAGE
return language

View File

@@ -1,4 +1,5 @@
import os
import six
import sys
from collections import defaultdict, OrderedDict
@@ -33,8 +34,14 @@ WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
}
char_scripts = get_chars_by_script()
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
lang_scripts = defaultdict(set)
for script, langs in six.iteritems(script_languages):
for lang in langs:
lang_scripts[lang].add(script)
lang_scripts = dict(lang_scripts)
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
@@ -69,9 +76,7 @@ UNKNOWN_LANGUAGE = 'unk'
AMBIGUOUS_LANGUAGE = 'xxx'
def disambiguate_language(text, languages):
text = safe_decode(text)
valid_languages = OrderedDict(languages)
def disambiguate_language_script(text, languages):
script_langs = {}
read_len = 0
while read_len < len(text):
@@ -85,6 +90,27 @@ def disambiguate_language(text, languages):
read_len += script_len
return UNKNOWN_LANGUAGE
LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic')
def has_non_latin_script(languages):
for lang, is_default in languages:
scripts = script_languages.get(lang, set())
if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
return True
return False
def disambiguate_language(text, languages, scripts_only=False):
text = safe_decode(text)
valid_languages = OrderedDict(languages)
language_script = disambiguate_language_script(text, languages)
if language_script is not UNKNOWN_LANGUAGE:
return language_script
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
tokens = normalized_tokens(text)