[languages] Adding script-only disambiguation
This commit is contained in:
@@ -240,10 +240,20 @@ class AddressComponents(object):
|
|||||||
language = candidate_languages[0]['lang']
|
language = candidate_languages[0]['lang']
|
||||||
else:
|
else:
|
||||||
street = components.get(AddressFormatter.ROAD, None)
|
street = components.get(AddressFormatter.ROAD, None)
|
||||||
|
|
||||||
|
lang_tuples = [(l['lang'], l['default']) for l in candidate_languages]
|
||||||
if street is not None:
|
if street is not None:
|
||||||
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
|
language = disambiguate_language(street, lang_tuples)
|
||||||
else:
|
else:
|
||||||
language = UNKNOWN_LANGUAGE
|
if has_non_latin_script(lang_tuples):
|
||||||
|
for component, value in six.iteritems(components):
|
||||||
|
language = disambiguate_language_script(value, lang_tuples)
|
||||||
|
if language is not UNKNOWN_LANGUAGE:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
language = UNKNOWN_LANGUAGE
|
||||||
|
else:
|
||||||
|
language = UNKNOWN_LANGUAGE
|
||||||
|
|
||||||
return language
|
return language
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import six
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
@@ -33,8 +34,14 @@ WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char_scripts = get_chars_by_script()
|
char_scripts = get_chars_by_script()
|
||||||
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
|
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
|
||||||
|
lang_scripts = defaultdict(set)
|
||||||
|
|
||||||
|
for script, langs in six.iteritems(script_languages):
|
||||||
|
for lang in langs:
|
||||||
|
lang_scripts[lang].add(script)
|
||||||
|
|
||||||
|
lang_scripts = dict(lang_scripts)
|
||||||
|
|
||||||
UNKNOWN_SCRIPT = 'Unknown'
|
UNKNOWN_SCRIPT = 'Unknown'
|
||||||
COMMON_SCRIPT = 'Common'
|
COMMON_SCRIPT = 'Common'
|
||||||
@@ -69,9 +76,7 @@ UNKNOWN_LANGUAGE = 'unk'
|
|||||||
AMBIGUOUS_LANGUAGE = 'xxx'
|
AMBIGUOUS_LANGUAGE = 'xxx'
|
||||||
|
|
||||||
|
|
||||||
def disambiguate_language(text, languages):
|
def disambiguate_language_script(text, languages):
|
||||||
text = safe_decode(text)
|
|
||||||
valid_languages = OrderedDict(languages)
|
|
||||||
script_langs = {}
|
script_langs = {}
|
||||||
read_len = 0
|
read_len = 0
|
||||||
while read_len < len(text):
|
while read_len < len(text):
|
||||||
@@ -85,6 +90,27 @@ def disambiguate_language(text, languages):
|
|||||||
|
|
||||||
read_len += script_len
|
read_len += script_len
|
||||||
|
|
||||||
|
return UNKNOWN_LANGUAGE
|
||||||
|
|
||||||
|
LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic')
|
||||||
|
|
||||||
|
|
||||||
|
def has_non_latin_script(languages):
|
||||||
|
for lang, is_default in languages:
|
||||||
|
scripts = script_languages.get(lang, set())
|
||||||
|
if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def disambiguate_language(text, languages, scripts_only=False):
|
||||||
|
text = safe_decode(text)
|
||||||
|
valid_languages = OrderedDict(languages)
|
||||||
|
|
||||||
|
language_script = disambiguate_language_script(text, languages)
|
||||||
|
if language_script is not UNKNOWN_LANGUAGE:
|
||||||
|
return language_script
|
||||||
|
|
||||||
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||||
|
|
||||||
tokens = normalized_tokens(text)
|
tokens = normalized_tokens(text)
|
||||||
|
|||||||
Reference in New Issue
Block a user