[languages] encoding in language disambiguation

This commit is contained in:
Al
2016-01-19 03:22:03 -05:00
parent 3262d2ccd3
commit 8b94a018e6

View File

@@ -192,8 +192,8 @@ script_languages = {}
def init_disambiguation():
global char_scripts, script_languages
char_scripts.extend(get_chars_by_script())
script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
char_scripts = get_chars_by_script()
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
@@ -229,6 +229,7 @@ AMBIGUOUS_LANGUAGE = 'xxx'
def disambiguate_language(text, languages):
text = safe_decode(text)
valid_languages = OrderedDict(languages)
script_langs = {}
read_len = 0
@@ -245,7 +246,7 @@ def disambiguate_language(text, languages):
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
tokens = normalized_tokens((safe_decode(text)))
tokens = normalized_tokens(text)
current_lang = None
possible_lang = None