[languages] encoding in language disambiguation
This commit is contained in:
@@ -192,8 +192,8 @@ script_languages = {}
|
||||
|
||||
def init_disambiguation():
|
||||
global char_scripts, script_languages
|
||||
char_scripts.extend(get_chars_by_script())
|
||||
script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
|
||||
char_scripts = get_chars_by_script()
|
||||
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
|
||||
|
||||
UNKNOWN_SCRIPT = 'Unknown'
|
||||
COMMON_SCRIPT = 'Common'
|
||||
@@ -229,6 +229,7 @@ AMBIGUOUS_LANGUAGE = 'xxx'
|
||||
|
||||
|
||||
def disambiguate_language(text, languages):
|
||||
text = safe_decode(text)
|
||||
valid_languages = OrderedDict(languages)
|
||||
script_langs = {}
|
||||
read_len = 0
|
||||
@@ -245,7 +246,7 @@ def disambiguate_language(text, languages):
|
||||
|
||||
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||
|
||||
tokens = normalized_tokens((safe_decode(text)))
|
||||
tokens = normalized_tokens(text)
|
||||
|
||||
current_lang = None
|
||||
possible_lang = None
|
||||
|
||||
Reference in New Issue
Block a user