[languages] encoding in language disambiguation
This commit is contained in:
@@ -192,8 +192,8 @@ script_languages = {}
|
|||||||
|
|
||||||
def init_disambiguation():
|
def init_disambiguation():
|
||||||
global char_scripts, script_languages
|
global char_scripts, script_languages
|
||||||
char_scripts.extend(get_chars_by_script())
|
char_scripts = get_chars_by_script()
|
||||||
script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
|
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
|
||||||
|
|
||||||
UNKNOWN_SCRIPT = 'Unknown'
|
UNKNOWN_SCRIPT = 'Unknown'
|
||||||
COMMON_SCRIPT = 'Common'
|
COMMON_SCRIPT = 'Common'
|
||||||
@@ -229,6 +229,7 @@ AMBIGUOUS_LANGUAGE = 'xxx'
|
|||||||
|
|
||||||
|
|
||||||
def disambiguate_language(text, languages):
|
def disambiguate_language(text, languages):
|
||||||
|
text = safe_decode(text)
|
||||||
valid_languages = OrderedDict(languages)
|
valid_languages = OrderedDict(languages)
|
||||||
script_langs = {}
|
script_langs = {}
|
||||||
read_len = 0
|
read_len = 0
|
||||||
@@ -245,7 +246,7 @@ def disambiguate_language(text, languages):
|
|||||||
|
|
||||||
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
|
||||||
|
|
||||||
tokens = normalized_tokens((safe_decode(text)))
|
tokens = normalized_tokens(text)
|
||||||
|
|
||||||
current_lang = None
|
current_lang = None
|
||||||
possible_lang = None
|
possible_lang = None
|
||||||
|
|||||||
Reference in New Issue
Block a user