From 8b94a018e638c982376a8f27f2d7303f67af1ed1 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 19 Jan 2016 03:22:03 -0500 Subject: [PATCH] [languages] encoding in language disambiguation --- scripts/geodata/language_id/disambiguation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 4997bdcd..d0438d8f 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -192,8 +192,8 @@ script_languages = {} def init_disambiguation(): global char_scripts, script_languages - char_scripts.extend(get_chars_by_script()) - script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()}) + char_scripts = get_chars_by_script() + script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()} UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' @@ -229,6 +229,7 @@ AMBIGUOUS_LANGUAGE = 'xxx' def disambiguate_language(text, languages): + text = safe_decode(text) valid_languages = OrderedDict(languages) script_langs = {} read_len = 0 @@ -245,7 +246,7 @@ def disambiguate_language(text, languages): num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default)) - tokens = normalized_tokens((safe_decode(text))) + tokens = normalized_tokens(text) current_lang = None possible_lang = None