diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 4997bdcd..d0438d8f 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -192,8 +192,8 @@ script_languages = {} def init_disambiguation(): global char_scripts, script_languages - char_scripts.extend(get_chars_by_script()) - script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()}) + char_scripts = get_chars_by_script() + script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()} UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' @@ -229,6 +229,7 @@ AMBIGUOUS_LANGUAGE = 'xxx' def disambiguate_language(text, languages): + text = safe_decode(text) valid_languages = OrderedDict(languages) script_langs = {} read_len = 0 @@ -245,7 +246,7 @@ def disambiguate_language(text, languages): num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default)) - tokens = normalized_tokens((safe_decode(text))) + tokens = normalized_tokens(text) current_lang = None possible_lang = None