diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index adb9e548..029484a6 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -240,10 +240,20 @@ class AddressComponents(object): language = candidate_languages[0]['lang'] else: street = components.get(AddressFormatter.ROAD, None) + + lang_tuples = [(l['lang'], l['default']) for l in candidate_languages] if street is not None: - language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) + language = disambiguate_language(street, lang_tuples) else: - language = UNKNOWN_LANGUAGE + if has_non_latin_script(lang_tuples): + for component, value in six.iteritems(components): + language = disambiguate_language_script(value, lang_tuples) + if language is not UNKNOWN_LANGUAGE: + break + else: + language = UNKNOWN_LANGUAGE + else: + language = UNKNOWN_LANGUAGE return language diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 3fec1e93..6980bddd 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -1,4 +1,5 @@ import os +import six import sys from collections import defaultdict, OrderedDict @@ -33,8 +34,14 @@ WELL_REPRESENTED_LANGUAGE_COUNTRIES = { } char_scripts = get_chars_by_script() -script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()} +script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())} +lang_scripts = defaultdict(set) +for script, langs in six.iteritems(script_languages): + for lang in langs: + lang_scripts[lang].add(script) + +lang_scripts = dict(lang_scripts) UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' @@ -69,9 +76,7 @@ UNKNOWN_LANGUAGE = 'unk' AMBIGUOUS_LANGUAGE = 'xxx' -def disambiguate_language(text, languages): - text = safe_decode(text) - valid_languages = OrderedDict(languages) +def disambiguate_language_script(text, languages): script_langs = {} read_len = 0 while read_len < len(text): @@ -85,6 +90,27 @@ def disambiguate_language(text, languages): read_len += script_len + return UNKNOWN_LANGUAGE + +LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic') + + +def has_non_latin_script(languages): + for lang, is_default in languages: + scripts = script_languages.get(lang, set()) + if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS: + return True + return False + + +def disambiguate_language(text, languages, scripts_only=False): + text = safe_decode(text) + valid_languages = OrderedDict(languages) + + language_script = disambiguate_language_script(text, languages) + if language_script is not UNKNOWN_LANGUAGE: + return language_script + num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default)) tokens = normalized_tokens(text)