[languages] non-default languages can still be labeled from > 1 char abbreviations if there's no evidence of other languages in the string. Adding Python version of get_string_script from the C lib

2015-08-23 02:24:32 -04:00
parent a419dad630
commit 122a81b610
4 changed files with 48 additions and 14 deletions
--- a/scripts/geodata/i18n/unicode_properties.py
+++ b/scripts/geodata/i18n/unicode_properties.py
@@ -332,11 +332,15 @@ def extract_language_scripts(xml):
    return language_scripts


-def get_script_languages(script_codes):
+def get_script_languages():
    # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
    # to identify the language. We keep track of those single language scripts to inform
    # the language classifier

+    chars = get_chars_by_script()
+    all_scripts = build_master_scripts_list(chars)
+    script_codes = get_script_codes(all_scripts)
+
    cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
    cldr_xml = etree.parse(cldr_supplemental_data)
    language_scripts = extract_language_scripts(cldr_xml)
@@ -364,6 +368,9 @@ def get_script_languages(script_codes):
        langs = script_code_languages.get(script_code, [])
        script_languages[script_name].extend(langs)

+    for name in all_scripts.iterkeys():
+        script_languages.setdefault(name, [])
+
    return script_languages


@@ -383,11 +390,7 @@ def main(out_dir):
    if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
        download_cldr()

-    chars = get_chars_by_script()
-    all_scripts = build_master_scripts_list(chars)
-    script_codes = get_script_codes(all_scripts)
-
-    script_languages = get_script_languages(script_codes)
+    script_languages = get_script_languages()

    max_langs = 0

@@ -396,9 +399,6 @@ def main(out_dir):
        if num_langs > max_langs:
            max_langs = num_langs

-    for name in all_scripts.iterkeys():
-        script_languages.setdefault(name, [])
-
    # Generate C header and constants

    script_enum = u'''