[languages] non-default languages can still be labeled from > 1 char abbreviations if there's no evidence of other languages in the string. Adding Python version of get_string_script from the C lib

This commit is contained in:
Al
2015-08-23 02:24:32 -04:00
parent a419dad630
commit 122a81b610
4 changed files with 48 additions and 14 deletions

View File

@@ -332,11 +332,15 @@ def extract_language_scripts(xml):
return language_scripts
def get_script_languages(script_codes):
def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform
# the language classifier
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
cldr_xml = etree.parse(cldr_supplemental_data)
language_scripts = extract_language_scripts(cldr_xml)
@@ -364,6 +368,9 @@ def get_script_languages(script_codes):
langs = script_code_languages.get(script_code, [])
script_languages[script_name].extend(langs)
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
return script_languages
@@ -383,11 +390,7 @@ def main(out_dir):
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr()
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages(script_codes)
script_languages = get_script_languages()
max_langs = 0
@@ -396,9 +399,6 @@ def main(out_dir):
if num_langs > max_langs:
max_langs = num_langs
for name in all_scripts.iterkeys():
script_languages.setdefault(name, [])
# Generate C header and constants
script_enum = u'''