From 03febc7e209420e9e7cb2829ff016b2d03029204 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 13 Aug 2015 18:25:55 -0400 Subject: [PATCH] [scripts] Better script code aliasing --- scripts/geodata/i18n/unicode_properties.py | 32 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/i18n/unicode_properties.py b/scripts/geodata/i18n/unicode_properties.py index afdefed5..796a2983 100644 --- a/scripts/geodata/i18n/unicode_properties.py +++ b/scripts/geodata/i18n/unicode_properties.py @@ -248,6 +248,7 @@ def get_word_break_properties(): return dict(props) + def build_master_scripts_list(chars): all_scripts = OrderedDict.fromkeys(filter(bool, chars)) @@ -260,6 +261,12 @@ def build_master_scripts_list(chars): return all_scripts +SCRIPT_ALIASES_SUPPLEMENTAL = { + 'Hant': 'Han', + 'Hans': 'Han' +} + + def get_script_codes(all_scripts): temp_dir = tempfile.gettempdir() script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1]) @@ -289,9 +296,24 @@ def get_script_codes(all_scripts): script_codes[code] = normalized_name seen_scripts.add(normalized_name) + value_aliases = get_property_value_aliases() + script_aliases = value_aliases['sc'] + + for code, script in script_aliases.iteritems(): + if code not in script_codes and script in all_scripts: + script_codes[code] = script + + script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL) + return script_codes +SCRIPT_CODE_ALIASES = { + 'Jpan': ['Hani', 'Hira', 'Kana'], + 'Kore': ['Hang', 'Han'] +} + + def extract_language_scripts(xml): language_scripts = defaultdict(list) @@ -301,7 +323,11 @@ def extract_language_scripts(xml): if not scripts: continue for script in scripts.split(): - language_scripts[language_code].append(script) + script_aliases = SCRIPT_CODE_ALIASES.get(script) + if not script_aliases: + language_scripts[language_code].append(script) + else: + language_scripts[language_code].extend(script_aliases) return language_scripts @@ -332,11 +358,11 @@ def get_script_languages(script_codes): for script in scripts: script_code_languages[script].append(language) - script_languages = {} + script_languages = defaultdict(list) for script_code, script_name in script_codes.iteritems(): langs = script_code_languages.get(script_code, []) - script_languages[script_name] = langs + script_languages[script_name].extend(langs) return script_languages