[i18n] adding CLDR languages script to this repo

2015-03-18 08:01:36 -04:00
parent d2ceb5f418
commit 88554c1ef7
1 changed files with 136 additions and 0 deletions
--- a/scripts/geodata/i18n/cldr_languages.py
+++ b/scripts/geodata/i18n/cldr_languages.py
@@ -0,0 +1,136 @@
+import argparse
+import csv
+import os
+import requests
+
+from collections import Counter
+
+from cStringIO import StringIO
+from lxml import etree
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+DEFAULT_DIR = os.path.join(os.pardir, os.pardir, os.pardir,
+                           'data', 'language', 'countries')
+
+CLDR_URL = 'http://unicode.org/repos/cldr/trunk/common'
+CLDR_SUPPLEMENTAL_DATA = CLDR_URL + '/supplemental/supplementalData.xml'
+
+ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
+ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
+
+ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
+MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
+COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
+
+REGIONAL = 'official_regional'
+UNKNOWN_COUNTRY = 'zz'
+UNKNOWN_LANGUAGES = ('und', 'zxx')
+
+
+def write_country_official_languages_file(xml, out_dir):
+    lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
+    lang_writer = csv.writer(lang_file, delimiter='\t')
+
+    def get_population_pct(lang):
+        return int(lang.attrib.get('populationPercent', 0))
+
+    lang_scripts = {}
+    for lang in xml.xpath('//languageData/language'):
+        language_code = lang.attrib['type'].lower()
+        script = lang.get('scripts')
+        if not script:
+            continue
+        territories = lang.get('territories')
+        if (language_code, None) not in lang_scripts:
+            lang_scripts[(language_code, None)] = script
+
+        if not territories:
+            continue
+        for territory in territories.strip().split():
+            lang_scripts[(language_code, territory.lower())] = script
+
+    for territory in xml.xpath('//territoryInfo/territory'):
+        country_code = territory.attrib['type'].lower()
+        if country_code == UNKNOWN_COUNTRY:
+            continue
+        langs = territory.xpath('languagePopulation')
+        languages = Counter()
+        official = set()
+        regional = set()
+        for lang in langs:
+            language = lang.attrib['type'].lower().split('_')[0]
+            official_status = lang.attrib.get('officialStatus')
+            languages[language] += float(lang.attrib['populationPercent'])
+            if official_status and official_status != REGIONAL:
+                official.add(language)
+            elif official_status == REGIONAL:
+                regional.add(language)
+
+        if official:
+            languages = Counter({l: c for l, c in languages.iteritems()
+                                 if l in official or l in regional})
+        else:
+            languages = Counter({l: c for l, c in languages.most_common(1)})
+
+        for lang, pct in languages.most_common():
+            if lang in UNKNOWN_LANGUAGES:
+                continue
+
+            script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
+
+            lang_writer.writerow((country_code, lang, script.replace(' ', ','),
+                                  str(min(pct, 100.0)), str(int(lang in official))))
+
+RETIRED = 'R'
+INDIVIDUAL = 'I'
+MACRO = 'M'
+LIVING = 'L'
+
+
+def write_languages_file(langs, macro, out_dir):
+    lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
+    writer = csv.writer(lang_file, delimiter='\t')
+    writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
+                     'ISO 639-1', 'type', 'macro'))
+
+    macro_reader = csv.reader(StringIO(macro), delimiter='\t')
+    headers = macro_reader.next()
+    assert len(headers) == 3
+    macros = {minor_code: macro_code for (macro_code, minor_code, status)
+              in macro_reader if status != RETIRED}
+
+    lang_reader = csv.reader(StringIO(langs), delimiter='\t')
+    headers = lang_reader.next()
+    assert headers[:6] == ['Id', 'Part2B', 'Part2T',
+                           'Part1', 'Scope', 'Language_Type']
+
+    for line in lang_reader:
+        iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
+        macro = macros.get(iso639_3, '')
+        # Only living languages that are either individual or macro
+        if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
+            writer.writerow((iso639_3, iso639_2b, iso639_2t,
+                             iso639_1, scope, macro))
+
+
+def main(out_dir):
+    response = requests.get(ISO_639_3)
+    langs = response.content
+
+    response = requests.get(ISO_MACROLANGUAGES)
+    macro = response.content
+    write_languages_file(langs, macro, out_dir)
+
+    response = requests.get(CLDR_SUPPLEMENTAL_DATA)
+    xml = etree.fromstring(response.content)
+    write_country_official_languages_file(xml, out_dir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_DIR,
+                        help='Out directory')
+    args = parser.parse_args()
+
+    main(args.out)