libpostal/scripts/geodata/i18n/cldr_languages.py

import argparse
import csv
import os
import requests

from collections import Counter

from cStringIO import StringIO
from lxml import etree

from unicode_paths import CLDR_DIR

this_dir = os.path.realpath(os.path.dirname(__file__))
DEFAULT_LANGUAGES_DIR = os.path.join(os.pardir, os.pardir, os.pardir,
                                     'data', 'language', 'countries')

CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
                                      'supplementalData.xml')

ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'

ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'

REGIONAL = 'official_regional'
UNKNOWN_COUNTRY = 'zz'
UNKNOWN_LANGUAGES = ('und', 'zxx')


def write_country_official_languages_file(xml, out_dir):
    lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
    lang_writer = csv.writer(lang_file, delimiter='\t')

    def get_population_pct(lang):
        return int(lang.attrib.get('populationPercent', 0))

    lang_scripts = {}
    for lang in xml.xpath('//languageData/language'):
        language_code = lang.attrib['type'].lower()
        scripts = lang.get('scripts')
        if not scripts:
            continue
        territories = lang.get('territories')
        if (language_code, None) not in lang_scripts:
            lang_scripts[(language_code, None)] = scripts

        if not territories:
            continue
        for territory in territories.strip().split():
            lang_scripts[(language_code, territory.lower())] = scripts

    for territory in xml.xpath('//territoryInfo/territory'):
        country_code = territory.attrib['type'].lower()
        if country_code == UNKNOWN_COUNTRY:
            continue
        langs = territory.xpath('languagePopulation')
        languages = Counter()
        official = set()
        regional = set()
        for lang in langs:
            language = lang.attrib['type'].lower().split('_')[0]
            official_status = lang.attrib.get('officialStatus')
            languages[language] += float(lang.attrib['populationPercent'])
            if official_status and official_status != REGIONAL:
                official.add(language)
            elif official_status == REGIONAL:
                regional.add(language)

        if official:
            languages = Counter({l: c for l, c in languages.iteritems()
                                 if l in official or l in regional})
        else:
            languages = Counter({l: c for l, c in languages.most_common(1)})

        for lang, pct in languages.most_common():
            if lang in UNKNOWN_LANGUAGES:
                continue

            script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))

            lang_writer.writerow((country_code, lang, script.replace(' ', ','),
                                  str(min(pct, 100.0)), str(int(lang in official))))

RETIRED = 'R'
INDIVIDUAL = 'I'
MACRO = 'M'
LIVING = 'L'


def write_languages_file(langs, macro, out_dir):
    lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
    writer = csv.writer(lang_file, delimiter='\t')
    writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
                     'ISO 639-1', 'type', 'macro'))

    macro_reader = csv.reader(StringIO(macro), delimiter='\t')
    headers = macro_reader.next()
    assert len(headers) == 3
    macros = {minor_code: macro_code for (macro_code, minor_code, status)
              in macro_reader if status != RETIRED}

    lang_reader = csv.reader(StringIO(langs), delimiter='\t')
    headers = lang_reader.next()
    assert headers[:6] == ['Id', 'Part2B', 'Part2T',
                           'Part1', 'Scope', 'Language_Type']

    for line in lang_reader:
        iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
        macro = macros.get(iso639_3, '')
        # Only living languages that are either individual or macro
        if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
            writer.writerow((iso639_3, iso639_2b, iso639_2t,
                             iso639_1, scope, macro))


def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
    response = requests.get(ISO_639_3)
    langs = response.content

    response = requests.get(ISO_MACROLANGUAGES)
    macro = response.content
    write_languages_file(langs, macro, out_dir)

    supplemental = open(CLDR_SUPPLEMENTAL_DATA)
    xml = etree.parse(supplemental)
    write_country_official_languages_file(xml, out_dir)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--out',
                        default=DEFAULT_LANGUAGES_DIR,
                        help='Out directory')
    args = parser.parse_args()

    fetch_cldr_languages(args.out)