diff --git a/scripts/geodata/countries/country_names.py b/scripts/geodata/countries/country_names.py index 9c75db5e..dcffe3c6 100644 --- a/scripts/geodata/countries/country_names.py +++ b/scripts/geodata/countries/country_names.py @@ -1,3 +1,6 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + import os import sys @@ -12,7 +15,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.i18n.unicode_paths import CLDR_DIR from geodata.i18n.languages import * -from geodata.encoding import safe_encode +from geodata.encoding import safe_decode CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main') @@ -26,7 +29,21 @@ LANGUAGE_COUNTRY_OVERRIDES = { 'en': { 'CD': 'Democratic Republic of the Congo', 'CG': 'Republic of the Congo', + }, + + # Countries that don't have their language in CLDR + + # Tajik / Tajikistan + 'tg': { + 'TJ': 'Тоҷикистон', + }, + + # Maldivan / Maldives + 'dv': { + 'MV': 'ދިވެހިރާއްޖެ', } + + } @@ -34,25 +51,33 @@ def cldr_country_names(language, base_dir=CLDR_MAIN_PATH): filename = os.path.join(base_dir, '{}.xml'.format(language)) xml = etree.parse(open(filename)) - country_names = {} + country_names = defaultdict(dict) for territory in xml.xpath('*//territories/*'): country_code = territory.attrib['type'] + if country_code in IGNORE_COUNTRIES or country_code.isdigit(): continue - elif country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): - country_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) - continue - elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short': - continue - elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant': - continue - elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'): + + country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text) + + display_names = {} + + for country_code, names in country_names.iteritems(): + if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): + display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) continue - country_names[country_code] = safe_decode(territory.text) + default_name = names.get(None) - return country_names + if country_code in COUNTRY_USE_SHORT_NAME: + display_names[country_code] = names.get('short', default_name) + elif country_code in COUNTRY_USE_VARIANT_NAME: + display_names[country_code] = names.get('variant', default_name) + elif default_name is not None: + display_names[country_code] = default_name + + return display_names country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries]) country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries]) @@ -69,12 +94,13 @@ def init_country_names(base_dir=CLDR_MAIN_PATH): global language_country_names init_languages() - local_languages = {country: get_country_languages(country, official=False, - overrides=False) for country in country_alpha2_codes} + local_languages = {country: get_country_languages(country, official=False) + or OrderedDict([('en', 1)]) + for country in country_alpha2_codes} country_language_names = defaultdict(dict) - for filename in os.listdir(CLDR_MAIN_PATH): + for filename in os.listdir(base_dir): lang = filename.split('.xml')[0] if len(lang) > 3: continue @@ -89,12 +115,30 @@ def init_country_names(base_dir=CLDR_MAIN_PATH): if lang in local_languages.get(country, {}): country_language_names[country][lang] = name + for l, names in LANGUAGE_COUNTRY_OVERRIDES.iteritems(): + if l not in language_country_names: + language_country_names[l.lower()] = names + + for c, name in names.iteritems(): + if c.lower() not in country_language_names: + country_language_names[c.lower()][l.lower()] = name + for country, langs in local_languages.iteritems(): names = country_language_names[country] - for lang, default in langs.iteritems(): + num_defaults = sum((1 for lang in names.keys() if langs.get(lang))) + for i, (lang, default) in enumerate(langs.iteritems()): name = names.get(lang) if not name: continue - if default: + if default or num_defaults == 0: country_official_names[country][lang] = name + if num_defaults == 0: + break country_local_names[country][lang] = name + + +def country_official_name(country_code): + if not country_official_names: + init_country_names() + return ' / '.join(OrderedDict.fromkeys(n.replace('-', ' ') + for n in country_official_names[c].values()).keys())