From 6491a5c3c49b6fd5c508772b7110a428b9f92be8 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 10 May 2016 09:40:08 -0400 Subject: [PATCH] [countries] Wrapping CLDR country names/alpha3 codes --- scripts/geodata/countries/country_names.py | 181 -------------------- scripts/geodata/countries/names.py | 183 +++++++++++++++++++++ 2 files changed, 183 insertions(+), 181 deletions(-) delete mode 100644 scripts/geodata/countries/country_names.py create mode 100644 scripts/geodata/countries/names.py diff --git a/scripts/geodata/countries/country_names.py b/scripts/geodata/countries/country_names.py deleted file mode 100644 index 2dab838d..00000000 --- a/scripts/geodata/countries/country_names.py +++ /dev/null @@ -1,181 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import os -import sys - -import pycountry - -from collections import OrderedDict - -from lxml import etree - -this_dir = os.path.realpath(os.path.dirname(__file__)) -sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) - -from geodata.i18n.unicode_paths import CLDR_DIR -from geodata.i18n.languages import * -from geodata.encoding import safe_decode - -CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main') - - -IGNORE_COUNTRIES = set(['ZZ']) - -COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS']) -COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL']) - -LANGUAGE_COUNTRY_OVERRIDES = { - 'en': { - 'CD': 'Democratic Republic of the Congo', - 'CG': 'Republic of the Congo', - }, - - # Countries where the local language is absent from CLDR - - # Tajik / Tajikistan - 'tg': { - 'TJ': 'Тоҷикистон', - }, - - # Maldivan / Maldives - 'dv': { - 'MV': 'ދިވެހިރާއްޖެ', - } - - -} - - -def cldr_country_names(language, base_dir=CLDR_MAIN_PATH): - ''' - Country names are tricky as there can be several versions - and levels of verbosity e.g. United States of America - vs. the more commonly used United States. Most countries - have a similarly verbose form. - - The CLDR repo (http://cldr.unicode.org/) has the most - comprehensive localized database of country names - (among other things), organized by language. This function - parses CLDR XML for a given language and returns a dictionary - of {country_code: name} for that language. - ''' - filename = os.path.join(base_dir, '{}.xml'.format(language)) - xml = etree.parse(open(filename)) - - country_names = defaultdict(dict) - - for territory in xml.xpath('*//territories/*'): - country_code = territory.attrib['type'] - - if country_code in IGNORE_COUNTRIES or country_code.isdigit(): - continue - - country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text) - - display_names = {} - - for country_code, names in country_names.iteritems(): - if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): - display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) - continue - - default_name = names.get(None) - - if country_code in COUNTRY_USE_SHORT_NAME: - display_names[country_code] = names.get('short', default_name) - elif country_code in COUNTRY_USE_VARIANT_NAME: - display_names[country_code] = names.get('variant', default_name) - elif default_name is not None: - display_names[country_code] = default_name - - return display_names - - -country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries]) -country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries]) - -country_alpha3_map = {c.alpha3.lower(): c.alpha2.lower() for c in pycountry.countries} - -language_country_names = {} - -country_official_names = defaultdict(OrderedDict) -country_local_names = defaultdict(OrderedDict) - - -def init_country_names(base_dir=CLDR_MAIN_PATH): - ''' - Call init_country_names to initialized the module. Sets up the above dictionaries. - ''' - global language_country_names - init_languages() - - local_languages = {} - - country_language_names = defaultdict(dict) - - for filename in os.listdir(base_dir): - lang = filename.split('.xml')[0] - if len(lang) > 3: - continue - - names = cldr_country_names(lang, base_dir=base_dir) - lang = lang.lower() - language_country_names[lang] = names - - for country, name in names.iteritems(): - country = country.lower() - - languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)]) - local_languages[country] = languages - - if lang in local_languages.get(country, {}): - country_language_names[country][lang] = name - - for l, names in LANGUAGE_COUNTRY_OVERRIDES.iteritems(): - if l not in language_country_names: - language_country_names[l.lower()] = names - - for c, name in names.iteritems(): - if c.lower() not in country_language_names: - country_language_names[c.lower()][l.lower()] = name - - for country, langs in local_languages.iteritems(): - names = country_language_names[country] - num_defaults = sum((1 for lang in names.keys() if langs.get(lang))) - for i, (lang, default) in enumerate(langs.iteritems()): - name = names.get(lang) - if not name: - continue - if default or num_defaults == 0: - country_official_names[country][lang] = name - if num_defaults == 0: - break - country_local_names[country][lang] = name - - -def country_localized_display_name(country_code): - ''' - Get the display name for a country code in the local language - e.g. Россия for Russia, España for Spain, etc. - - For most countries there is a single official name. For countries - with more than one official language, this will return a concatenated - version separated by a slash e.g. Maroc / المغرب for Morocco. - - Note that all of the exceptions in road_sign_languages.tsv are also - taken into account here so India for example uses the English name - rather than concatenating all 27 toponyms. - - This method should be roughly consistent with OSM's display names. - - Usage: - >>> country_official_name('jp') # returns '日本' - >>> country_official_name('be') # returns 'België / Belgique / Belgien' - ''' - - country_code = country_code.lower() - if not country_official_names: - init_country_names() - return ' / '.join(OrderedDict.fromkeys(n.replace('-', ' ') - for n in country_official_names[country_code].values()).keys()) diff --git a/scripts/geodata/countries/names.py b/scripts/geodata/countries/names.py new file mode 100644 index 00000000..bff7090c --- /dev/null +++ b/scripts/geodata/countries/names.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +import os +import six +import sys + +import pycountry + +from collections import OrderedDict + +from lxml import etree + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.i18n.unicode_paths import CLDR_DIR +from geodata.i18n.languages import * +from geodata.encoding import safe_decode + +CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main') + +COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'countries', 'names.yaml') + +IGNORE_COUNTRIES = set([six.u('ZZ')]) + +COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')]) +COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')]) + +LANGUAGE_COUNTRY_OVERRIDES = { + 'en': { + 'CD': safe_decode('Democratic Republic of the Congo'), + 'CG': safe_decode('Republic of the Congo'), + }, + + # Countries where the local language is absent from CLDR + + # Tajik / Tajikistan + 'tg': { + 'TJ': safe_decode('Тоҷикистон'), + }, + + # Maldivan / Maldives + 'dv': { + 'MV': safe_decode('ދިވެހިރާއްޖެ'), + } + +} + + +class CountryNames(object): + def __init__(self, base_dir=CLDR_MAIN_PATH): + self.base_dir = base_dir + + self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries} + + self.language_country_names = {} + self.country_language_names = defaultdict(dict) + + self.country_official_names = defaultdict(OrderedDict) + self.country_local_names = defaultdict(OrderedDict) + + local_languages = {} + + country_local_language_names = defaultdict(dict) + + for filename in os.listdir(base_dir): + lang = filename.split('.xml')[0] + if len(lang) > 3: + continue + + names = self.cldr_country_names(lang) + lang = lang.lower() + self.language_country_names[lang] = names + + for country, name in names.iteritems(): + country = country.lower() + + languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)]) + local_languages[country] = languages + + self.country_language_names[country.lower()][lang.lower()] = name + + if lang in local_languages.get(country, {}): + country_local_language_names[country][lang] = name + + for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES): + if l not in self.language_country_names: + self.language_country_names[l.lower()] = names + + for c, name in six.iteritems(names): + self.country_language_names[c.lower()][l.lower()] = name + if c.lower() not in country_local_language_names: + country_local_language_names[c.lower()][l.lower()] = name + + for country, langs in six.iteritems(local_languages): + names = country_local_language_names[country] + num_defaults = sum((1 for lang in names.keys() if langs.get(lang))) + for i, (lang, default) in enumerate(langs.iteritems()): + name = names.get(lang) + if not name: + continue + if default or num_defaults == 0: + self.country_official_names[country][lang] = name + if num_defaults == 0: + break + self.country_local_names[country][lang] = name + + def cldr_country_names(self, language): + ''' + Country names are tricky as there can be several versions + and levels of verbosity e.g. United States of America + vs. the more commonly used United States. Most countries + have a similarly verbose form. + + The CLDR repo (http://cldr.unicode.org/) has the most + comprehensive localized database of country names + (among other things), organized by language. This function + parses CLDR XML for a given language and returns a dictionary + of {country_code: name} for that language. + ''' + filename = os.path.join(self.base_dir, '{}.xml'.format(language)) + xml = etree.parse(open(filename)) + + country_names = defaultdict(dict) + + for territory in xml.xpath('*//territories/*'): + country_code = territory.attrib['type'] + + if country_code in IGNORE_COUNTRIES or country_code.isdigit(): + continue + + country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text) + + display_names = {} + + for country_code, names in country_names.iteritems(): + if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): + display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) + continue + + default_name = names.get(None) + + if country_code in COUNTRY_USE_SHORT_NAME: + display_names[country_code] = names.get('short', default_name) + elif country_code in COUNTRY_USE_VARIANT_NAME: + display_names[country_code] = names.get('variant', default_name) + elif default_name is not None: + display_names[country_code] = default_name + + return display_names + + def localized_name(self, country_code, language=None): + ''' + Get the display name for a country code in the local language + e.g. Россия for Russia, España for Spain, etc. + + For most countries there is a single official name. For countries + with more than one official language, this will return a concatenated + version separated by a slash e.g. Maroc / المغرب for Morocco. + + Note that all of the exceptions in road_sign_languages.tsv are also + taken into account here so India for example uses the English name + rather than concatenating all 27 toponyms. + + This method should be roughly consistent with OSM's display names. + + Usage: + >>> country_names.localized_name('jp') # returns '日本' + >>> country_names.localized_name('be') # returns 'België / Belgique / Belgien' + ''' + + country_code = country_code.lower() + if language is None: + return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' ')) + for n in self.country_official_names[country_code].values()).keys()) + else: + return self.country_language_names.get(country_code, {}).get(language) + + def alpha3_code(self, alpha2_code): + alpha3 = self.country_alpha3_codes.get(alpha2_code.lower()) + return alpha3.upper() if alpha3 else None + +country_names = CountryNames()