[countries] Wrapping CLDR country names/alpha3 codes

This commit is contained in:
Al
2016-05-10 09:40:08 -04:00
parent 0ba010b9aa
commit d17ff3678d
2 changed files with 183 additions and 181 deletions

View File

@@ -1,181 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import sys
import pycountry
from collections import OrderedDict
from lxml import etree
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.i18n.unicode_paths import CLDR_DIR
from geodata.i18n.languages import *
from geodata.encoding import safe_decode
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
IGNORE_COUNTRIES = set(['ZZ'])
COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
LANGUAGE_COUNTRY_OVERRIDES = {
'en': {
'CD': 'Democratic Republic of the Congo',
'CG': 'Republic of the Congo',
},
# Countries where the local language is absent from CLDR
# Tajik / Tajikistan
'tg': {
'TJ': 'Тоҷикистон',
},
# Maldivan / Maldives
'dv': {
'MV': 'ދިވެހިރާއްޖެ',
}
}
def cldr_country_names(language, base_dir=CLDR_MAIN_PATH):
'''
Country names are tricky as there can be several versions
and levels of verbosity e.g. United States of America
vs. the more commonly used United States. Most countries
have a similarly verbose form.
The CLDR repo (http://cldr.unicode.org/) has the most
comprehensive localized database of country names
(among other things), organized by language. This function
parses CLDR XML for a given language and returns a dictionary
of {country_code: name} for that language.
'''
filename = os.path.join(base_dir, '{}.xml'.format(language))
xml = etree.parse(open(filename))
country_names = defaultdict(dict)
for territory in xml.xpath('*//territories/*'):
country_code = territory.attrib['type']
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
continue
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
display_names = {}
for country_code, names in country_names.iteritems():
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
continue
default_name = names.get(None)
if country_code in COUNTRY_USE_SHORT_NAME:
display_names[country_code] = names.get('short', default_name)
elif country_code in COUNTRY_USE_VARIANT_NAME:
display_names[country_code] = names.get('variant', default_name)
elif default_name is not None:
display_names[country_code] = default_name
return display_names
country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
country_alpha3_map = {c.alpha3.lower(): c.alpha2.lower() for c in pycountry.countries}
language_country_names = {}
country_official_names = defaultdict(OrderedDict)
country_local_names = defaultdict(OrderedDict)
def init_country_names(base_dir=CLDR_MAIN_PATH):
'''
Call init_country_names to initialized the module. Sets up the above dictionaries.
'''
global language_country_names
init_languages()
local_languages = {}
country_language_names = defaultdict(dict)
for filename in os.listdir(base_dir):
lang = filename.split('.xml')[0]
if len(lang) > 3:
continue
names = cldr_country_names(lang, base_dir=base_dir)
lang = lang.lower()
language_country_names[lang] = names
for country, name in names.iteritems():
country = country.lower()
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
local_languages[country] = languages
if lang in local_languages.get(country, {}):
country_language_names[country][lang] = name
for l, names in LANGUAGE_COUNTRY_OVERRIDES.iteritems():
if l not in language_country_names:
language_country_names[l.lower()] = names
for c, name in names.iteritems():
if c.lower() not in country_language_names:
country_language_names[c.lower()][l.lower()] = name
for country, langs in local_languages.iteritems():
names = country_language_names[country]
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
for i, (lang, default) in enumerate(langs.iteritems()):
name = names.get(lang)
if not name:
continue
if default or num_defaults == 0:
country_official_names[country][lang] = name
if num_defaults == 0:
break
country_local_names[country][lang] = name
def country_localized_display_name(country_code):
'''
Get the display name for a country code in the local language
e.g. Россия for Russia, España for Spain, etc.
For most countries there is a single official name. For countries
with more than one official language, this will return a concatenated
version separated by a slash e.g. Maroc / المغرب for Morocco.
Note that all of the exceptions in road_sign_languages.tsv are also
taken into account here so India for example uses the English name
rather than concatenating all 27 toponyms.
This method should be roughly consistent with OSM's display names.
Usage:
>>> country_official_name('jp') # returns '日本'
>>> country_official_name('be') # returns 'België / Belgique / Belgien'
'''
country_code = country_code.lower()
if not country_official_names:
init_country_names()
return ' / '.join(OrderedDict.fromkeys(n.replace('-', ' ')
for n in country_official_names[country_code].values()).keys())

View File

@@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
import os
import six
import sys
import pycountry
from collections import OrderedDict
from lxml import etree
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.i18n.unicode_paths import CLDR_DIR
from geodata.i18n.languages import *
from geodata.encoding import safe_decode
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'countries', 'names.yaml')
IGNORE_COUNTRIES = set([six.u('ZZ')])
COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
LANGUAGE_COUNTRY_OVERRIDES = {
'en': {
'CD': safe_decode('Democratic Republic of the Congo'),
'CG': safe_decode('Republic of the Congo'),
},
# Countries where the local language is absent from CLDR
# Tajik / Tajikistan
'tg': {
'TJ': safe_decode('Тоҷикистон'),
},
# Maldivan / Maldives
'dv': {
'MV': safe_decode('ދިވެހިރާއްޖެ'),
}
}
class CountryNames(object):
def __init__(self, base_dir=CLDR_MAIN_PATH):
self.base_dir = base_dir
self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
self.language_country_names = {}
self.country_language_names = defaultdict(dict)
self.country_official_names = defaultdict(OrderedDict)
self.country_local_names = defaultdict(OrderedDict)
local_languages = {}
country_local_language_names = defaultdict(dict)
for filename in os.listdir(base_dir):
lang = filename.split('.xml')[0]
if len(lang) > 3:
continue
names = self.cldr_country_names(lang)
lang = lang.lower()
self.language_country_names[lang] = names
for country, name in names.iteritems():
country = country.lower()
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
local_languages[country] = languages
self.country_language_names[country.lower()][lang.lower()] = name
if lang in local_languages.get(country, {}):
country_local_language_names[country][lang] = name
for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
if l not in self.language_country_names:
self.language_country_names[l.lower()] = names
for c, name in six.iteritems(names):
self.country_language_names[c.lower()][l.lower()] = name
if c.lower() not in country_local_language_names:
country_local_language_names[c.lower()][l.lower()] = name
for country, langs in six.iteritems(local_languages):
names = country_local_language_names[country]
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
for i, (lang, default) in enumerate(langs.iteritems()):
name = names.get(lang)
if not name:
continue
if default or num_defaults == 0:
self.country_official_names[country][lang] = name
if num_defaults == 0:
break
self.country_local_names[country][lang] = name
def cldr_country_names(self, language):
'''
Country names are tricky as there can be several versions
and levels of verbosity e.g. United States of America
vs. the more commonly used United States. Most countries
have a similarly verbose form.
The CLDR repo (http://cldr.unicode.org/) has the most
comprehensive localized database of country names
(among other things), organized by language. This function
parses CLDR XML for a given language and returns a dictionary
of {country_code: name} for that language.
'''
filename = os.path.join(self.base_dir, '{}.xml'.format(language))
xml = etree.parse(open(filename))
country_names = defaultdict(dict)
for territory in xml.xpath('*//territories/*'):
country_code = territory.attrib['type']
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
continue
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
display_names = {}
for country_code, names in country_names.iteritems():
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
continue
default_name = names.get(None)
if country_code in COUNTRY_USE_SHORT_NAME:
display_names[country_code] = names.get('short', default_name)
elif country_code in COUNTRY_USE_VARIANT_NAME:
display_names[country_code] = names.get('variant', default_name)
elif default_name is not None:
display_names[country_code] = default_name
return display_names
def localized_name(self, country_code, language=None):
'''
Get the display name for a country code in the local language
e.g. Россия for Russia, España for Spain, etc.
For most countries there is a single official name. For countries
with more than one official language, this will return a concatenated
version separated by a slash e.g. Maroc / المغرب for Morocco.
Note that all of the exceptions in road_sign_languages.tsv are also
taken into account here so India for example uses the English name
rather than concatenating all 27 toponyms.
This method should be roughly consistent with OSM's display names.
Usage:
>>> country_names.localized_name('jp') # returns '日本'
>>> country_names.localized_name('be') # returns 'België / Belgique / Belgien'
'''
country_code = country_code.lower()
if language is None:
return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
for n in self.country_official_names[country_code].values()).keys())
else:
return self.country_language_names.get(country_code, {}).get(language)
def alpha3_code(self, alpha2_code):
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
return alpha3.upper() if alpha3 else None
country_names = CountryNames()