[countries] Wrapping CLDR country names/alpha3 codes
This commit is contained in:
@@ -1,181 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pycountry
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
||||
|
||||
|
||||
IGNORE_COUNTRIES = set(['ZZ'])
|
||||
|
||||
COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
|
||||
COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
|
||||
|
||||
LANGUAGE_COUNTRY_OVERRIDES = {
|
||||
'en': {
|
||||
'CD': 'Democratic Republic of the Congo',
|
||||
'CG': 'Republic of the Congo',
|
||||
},
|
||||
|
||||
# Countries where the local language is absent from CLDR
|
||||
|
||||
# Tajik / Tajikistan
|
||||
'tg': {
|
||||
'TJ': 'Тоҷикистон',
|
||||
},
|
||||
|
||||
# Maldivan / Maldives
|
||||
'dv': {
|
||||
'MV': 'ދިވެހިރާއްޖެ',
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def cldr_country_names(language, base_dir=CLDR_MAIN_PATH):
|
||||
'''
|
||||
Country names are tricky as there can be several versions
|
||||
and levels of verbosity e.g. United States of America
|
||||
vs. the more commonly used United States. Most countries
|
||||
have a similarly verbose form.
|
||||
|
||||
The CLDR repo (http://cldr.unicode.org/) has the most
|
||||
comprehensive localized database of country names
|
||||
(among other things), organized by language. This function
|
||||
parses CLDR XML for a given language and returns a dictionary
|
||||
of {country_code: name} for that language.
|
||||
'''
|
||||
filename = os.path.join(base_dir, '{}.xml'.format(language))
|
||||
xml = etree.parse(open(filename))
|
||||
|
||||
country_names = defaultdict(dict)
|
||||
|
||||
for territory in xml.xpath('*//territories/*'):
|
||||
country_code = territory.attrib['type']
|
||||
|
||||
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
||||
continue
|
||||
|
||||
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
|
||||
|
||||
display_names = {}
|
||||
|
||||
for country_code, names in country_names.iteritems():
|
||||
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
||||
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
||||
continue
|
||||
|
||||
default_name = names.get(None)
|
||||
|
||||
if country_code in COUNTRY_USE_SHORT_NAME:
|
||||
display_names[country_code] = names.get('short', default_name)
|
||||
elif country_code in COUNTRY_USE_VARIANT_NAME:
|
||||
display_names[country_code] = names.get('variant', default_name)
|
||||
elif default_name is not None:
|
||||
display_names[country_code] = default_name
|
||||
|
||||
return display_names
|
||||
|
||||
|
||||
country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||
country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
||||
|
||||
country_alpha3_map = {c.alpha3.lower(): c.alpha2.lower() for c in pycountry.countries}
|
||||
|
||||
language_country_names = {}
|
||||
|
||||
country_official_names = defaultdict(OrderedDict)
|
||||
country_local_names = defaultdict(OrderedDict)
|
||||
|
||||
|
||||
def init_country_names(base_dir=CLDR_MAIN_PATH):
|
||||
'''
|
||||
Call init_country_names to initialized the module. Sets up the above dictionaries.
|
||||
'''
|
||||
global language_country_names
|
||||
init_languages()
|
||||
|
||||
local_languages = {}
|
||||
|
||||
country_language_names = defaultdict(dict)
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
lang = filename.split('.xml')[0]
|
||||
if len(lang) > 3:
|
||||
continue
|
||||
|
||||
names = cldr_country_names(lang, base_dir=base_dir)
|
||||
lang = lang.lower()
|
||||
language_country_names[lang] = names
|
||||
|
||||
for country, name in names.iteritems():
|
||||
country = country.lower()
|
||||
|
||||
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
|
||||
local_languages[country] = languages
|
||||
|
||||
if lang in local_languages.get(country, {}):
|
||||
country_language_names[country][lang] = name
|
||||
|
||||
for l, names in LANGUAGE_COUNTRY_OVERRIDES.iteritems():
|
||||
if l not in language_country_names:
|
||||
language_country_names[l.lower()] = names
|
||||
|
||||
for c, name in names.iteritems():
|
||||
if c.lower() not in country_language_names:
|
||||
country_language_names[c.lower()][l.lower()] = name
|
||||
|
||||
for country, langs in local_languages.iteritems():
|
||||
names = country_language_names[country]
|
||||
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
|
||||
for i, (lang, default) in enumerate(langs.iteritems()):
|
||||
name = names.get(lang)
|
||||
if not name:
|
||||
continue
|
||||
if default or num_defaults == 0:
|
||||
country_official_names[country][lang] = name
|
||||
if num_defaults == 0:
|
||||
break
|
||||
country_local_names[country][lang] = name
|
||||
|
||||
|
||||
def country_localized_display_name(country_code):
|
||||
'''
|
||||
Get the display name for a country code in the local language
|
||||
e.g. Россия for Russia, España for Spain, etc.
|
||||
|
||||
For most countries there is a single official name. For countries
|
||||
with more than one official language, this will return a concatenated
|
||||
version separated by a slash e.g. Maroc / المغرب for Morocco.
|
||||
|
||||
Note that all of the exceptions in road_sign_languages.tsv are also
|
||||
taken into account here so India for example uses the English name
|
||||
rather than concatenating all 27 toponyms.
|
||||
|
||||
This method should be roughly consistent with OSM's display names.
|
||||
|
||||
Usage:
|
||||
>>> country_official_name('jp') # returns '日本'
|
||||
>>> country_official_name('be') # returns 'België / Belgique / Belgien'
|
||||
'''
|
||||
|
||||
country_code = country_code.lower()
|
||||
if not country_official_names:
|
||||
init_country_names()
|
||||
return ' / '.join(OrderedDict.fromkeys(n.replace('-', ' ')
|
||||
for n in country_official_names[country_code].values()).keys())
|
||||
183
scripts/geodata/countries/names.py
Normal file
183
scripts/geodata/countries/names.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
import pycountry
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
||||
|
||||
COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'countries', 'names.yaml')
|
||||
|
||||
IGNORE_COUNTRIES = set([six.u('ZZ')])
|
||||
|
||||
COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
|
||||
COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
|
||||
|
||||
LANGUAGE_COUNTRY_OVERRIDES = {
|
||||
'en': {
|
||||
'CD': safe_decode('Democratic Republic of the Congo'),
|
||||
'CG': safe_decode('Republic of the Congo'),
|
||||
},
|
||||
|
||||
# Countries where the local language is absent from CLDR
|
||||
|
||||
# Tajik / Tajikistan
|
||||
'tg': {
|
||||
'TJ': safe_decode('Тоҷикистон'),
|
||||
},
|
||||
|
||||
# Maldivan / Maldives
|
||||
'dv': {
|
||||
'MV': safe_decode('ދިވެހިރާއްޖެ'),
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class CountryNames(object):
|
||||
def __init__(self, base_dir=CLDR_MAIN_PATH):
|
||||
self.base_dir = base_dir
|
||||
|
||||
self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
|
||||
|
||||
self.language_country_names = {}
|
||||
self.country_language_names = defaultdict(dict)
|
||||
|
||||
self.country_official_names = defaultdict(OrderedDict)
|
||||
self.country_local_names = defaultdict(OrderedDict)
|
||||
|
||||
local_languages = {}
|
||||
|
||||
country_local_language_names = defaultdict(dict)
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
lang = filename.split('.xml')[0]
|
||||
if len(lang) > 3:
|
||||
continue
|
||||
|
||||
names = self.cldr_country_names(lang)
|
||||
lang = lang.lower()
|
||||
self.language_country_names[lang] = names
|
||||
|
||||
for country, name in names.iteritems():
|
||||
country = country.lower()
|
||||
|
||||
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
|
||||
local_languages[country] = languages
|
||||
|
||||
self.country_language_names[country.lower()][lang.lower()] = name
|
||||
|
||||
if lang in local_languages.get(country, {}):
|
||||
country_local_language_names[country][lang] = name
|
||||
|
||||
for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
|
||||
if l not in self.language_country_names:
|
||||
self.language_country_names[l.lower()] = names
|
||||
|
||||
for c, name in six.iteritems(names):
|
||||
self.country_language_names[c.lower()][l.lower()] = name
|
||||
if c.lower() not in country_local_language_names:
|
||||
country_local_language_names[c.lower()][l.lower()] = name
|
||||
|
||||
for country, langs in six.iteritems(local_languages):
|
||||
names = country_local_language_names[country]
|
||||
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
|
||||
for i, (lang, default) in enumerate(langs.iteritems()):
|
||||
name = names.get(lang)
|
||||
if not name:
|
||||
continue
|
||||
if default or num_defaults == 0:
|
||||
self.country_official_names[country][lang] = name
|
||||
if num_defaults == 0:
|
||||
break
|
||||
self.country_local_names[country][lang] = name
|
||||
|
||||
def cldr_country_names(self, language):
|
||||
'''
|
||||
Country names are tricky as there can be several versions
|
||||
and levels of verbosity e.g. United States of America
|
||||
vs. the more commonly used United States. Most countries
|
||||
have a similarly verbose form.
|
||||
|
||||
The CLDR repo (http://cldr.unicode.org/) has the most
|
||||
comprehensive localized database of country names
|
||||
(among other things), organized by language. This function
|
||||
parses CLDR XML for a given language and returns a dictionary
|
||||
of {country_code: name} for that language.
|
||||
'''
|
||||
filename = os.path.join(self.base_dir, '{}.xml'.format(language))
|
||||
xml = etree.parse(open(filename))
|
||||
|
||||
country_names = defaultdict(dict)
|
||||
|
||||
for territory in xml.xpath('*//territories/*'):
|
||||
country_code = territory.attrib['type']
|
||||
|
||||
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
||||
continue
|
||||
|
||||
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
|
||||
|
||||
display_names = {}
|
||||
|
||||
for country_code, names in country_names.iteritems():
|
||||
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
||||
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
||||
continue
|
||||
|
||||
default_name = names.get(None)
|
||||
|
||||
if country_code in COUNTRY_USE_SHORT_NAME:
|
||||
display_names[country_code] = names.get('short', default_name)
|
||||
elif country_code in COUNTRY_USE_VARIANT_NAME:
|
||||
display_names[country_code] = names.get('variant', default_name)
|
||||
elif default_name is not None:
|
||||
display_names[country_code] = default_name
|
||||
|
||||
return display_names
|
||||
|
||||
def localized_name(self, country_code, language=None):
|
||||
'''
|
||||
Get the display name for a country code in the local language
|
||||
e.g. Россия for Russia, España for Spain, etc.
|
||||
|
||||
For most countries there is a single official name. For countries
|
||||
with more than one official language, this will return a concatenated
|
||||
version separated by a slash e.g. Maroc / المغرب for Morocco.
|
||||
|
||||
Note that all of the exceptions in road_sign_languages.tsv are also
|
||||
taken into account here so India for example uses the English name
|
||||
rather than concatenating all 27 toponyms.
|
||||
|
||||
This method should be roughly consistent with OSM's display names.
|
||||
|
||||
Usage:
|
||||
>>> country_names.localized_name('jp') # returns '日本'
|
||||
>>> country_names.localized_name('be') # returns 'België / Belgique / Belgien'
|
||||
'''
|
||||
|
||||
country_code = country_code.lower()
|
||||
if language is None:
|
||||
return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
|
||||
for n in self.country_official_names[country_code].values()).keys())
|
||||
else:
|
||||
return self.country_language_names.get(country_code, {}).get(language)
|
||||
|
||||
def alpha3_code(self, alpha2_code):
|
||||
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
|
||||
return alpha3.upper() if alpha3 else None
|
||||
|
||||
country_names = CountryNames()
|
||||
Reference in New Issue
Block a user