[countries] Making country official names align better with OSM/Wikipedia, plugging holes
This commit is contained in:
@@ -1,3 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@@ -12,7 +15,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|||||||
|
|
||||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||||
from geodata.i18n.languages import *
|
from geodata.i18n.languages import *
|
||||||
from geodata.encoding import safe_encode
|
from geodata.encoding import safe_decode
|
||||||
|
|
||||||
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
||||||
|
|
||||||
@@ -26,7 +29,21 @@ LANGUAGE_COUNTRY_OVERRIDES = {
|
|||||||
'en': {
|
'en': {
|
||||||
'CD': 'Democratic Republic of the Congo',
|
'CD': 'Democratic Republic of the Congo',
|
||||||
'CG': 'Republic of the Congo',
|
'CG': 'Republic of the Congo',
|
||||||
|
},
|
||||||
|
|
||||||
|
# Countries that don't have their language in CLDR
|
||||||
|
|
||||||
|
# Tajik / Tajikistan
|
||||||
|
'tg': {
|
||||||
|
'TJ': 'Тоҷикистон',
|
||||||
|
},
|
||||||
|
|
||||||
|
# Maldivan / Maldives
|
||||||
|
'dv': {
|
||||||
|
'MV': 'ދިވެހިރާއްޖެ',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -34,25 +51,33 @@ def cldr_country_names(language, base_dir=CLDR_MAIN_PATH):
|
|||||||
filename = os.path.join(base_dir, '{}.xml'.format(language))
|
filename = os.path.join(base_dir, '{}.xml'.format(language))
|
||||||
xml = etree.parse(open(filename))
|
xml = etree.parse(open(filename))
|
||||||
|
|
||||||
country_names = {}
|
country_names = defaultdict(dict)
|
||||||
|
|
||||||
for territory in xml.xpath('*//territories/*'):
|
for territory in xml.xpath('*//territories/*'):
|
||||||
country_code = territory.attrib['type']
|
country_code = territory.attrib['type']
|
||||||
|
|
||||||
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
||||||
continue
|
continue
|
||||||
elif country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
|
||||||
country_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
|
||||||
continue
|
|
||||||
elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
|
display_names = {}
|
||||||
continue
|
|
||||||
elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
|
for country_code, names in country_names.iteritems():
|
||||||
continue
|
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
||||||
elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
|
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
||||||
continue
|
continue
|
||||||
|
|
||||||
country_names[country_code] = safe_decode(territory.text)
|
default_name = names.get(None)
|
||||||
|
|
||||||
return country_names
|
if country_code in COUNTRY_USE_SHORT_NAME:
|
||||||
|
display_names[country_code] = names.get('short', default_name)
|
||||||
|
elif country_code in COUNTRY_USE_VARIANT_NAME:
|
||||||
|
display_names[country_code] = names.get('variant', default_name)
|
||||||
|
elif default_name is not None:
|
||||||
|
display_names[country_code] = default_name
|
||||||
|
|
||||||
|
return display_names
|
||||||
|
|
||||||
country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
country_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||||
country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
country_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
||||||
@@ -69,12 +94,13 @@ def init_country_names(base_dir=CLDR_MAIN_PATH):
|
|||||||
global language_country_names
|
global language_country_names
|
||||||
init_languages()
|
init_languages()
|
||||||
|
|
||||||
local_languages = {country: get_country_languages(country, official=False,
|
local_languages = {country: get_country_languages(country, official=False)
|
||||||
overrides=False) for country in country_alpha2_codes}
|
or OrderedDict([('en', 1)])
|
||||||
|
for country in country_alpha2_codes}
|
||||||
|
|
||||||
country_language_names = defaultdict(dict)
|
country_language_names = defaultdict(dict)
|
||||||
|
|
||||||
for filename in os.listdir(CLDR_MAIN_PATH):
|
for filename in os.listdir(base_dir):
|
||||||
lang = filename.split('.xml')[0]
|
lang = filename.split('.xml')[0]
|
||||||
if len(lang) > 3:
|
if len(lang) > 3:
|
||||||
continue
|
continue
|
||||||
@@ -89,12 +115,30 @@ def init_country_names(base_dir=CLDR_MAIN_PATH):
|
|||||||
if lang in local_languages.get(country, {}):
|
if lang in local_languages.get(country, {}):
|
||||||
country_language_names[country][lang] = name
|
country_language_names[country][lang] = name
|
||||||
|
|
||||||
|
for l, names in LANGUAGE_COUNTRY_OVERRIDES.iteritems():
|
||||||
|
if l not in language_country_names:
|
||||||
|
language_country_names[l.lower()] = names
|
||||||
|
|
||||||
|
for c, name in names.iteritems():
|
||||||
|
if c.lower() not in country_language_names:
|
||||||
|
country_language_names[c.lower()][l.lower()] = name
|
||||||
|
|
||||||
for country, langs in local_languages.iteritems():
|
for country, langs in local_languages.iteritems():
|
||||||
names = country_language_names[country]
|
names = country_language_names[country]
|
||||||
for lang, default in langs.iteritems():
|
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
|
||||||
|
for i, (lang, default) in enumerate(langs.iteritems()):
|
||||||
name = names.get(lang)
|
name = names.get(lang)
|
||||||
if not name:
|
if not name:
|
||||||
continue
|
continue
|
||||||
if default:
|
if default or num_defaults == 0:
|
||||||
country_official_names[country][lang] = name
|
country_official_names[country][lang] = name
|
||||||
|
if num_defaults == 0:
|
||||||
|
break
|
||||||
country_local_names[country][lang] = name
|
country_local_names[country][lang] = name
|
||||||
|
|
||||||
|
|
||||||
|
def country_official_name(country_code):
|
||||||
|
if not country_official_names:
|
||||||
|
init_country_names()
|
||||||
|
return ' / '.join(OrderedDict.fromkeys(n.replace('-', ' ')
|
||||||
|
for n in country_official_names[c].values()).keys())
|
||||||
|
|||||||
Reference in New Issue
Block a user