Initial fork commit
This commit is contained in:
0
scripts/geodata/countries/__init__.py
Normal file
0
scripts/geodata/countries/__init__.py
Normal file
262
scripts/geodata/countries/constants.py
Normal file
262
scripts/geodata/countries/constants.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import pycountry
|
||||
|
||||
|
||||
class Countries(object):
|
||||
AFGHANISTAN = 'af'
|
||||
ALAND_ISLANDS = 'ax'
|
||||
ALBANIA = 'al'
|
||||
ALGERIA = 'dz'
|
||||
AMERICAN_SAMOA = 'as'
|
||||
ANDORRA = 'ad'
|
||||
ANGOLA = 'ao'
|
||||
ANGUILLA = 'ai'
|
||||
ANTARCTICA = 'aq'
|
||||
ANTIGUA_AND_BARBUDA = 'ag'
|
||||
ARGENTINA = 'ar'
|
||||
ARMENIA = 'am'
|
||||
ARUBA = 'aw'
|
||||
AUSTRALIA = 'au'
|
||||
AUSTRIA = 'at'
|
||||
AZERBAIJAN = 'az'
|
||||
BAHAMAS = 'bs'
|
||||
BAHRAIN = 'bh'
|
||||
BANGLADESH = 'bd'
|
||||
BARBADOS = 'bb'
|
||||
BELARUS = 'by'
|
||||
BELGIUM = 'be'
|
||||
BELIZE = 'bz'
|
||||
BENIN = 'bj'
|
||||
BERMUDA = 'bm'
|
||||
BHUTAN = 'bt'
|
||||
BOLIVIA = 'bo'
|
||||
BONAIRE = 'bq'
|
||||
BOSNIA_AND_HERZEGOVINA = 'bq'
|
||||
BOTSWANA = 'bw'
|
||||
BOUVET_ISLAND = 'bv'
|
||||
BRAZIL = 'br'
|
||||
BRITISH_INDIAN_OCEAN_TERRITORY = 'io'
|
||||
BRITISH_VIRGIN_ISLANDS = 'vg'
|
||||
BRUNEI_DARUSSALAM = 'bn'
|
||||
BULGARIA = 'bg'
|
||||
BURKINA_FASO = 'bf'
|
||||
BURUNDI = 'bi'
|
||||
CAMBODIA = 'kh'
|
||||
CAMEROON = 'cm'
|
||||
CANADA = 'ca'
|
||||
CAPE_VERDE = 'cv'
|
||||
CAYMAN_ISLANDS = 'ky'
|
||||
CENTRAL_AFRICAN_REPUBLIC = 'cf'
|
||||
CHAD = 'td'
|
||||
CHILE = 'cl'
|
||||
CHINA = 'cn'
|
||||
CHRISTMAS_ISLAND = 'cx'
|
||||
COCOS_KEELING_ISLANDS = 'cc'
|
||||
COLOMBIA = 'co'
|
||||
COMOROS = 'km'
|
||||
COOK_ISLANDS = 'ck'
|
||||
COSTA_RICA = 'cr'
|
||||
COTE_DIVOIRE = 'ci'
|
||||
CROATIA = 'hr'
|
||||
CUBA = 'cu'
|
||||
CURACAO = 'cw'
|
||||
CYPRUS = 'cy'
|
||||
CZECH_REPUBLIC = 'cz'
|
||||
DENMARK = 'dk'
|
||||
DEMOCRATIC_REPUBLIC_OF_THE_CONGO = 'cd'
|
||||
DJIBOUTI = 'dj'
|
||||
DOMINICA = 'dm'
|
||||
DOMINICAN_REPUBLIC = 'do'
|
||||
ECUADOR = 'ec'
|
||||
EGYPT = 'eg'
|
||||
EL_SALVADOR = 'sv'
|
||||
EQUATORIAL_GUINEA = 'gq'
|
||||
ERITREA = 'er'
|
||||
ESTONIA = 'ee'
|
||||
ETHIOPIA = 'et'
|
||||
FALKLAND_ISLANDS_MALVINAS = 'fk'
|
||||
FAROE_ISLANDS = 'fo'
|
||||
FEDERATED_STATES_OF_MICRONESIA = 'fm'
|
||||
FIJI = 'fj'
|
||||
FINLAND = 'fi'
|
||||
FRANCE = 'fr'
|
||||
FRENCH_GUIANA = 'gf'
|
||||
FRENCH_POLYNESIA = 'pf'
|
||||
FRENCH_SOUTHERN_TERRITORIES = 'tf'
|
||||
GABON = 'ga'
|
||||
GAMBIA = 'gm'
|
||||
GEORGIA = 'ge'
|
||||
GERMANY = 'de'
|
||||
GHANA = 'gh'
|
||||
GIBRALTAR = 'gi'
|
||||
GREECE = 'gr'
|
||||
GREENLAND = 'gl'
|
||||
GRENADA = 'gd'
|
||||
GUADELOUPE = 'gp'
|
||||
GUAM = 'gu'
|
||||
GUATEMALA = 'gt'
|
||||
GUERNSEY = 'gg'
|
||||
GUINEA = 'gn'
|
||||
GUINEA_BISSAU = 'gw'
|
||||
GUYANA = 'gy'
|
||||
HAITI = 'ht'
|
||||
HEARD_ISLAND_AND_MCDONALD_ISLANDS = 'hm'
|
||||
HONDURAS = 'hn'
|
||||
HONG_KONG = 'hk'
|
||||
HUNGARY = 'hu'
|
||||
ICELAND = 'is'
|
||||
INDIA = 'in'
|
||||
INDONESIA = 'id'
|
||||
IRAN = 'ir'
|
||||
IRAQ = 'iq'
|
||||
IRELAND = 'ie'
|
||||
ISLE_OF_MAN = 'im'
|
||||
ISRAEL = 'il'
|
||||
ITALY = 'it'
|
||||
JAMAICA = 'jm'
|
||||
JAPAN = 'jp'
|
||||
JERSEY = 'je'
|
||||
JORDAN = 'jo'
|
||||
KAZAKHSTAN = 'kz'
|
||||
KENYA = 'ke'
|
||||
KIRIBATI = 'ki'
|
||||
KUWAIT = 'kw'
|
||||
KYRGYZSTAN = 'kg'
|
||||
LAOS = 'la'
|
||||
LATVIA = 'lv'
|
||||
LEBANON = 'lb'
|
||||
LESOTHO = 'ls'
|
||||
LIBERIA = 'lr'
|
||||
LIBYA = 'ly'
|
||||
LIECHTENSTEIN = 'li'
|
||||
LITHUANIA = 'lt'
|
||||
LUXEMBOURG = 'lu'
|
||||
MACAO = 'mo'
|
||||
MACEDONIA = 'mk'
|
||||
MADAGASCAR = 'mg'
|
||||
MALAWI = 'mw'
|
||||
MALAYSIA = 'my'
|
||||
MALDIVES = 'mv'
|
||||
MALI = 'ml'
|
||||
MALTA = 'mt'
|
||||
MARSHALL_ISLANDS = 'mh'
|
||||
MARTINIQUE = 'mq'
|
||||
MAURITANIA = 'mr'
|
||||
MAURITIUS = 'mu'
|
||||
MAYOTTE = 'yt'
|
||||
MEXICO = 'mx'
|
||||
MOLDOVA = 'md'
|
||||
MONACO = 'mc'
|
||||
MONGOLIA = 'mn'
|
||||
MONTENEGRO = 'me'
|
||||
MONTSERRAT = 'ms'
|
||||
MOROCCO = 'ma'
|
||||
MOZAMBIQUE = 'mz'
|
||||
MYANMAR = 'mm'
|
||||
NAMIBIA = 'na'
|
||||
NAURU = 'nr'
|
||||
NEPAL = 'np'
|
||||
NETHERLANDS = 'nl'
|
||||
NEW_CALEDONIA = 'nc'
|
||||
NEW_ZEALAND = 'nz'
|
||||
NICARAGUA = 'ni'
|
||||
NIGER = 'ne'
|
||||
NIGERIA = 'ng'
|
||||
NIUE = 'nu'
|
||||
NORFOLK_ISLAND = 'nf'
|
||||
NORTH_KOREA = 'kp'
|
||||
NORTHERN_MARIANA_ISLANDS = 'mp'
|
||||
NORWAY = 'no'
|
||||
OMAN = 'om'
|
||||
PAKISTAN = 'pk'
|
||||
PALAU = 'pw'
|
||||
PALESTINE = 'ps'
|
||||
PANAMA = 'pa'
|
||||
PAPUA_NEW_GUINEA = 'pg'
|
||||
PARAGUAY = 'py'
|
||||
PERU = 'pe'
|
||||
PHILIPPINES = 'ph'
|
||||
PITCAIRN_ISLANDS = 'pn'
|
||||
POLAND = 'pl'
|
||||
PORTUGAL = 'pt'
|
||||
PUERTO_RICO = 'pr'
|
||||
QATAR = 'qa'
|
||||
REPUBLIC_OF_CONGO = 'cg'
|
||||
REUNION = 're'
|
||||
ROMANIA = 'ro'
|
||||
RUSSIA = 'ru'
|
||||
RWANDA = 'rw'
|
||||
SAINT_BARTHELEMY = 'bl'
|
||||
SAINT_HELENA_ASCENSION_AND_TRISTAN_DA_CUNHA = 'sh'
|
||||
SAINT_KITTS_AND_NEVIS = 'kn'
|
||||
SAINT_LUCIA = 'lc'
|
||||
SAINT_MARTIN = 'mf'
|
||||
SAINT_PIERRE_AND_MIQUELON = 'pm'
|
||||
SAINT_VINCENT_AND_THE_GRENADINES = 'vc'
|
||||
SAMOA = 'ws'
|
||||
SAN_MARINO = 'sm'
|
||||
SAO_TOME_AND_PRINCIPE = 'st'
|
||||
SAUDI_ARABIA = 'sa'
|
||||
SENEGAL = 'sn'
|
||||
SERBIA = 'rs'
|
||||
SEYCHELLES = 'sc'
|
||||
SIERRA_LEONE = 'sl'
|
||||
SINGAPORE = 'sg'
|
||||
SINT_MAARTEN = 'sx'
|
||||
SLOVAKIA = 'sk'
|
||||
SLOVENIA = 'si'
|
||||
SOLOMON_ISLANDS = 'sb'
|
||||
SOMALIA = 'so'
|
||||
SOUTH_AFRICA = 'za'
|
||||
SOUTH_GEORGIA_AND_THE_SOUTH_SANDWICH_ISLANDS = 'gs'
|
||||
SOUTH_KOREA = 'kr'
|
||||
SOUTH_SUDAN = 'ss'
|
||||
SPAIN = 'es'
|
||||
SRI_LANKA = 'lk'
|
||||
SUDAN = 'sd'
|
||||
SURINAME = 'sr'
|
||||
SVALBARD_AND_JAN_MAYEN = 'sj'
|
||||
SWAZILAND = 'sz'
|
||||
SWEDEN = 'se'
|
||||
SWITZERLAND = 'ch'
|
||||
SYRIA = 'sy'
|
||||
TAIWAN = 'tw'
|
||||
TAJIKISTAN = 'tj'
|
||||
TANZANIA = 'tz'
|
||||
THAILAND = 'th'
|
||||
TIMOR_LESTE = 'tl'
|
||||
TOGO = 'tg'
|
||||
TOKELAU = 'tk'
|
||||
TONGA = 'to'
|
||||
TRINIDAD_AND_TOBAGO = 'tt'
|
||||
TUNISIA = 'tn'
|
||||
TURKEY = 'tr'
|
||||
TURKMENISTAN = 'tm'
|
||||
TURKS_AND_CAICOS_ISLANDS = 'tc'
|
||||
TUVALU = 'tv'
|
||||
UGANDA = 'ug'
|
||||
UKRAINE = 'ua'
|
||||
UNITED_ARAB_EMIRATES = 'ae'
|
||||
UNITED_KINGDOM = 'gb'
|
||||
UNITED_STATES = 'us'
|
||||
UNITED_STATES_MINOR_OUTLYING_ISLANDS = 'um'
|
||||
URUGUAY = 'uy'
|
||||
US_VIRGIN_ISLANDS = 'vi'
|
||||
UZBEKISTAN = 'uz'
|
||||
VANUATU = 'vu'
|
||||
VATICAN = 'va'
|
||||
VENEZUELA = 've'
|
||||
VIETNAM = 'vn'
|
||||
WALLIS_AND_FUTUNA = 'wf'
|
||||
WESTERN_SAHARA = 'eh'
|
||||
YEMEN = 'ye'
|
||||
ZAMBIA = 'zm'
|
||||
ZIMBABWE = 'zw'
|
||||
|
||||
FORMER_SOVIET_UNION_COUNTRIES = set([RUSSIA, UKRAINE, BELARUS, KAZAKHSTAN, AZERBAIJAN, KYRGYZSTAN, GEORGIA, UZBEKISTAN, ARMENIA, TAJIKISTAN, MOLDOVA, TURKMENISTAN, LATVIA, LITHUANIA, ESTONIA])
|
||||
CJK_COUNTRIES = set([CHINA, JAPAN, SOUTH_KOREA, TAIWAN, HONG_KONG, MACAO])
|
||||
|
||||
all_country_iso_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||
|
||||
@classmethod
|
||||
def is_valid_country_code(cls, alpha2_code):
|
||||
return alpha2_code and alpha2_code.lower() in cls.all_country_iso_codes
|
||||
187
scripts/geodata/countries/names.py
Normal file
187
scripts/geodata/countries/names.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
|
||||
import pycountry
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
|
||||
|
||||
COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'countries', 'names.yaml')
|
||||
|
||||
IGNORE_COUNTRIES = set([six.u('ZZ')])
|
||||
|
||||
COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
|
||||
COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
|
||||
|
||||
LANGUAGE_COUNTRY_OVERRIDES = {
|
||||
'en': {
|
||||
'CD': safe_decode('Democratic Republic of the Congo'),
|
||||
'CG': safe_decode('Republic of the Congo'),
|
||||
},
|
||||
|
||||
# Countries where the local language is absent from CLDR
|
||||
|
||||
# Tajik / Tajikistan
|
||||
'tg': {
|
||||
'TJ': safe_decode('Тоҷикистон'),
|
||||
},
|
||||
|
||||
# Maldivan / Maldives
|
||||
'dv': {
|
||||
'MV': safe_decode('ދިވެހިރާއްޖެ'),
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class CountryNames(object):
|
||||
def __init__(self, base_dir=CLDR_MAIN_PATH):
|
||||
self.base_dir = base_dir
|
||||
|
||||
self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
|
||||
self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}
|
||||
|
||||
self.language_country_names = {}
|
||||
self.country_language_names = defaultdict(dict)
|
||||
|
||||
self.country_official_names = defaultdict(OrderedDict)
|
||||
self.country_local_names = defaultdict(OrderedDict)
|
||||
|
||||
local_languages = {}
|
||||
|
||||
country_local_language_names = defaultdict(dict)
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
lang = filename.split('.xml')[0]
|
||||
if len(lang) > 3:
|
||||
continue
|
||||
|
||||
names = self.cldr_country_names(lang)
|
||||
lang = lang.lower()
|
||||
self.language_country_names[lang] = names
|
||||
|
||||
for country, name in names.iteritems():
|
||||
country = country.lower()
|
||||
|
||||
languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
|
||||
local_languages[country] = languages
|
||||
|
||||
self.country_language_names[country.lower()][lang.lower()] = name
|
||||
|
||||
if lang in local_languages.get(country, {}):
|
||||
country_local_language_names[country][lang] = name
|
||||
|
||||
for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
|
||||
if l not in self.language_country_names:
|
||||
self.language_country_names[l.lower()] = names
|
||||
|
||||
for c, name in six.iteritems(names):
|
||||
self.country_language_names[c.lower()][l.lower()] = name
|
||||
if c.lower() not in country_local_language_names:
|
||||
country_local_language_names[c.lower()][l.lower()] = name
|
||||
|
||||
for country, langs in six.iteritems(local_languages):
|
||||
names = country_local_language_names[country]
|
||||
num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
|
||||
for i, (lang, default) in enumerate(langs.iteritems()):
|
||||
name = names.get(lang)
|
||||
if not name:
|
||||
continue
|
||||
if default or num_defaults == 0:
|
||||
self.country_official_names[country][lang] = name
|
||||
if num_defaults == 0:
|
||||
break
|
||||
self.country_local_names[country][lang] = name
|
||||
|
||||
def cldr_country_names(self, language):
|
||||
'''
|
||||
Country names are tricky as there can be several versions
|
||||
and levels of verbosity e.g. United States of America
|
||||
vs. the more commonly used United States. Most countries
|
||||
have a similarly verbose form.
|
||||
|
||||
The CLDR repo (http://cldr.unicode.org/) has the most
|
||||
comprehensive localized database of country names
|
||||
(among other things), organized by language. This function
|
||||
parses CLDR XML for a given language and returns a dictionary
|
||||
of {country_code: name} for that language.
|
||||
'''
|
||||
filename = os.path.join(self.base_dir, '{}.xml'.format(language))
|
||||
xml = etree.parse(open(filename))
|
||||
|
||||
country_names = defaultdict(dict)
|
||||
|
||||
for territory in xml.xpath('*//territories/*'):
|
||||
country_code = territory.attrib['type']
|
||||
|
||||
if country_code in IGNORE_COUNTRIES or country_code.isdigit():
|
||||
continue
|
||||
|
||||
country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
|
||||
|
||||
display_names = {}
|
||||
|
||||
for country_code, names in country_names.iteritems():
|
||||
if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
|
||||
display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
|
||||
continue
|
||||
|
||||
default_name = names.get(None)
|
||||
|
||||
if country_code in COUNTRY_USE_SHORT_NAME:
|
||||
display_names[country_code] = names.get('short', default_name)
|
||||
elif country_code in COUNTRY_USE_VARIANT_NAME:
|
||||
display_names[country_code] = names.get('variant', default_name)
|
||||
elif default_name is not None:
|
||||
display_names[country_code] = default_name
|
||||
|
||||
return display_names
|
||||
|
||||
def localized_name(self, country_code, language=None):
|
||||
'''
|
||||
Get the display name for a country code in the local language
|
||||
e.g. Россия for Russia, España for Spain, etc.
|
||||
|
||||
For most countries there is a single official name. For countries
|
||||
with more than one official language, this will return a concatenated
|
||||
version separated by a slash e.g. Maroc / المغرب for Morocco.
|
||||
|
||||
Note that all of the exceptions in road_sign_languages.tsv are also
|
||||
taken into account here so India for example uses the English name
|
||||
rather than concatenating all 27 toponyms.
|
||||
|
||||
This method should be roughly consistent with OSM's display names.
|
||||
|
||||
Usage:
|
||||
>>> country_names.localized_name('jp') # returns '日本'
|
||||
>>> country_names.localized_name('be') # returns 'België / Belgique / Belgien'
|
||||
'''
|
||||
|
||||
country_code = country_code.lower()
|
||||
if language is None:
|
||||
return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
|
||||
for n in self.country_official_names[country_code].values()).keys())
|
||||
else:
|
||||
return self.country_language_names.get(country_code, {}).get(language)
|
||||
|
||||
def alpha3_code(self, alpha2_code):
|
||||
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
|
||||
return alpha3.upper() if alpha3 else None
|
||||
|
||||
def iso_3166_name(self, alpha2_code):
|
||||
return self.iso_3166_names.get(alpha2_code.lower())
|
||||
|
||||
country_names = CountryNames()
|
||||
Reference in New Issue
Block a user