[geodisambig] Canonical country names from CLDR, adding alpha-2 and alpha-3 surface forms, writing results to stdout or a file for streaming
This commit is contained in:
@@ -4,12 +4,19 @@ import os
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import pycountry
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||||
|
|
||||||
from geodata.file_utils import *
|
from geodata.file_utils import *
|
||||||
from geodata.encoding import safe_encode
|
from geodata.encoding import safe_encode
|
||||||
from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH
|
from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH
|
||||||
|
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
|
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
|
||||||
os.path.pardir, 'data', 'geonames')
|
os.path.pardir, 'data', 'geonames')
|
||||||
@@ -30,6 +37,9 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
|
|||||||
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
|
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
|
||||||
|
|
||||||
|
|
||||||
|
CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')
|
||||||
|
|
||||||
|
|
||||||
class boundary_types:
|
class boundary_types:
|
||||||
COUNTRY = 0
|
COUNTRY = 0
|
||||||
ADMIN1 = 1
|
ADMIN1 = 1
|
||||||
@@ -88,6 +98,16 @@ geonames_fields = [
|
|||||||
DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
|
DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
|
||||||
if f.is_dummy][0]
|
if f.is_dummy][0]
|
||||||
|
|
||||||
|
CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields)
|
||||||
|
if f.c_constant == 'GEONAMES_CANONICAL'][0]
|
||||||
|
|
||||||
|
NAME_INDEX = [i for i, f in enumerate(geonames_fields)
|
||||||
|
if f.c_constant == 'GEONAMES_NAME'][0]
|
||||||
|
|
||||||
|
COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields)
|
||||||
|
if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0]
|
||||||
|
|
||||||
|
|
||||||
geonames_admin_joins = '''
|
geonames_admin_joins = '''
|
||||||
left join admin1_codes a1
|
left join admin1_codes a1
|
||||||
on a1.code = gn.admin1_code
|
on a1.code = gn.admin1_code
|
||||||
@@ -185,13 +205,48 @@ group by postal_code, p.country_code
|
|||||||
BATCH_SIZE = 2000
|
BATCH_SIZE = 2000
|
||||||
|
|
||||||
|
|
||||||
def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
IGNORE_COUNTRIES = set(['ZZ'])
|
||||||
|
|
||||||
|
COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
|
||||||
|
COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
|
||||||
|
|
||||||
|
|
||||||
|
def cldr_country_names(filename=CLDR_ENGLISH_PATH):
|
||||||
|
xml = etree.parse(open(filename))
|
||||||
|
|
||||||
|
country_names = {}
|
||||||
|
|
||||||
|
for territory in xml.xpath('*//territories/*'):
|
||||||
|
country_code = territory.attrib['type']
|
||||||
|
if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
|
||||||
|
continue
|
||||||
|
elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
|
||||||
|
continue
|
||||||
|
elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
|
||||||
|
continue
|
||||||
|
elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
country_names[country_code] = safe_encode(territory.text)
|
||||||
|
|
||||||
|
return country_names
|
||||||
|
|
||||||
|
|
||||||
|
def create_geonames_tsv(db_path, out_dir=None):
|
||||||
db = sqlite3.connect(db_path)
|
db = sqlite3.connect(db_path)
|
||||||
|
|
||||||
filename = 'geonames.tsv'
|
filename = 'geonames.tsv'
|
||||||
f = open(os.path.join(out_dir, filename), 'w')
|
if out_dir:
|
||||||
|
f = open(os.path.join(out_dir, filename), 'w')
|
||||||
|
else:
|
||||||
|
f = sys.stdout
|
||||||
writer = csv.writer(f, delimiter='\t')
|
writer = csv.writer(f, delimiter='\t')
|
||||||
|
|
||||||
|
country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
|
||||||
|
country_alpha2 = set([c.alpha2 for c in pycountry.countries])
|
||||||
|
|
||||||
|
country_names = cldr_country_names()
|
||||||
|
|
||||||
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
|
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
|
||||||
if boundary_type != boundary_types.COUNTRY:
|
if boundary_type != boundary_types.COUNTRY:
|
||||||
predicate = 'where gn.feature_code in ({codes})'.format(
|
predicate = 'where gn.feature_code in ({codes})'.format(
|
||||||
@@ -214,20 +269,42 @@ def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|||||||
for row in batch:
|
for row in batch:
|
||||||
row = [safe_encode(val or '') for val in row]
|
row = [safe_encode(val or '') for val in row]
|
||||||
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
||||||
|
|
||||||
|
if boundary_type == boundary_types.COUNTRY:
|
||||||
|
alpha2_code = row[COUNTRY_CODE_INDEX]
|
||||||
|
|
||||||
|
is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
|
||||||
|
row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]
|
||||||
|
|
||||||
|
if alpha2_code and is_orig_name:
|
||||||
|
alpha2_row = row[:]
|
||||||
|
alpha2_row[NAME_INDEX] = alpha2_code
|
||||||
|
rows.append(alpha2_row)
|
||||||
|
|
||||||
|
if alpha2_code in country_code_alpha3_map and is_orig_name:
|
||||||
|
alpha3_row = row[:]
|
||||||
|
alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
|
||||||
|
rows.append(alpha3_row)
|
||||||
|
|
||||||
rows.append(row)
|
rows.append(row)
|
||||||
|
|
||||||
writer.writerows(rows)
|
writer.writerows(rows)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
f.flush()
|
f.flush()
|
||||||
f.close()
|
|
||||||
|
if out_dir:
|
||||||
|
f.close()
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
def create_postal_codes_tsv(db_path, out_dir=None):
|
||||||
db = sqlite3.connect(db_path)
|
db = sqlite3.connect(db_path)
|
||||||
|
|
||||||
filename = 'postal_codes.tsv'
|
filename = 'postal_codes.tsv'
|
||||||
f = open(os.path.join(out_dir, filename), 'w')
|
if out_dir:
|
||||||
|
f = open(os.path.join(out_dir, filename), 'w')
|
||||||
|
else:
|
||||||
|
f = sys.stdout
|
||||||
writer = csv.writer(f, delimiter='\t')
|
writer = csv.writer(f, delimiter='\t')
|
||||||
|
|
||||||
cursor = db.execute(postal_codes_query)
|
cursor = db.execute(postal_codes_query)
|
||||||
@@ -243,7 +320,8 @@ def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|||||||
writer.writerows(rows)
|
writer.writerows(rows)
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
f.close()
|
if out_dir:
|
||||||
|
f.close()
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
# Generates a C header telling us the order of the fields as written
|
# Generates a C header telling us the order of the fields as written
|
||||||
@@ -285,7 +363,7 @@ if __name__ == '__main__':
|
|||||||
default=DEFAULT_GEONAMES_DB_PATH,
|
default=DEFAULT_GEONAMES_DB_PATH,
|
||||||
help='SQLite db file')
|
help='SQLite db file')
|
||||||
parser.add_argument('-o', '--out',
|
parser.add_argument('-o', '--out',
|
||||||
default=DEFAULT_DATA_DIR, help='output directory')
|
default=None, help='output directory')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
create_geonames_tsv(args.db, args.out)
|
create_geonames_tsv(args.db, args.out)
|
||||||
create_postal_codes_tsv(args.db, args.out)
|
create_postal_codes_tsv(args.db, args.out)
|
||||||
|
|||||||
Reference in New Issue
Block a user