From 7a4fa7d443c52bb498b3603bc409d1060ce628bb Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 15 Jun 2015 01:58:43 -0400 Subject: [PATCH] [geodisambig] Canonical country names from CLDR, adding alpha-2 and alpha-3 surface forms, writing results to stdout or a file for streaming --- .../geodata/geonames/create_geonames_tsv.py | 92 +++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py index 11da9de4..95687bc4 100644 --- a/scripts/geodata/geonames/create_geonames_tsv.py +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -4,12 +4,19 @@ import os import sqlite3 import sys +import requests +import pycountry + +from lxml import etree + this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.file_utils import * from geodata.encoding import safe_encode from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH +from geodata.i18n.unicode_paths import CLDR_DIR + DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir, os.path.pardir, 'data', 'geonames') @@ -30,6 +37,9 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', NEIGHBORHOOD_FEATURE_CODES = ('PPLX', ) +CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml') + + class boundary_types: COUNTRY = 0 ADMIN1 = 1 @@ -88,6 +98,16 @@ geonames_fields = [ DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields) if f.is_dummy][0] +CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_CANONICAL'][0] + +NAME_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_NAME'][0] + +COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0] + + geonames_admin_joins = ''' left join admin1_codes a1 on a1.code = gn.admin1_code @@ -185,13 +205,48 @@ group by postal_code, p.country_code BATCH_SIZE = 2000 -def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): +IGNORE_COUNTRIES = set(['ZZ']) + +COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS']) +COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL']) + + +def cldr_country_names(filename=CLDR_ENGLISH_PATH): + xml = etree.parse(open(filename)) + + country_names = {} + + for territory in xml.xpath('*//territories/*'): + country_code = territory.attrib['type'] + if country_code in IGNORE_COUNTRIES and not country_code.isdigit(): + continue + elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short': + continue + elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant': + continue + elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'): + continue + + country_names[country_code] = safe_encode(territory.text) + + return country_names + + +def create_geonames_tsv(db_path, out_dir=None): db = sqlite3.connect(db_path) filename = 'geonames.tsv' - f = open(os.path.join(out_dir, filename), 'w') + if out_dir: + f = open(os.path.join(out_dir, filename), 'w') + else: + f = sys.stdout writer = csv.writer(f, delimiter='\t') + country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries} + country_alpha2 = set([c.alpha2 for c in pycountry.countries]) + + country_names = cldr_country_names() + for boundary_type, codes in geonames_admin_dictionaries.iteritems(): if boundary_type != boundary_types.COUNTRY: predicate = 'where gn.feature_code in ({codes})'.format( @@ -214,20 +269,42 @@ def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): for row in batch: row = [safe_encode(val or '') for val in row] row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type + + if boundary_type == boundary_types.COUNTRY: + alpha2_code = row[COUNTRY_CODE_INDEX] + + is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] + row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]] + + if alpha2_code and is_orig_name: + alpha2_row = row[:] + alpha2_row[NAME_INDEX] = alpha2_code + rows.append(alpha2_row) + + if alpha2_code in country_code_alpha3_map and is_orig_name: + alpha3_row = row[:] + alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code] + rows.append(alpha3_row) + rows.append(row) writer.writerows(rows) cursor.close() f.flush() - f.close() + + if out_dir: + f.close() db.close() -def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR): +def create_postal_codes_tsv(db_path, out_dir=None): db = sqlite3.connect(db_path) filename = 'postal_codes.tsv' - f = open(os.path.join(out_dir, filename), 'w') + if out_dir: + f = open(os.path.join(out_dir, filename), 'w') + else: + f = sys.stdout writer = csv.writer(f, delimiter='\t') cursor = db.execute(postal_codes_query) @@ -243,7 +320,8 @@ def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR): writer.writerows(rows) cursor.close() - f.close() + if out_dir: + f.close() db.close() # Generates a C header telling us the order of the fields as written @@ -285,7 +363,7 @@ if __name__ == '__main__': default=DEFAULT_GEONAMES_DB_PATH, help='SQLite db file') parser.add_argument('-o', '--out', - default=DEFAULT_DATA_DIR, help='output directory') + default=None, help='output directory') args = parser.parse_args() create_geonames_tsv(args.db, args.out) create_postal_codes_tsv(args.db, args.out)