[geodisambig] Canonical country names from CLDR, adding alpha-2 and alpha-3 surface forms, writing results to stdout or a file for streaming

2015-06-15 01:58:43 -04:00
parent 43e023077c
commit 7a4fa7d443
1 changed files with 85 additions and 7 deletions
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -4,12 +4,19 @@ import os
 import sqlite3
 import sys

+import requests
+import pycountry
+
+from lxml import etree
+
 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))

 from geodata.file_utils import *
 from geodata.encoding import safe_encode
 from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH
+from geodata.i18n.unicode_paths import CLDR_DIR
+

 DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
                                os.path.pardir, 'data', 'geonames')
@@ -30,6 +37,9 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
 NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )


+CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')
+
+
 class boundary_types:
    COUNTRY = 0
    ADMIN1 = 1
@@ -88,6 +98,16 @@ geonames_fields = [
 DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
                             if f.is_dummy][0]

+CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields)
+                        if f.c_constant == 'GEONAMES_CANONICAL'][0]
+
+NAME_INDEX = [i for i, f in enumerate(geonames_fields)
+              if f.c_constant == 'GEONAMES_NAME'][0]
+
+COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields)
+                      if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0]
+
+
 geonames_admin_joins = '''
 left join admin1_codes a1
    on a1.code = gn.admin1_code
@@ -185,13 +205,48 @@ group by postal_code, p.country_code
 BATCH_SIZE = 2000


-def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
+IGNORE_COUNTRIES = set(['ZZ'])
+
+COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
+COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
+
+
+def cldr_country_names(filename=CLDR_ENGLISH_PATH):
+    xml = etree.parse(open(filename))
+
+    country_names = {}
+
+    for territory in xml.xpath('*//territories/*'):
+        country_code = territory.attrib['type']
+        if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
+            continue
+        elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
+            continue
+        elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
+            continue
+        elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
+            continue
+
+        country_names[country_code] = safe_encode(territory.text)
+
+    return country_names
+
+
+def create_geonames_tsv(db_path, out_dir=None):
    db = sqlite3.connect(db_path)

    filename = 'geonames.tsv'
-    f = open(os.path.join(out_dir, filename), 'w')
+    if out_dir:
+        f = open(os.path.join(out_dir, filename), 'w')
+    else:
+        f = sys.stdout
    writer = csv.writer(f, delimiter='\t')

+    country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
+    country_alpha2 = set([c.alpha2 for c in pycountry.countries])
+
+    country_names = cldr_country_names()
+
    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
        if boundary_type != boundary_types.COUNTRY:
            predicate = 'where gn.feature_code in ({codes})'.format(
@@ -214,20 +269,42 @@ def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
            for row in batch:
                row = [safe_encode(val or '') for val in row]
                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
+
+                if boundary_type == boundary_types.COUNTRY:
+                    alpha2_code = row[COUNTRY_CODE_INDEX]
+
+                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
+                    row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]
+
+                    if alpha2_code and is_orig_name:
+                        alpha2_row = row[:]
+                        alpha2_row[NAME_INDEX] = alpha2_code
+                        rows.append(alpha2_row)
+
+                    if alpha2_code in country_code_alpha3_map and is_orig_name:
+                        alpha3_row = row[:]
+                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
+                        rows.append(alpha3_row)
+
                rows.append(row)

            writer.writerows(rows)
        cursor.close()
        f.flush()
-    f.close()
+
+    if out_dir:
+        f.close()
    db.close()


-def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
+def create_postal_codes_tsv(db_path, out_dir=None):
    db = sqlite3.connect(db_path)

    filename = 'postal_codes.tsv'
-    f = open(os.path.join(out_dir, filename), 'w')
+    if out_dir:
+        f = open(os.path.join(out_dir, filename), 'w')
+    else:
+        f = sys.stdout
    writer = csv.writer(f, delimiter='\t')

    cursor = db.execute(postal_codes_query)
@@ -243,7 +320,8 @@ def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
        writer.writerows(rows)

    cursor.close()
-    f.close()
+    if out_dir:
+        f.close()
    db.close()

 # Generates a C header telling us the order of the fields as written
@@ -285,7 +363,7 @@ if __name__ == '__main__':
                        default=DEFAULT_GEONAMES_DB_PATH,
                        help='SQLite db file')
    parser.add_argument('-o', '--out',
-                        default=DEFAULT_DATA_DIR, help='output directory')
+                        default=None, help='output directory')
    args = parser.parse_args()
    create_geonames_tsv(args.db, args.out)
    create_postal_codes_tsv(args.db, args.out)