diff --git a/scripts/geodata/__init__.py b/scripts/geodata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/encoding.py b/scripts/geodata/encoding.py new file mode 100644 index 00000000..b4bcbd61 --- /dev/null +++ b/scripts/geodata/encoding.py @@ -0,0 +1,34 @@ +import six + +text_type = six.text_type +string_types = six.string_types +binary_type = six.binary_type + + +def safe_decode(value, encoding='utf-8', errors='strict'): + if isinstance(value, text_type): + return value + + if isinstance(value, (string_types, binary_type)): + return value.decode(encoding, errors) + else: + return binary_type(value).decode(encoding, errors) + + +def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'): + if not isinstance(value, (string_types, binary_type)): + return binary_type(value) + + if isinstance(value, text_type): + return value.encode(encoding, errors) + else: + if hasattr(incoming, 'lower'): + incoming = incoming.lower() + if hasattr(encoding, 'lower'): + encoding = encoding.lower() + + if value and encoding != incoming: + value = safe_decode(value, encoding, errors) + return value.encode(encoding, errors) + else: + return value diff --git a/scripts/geodata/file_utils.py b/scripts/geodata/file_utils.py new file mode 100644 index 00000000..3cef7cb1 --- /dev/null +++ b/scripts/geodata/file_utils.py @@ -0,0 +1,15 @@ +import os +import subprocess + + +def unzip_file(filename, dest_dir): + subprocess.call(['unzip', '-o', filename, '-d', dest_dir]) + + +def remove_file(filename): + os.unlink(filename) + + +def ensure_dir(d): + if not os.path.exists(d): + os.makedirs(d) diff --git a/scripts/geodata/geonames/__init__.py b/scripts/geodata/geonames/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py new file mode 100644 index 00000000..cdfec3a6 --- /dev/null +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -0,0 +1,220 @@ +import argparse +import csv +import os +import sqlite3 +import sys + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.file_utils import * +from geodata.encoding import safe_encode +from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH + +DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir, + os.path.pardir, 'data', 'geonames') + +COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS') +CONTINENT_FEATURE_CODES = ('CONT',) + +ADMIN_1_FEATURE_CODES = ('ADM1',) +ADMIN_2_FEATURE_CODES = ('ADM2',) +ADMIN_3_FEATURE_CODES = ('ADM3',) +ADMIN_4_FEATURE_CODES = ('ADM4',) +OTHER_ADMIN_FEATURE_CODES = ('ADM5',) +ADMIN_OTHER_FEATURE_CODES = ('ADMD', ) + +POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', + 'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL', + 'PPLR', 'PPLS', 'STLMT') +NEIGHBORHOOD_FEATURE_CODES = ('PPLX', ) + + +class boundary_types: + COUNTRY = 0 + ADMIN1 = 1 + ADMIN2 = 2 + ADMIN3 = 3 + ADMIN4 = 4 + ADMIN_OTHER = 5 + LOCALITY = 6 + NEIGHBORHOOD = 7 + +geonames_admin_dictionaries = { + boundary_types.COUNTRY: COUNTRY_FEATURE_CODES, + boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES, + boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES, + boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES, + boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES, + boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES, + boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES, + boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES, +} + +# Append new fields to the end for compatibility + +geonames_fields = [ + 'ifnull(alternate_name, gn.name) as alternate_name', + 'gn.geonames_id as geonames_id', + 'gn.name as name', + 'iso_language', + 'is_preferred_name', + 'population', + 'latitude', + 'longitude', + 'feature_code', + 'gn.country_code as country_code', + 'gn.admin1_code as admin1_code', + 'a1.geonames_id as a1_gn_id', + 'gn.admin2_code as admin2_code', + 'a2.geonames_id as a2_gn_id', + 'gn.admin3_code as admin3_code', + 'a3.geonames_id as a3_gn_id', + 'gn.admin4_code as admin4_code', + 'a4.geonames_id as a4_gn_id', +] + +base_geonames_query = ''' +select {fields} +from geonames gn +left join alternate_names an + on gn.geonames_id = an.geonames_id + and iso_language not in ('doi','faac','iata', + 'icao','link','post','tcid') +left join admin1_codes a1 + on a1.code = gn.admin1_code + and a1.country_code = gn.country_code +left join admin2_codes a2 + on a2.code = gn.admin2_code + and a2.admin1_code = gn.admin1_code + and a2.country_code = gn.country_code +left join admin3_codes a3 + on a3.code = gn.admin3_code + and a3.admin1_code = gn.admin1_code + and a3.admin2_code = gn.admin2_code + and a3.country_code = gn.country_code +left join admin4_codes a4 + on a4.code = gn.admin4_code + and a4.admin1_code = gn.admin1_code + and a4.admin2_code = gn.admin2_code + and a4.admin3_code = gn.admin3_code + and a4.country_code = gn.country_code +{predicate}''' + +IGNORE_COUNTRY_POSTAL_CODES = set([ + 'AR', # GeoNames has pre-1999 postal codes +]) + +postal_codes_query = ''' +select +postal_code, +p.country_code as country_code, +n.geonames_id is not null as have_containing_geoname, +n.geonames_id as containing_geoname_id, +group_concat(distinct a1.geonames_id) admin1_ids, +group_concat(distinct a2.geonames_id) admin2_ids, +group_concat(distinct a3.geonames_id) admin3_ids +from postal_codes p +left join ( + select + gn.geonames_id, + alternate_name, + country_code, + gn.name + from alternate_names an + join geonames gn + on an.geonames_id = gn.geonames_id + where iso_language = 'post' +) as n +on p.postal_code = n.alternate_name +and p.country_code = n.country_code +left join admin1_codes a1 + on a1.code = p.admin1_code + and p.country_code = a1.country_code +left join admin2_codes a2 + on a2.code = p.admin2_code + and a2.admin1_code = p.admin1_code + and a2.country_code = p.country_code +left join admin3_codes a3 + on a3.code = p.admin3_code + and a3.admin1_code = p.admin1_code + and a3.admin2_code = p.admin2_code + and a3.country_code = p.country_code +where p.country_code not in ({codes}) +group by postal_code, p.country_code +'''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES)) + +BATCH_SIZE = 10000 + + +def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): + db = sqlite3.connect(db_path) + + filename = 'geonames.tsv' + f = open(os.path.join(out_dir, filename), 'w') + writer = csv.writer(f, delimiter='\t') + + for boundary_type, codes in geonames_admin_dictionaries.iteritems(): + if boundary_type != boundary_types.COUNTRY: + predicate = 'where gn.feature_code in ({codes})'.format( + codes=','.join(['"{}"'.format(c) for c in codes]) + ) + else: + # The query for countries in GeoNames is somewhat non-trivial + predicate = 'where gn.geonames_id in (select geonames_id from countries)' + + query = base_geonames_query.format( + fields=','.join(geonames_fields), + predicate=predicate + ) + cursor = db.execute(query) + while True: + batch = cursor.fetchmany(BATCH_SIZE) + if not batch: + break + rows = [ + [str(boundary_type)] + [safe_encode(val or '') for val in row] + for row in batch + ] + writer.writerows(rows) + cursor.close() + f.flush() + f.close() + db.close() + + +def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR): + db = sqlite3.connect(db_path) + + filename = 'postal_codes.tsv' + f = open(os.path.join(out_dir, filename), 'w') + writer = csv.writer(f, delimiter='\t') + + cursor = db.execute(postal_codes_query) + + while True: + batch = cursor.fetchmany(BATCH_SIZE) + if not batch: + break + rows = [ + [safe_encode(val or '') for val in row] + for row in batch + ] + writer.writerows(rows) + + cursor.close() + f.close() + db.close() + + +if __name__ == '__main__': + # Handle argument parsing here + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--db', + default=DEFAULT_GEONAMES_DB_PATH, + help='SQLite db file') + parser.add_argument('-o', '--out', + default=DEFAULT_DATA_DIR, help='output directory') + args = parser.parse_args() + create_geonames_tsv(args.db, args.out) + create_postal_codes_tsv(args.db, args.out) diff --git a/scripts/geodata/geonames/geonames_sqlite.py b/scripts/geodata/geonames/geonames_sqlite.py new file mode 100644 index 00000000..f9564be5 --- /dev/null +++ b/scripts/geodata/geonames/geonames_sqlite.py @@ -0,0 +1,340 @@ +import os +import shutil +import sqlite3 + +import tempfile +import urlparse +import urllib2 +import subprocess + +import logging + +import argparse + +import csv +import sys + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.encoding import safe_decode + +from geodata.file_utils import * +from geodata.log import * + +from itertools import islice, chain + +log_to_stdout() +logger = logging.getLogger('geonames.sqlite') + +GEONAMES_DB_NAME = 'geonames.db' + +DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir, + os.path.pardir, os.path.pardir, + 'data', 'geonames', GEONAMES_DB_NAME) + +BASE_URL = 'http://download.geonames.org/export/' + +DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/') +ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip' +HIERARCHY_ZIP_FILE = 'hierarchy.zip' +ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip' + +ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/') + +GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE, + HIERARCHY_ZIP_FILE, + ALTERNATE_NAMES_ZIP_FILE) + +# base_url, local_dir, is_gzipped, local_filename + + +GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE), + (DUMP_URL, '', True, HIERARCHY_ZIP_FILE), + (DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE), + (ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE), + ] + + +def download_file(url, dest): + logger.info('Downloading file from {}'.format(url)) + req = urllib2.urlopen(url) + with open(dest, 'wb') as fp: + shutil.copyfileobj(req, fp) + + +def admin_ddl(admin_level): + columns = ['admin{}_code TEXT'.format(i) + for i in xrange(1, admin_level)] + + create = ''' + CREATE TABLE admin{level}_codes ( + geonames_id INT, + code TEXT, + name TEXT, + country_code TEXT, + {fields} + )'''.format(level=admin_level, + fields=''', + '''.join(columns)) + + indices = ( + '''CREATE INDEX admin{}_code_index ON + admin{}_codes (code)'''.format(admin_level, admin_level), + '''CREATE INDEX admin{}_gn_id_index ON + admin{}_codes (geonames_id)'''.format(admin_level, admin_level), + ) + + return (create, ) + indices + +geonames_ddl = { + 'geonames': ( + '''CREATE TABLE geonames ( + geonames_id INT PRIMARY KEY, + name TEXT, + ascii_name TEXT, + alternate_names TEXT, + latitude DOUBLE, + longitude DOUBLE, + feature_class TEXT, + feature_code TEXT, + country_code TEXT, + cc2 TEXT, + admin1_code TEXT, + admin2_code TEXT, + admin3_code TEXT, + admin4_code TEXT, + population LONG, + elevation INT, + dem INT, + timezone TEXT, + modification_date TEXT);''', + '''CREATE INDEX feature_code ON + geonames (feature_code)''', + '''CREATE INDEX country_code ON + geonames (country_code)''', + '''CREATE INDEX admin1_code ON + geonames (admin1_code)''', + '''CREATE INDEX admin2_code ON + geonames (admin2_code)''', + '''CREATE INDEX admin3_code ON + geonames (admin3_code)''', + '''CREATE INDEX admin4_code ON + geonames (admin4_code)''', + ), + + 'alternate_names': ( + '''CREATE TABLE alternate_names ( + alternate_name_id INT PRIMARY KEY, + geonames_id INT, + iso_language TEXT, + alternate_name TEXT, + is_preferred_name BOOLEAN, + is_short_name BOOLEAN, + is_colloquial BOOLEAN, + is_historic BOOLEAN)''', + '''CREATE INDEX geonames_id_index ON + alternate_names (geonames_id)''', + ), + + 'hierarchy': ( + '''CREATE TABLE hierarchy ( + parent_id INT, + child_id INT, + type TEXT + );''', + '''CREATE INDEX parent_child_index ON + hierarchy (parent_id, child_id)''', + '''CREATE INDEX child_parent_index ON + hierarchy (child_id, parent_id)''', + ), + + 'postal_codes': ( + '''CREATE TABLE postal_codes ( + country_code TEXT, + postal_code TEXT, + place_name TEXT, + admin1 TEXT, + admin1_code TEXT, + admin2 TEXT, + admin2_code TEXT, + admin3 TEXT, + admin3_code TEXT, + latitude DOUBLE, + longitude DOUBLE, + accuracy INT + )''', + '''CREATE INDEX post_code_index ON + postal_codes (country_code, postal_code)''', + ), + 'admin1_codes': admin_ddl(1), + 'admin2_codes': admin_ddl(2), + 'admin3_codes': admin_ddl(3), + 'admin4_codes': admin_ddl(4), + +} + +geonames_file_table_map = { + ('', ALL_COUNTRIES_ZIP_FILE): 'geonames', + ('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names', + ('', HIERARCHY_ZIP_FILE): 'hierarchy', + ('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes', +} + + +country_codes_create_table = ( + 'drop table if exists country_codes', + ''' + create table country_codes as + select distinct country_code from geonames + where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR') + ''', +) + +proper_countries_create_table = ( + 'drop table if exists proper_countries', + ''' + create table proper_countries as + select * from geonames + where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS') + and country_code in (select country_code from country_codes) + ''', +) + +territories_create_table = ( + 'drop table if exists territories', + ''' + create table territories as + select * from geonames where feature_code = 'TERR' + and country_code not in (select country_code from proper_countries); + ''', +) + +countries_create_table = ( + 'drop table if exists countries', + ''' + create table countries as + select * from proper_countries + union + select * from territories; + ''', + 'create index country_geonames_id on countries (geonames_id)', +) + +country_alises_create_table = ( + 'drop table if exists country_aliases', + ''' + create table country_aliases as + select name, country_code + from countries + union + select alternate_name, country_code + from alternate_names an + join countries c + on c.geonames_id = an.geonames_id + where alternate_name != '' + and iso_language not in ('doi','faac','iata', + 'icao','link','post','tcid') + ''' +) + +country_table_create_statements = list(chain(country_codes_create_table, + proper_countries_create_table, + territories_create_table, + countries_create_table, + country_alises_create_table)) + + +def create_table(conn, table): + cursor = conn.cursor() + create_statements = geonames_ddl[table] + cursor.execute('DROP TABLE IF EXISTS {}'.format(table)) + for statement in create_statements: + cursor.execute(statement) + conn.commit() + + +def batch_iter(iterable, batch_size): + source_iter = iter(iterable) + while True: + batch = list(islice(source_iter, batch_size)) + if len(batch) > 0: + yield batch + else: + return + + +def populate_admin_table(conn, admin_level): + logging.info('Doing admin level {}'.format(admin_level)) + + columns = ['admin{}_code'.format(admin_level), + 'name', 'country_code'] + columns.extend(['admin{}_code'.format(i) + for i in xrange(1, admin_level)]) + columns.append('geonames_id') + + admin_insert_statement = ''' + insert into "admin{}_codes" + select {} + from geonames + where feature_code = "ADM{}" + '''.format(admin_level, ','.join(columns), admin_level) + + conn.execute(admin_insert_statement) + conn.commit() + + logging.info('Done with admin level {}'.format(admin_level)) + + +def import_geonames_table(conn, table, f, batch_size=2000): + # escape the brackets around the values format string so we can use later + statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table) + cursor = conn.cursor() + for i, batch in enumerate(batch_iter(f, batch_size)): + num_cols = len(batch[0]) + cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch) + conn.commit() + cursor = conn.cursor() + logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size)) + cursor.close() + + +def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH): + conn = sqlite3.connect(db_file) + logging.info('Created database at {}'.format(db_file)) + for url, directory, is_gzipped, filename in GEONAMES_FILES: + table = geonames_file_table_map[(directory, filename)] + create_table(conn, table) + full_url = urlparse.urljoin(url, filename) + dest_dir = os.path.join(temp_dir, directory) + ensure_dir(dest_dir) + dest_file = os.path.join(dest_dir, filename) + download_file(full_url, dest_file) + if is_gzipped: + unzip_file(dest_file, dest_dir) + filename = dest_file.replace('.zip', '.txt') + reader = csv.reader(open(filename), delimiter='\t', quotechar=None) + lines = (map(safe_decode, line) for line in reader) + import_geonames_table(conn, table, lines) + logging.info('Creating countries tables') + for statement in country_table_create_statements: + conn.execute(statement) + conn.commit() + logging.info('Creating admin tables') + for admin_level in xrange(1, 5): + create_table(conn, 'admin{}_codes'.format(admin_level)) + populate_admin_table(conn, admin_level) + conn.close() + + +if __name__ == '__main__': + # Handle argument parsing here + parser = argparse.ArgumentParser() + parser.add_argument('-t', '--temp-dir', + default=tempfile.gettempdir(), + help='Temporary work directory') + parser.add_argument('-o', '--out', + default=DEFAULT_GEONAMES_DB_PATH, + help='SQLite3 db filename') + args = parser.parse_args() + create_geonames_sqlite_db(args.temp_dir, args.out) diff --git a/scripts/geodata/log.py b/scripts/geodata/log.py new file mode 100644 index 00000000..43c702a5 --- /dev/null +++ b/scripts/geodata/log.py @@ -0,0 +1,10 @@ +import logging +import sys + + +def log_to_stdout(level=logging.INFO): + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s') + handler.setFormatter(formatter) + logging.root.addHandler(handler) + logging.root.setLevel(level)