import argparse import csv import os import sqlite3 import sys this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.file_utils import * from geodata.encoding import safe_encode from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir, os.path.pardir, 'data', 'geonames') COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS') CONTINENT_FEATURE_CODES = ('CONT',) ADMIN_1_FEATURE_CODES = ('ADM1',) ADMIN_2_FEATURE_CODES = ('ADM2',) ADMIN_3_FEATURE_CODES = ('ADM3',) ADMIN_4_FEATURE_CODES = ('ADM4',) OTHER_ADMIN_FEATURE_CODES = ('ADM5',) ADMIN_OTHER_FEATURE_CODES = ('ADMD', ) POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL', 'PPLR', 'PPLS', 'STLMT') NEIGHBORHOOD_FEATURE_CODES = ('PPLX', ) class boundary_types: COUNTRY = 0 ADMIN1 = 1 ADMIN2 = 2 ADMIN3 = 3 ADMIN4 = 4 ADMIN_OTHER = 5 LOCALITY = 6 NEIGHBORHOOD = 7 geonames_admin_dictionaries = { boundary_types.COUNTRY: COUNTRY_FEATURE_CODES, boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES, boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES, boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES, boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES, boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES, boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES, boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES, } # Append new fields to the end for compatibility geonames_fields = [ 'ifnull(alternate_name, gn.name) as alternate_name', 'gn.geonames_id as geonames_id', 'gn.name as name', 'iso_language', 'is_preferred_name', 'population', 'latitude', 'longitude', 'feature_code', 'gn.country_code as country_code', 'gn.admin1_code as admin1_code', 'a1.geonames_id as a1_gn_id', 'gn.admin2_code as admin2_code', 'a2.geonames_id as a2_gn_id', 'gn.admin3_code as admin3_code', 'a3.geonames_id as a3_gn_id', 'gn.admin4_code as admin4_code', 'a4.geonames_id as a4_gn_id', ] base_geonames_query = ''' select {fields} from geonames gn left join alternate_names an on gn.geonames_id = an.geonames_id and iso_language not in ('doi','faac','iata', 'icao','link','post','tcid') left join admin1_codes a1 on a1.code = gn.admin1_code and a1.country_code = gn.country_code left join admin2_codes a2 on a2.code = gn.admin2_code and a2.admin1_code = gn.admin1_code and a2.country_code = gn.country_code left join admin3_codes a3 on a3.code = gn.admin3_code and a3.admin1_code = gn.admin1_code and a3.admin2_code = gn.admin2_code and a3.country_code = gn.country_code left join admin4_codes a4 on a4.code = gn.admin4_code and a4.admin1_code = gn.admin1_code and a4.admin2_code = gn.admin2_code and a4.admin3_code = gn.admin3_code and a4.country_code = gn.country_code {predicate}''' IGNORE_COUNTRY_POSTAL_CODES = set([ 'AR', # GeoNames has pre-1999 postal codes ]) postal_codes_query = ''' select postal_code, p.country_code as country_code, n.geonames_id is not null as have_containing_geoname, n.geonames_id as containing_geoname_id, group_concat(distinct a1.geonames_id) admin1_ids, group_concat(distinct a2.geonames_id) admin2_ids, group_concat(distinct a3.geonames_id) admin3_ids from postal_codes p left join ( select gn.geonames_id, alternate_name, country_code, gn.name from alternate_names an join geonames gn on an.geonames_id = gn.geonames_id where iso_language = 'post' ) as n on p.postal_code = n.alternate_name and p.country_code = n.country_code left join admin1_codes a1 on a1.code = p.admin1_code and p.country_code = a1.country_code left join admin2_codes a2 on a2.code = p.admin2_code and a2.admin1_code = p.admin1_code and a2.country_code = p.country_code left join admin3_codes a3 on a3.code = p.admin3_code and a3.admin1_code = p.admin1_code and a3.admin2_code = p.admin2_code and a3.country_code = p.country_code where p.country_code not in ({codes}) group by postal_code, p.country_code '''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES)) BATCH_SIZE = 10000 def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): db = sqlite3.connect(db_path) filename = 'geonames.tsv' f = open(os.path.join(out_dir, filename), 'w') writer = csv.writer(f, delimiter='\t') for boundary_type, codes in geonames_admin_dictionaries.iteritems(): if boundary_type != boundary_types.COUNTRY: predicate = 'where gn.feature_code in ({codes})'.format( codes=','.join(['"{}"'.format(c) for c in codes]) ) else: # The query for countries in GeoNames is somewhat non-trivial predicate = 'where gn.geonames_id in (select geonames_id from countries)' query = base_geonames_query.format( fields=','.join(geonames_fields), predicate=predicate ) cursor = db.execute(query) while True: batch = cursor.fetchmany(BATCH_SIZE) if not batch: break rows = [ [str(boundary_type)] + [safe_encode(val or '') for val in row] for row in batch ] writer.writerows(rows) cursor.close() f.flush() f.close() db.close() def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR): db = sqlite3.connect(db_path) filename = 'postal_codes.tsv' f = open(os.path.join(out_dir, filename), 'w') writer = csv.writer(f, delimiter='\t') cursor = db.execute(postal_codes_query) while True: batch = cursor.fetchmany(BATCH_SIZE) if not batch: break rows = [ [safe_encode(val or '') for val in row] for row in batch ] writer.writerows(rows) cursor.close() f.close() db.close() if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() parser.add_argument('-d', '--db', default=DEFAULT_GEONAMES_DB_PATH, help='SQLite db file') parser.add_argument('-o', '--out', default=DEFAULT_DATA_DIR, help='output directory') args = parser.parse_args() create_geonames_tsv(args.db, args.out) create_postal_codes_tsv(args.db, args.out)