221 lines
6.6 KiB
Python
221 lines
6.6 KiB
Python
import argparse
|
|
import csv
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
from geodata.file_utils import *
|
|
from geodata.encoding import safe_encode
|
|
from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH
|
|
|
|
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
|
|
os.path.pardir, 'data', 'geonames')
|
|
|
|
COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
|
CONTINENT_FEATURE_CODES = ('CONT',)
|
|
|
|
ADMIN_1_FEATURE_CODES = ('ADM1',)
|
|
ADMIN_2_FEATURE_CODES = ('ADM2',)
|
|
ADMIN_3_FEATURE_CODES = ('ADM3',)
|
|
ADMIN_4_FEATURE_CODES = ('ADM4',)
|
|
OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
|
|
ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
|
|
|
|
POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
|
|
'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
|
|
'PPLR', 'PPLS', 'STLMT')
|
|
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
|
|
|
|
|
|
class boundary_types:
|
|
COUNTRY = 0
|
|
ADMIN1 = 1
|
|
ADMIN2 = 2
|
|
ADMIN3 = 3
|
|
ADMIN4 = 4
|
|
ADMIN_OTHER = 5
|
|
LOCALITY = 6
|
|
NEIGHBORHOOD = 7
|
|
|
|
geonames_admin_dictionaries = {
|
|
boundary_types.COUNTRY: COUNTRY_FEATURE_CODES,
|
|
boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES,
|
|
boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES,
|
|
boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES,
|
|
boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES,
|
|
boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES,
|
|
boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES,
|
|
boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
|
|
}
|
|
|
|
# Append new fields to the end for compatibility
|
|
|
|
geonames_fields = [
|
|
'ifnull(alternate_name, gn.name) as alternate_name',
|
|
'gn.geonames_id as geonames_id',
|
|
'gn.name as name',
|
|
'iso_language',
|
|
'is_preferred_name',
|
|
'population',
|
|
'latitude',
|
|
'longitude',
|
|
'feature_code',
|
|
'gn.country_code as country_code',
|
|
'gn.admin1_code as admin1_code',
|
|
'a1.geonames_id as a1_gn_id',
|
|
'gn.admin2_code as admin2_code',
|
|
'a2.geonames_id as a2_gn_id',
|
|
'gn.admin3_code as admin3_code',
|
|
'a3.geonames_id as a3_gn_id',
|
|
'gn.admin4_code as admin4_code',
|
|
'a4.geonames_id as a4_gn_id',
|
|
]
|
|
|
|
base_geonames_query = '''
|
|
select {fields}
|
|
from geonames gn
|
|
left join alternate_names an
|
|
on gn.geonames_id = an.geonames_id
|
|
and iso_language not in ('doi','faac','iata',
|
|
'icao','link','post','tcid')
|
|
left join admin1_codes a1
|
|
on a1.code = gn.admin1_code
|
|
and a1.country_code = gn.country_code
|
|
left join admin2_codes a2
|
|
on a2.code = gn.admin2_code
|
|
and a2.admin1_code = gn.admin1_code
|
|
and a2.country_code = gn.country_code
|
|
left join admin3_codes a3
|
|
on a3.code = gn.admin3_code
|
|
and a3.admin1_code = gn.admin1_code
|
|
and a3.admin2_code = gn.admin2_code
|
|
and a3.country_code = gn.country_code
|
|
left join admin4_codes a4
|
|
on a4.code = gn.admin4_code
|
|
and a4.admin1_code = gn.admin1_code
|
|
and a4.admin2_code = gn.admin2_code
|
|
and a4.admin3_code = gn.admin3_code
|
|
and a4.country_code = gn.country_code
|
|
{predicate}'''
|
|
|
|
IGNORE_COUNTRY_POSTAL_CODES = set([
|
|
'AR', # GeoNames has pre-1999 postal codes
|
|
])
|
|
|
|
postal_codes_query = '''
|
|
select
|
|
postal_code,
|
|
p.country_code as country_code,
|
|
n.geonames_id is not null as have_containing_geoname,
|
|
n.geonames_id as containing_geoname_id,
|
|
group_concat(distinct a1.geonames_id) admin1_ids,
|
|
group_concat(distinct a2.geonames_id) admin2_ids,
|
|
group_concat(distinct a3.geonames_id) admin3_ids
|
|
from postal_codes p
|
|
left join (
|
|
select
|
|
gn.geonames_id,
|
|
alternate_name,
|
|
country_code,
|
|
gn.name
|
|
from alternate_names an
|
|
join geonames gn
|
|
on an.geonames_id = gn.geonames_id
|
|
where iso_language = 'post'
|
|
) as n
|
|
on p.postal_code = n.alternate_name
|
|
and p.country_code = n.country_code
|
|
left join admin1_codes a1
|
|
on a1.code = p.admin1_code
|
|
and p.country_code = a1.country_code
|
|
left join admin2_codes a2
|
|
on a2.code = p.admin2_code
|
|
and a2.admin1_code = p.admin1_code
|
|
and a2.country_code = p.country_code
|
|
left join admin3_codes a3
|
|
on a3.code = p.admin3_code
|
|
and a3.admin1_code = p.admin1_code
|
|
and a3.admin2_code = p.admin2_code
|
|
and a3.country_code = p.country_code
|
|
where p.country_code not in ({codes})
|
|
group by postal_code, p.country_code
|
|
'''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
|
|
|
|
BATCH_SIZE = 10000
|
|
|
|
|
|
def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|
db = sqlite3.connect(db_path)
|
|
|
|
filename = 'geonames.tsv'
|
|
f = open(os.path.join(out_dir, filename), 'w')
|
|
writer = csv.writer(f, delimiter='\t')
|
|
|
|
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
|
|
if boundary_type != boundary_types.COUNTRY:
|
|
predicate = 'where gn.feature_code in ({codes})'.format(
|
|
codes=','.join(['"{}"'.format(c) for c in codes])
|
|
)
|
|
else:
|
|
# The query for countries in GeoNames is somewhat non-trivial
|
|
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
|
|
|
|
query = base_geonames_query.format(
|
|
fields=','.join(geonames_fields),
|
|
predicate=predicate
|
|
)
|
|
cursor = db.execute(query)
|
|
while True:
|
|
batch = cursor.fetchmany(BATCH_SIZE)
|
|
if not batch:
|
|
break
|
|
rows = [
|
|
[str(boundary_type)] + [safe_encode(val or '') for val in row]
|
|
for row in batch
|
|
]
|
|
writer.writerows(rows)
|
|
cursor.close()
|
|
f.flush()
|
|
f.close()
|
|
db.close()
|
|
|
|
|
|
def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|
db = sqlite3.connect(db_path)
|
|
|
|
filename = 'postal_codes.tsv'
|
|
f = open(os.path.join(out_dir, filename), 'w')
|
|
writer = csv.writer(f, delimiter='\t')
|
|
|
|
cursor = db.execute(postal_codes_query)
|
|
|
|
while True:
|
|
batch = cursor.fetchmany(BATCH_SIZE)
|
|
if not batch:
|
|
break
|
|
rows = [
|
|
[safe_encode(val or '') for val in row]
|
|
for row in batch
|
|
]
|
|
writer.writerows(rows)
|
|
|
|
cursor.close()
|
|
f.close()
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Handle argument parsing here
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-d', '--db',
|
|
default=DEFAULT_GEONAMES_DB_PATH,
|
|
help='SQLite db file')
|
|
parser.add_argument('-o', '--out',
|
|
default=DEFAULT_DATA_DIR, help='output directory')
|
|
args = parser.parse_args()
|
|
create_geonames_tsv(args.db, args.out)
|
|
create_postal_codes_tsv(args.db, args.out)
|