334 lines
9.9 KiB
Python
334 lines
9.9 KiB
Python
import os
|
|
import shutil
|
|
import sqlite3
|
|
|
|
import tempfile
|
|
import urlparse
|
|
import urllib2
|
|
import subprocess
|
|
|
|
import logging
|
|
|
|
import argparse
|
|
|
|
import csv
|
|
import sys
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
from geodata.encoding import safe_decode
|
|
from geodata.geonames.paths import *
|
|
|
|
from geodata.file_utils import *
|
|
from geodata.log import *
|
|
|
|
from itertools import islice, chain
|
|
|
|
log_to_file(sys.stderr)
|
|
logger = logging.getLogger('geonames.sqlite')
|
|
|
|
BASE_URL = 'http://download.geonames.org/export/'
|
|
|
|
DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
|
|
ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
|
|
HIERARCHY_ZIP_FILE = 'hierarchy.zip'
|
|
ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
|
|
|
|
ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
|
|
|
|
GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
|
|
HIERARCHY_ZIP_FILE,
|
|
ALTERNATE_NAMES_ZIP_FILE)
|
|
|
|
# base_url, local_dir, is_gzipped, local_filename
|
|
|
|
|
|
GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
|
|
(DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
|
|
(DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
|
|
(ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
|
|
]
|
|
|
|
|
|
def download_file(url, dest):
|
|
logger.info('Downloading file from {}'.format(url))
|
|
subprocess.check_call(['wget', url, '-O', dest])
|
|
|
|
|
|
def admin_ddl(admin_level):
|
|
columns = ['country_code TEXT'] + \
|
|
['admin{}_code TEXT'.format(i)
|
|
for i in xrange(1, admin_level)]
|
|
|
|
create = '''
|
|
CREATE TABLE admin{level}_codes (
|
|
geonames_id INT,
|
|
code TEXT,
|
|
name TEXT,
|
|
{fields}
|
|
)'''.format(level=admin_level,
|
|
fields=''',
|
|
'''.join(columns))
|
|
|
|
indices = (
|
|
'''CREATE INDEX admin{}_code_index ON
|
|
admin{}_codes (code)'''.format(admin_level, admin_level),
|
|
'''CREATE INDEX admin{}_gn_id_index ON
|
|
admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
|
|
)
|
|
|
|
return (create, ) + indices
|
|
|
|
geonames_ddl = {
|
|
'geonames': (
|
|
'''CREATE TABLE geonames (
|
|
geonames_id INT PRIMARY KEY,
|
|
name TEXT,
|
|
ascii_name TEXT,
|
|
alternate_names TEXT,
|
|
latitude DOUBLE,
|
|
longitude DOUBLE,
|
|
feature_class TEXT,
|
|
feature_code TEXT,
|
|
country_code TEXT,
|
|
cc2 TEXT,
|
|
admin1_code TEXT,
|
|
admin2_code TEXT,
|
|
admin3_code TEXT,
|
|
admin4_code TEXT,
|
|
population LONG DEFAULT 0,
|
|
elevation INT,
|
|
dem INT,
|
|
timezone TEXT,
|
|
modification_date TEXT)''',
|
|
'''CREATE INDEX feature_code ON
|
|
geonames (feature_code)''',
|
|
'''CREATE INDEX country_code ON
|
|
geonames (country_code)''',
|
|
'''CREATE INDEX admin_codes ON
|
|
geonames (country_code, admin1_code, admin2_code, admin3_code, admin4_code)''',
|
|
),
|
|
|
|
'alternate_names': (
|
|
'''CREATE TABLE alternate_names (
|
|
alternate_name_id INT PRIMARY KEY,
|
|
geonames_id INT,
|
|
iso_language TEXT,
|
|
alternate_name TEXT,
|
|
is_preferred_name BOOLEAN DEFAULT 0,
|
|
is_short_name BOOLEAN DEFAULT 0,
|
|
is_colloquial BOOLEAN DEFAULT 0,
|
|
is_historic BOOLEAN DEFAULT 0)''',
|
|
'''CREATE INDEX geonames_id_index ON
|
|
alternate_names (geonames_id)''',
|
|
'''CREATE INDEX geonames_id_alt_name_index ON
|
|
alternate_names(geonames_id, alternate_name)''',
|
|
),
|
|
|
|
'hierarchy': (
|
|
'''CREATE TABLE hierarchy (
|
|
parent_id INT,
|
|
child_id INT,
|
|
type TEXT
|
|
);''',
|
|
'''CREATE INDEX parent_child_index ON
|
|
hierarchy (parent_id, child_id)''',
|
|
'''CREATE INDEX child_parent_index ON
|
|
hierarchy (child_id, parent_id)''',
|
|
),
|
|
|
|
'postal_codes': (
|
|
'''CREATE TABLE postal_codes (
|
|
country_code TEXT,
|
|
postal_code TEXT,
|
|
place_name TEXT,
|
|
admin1 TEXT,
|
|
admin1_code TEXT,
|
|
admin2 TEXT,
|
|
admin2_code TEXT,
|
|
admin3 TEXT,
|
|
admin3_code TEXT,
|
|
latitude DOUBLE,
|
|
longitude DOUBLE,
|
|
accuracy INT
|
|
)''',
|
|
'''CREATE INDEX post_code_index ON
|
|
postal_codes (country_code, postal_code)''',
|
|
'''CREATE INDEX postal_code_admins ON
|
|
postal_codes (country_code, admin1_code, admin2_code, admin3_code)''',
|
|
),
|
|
'admin1_codes': admin_ddl(1),
|
|
'admin2_codes': admin_ddl(2),
|
|
'admin3_codes': admin_ddl(3),
|
|
'admin4_codes': admin_ddl(4),
|
|
|
|
}
|
|
|
|
geonames_file_table_map = {
|
|
('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
|
|
('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
|
|
('', HIERARCHY_ZIP_FILE): 'hierarchy',
|
|
('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
|
|
}
|
|
|
|
|
|
country_codes_create_table = (
|
|
'drop table if exists country_codes',
|
|
'''
|
|
create table country_codes as
|
|
select distinct country_code from geonames
|
|
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
|
|
''',
|
|
)
|
|
|
|
proper_countries_create_table = (
|
|
'drop table if exists proper_countries',
|
|
'''
|
|
create table proper_countries as
|
|
select * from geonames
|
|
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
|
and country_code in (select country_code from country_codes)
|
|
''',
|
|
)
|
|
|
|
territories_create_table = (
|
|
'drop table if exists territories',
|
|
'''
|
|
create table territories as
|
|
select * from geonames where feature_code = 'TERR'
|
|
and country_code not in (select country_code from proper_countries);
|
|
''',
|
|
)
|
|
|
|
countries_create_table = (
|
|
'drop table if exists countries',
|
|
'''
|
|
create table countries as
|
|
select * from proper_countries
|
|
union
|
|
select * from territories;
|
|
''',
|
|
'create index country_geonames_id on countries (geonames_id)',
|
|
'create index conntry_country_code on countries (country_code)',
|
|
)
|
|
|
|
country_alises_create_table = (
|
|
'drop table if exists country_aliases',
|
|
'''
|
|
create table country_aliases as
|
|
select name, country_code
|
|
from countries
|
|
union
|
|
select alternate_name, country_code
|
|
from alternate_names an
|
|
join countries c
|
|
on c.geonames_id = an.geonames_id
|
|
where alternate_name != ''
|
|
and iso_language not in ('doi','faac','iata',
|
|
'icao','link','post','tcid')
|
|
'''
|
|
)
|
|
|
|
country_table_create_statements = list(chain(country_codes_create_table,
|
|
proper_countries_create_table,
|
|
territories_create_table,
|
|
countries_create_table,
|
|
country_alises_create_table))
|
|
|
|
|
|
def create_table(conn, table):
|
|
cursor = conn.cursor()
|
|
create_statements = geonames_ddl[table]
|
|
cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
|
|
for statement in create_statements:
|
|
cursor.execute(statement)
|
|
conn.commit()
|
|
|
|
|
|
def batch_iter(iterable, batch_size):
|
|
source_iter = iter(iterable)
|
|
while True:
|
|
batch = list(islice(source_iter, batch_size))
|
|
if len(batch) > 0:
|
|
yield batch
|
|
else:
|
|
return
|
|
|
|
|
|
def populate_admin_table(conn, admin_level):
|
|
logging.info('Doing admin level {}'.format(admin_level))
|
|
|
|
columns = ['geonames_id',
|
|
'admin{}_code'.format(admin_level),
|
|
'name',
|
|
'country_code']
|
|
columns.extend(['admin{}_code'.format(i)
|
|
for i in xrange(1, admin_level)])
|
|
|
|
admin_insert_statement = '''
|
|
insert into "admin{}_codes"
|
|
select {}
|
|
from geonames
|
|
where feature_code = "ADM{}"
|
|
'''.format(admin_level, ','.join(columns), admin_level)
|
|
|
|
conn.execute(admin_insert_statement)
|
|
conn.commit()
|
|
|
|
logging.info('Done with admin level {}'.format(admin_level))
|
|
|
|
|
|
def import_geonames_table(conn, table, f, batch_size=2000):
|
|
# escape the brackets around the values format string so we can use later
|
|
statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
|
|
cursor = conn.cursor()
|
|
for i, batch in enumerate(batch_iter(f, batch_size)):
|
|
num_cols = len(batch[0])
|
|
cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
|
|
conn.commit()
|
|
cursor = conn.cursor()
|
|
logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
|
|
cursor.close()
|
|
|
|
|
|
def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
|
|
conn = sqlite3.connect(db_file)
|
|
logging.info('Created database at {}'.format(db_file))
|
|
for url, directory, is_gzipped, filename in GEONAMES_FILES:
|
|
table = geonames_file_table_map[(directory, filename)]
|
|
create_table(conn, table)
|
|
full_url = urlparse.urljoin(url, filename)
|
|
dest_dir = os.path.join(temp_dir, directory)
|
|
ensure_dir(dest_dir)
|
|
dest_file = os.path.join(dest_dir, filename)
|
|
download_file(full_url, dest_file)
|
|
if is_gzipped:
|
|
unzip_file(dest_file, dest_dir)
|
|
filename = dest_file.replace('.zip', '.txt')
|
|
reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
|
|
lines = (map(safe_decode, line) for line in reader)
|
|
import_geonames_table(conn, table, lines)
|
|
logging.info('Creating countries tables')
|
|
for statement in country_table_create_statements:
|
|
conn.execute(statement)
|
|
conn.commit()
|
|
logging.info('Creating admin tables')
|
|
for admin_level in xrange(1, 5):
|
|
create_table(conn, 'admin{}_codes'.format(admin_level))
|
|
populate_admin_table(conn, admin_level)
|
|
conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Handle argument parsing here
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-t', '--temp-dir',
|
|
default=tempfile.gettempdir(),
|
|
help='Temporary work directory')
|
|
parser.add_argument('-o', '--out',
|
|
default=DEFAULT_GEONAMES_DB_PATH,
|
|
help='SQLite3 db filename')
|
|
args = parser.parse_args()
|
|
create_geonames_sqlite_db(args.temp_dir, args.out)
|