[geodisambig] Python scripts to prep GeoNames records for trie insertion
This commit is contained in:
0
scripts/geodata/__init__.py
Normal file
0
scripts/geodata/__init__.py
Normal file
34
scripts/geodata/encoding.py
Normal file
34
scripts/geodata/encoding.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import six
|
||||
|
||||
text_type = six.text_type
|
||||
string_types = six.string_types
|
||||
binary_type = six.binary_type
|
||||
|
||||
|
||||
def safe_decode(value, encoding='utf-8', errors='strict'):
|
||||
if isinstance(value, text_type):
|
||||
return value
|
||||
|
||||
if isinstance(value, (string_types, binary_type)):
|
||||
return value.decode(encoding, errors)
|
||||
else:
|
||||
return binary_type(value).decode(encoding, errors)
|
||||
|
||||
|
||||
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
|
||||
if not isinstance(value, (string_types, binary_type)):
|
||||
return binary_type(value)
|
||||
|
||||
if isinstance(value, text_type):
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
if hasattr(incoming, 'lower'):
|
||||
incoming = incoming.lower()
|
||||
if hasattr(encoding, 'lower'):
|
||||
encoding = encoding.lower()
|
||||
|
||||
if value and encoding != incoming:
|
||||
value = safe_decode(value, encoding, errors)
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
return value
|
||||
15
scripts/geodata/file_utils.py
Normal file
15
scripts/geodata/file_utils.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
def unzip_file(filename, dest_dir):
|
||||
subprocess.call(['unzip', '-o', filename, '-d', dest_dir])
|
||||
|
||||
|
||||
def remove_file(filename):
|
||||
os.unlink(filename)
|
||||
|
||||
|
||||
def ensure_dir(d):
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
0
scripts/geodata/geonames/__init__.py
Normal file
0
scripts/geodata/geonames/__init__.py
Normal file
220
scripts/geodata/geonames/create_geonames_tsv.py
Normal file
220
scripts/geodata/geonames/create_geonames_tsv.py
Normal file
@@ -0,0 +1,220 @@
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.file_utils import *
|
||||
from geodata.encoding import safe_encode
|
||||
from geodata.geonames.geonames_sqlite import DEFAULT_GEONAMES_DB_PATH
|
||||
|
||||
DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
|
||||
os.path.pardir, 'data', 'geonames')
|
||||
|
||||
COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
||||
CONTINENT_FEATURE_CODES = ('CONT',)
|
||||
|
||||
ADMIN_1_FEATURE_CODES = ('ADM1',)
|
||||
ADMIN_2_FEATURE_CODES = ('ADM2',)
|
||||
ADMIN_3_FEATURE_CODES = ('ADM3',)
|
||||
ADMIN_4_FEATURE_CODES = ('ADM4',)
|
||||
OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
|
||||
ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
|
||||
|
||||
POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
|
||||
'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
|
||||
'PPLR', 'PPLS', 'STLMT')
|
||||
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
|
||||
|
||||
|
||||
class boundary_types:
|
||||
COUNTRY = 0
|
||||
ADMIN1 = 1
|
||||
ADMIN2 = 2
|
||||
ADMIN3 = 3
|
||||
ADMIN4 = 4
|
||||
ADMIN_OTHER = 5
|
||||
LOCALITY = 6
|
||||
NEIGHBORHOOD = 7
|
||||
|
||||
geonames_admin_dictionaries = {
|
||||
boundary_types.COUNTRY: COUNTRY_FEATURE_CODES,
|
||||
boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES,
|
||||
boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES,
|
||||
boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES,
|
||||
boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES,
|
||||
boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES,
|
||||
boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES,
|
||||
boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
|
||||
}
|
||||
|
||||
# Append new fields to the end for compatibility
|
||||
|
||||
geonames_fields = [
|
||||
'ifnull(alternate_name, gn.name) as alternate_name',
|
||||
'gn.geonames_id as geonames_id',
|
||||
'gn.name as name',
|
||||
'iso_language',
|
||||
'is_preferred_name',
|
||||
'population',
|
||||
'latitude',
|
||||
'longitude',
|
||||
'feature_code',
|
||||
'gn.country_code as country_code',
|
||||
'gn.admin1_code as admin1_code',
|
||||
'a1.geonames_id as a1_gn_id',
|
||||
'gn.admin2_code as admin2_code',
|
||||
'a2.geonames_id as a2_gn_id',
|
||||
'gn.admin3_code as admin3_code',
|
||||
'a3.geonames_id as a3_gn_id',
|
||||
'gn.admin4_code as admin4_code',
|
||||
'a4.geonames_id as a4_gn_id',
|
||||
]
|
||||
|
||||
base_geonames_query = '''
|
||||
select {fields}
|
||||
from geonames gn
|
||||
left join alternate_names an
|
||||
on gn.geonames_id = an.geonames_id
|
||||
and iso_language not in ('doi','faac','iata',
|
||||
'icao','link','post','tcid')
|
||||
left join admin1_codes a1
|
||||
on a1.code = gn.admin1_code
|
||||
and a1.country_code = gn.country_code
|
||||
left join admin2_codes a2
|
||||
on a2.code = gn.admin2_code
|
||||
and a2.admin1_code = gn.admin1_code
|
||||
and a2.country_code = gn.country_code
|
||||
left join admin3_codes a3
|
||||
on a3.code = gn.admin3_code
|
||||
and a3.admin1_code = gn.admin1_code
|
||||
and a3.admin2_code = gn.admin2_code
|
||||
and a3.country_code = gn.country_code
|
||||
left join admin4_codes a4
|
||||
on a4.code = gn.admin4_code
|
||||
and a4.admin1_code = gn.admin1_code
|
||||
and a4.admin2_code = gn.admin2_code
|
||||
and a4.admin3_code = gn.admin3_code
|
||||
and a4.country_code = gn.country_code
|
||||
{predicate}'''
|
||||
|
||||
IGNORE_COUNTRY_POSTAL_CODES = set([
|
||||
'AR', # GeoNames has pre-1999 postal codes
|
||||
])
|
||||
|
||||
postal_codes_query = '''
|
||||
select
|
||||
postal_code,
|
||||
p.country_code as country_code,
|
||||
n.geonames_id is not null as have_containing_geoname,
|
||||
n.geonames_id as containing_geoname_id,
|
||||
group_concat(distinct a1.geonames_id) admin1_ids,
|
||||
group_concat(distinct a2.geonames_id) admin2_ids,
|
||||
group_concat(distinct a3.geonames_id) admin3_ids
|
||||
from postal_codes p
|
||||
left join (
|
||||
select
|
||||
gn.geonames_id,
|
||||
alternate_name,
|
||||
country_code,
|
||||
gn.name
|
||||
from alternate_names an
|
||||
join geonames gn
|
||||
on an.geonames_id = gn.geonames_id
|
||||
where iso_language = 'post'
|
||||
) as n
|
||||
on p.postal_code = n.alternate_name
|
||||
and p.country_code = n.country_code
|
||||
left join admin1_codes a1
|
||||
on a1.code = p.admin1_code
|
||||
and p.country_code = a1.country_code
|
||||
left join admin2_codes a2
|
||||
on a2.code = p.admin2_code
|
||||
and a2.admin1_code = p.admin1_code
|
||||
and a2.country_code = p.country_code
|
||||
left join admin3_codes a3
|
||||
on a3.code = p.admin3_code
|
||||
and a3.admin1_code = p.admin1_code
|
||||
and a3.admin2_code = p.admin2_code
|
||||
and a3.country_code = p.country_code
|
||||
where p.country_code not in ({codes})
|
||||
group by postal_code, p.country_code
|
||||
'''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
|
||||
|
||||
BATCH_SIZE = 10000
|
||||
|
||||
|
||||
def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
||||
db = sqlite3.connect(db_path)
|
||||
|
||||
filename = 'geonames.tsv'
|
||||
f = open(os.path.join(out_dir, filename), 'w')
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
|
||||
for boundary_type, codes in geonames_admin_dictionaries.iteritems():
|
||||
if boundary_type != boundary_types.COUNTRY:
|
||||
predicate = 'where gn.feature_code in ({codes})'.format(
|
||||
codes=','.join(['"{}"'.format(c) for c in codes])
|
||||
)
|
||||
else:
|
||||
# The query for countries in GeoNames is somewhat non-trivial
|
||||
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
|
||||
|
||||
query = base_geonames_query.format(
|
||||
fields=','.join(geonames_fields),
|
||||
predicate=predicate
|
||||
)
|
||||
cursor = db.execute(query)
|
||||
while True:
|
||||
batch = cursor.fetchmany(BATCH_SIZE)
|
||||
if not batch:
|
||||
break
|
||||
rows = [
|
||||
[str(boundary_type)] + [safe_encode(val or '') for val in row]
|
||||
for row in batch
|
||||
]
|
||||
writer.writerows(rows)
|
||||
cursor.close()
|
||||
f.flush()
|
||||
f.close()
|
||||
db.close()
|
||||
|
||||
|
||||
def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
||||
db = sqlite3.connect(db_path)
|
||||
|
||||
filename = 'postal_codes.tsv'
|
||||
f = open(os.path.join(out_dir, filename), 'w')
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
|
||||
cursor = db.execute(postal_codes_query)
|
||||
|
||||
while True:
|
||||
batch = cursor.fetchmany(BATCH_SIZE)
|
||||
if not batch:
|
||||
break
|
||||
rows = [
|
||||
[safe_encode(val or '') for val in row]
|
||||
for row in batch
|
||||
]
|
||||
writer.writerows(rows)
|
||||
|
||||
cursor.close()
|
||||
f.close()
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-d', '--db',
|
||||
default=DEFAULT_GEONAMES_DB_PATH,
|
||||
help='SQLite db file')
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_DATA_DIR, help='output directory')
|
||||
args = parser.parse_args()
|
||||
create_geonames_tsv(args.db, args.out)
|
||||
create_postal_codes_tsv(args.db, args.out)
|
||||
340
scripts/geodata/geonames/geonames_sqlite.py
Normal file
340
scripts/geodata/geonames/geonames_sqlite.py
Normal file
@@ -0,0 +1,340 @@
|
||||
import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
|
||||
import tempfile
|
||||
import urlparse
|
||||
import urllib2
|
||||
import subprocess
|
||||
|
||||
import logging
|
||||
|
||||
import argparse
|
||||
|
||||
import csv
|
||||
import sys
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.file_utils import *
|
||||
from geodata.log import *
|
||||
|
||||
from itertools import islice, chain
|
||||
|
||||
log_to_stdout()
|
||||
logger = logging.getLogger('geonames.sqlite')
|
||||
|
||||
GEONAMES_DB_NAME = 'geonames.db'
|
||||
|
||||
DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir,
|
||||
os.path.pardir, os.path.pardir,
|
||||
'data', 'geonames', GEONAMES_DB_NAME)
|
||||
|
||||
BASE_URL = 'http://download.geonames.org/export/'
|
||||
|
||||
DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
|
||||
ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
|
||||
HIERARCHY_ZIP_FILE = 'hierarchy.zip'
|
||||
ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
|
||||
|
||||
ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
|
||||
|
||||
GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
|
||||
HIERARCHY_ZIP_FILE,
|
||||
ALTERNATE_NAMES_ZIP_FILE)
|
||||
|
||||
# base_url, local_dir, is_gzipped, local_filename
|
||||
|
||||
|
||||
GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
|
||||
(DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
|
||||
(DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
|
||||
(ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
|
||||
]
|
||||
|
||||
|
||||
def download_file(url, dest):
|
||||
logger.info('Downloading file from {}'.format(url))
|
||||
req = urllib2.urlopen(url)
|
||||
with open(dest, 'wb') as fp:
|
||||
shutil.copyfileobj(req, fp)
|
||||
|
||||
|
||||
def admin_ddl(admin_level):
|
||||
columns = ['admin{}_code TEXT'.format(i)
|
||||
for i in xrange(1, admin_level)]
|
||||
|
||||
create = '''
|
||||
CREATE TABLE admin{level}_codes (
|
||||
geonames_id INT,
|
||||
code TEXT,
|
||||
name TEXT,
|
||||
country_code TEXT,
|
||||
{fields}
|
||||
)'''.format(level=admin_level,
|
||||
fields=''',
|
||||
'''.join(columns))
|
||||
|
||||
indices = (
|
||||
'''CREATE INDEX admin{}_code_index ON
|
||||
admin{}_codes (code)'''.format(admin_level, admin_level),
|
||||
'''CREATE INDEX admin{}_gn_id_index ON
|
||||
admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
|
||||
)
|
||||
|
||||
return (create, ) + indices
|
||||
|
||||
geonames_ddl = {
|
||||
'geonames': (
|
||||
'''CREATE TABLE geonames (
|
||||
geonames_id INT PRIMARY KEY,
|
||||
name TEXT,
|
||||
ascii_name TEXT,
|
||||
alternate_names TEXT,
|
||||
latitude DOUBLE,
|
||||
longitude DOUBLE,
|
||||
feature_class TEXT,
|
||||
feature_code TEXT,
|
||||
country_code TEXT,
|
||||
cc2 TEXT,
|
||||
admin1_code TEXT,
|
||||
admin2_code TEXT,
|
||||
admin3_code TEXT,
|
||||
admin4_code TEXT,
|
||||
population LONG,
|
||||
elevation INT,
|
||||
dem INT,
|
||||
timezone TEXT,
|
||||
modification_date TEXT);''',
|
||||
'''CREATE INDEX feature_code ON
|
||||
geonames (feature_code)''',
|
||||
'''CREATE INDEX country_code ON
|
||||
geonames (country_code)''',
|
||||
'''CREATE INDEX admin1_code ON
|
||||
geonames (admin1_code)''',
|
||||
'''CREATE INDEX admin2_code ON
|
||||
geonames (admin2_code)''',
|
||||
'''CREATE INDEX admin3_code ON
|
||||
geonames (admin3_code)''',
|
||||
'''CREATE INDEX admin4_code ON
|
||||
geonames (admin4_code)''',
|
||||
),
|
||||
|
||||
'alternate_names': (
|
||||
'''CREATE TABLE alternate_names (
|
||||
alternate_name_id INT PRIMARY KEY,
|
||||
geonames_id INT,
|
||||
iso_language TEXT,
|
||||
alternate_name TEXT,
|
||||
is_preferred_name BOOLEAN,
|
||||
is_short_name BOOLEAN,
|
||||
is_colloquial BOOLEAN,
|
||||
is_historic BOOLEAN)''',
|
||||
'''CREATE INDEX geonames_id_index ON
|
||||
alternate_names (geonames_id)''',
|
||||
),
|
||||
|
||||
'hierarchy': (
|
||||
'''CREATE TABLE hierarchy (
|
||||
parent_id INT,
|
||||
child_id INT,
|
||||
type TEXT
|
||||
);''',
|
||||
'''CREATE INDEX parent_child_index ON
|
||||
hierarchy (parent_id, child_id)''',
|
||||
'''CREATE INDEX child_parent_index ON
|
||||
hierarchy (child_id, parent_id)''',
|
||||
),
|
||||
|
||||
'postal_codes': (
|
||||
'''CREATE TABLE postal_codes (
|
||||
country_code TEXT,
|
||||
postal_code TEXT,
|
||||
place_name TEXT,
|
||||
admin1 TEXT,
|
||||
admin1_code TEXT,
|
||||
admin2 TEXT,
|
||||
admin2_code TEXT,
|
||||
admin3 TEXT,
|
||||
admin3_code TEXT,
|
||||
latitude DOUBLE,
|
||||
longitude DOUBLE,
|
||||
accuracy INT
|
||||
)''',
|
||||
'''CREATE INDEX post_code_index ON
|
||||
postal_codes (country_code, postal_code)''',
|
||||
),
|
||||
'admin1_codes': admin_ddl(1),
|
||||
'admin2_codes': admin_ddl(2),
|
||||
'admin3_codes': admin_ddl(3),
|
||||
'admin4_codes': admin_ddl(4),
|
||||
|
||||
}
|
||||
|
||||
geonames_file_table_map = {
|
||||
('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
|
||||
('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
|
||||
('', HIERARCHY_ZIP_FILE): 'hierarchy',
|
||||
('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
|
||||
}
|
||||
|
||||
|
||||
country_codes_create_table = (
|
||||
'drop table if exists country_codes',
|
||||
'''
|
||||
create table country_codes as
|
||||
select distinct country_code from geonames
|
||||
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
|
||||
''',
|
||||
)
|
||||
|
||||
proper_countries_create_table = (
|
||||
'drop table if exists proper_countries',
|
||||
'''
|
||||
create table proper_countries as
|
||||
select * from geonames
|
||||
where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
|
||||
and country_code in (select country_code from country_codes)
|
||||
''',
|
||||
)
|
||||
|
||||
territories_create_table = (
|
||||
'drop table if exists territories',
|
||||
'''
|
||||
create table territories as
|
||||
select * from geonames where feature_code = 'TERR'
|
||||
and country_code not in (select country_code from proper_countries);
|
||||
''',
|
||||
)
|
||||
|
||||
countries_create_table = (
|
||||
'drop table if exists countries',
|
||||
'''
|
||||
create table countries as
|
||||
select * from proper_countries
|
||||
union
|
||||
select * from territories;
|
||||
''',
|
||||
'create index country_geonames_id on countries (geonames_id)',
|
||||
)
|
||||
|
||||
country_alises_create_table = (
|
||||
'drop table if exists country_aliases',
|
||||
'''
|
||||
create table country_aliases as
|
||||
select name, country_code
|
||||
from countries
|
||||
union
|
||||
select alternate_name, country_code
|
||||
from alternate_names an
|
||||
join countries c
|
||||
on c.geonames_id = an.geonames_id
|
||||
where alternate_name != ''
|
||||
and iso_language not in ('doi','faac','iata',
|
||||
'icao','link','post','tcid')
|
||||
'''
|
||||
)
|
||||
|
||||
country_table_create_statements = list(chain(country_codes_create_table,
|
||||
proper_countries_create_table,
|
||||
territories_create_table,
|
||||
countries_create_table,
|
||||
country_alises_create_table))
|
||||
|
||||
|
||||
def create_table(conn, table):
|
||||
cursor = conn.cursor()
|
||||
create_statements = geonames_ddl[table]
|
||||
cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
|
||||
for statement in create_statements:
|
||||
cursor.execute(statement)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def batch_iter(iterable, batch_size):
|
||||
source_iter = iter(iterable)
|
||||
while True:
|
||||
batch = list(islice(source_iter, batch_size))
|
||||
if len(batch) > 0:
|
||||
yield batch
|
||||
else:
|
||||
return
|
||||
|
||||
|
||||
def populate_admin_table(conn, admin_level):
|
||||
logging.info('Doing admin level {}'.format(admin_level))
|
||||
|
||||
columns = ['admin{}_code'.format(admin_level),
|
||||
'name', 'country_code']
|
||||
columns.extend(['admin{}_code'.format(i)
|
||||
for i in xrange(1, admin_level)])
|
||||
columns.append('geonames_id')
|
||||
|
||||
admin_insert_statement = '''
|
||||
insert into "admin{}_codes"
|
||||
select {}
|
||||
from geonames
|
||||
where feature_code = "ADM{}"
|
||||
'''.format(admin_level, ','.join(columns), admin_level)
|
||||
|
||||
conn.execute(admin_insert_statement)
|
||||
conn.commit()
|
||||
|
||||
logging.info('Done with admin level {}'.format(admin_level))
|
||||
|
||||
|
||||
def import_geonames_table(conn, table, f, batch_size=2000):
|
||||
# escape the brackets around the values format string so we can use later
|
||||
statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
|
||||
cursor = conn.cursor()
|
||||
for i, batch in enumerate(batch_iter(f, batch_size)):
|
||||
num_cols = len(batch[0])
|
||||
cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
|
||||
conn.commit()
|
||||
cursor = conn.cursor()
|
||||
logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
|
||||
cursor.close()
|
||||
|
||||
|
||||
def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
|
||||
conn = sqlite3.connect(db_file)
|
||||
logging.info('Created database at {}'.format(db_file))
|
||||
for url, directory, is_gzipped, filename in GEONAMES_FILES:
|
||||
table = geonames_file_table_map[(directory, filename)]
|
||||
create_table(conn, table)
|
||||
full_url = urlparse.urljoin(url, filename)
|
||||
dest_dir = os.path.join(temp_dir, directory)
|
||||
ensure_dir(dest_dir)
|
||||
dest_file = os.path.join(dest_dir, filename)
|
||||
download_file(full_url, dest_file)
|
||||
if is_gzipped:
|
||||
unzip_file(dest_file, dest_dir)
|
||||
filename = dest_file.replace('.zip', '.txt')
|
||||
reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
|
||||
lines = (map(safe_decode, line) for line in reader)
|
||||
import_geonames_table(conn, table, lines)
|
||||
logging.info('Creating countries tables')
|
||||
for statement in country_table_create_statements:
|
||||
conn.execute(statement)
|
||||
conn.commit()
|
||||
logging.info('Creating admin tables')
|
||||
for admin_level in xrange(1, 5):
|
||||
create_table(conn, 'admin{}_codes'.format(admin_level))
|
||||
populate_admin_table(conn, admin_level)
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-t', '--temp-dir',
|
||||
default=tempfile.gettempdir(),
|
||||
help='Temporary work directory')
|
||||
parser.add_argument('-o', '--out',
|
||||
default=DEFAULT_GEONAMES_DB_PATH,
|
||||
help='SQLite3 db filename')
|
||||
args = parser.parse_args()
|
||||
create_geonames_sqlite_db(args.temp_dir, args.out)
|
||||
10
scripts/geodata/log.py
Normal file
10
scripts/geodata/log.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import logging
|
||||
import sys
|
||||
|
||||
|
||||
def log_to_stdout(level=logging.INFO):
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logging.root.addHandler(handler)
|
||||
logging.root.setLevel(level)
|
||||
Reference in New Issue
Block a user