[fix] Capturing GeoNames canonical and alternate names with a UNION ALL query, creating C headers with the field orderings for parsing the TSV file downstream
This commit is contained in:
@@ -51,36 +51,44 @@ geonames_admin_dictionaries = {
|
|||||||
boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
|
boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Append new fields to the end for compatibility
|
# Inserted post-query
|
||||||
|
DUMMY_BOUNDARY_TYPE = '-1 as type'
|
||||||
|
|
||||||
|
|
||||||
|
class GeonamesField(object):
|
||||||
|
def __init__(self, name, c_constant, default=None, is_dummy=False):
|
||||||
|
self.name = name
|
||||||
|
self.c_constant = c_constant
|
||||||
|
self.default = default
|
||||||
|
self.is_dummy = is_dummy
|
||||||
|
|
||||||
geonames_fields = [
|
geonames_fields = [
|
||||||
'ifnull(alternate_name, gn.name) as alternate_name',
|
# Field if alternate_names present, default field name if not, C header constant
|
||||||
'gn.geonames_id as geonames_id',
|
GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
|
||||||
'gn.name as name',
|
GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
|
||||||
'iso_language',
|
GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
|
||||||
'is_preferred_name',
|
GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
|
||||||
'population',
|
GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
|
||||||
'latitude',
|
GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
|
||||||
'longitude',
|
GeonamesField('population', 'GEONAMES_POPULATION'),
|
||||||
'feature_code',
|
GeonamesField('latitude', 'GEONAMES_LATITUDE'),
|
||||||
'gn.country_code as country_code',
|
GeonamesField('longitude', 'GEONAMES_LONGITUDE'),
|
||||||
'gn.admin1_code as admin1_code',
|
GeonamesField('feature_code', 'GEONAMES_FEATURE_CODE'),
|
||||||
'a1.geonames_id as a1_gn_id',
|
GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
|
||||||
'gn.admin2_code as admin2_code',
|
GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
|
||||||
'a2.geonames_id as a2_gn_id',
|
GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
|
||||||
'gn.admin3_code as admin3_code',
|
GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
|
||||||
'a3.geonames_id as a3_gn_id',
|
GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
|
||||||
'gn.admin4_code as admin4_code',
|
GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
|
||||||
'a4.geonames_id as a4_gn_id',
|
GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
|
||||||
|
GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
|
||||||
|
GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
|
||||||
]
|
]
|
||||||
|
|
||||||
base_geonames_query = '''
|
DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
|
||||||
select {fields}
|
if f.is_dummy][0]
|
||||||
from geonames gn
|
|
||||||
left join alternate_names an
|
geonames_admin_joins = '''
|
||||||
on gn.geonames_id = an.geonames_id
|
|
||||||
and iso_language not in ('doi','faac','iata',
|
|
||||||
'icao','link','post','tcid')
|
|
||||||
left join admin1_codes a1
|
left join admin1_codes a1
|
||||||
on a1.code = gn.admin1_code
|
on a1.code = gn.admin1_code
|
||||||
and a1.country_code = gn.country_code
|
and a1.country_code = gn.country_code
|
||||||
@@ -99,21 +107,49 @@ left join admin4_codes a4
|
|||||||
and a4.admin2_code = gn.admin2_code
|
and a4.admin2_code = gn.admin2_code
|
||||||
and a4.admin3_code = gn.admin3_code
|
and a4.admin3_code = gn.admin3_code
|
||||||
and a4.country_code = gn.country_code
|
and a4.country_code = gn.country_code
|
||||||
{predicate}'''
|
'''
|
||||||
|
|
||||||
|
# Canonical names are stored in the geonames table with alternates
|
||||||
|
# stored in a separate table. UNION ALL query will capture them all.
|
||||||
|
|
||||||
|
base_geonames_query = '''
|
||||||
|
select {geonames_fields}
|
||||||
|
from geonames gn
|
||||||
|
{admin_joins}
|
||||||
|
{{predicate}}
|
||||||
|
union all
|
||||||
|
select {alt_name_fields}
|
||||||
|
from geonames gn
|
||||||
|
join alternate_names an
|
||||||
|
on an.geonames_id = gn.geonames_id
|
||||||
|
and iso_language not in ('doi','faac','iata',
|
||||||
|
'icao','link','post','tcid')
|
||||||
|
{admin_joins}
|
||||||
|
{{predicate}}
|
||||||
|
'''.format(
|
||||||
|
geonames_fields=', '.join((f.name if f.default is None else
|
||||||
|
'{} as {}'.format(f.default, f.name)
|
||||||
|
for f in geonames_fields)),
|
||||||
|
alt_name_fields=', '.join((f.name for f in geonames_fields)),
|
||||||
|
admin_joins=geonames_admin_joins
|
||||||
|
)
|
||||||
|
|
||||||
IGNORE_COUNTRY_POSTAL_CODES = set([
|
IGNORE_COUNTRY_POSTAL_CODES = set([
|
||||||
'AR', # GeoNames has pre-1999 postal codes
|
'AR', # GeoNames has pre-1999 postal codes
|
||||||
])
|
])
|
||||||
|
|
||||||
|
postal_code_fields = [
|
||||||
|
GeonamesField('postal_code', 'GN_POSTAL_CODE'),
|
||||||
|
GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
|
||||||
|
GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
|
||||||
|
GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
|
||||||
|
GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
|
||||||
|
GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
|
||||||
|
]
|
||||||
|
|
||||||
postal_codes_query = '''
|
postal_codes_query = '''
|
||||||
select
|
select
|
||||||
postal_code,
|
{fields}
|
||||||
p.country_code as country_code,
|
|
||||||
n.geonames_id is not null as have_containing_geoname,
|
|
||||||
n.geonames_id as containing_geoname_id,
|
|
||||||
group_concat(distinct a1.geonames_id) admin1_ids,
|
|
||||||
group_concat(distinct a2.geonames_id) admin2_ids,
|
|
||||||
group_concat(distinct a3.geonames_id) admin3_ids
|
|
||||||
from postal_codes p
|
from postal_codes p
|
||||||
left join (
|
left join (
|
||||||
select
|
select
|
||||||
@@ -140,11 +176,13 @@ left join admin3_codes a3
|
|||||||
and a3.admin1_code = p.admin1_code
|
and a3.admin1_code = p.admin1_code
|
||||||
and a3.admin2_code = p.admin2_code
|
and a3.admin2_code = p.admin2_code
|
||||||
and a3.country_code = p.country_code
|
and a3.country_code = p.country_code
|
||||||
where p.country_code not in ({codes})
|
where p.country_code not in ({exclude_country_codes})
|
||||||
group by postal_code, p.country_code
|
group by postal_code, p.country_code
|
||||||
'''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
|
'''.format(
|
||||||
|
fields=','.join([f.name for f in postal_code_fields]),
|
||||||
|
exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
|
||||||
|
|
||||||
BATCH_SIZE = 10000
|
BATCH_SIZE = 2000
|
||||||
|
|
||||||
|
|
||||||
def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
||||||
@@ -164,18 +202,20 @@ def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|||||||
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
|
predicate = 'where gn.geonames_id in (select geonames_id from countries)'
|
||||||
|
|
||||||
query = base_geonames_query.format(
|
query = base_geonames_query.format(
|
||||||
fields=','.join(geonames_fields),
|
|
||||||
predicate=predicate
|
predicate=predicate
|
||||||
)
|
)
|
||||||
|
|
||||||
cursor = db.execute(query)
|
cursor = db.execute(query)
|
||||||
while True:
|
while True:
|
||||||
batch = cursor.fetchmany(BATCH_SIZE)
|
batch = cursor.fetchmany(BATCH_SIZE)
|
||||||
if not batch:
|
if not batch:
|
||||||
break
|
break
|
||||||
rows = [
|
rows = []
|
||||||
[str(boundary_type)] + [safe_encode(val or '') for val in row]
|
for row in batch:
|
||||||
for row in batch
|
row = [safe_encode(val or '') for val in row]
|
||||||
]
|
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
writer.writerows(rows)
|
writer.writerows(rows)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
f.flush()
|
f.flush()
|
||||||
@@ -206,6 +246,37 @@ def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR):
|
|||||||
f.close()
|
f.close()
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
# Generates a C header telling us the order of the fields as written
|
||||||
|
GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
|
'src', 'geonames_fields.h')
|
||||||
|
|
||||||
|
GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
|
||||||
|
{fields}
|
||||||
|
NUM_GEONAMES_FIELDS
|
||||||
|
}};
|
||||||
|
'''.format(fields=''',
|
||||||
|
'''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))
|
||||||
|
|
||||||
|
|
||||||
|
def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
f.write(GEONAMES_FIELDS_HEADER_FILE)
|
||||||
|
|
||||||
|
POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
|
'src', 'postal_fields.h')
|
||||||
|
|
||||||
|
POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
|
||||||
|
{fields}
|
||||||
|
NUM_POSTAL_FIELDS
|
||||||
|
}};
|
||||||
|
'''.format(fields=''',
|
||||||
|
'''.join(['{}={},'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))
|
||||||
|
|
||||||
|
|
||||||
|
def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
f.write(POSTAL_FIELDS_HEADER_FILE)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Handle argument parsing here
|
# Handle argument parsing here
|
||||||
@@ -218,3 +289,5 @@ if __name__ == '__main__':
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
create_geonames_tsv(args.db, args.out)
|
create_geonames_tsv(args.db, args.out)
|
||||||
create_postal_codes_tsv(args.db, args.out)
|
create_postal_codes_tsv(args.db, args.out)
|
||||||
|
write_geonames_fields_header()
|
||||||
|
write_postal_fields_header()
|
||||||
|
|||||||
Reference in New Issue
Block a user