diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py index cdfec3a6..feee8f72 100644 --- a/scripts/geodata/geonames/create_geonames_tsv.py +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -51,36 +51,44 @@ geonames_admin_dictionaries = { boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES, } -# Append new fields to the end for compatibility +# Inserted post-query +DUMMY_BOUNDARY_TYPE = '-1 as type' + + +class GeonamesField(object): + def __init__(self, name, c_constant, default=None, is_dummy=False): + self.name = name + self.c_constant = c_constant + self.default = default + self.is_dummy = is_dummy geonames_fields = [ - 'ifnull(alternate_name, gn.name) as alternate_name', - 'gn.geonames_id as geonames_id', - 'gn.name as name', - 'iso_language', - 'is_preferred_name', - 'population', - 'latitude', - 'longitude', - 'feature_code', - 'gn.country_code as country_code', - 'gn.admin1_code as admin1_code', - 'a1.geonames_id as a1_gn_id', - 'gn.admin2_code as admin2_code', - 'a2.geonames_id as a2_gn_id', - 'gn.admin3_code as admin3_code', - 'a3.geonames_id as a3_gn_id', - 'gn.admin4_code as admin4_code', - 'a4.geonames_id as a4_gn_id', + # Field if alternate_names present, default field name if not, C header constant + GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'), + GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'), + GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True), + GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'), + GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"), + GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'), + GeonamesField('population', 'GEONAMES_POPULATION'), + GeonamesField('latitude', 'GEONAMES_LATITUDE'), + GeonamesField('longitude', 'GEONAMES_LONGITUDE'), + GeonamesField('feature_code', 'GEONAMES_FEATURE_CODE'), + GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'), + GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'), + GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'), + GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'), + GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'), + GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'), + GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'), + GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'), + GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'), ] -base_geonames_query = ''' -select {fields} -from geonames gn -left join alternate_names an - on gn.geonames_id = an.geonames_id - and iso_language not in ('doi','faac','iata', - 'icao','link','post','tcid') +DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields) + if f.is_dummy][0] + +geonames_admin_joins = ''' left join admin1_codes a1 on a1.code = gn.admin1_code and a1.country_code = gn.country_code @@ -99,21 +107,49 @@ left join admin4_codes a4 and a4.admin2_code = gn.admin2_code and a4.admin3_code = gn.admin3_code and a4.country_code = gn.country_code -{predicate}''' +''' + +# Canonical names are stored in the geonames table with alternates +# stored in a separate table. UNION ALL query will capture them all. + +base_geonames_query = ''' +select {geonames_fields} +from geonames gn +{admin_joins} +{{predicate}} +union all +select {alt_name_fields} +from geonames gn +join alternate_names an + on an.geonames_id = gn.geonames_id + and iso_language not in ('doi','faac','iata', + 'icao','link','post','tcid') +{admin_joins} +{{predicate}} +'''.format( + geonames_fields=', '.join((f.name if f.default is None else + '{} as {}'.format(f.default, f.name) + for f in geonames_fields)), + alt_name_fields=', '.join((f.name for f in geonames_fields)), + admin_joins=geonames_admin_joins +) IGNORE_COUNTRY_POSTAL_CODES = set([ 'AR', # GeoNames has pre-1999 postal codes ]) +postal_code_fields = [ + GeonamesField('postal_code', 'GN_POSTAL_CODE'), + GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'), + GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'), + GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'), + GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'), + GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'), +] + postal_codes_query = ''' select -postal_code, -p.country_code as country_code, -n.geonames_id is not null as have_containing_geoname, -n.geonames_id as containing_geoname_id, -group_concat(distinct a1.geonames_id) admin1_ids, -group_concat(distinct a2.geonames_id) admin2_ids, -group_concat(distinct a3.geonames_id) admin3_ids +{fields} from postal_codes p left join ( select @@ -140,11 +176,13 @@ left join admin3_codes a3 and a3.admin1_code = p.admin1_code and a3.admin2_code = p.admin2_code and a3.country_code = p.country_code -where p.country_code not in ({codes}) +where p.country_code not in ({exclude_country_codes}) group by postal_code, p.country_code -'''.format(codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES)) +'''.format( + fields=','.join([f.name for f in postal_code_fields]), + exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES)) -BATCH_SIZE = 10000 +BATCH_SIZE = 2000 def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): @@ -164,18 +202,20 @@ def create_geonames_tsv(db_path, out_dir=DEFAULT_DATA_DIR): predicate = 'where gn.geonames_id in (select geonames_id from countries)' query = base_geonames_query.format( - fields=','.join(geonames_fields), predicate=predicate ) + cursor = db.execute(query) while True: batch = cursor.fetchmany(BATCH_SIZE) if not batch: break - rows = [ - [str(boundary_type)] + [safe_encode(val or '') for val in row] - for row in batch - ] + rows = [] + for row in batch: + row = [safe_encode(val or '') for val in row] + row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type + rows.append(row) + writer.writerows(rows) cursor.close() f.flush() @@ -206,6 +246,37 @@ def create_postal_codes_tsv(db_path, out_dir=DEFAULT_DATA_DIR): f.close() db.close() +# Generates a C header telling us the order of the fields as written +GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'src', 'geonames_fields.h') + +GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{ + {fields} + NUM_GEONAMES_FIELDS +}}; +'''.format(fields=''', + '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)])) + + +def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER): + with open(filename, 'w') as f: + f.write(GEONAMES_FIELDS_HEADER_FILE) + +POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'src', 'postal_fields.h') + +POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{ + {fields} + NUM_POSTAL_FIELDS +}}; +'''.format(fields=''', + '''.join(['{}={},'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)])) + + +def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER): + with open(filename, 'w') as f: + f.write(POSTAL_FIELDS_HEADER_FILE) + if __name__ == '__main__': # Handle argument parsing here @@ -218,3 +289,5 @@ if __name__ == '__main__': args = parser.parse_args() create_geonames_tsv(args.db, args.out) create_postal_codes_tsv(args.db, args.out) + write_geonames_fields_header() + write_postal_fields_header()