From ef1ecb97f779d62a4c0784aebd4f65578bef0059 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 8 Jul 2015 13:25:03 -0400 Subject: [PATCH] [geonames] Adding geonames_id for countries in places/postal codes. For postal codes, sorting desc by country population (10013 is a postal code in Italy but will default to US with no other information) --- .../geodata/geonames/create_geonames_tsv.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py index 72e099bd..d19dde72 100644 --- a/scripts/geodata/geonames/create_geonames_tsv.py +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -105,11 +105,12 @@ geonames_fields = [ GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'), GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'), GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'), - GeonamesField('population', 'GEONAMES_POPULATION'), - GeonamesField('latitude', 'GEONAMES_LATITUDE'), - GeonamesField('longitude', 'GEONAMES_LONGITUDE'), - GeonamesField('feature_code', 'GEONAMES_FEATURE_CODE'), + GeonamesField('gn.population', 'GEONAMES_POPULATION'), + GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'), + GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'), + GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'), GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'), + GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'), GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'), GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'), GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'), @@ -178,11 +179,15 @@ left join admin4_codes a4 base_geonames_query = ''' select {geonames_fields} from geonames gn +join countries c + on gn.country_code = c.country_code {admin_joins} {{predicate}} union all select {alt_name_fields} from geonames gn +join countries c + on gn.country_code = c.country_code join alternate_names an on an.geonames_id = gn.geonames_id and iso_language not in ('doi','faac','iata', @@ -205,16 +210,26 @@ IGNORE_COUNTRY_POSTAL_CODES = set([ postal_code_fields = [ GeonamesField('postal_code', 'GN_POSTAL_CODE'), GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'), + GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'), + GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'), GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'), GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'), GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'), GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'), ] +POSTAL_CODE_INDEX = [i for i, f in enumerate(postal_code_fields) + if f.c_constant == 'GN_POSTAL_CODE'][0] + +POSTAL_CODE_POP_INDEX = [i for i, f in enumerate(postal_code_fields) + if f.c_constant == 'GN_POSTAL_COUNTRY_POPULATION'][0] + postal_codes_query = ''' select {fields} from postal_codes p +join countries c + on p.country_code = c.country_code left join ( select gn.geonames_id, @@ -555,7 +570,14 @@ def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR): f.close() logging.info('Sorting...') - subprocess.check_call(['sort', '-t\t', '-k1,1', '-k2,2', '-o', filename, temp_filename]) + + subprocess.check_call([ + 'sort', '-t\t', '--ignore-case', + '-k{0},{0}'.format(POSTAL_CODE_INDEX + 1), + '-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1), + '-o', filename, + temp_filename + ]) os.unlink(temp_filename) # Generates a C header telling us the order of the fields as written