From ab1fb3669fb7854e06cca94f9fe6834f5c41754b Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 19 Jun 2015 14:21:20 -0500 Subject: [PATCH] [geonames] Only take alternative names that are != to the canonical name, sort by name, population desc, geonames_id --- scripts/geodata/geonames/create_geonames_tsv.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py index 9c905993..867d4d97 100644 --- a/scripts/geodata/geonames/create_geonames_tsv.py +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -103,6 +103,9 @@ geonames_fields = [ DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields) if f.is_dummy][0] +GEONAMES_ID_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_ID'][0] + CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields) if f.c_constant == 'GEONAMES_CANONICAL'][0] @@ -112,6 +115,9 @@ NAME_INDEX = [i for i, f in enumerate(geonames_fields) COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields) if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0] +POPULATION_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_POPULATION'][0] + geonames_admin_joins = ''' left join admin1_codes a1 @@ -149,6 +155,7 @@ join alternate_names an on an.geonames_id = gn.geonames_id and iso_language not in ('doi','faac','iata', 'icao','link','post','tcid') + and an.alternate_name != gn.name {admin_joins} {{predicate}} '''.format( @@ -301,7 +308,11 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): f.close() logging.info('Sorting...') - subprocess.check_call(['sort', '-t\t', '-k1,1', '-k2,2', '-o', filename, temp_filename]) + subprocess.check_call(['sort', '-t\t', '-u', + '-k{0},{0}'.format(NAME_INDEX + 1), + '-k{0},{0}nr'.format(POPULATION_INDEX + 1), + '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1), + '-o', filename, temp_filename]) os.unlink(temp_filename)