[geonames] Only take alternative names that are != to the canonical name, sort by name, population desc, geonames_id

This commit is contained in:
Al
2015-06-19 14:21:20 -05:00
parent bc306fc6c8
commit ab1fb3669f

View File

@@ -103,6 +103,9 @@ geonames_fields = [
DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
if f.is_dummy][0]
GEONAMES_ID_INDEX = [i for i, f in enumerate(geonames_fields)
if f.c_constant == 'GEONAMES_ID'][0]
CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields)
if f.c_constant == 'GEONAMES_CANONICAL'][0]
@@ -112,6 +115,9 @@ NAME_INDEX = [i for i, f in enumerate(geonames_fields)
COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields)
if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0]
POPULATION_INDEX = [i for i, f in enumerate(geonames_fields)
if f.c_constant == 'GEONAMES_POPULATION'][0]
geonames_admin_joins = '''
left join admin1_codes a1
@@ -149,6 +155,7 @@ join alternate_names an
on an.geonames_id = gn.geonames_id
and iso_language not in ('doi','faac','iata',
'icao','link','post','tcid')
and an.alternate_name != gn.name
{admin_joins}
{{predicate}}
'''.format(
@@ -301,7 +308,11 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
f.close()
logging.info('Sorting...')
subprocess.check_call(['sort', '-t\t', '-k1,1', '-k2,2', '-o', filename, temp_filename])
subprocess.check_call(['sort', '-t\t', '-u',
'-k{0},{0}'.format(NAME_INDEX + 1),
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
'-o', filename, temp_filename])
os.unlink(temp_filename)