[geonames] Adding geonames_id for countries in places/postal codes. For postal codes, sorting desc by country population (10013 is a postal code in Italy but will default to US with no other information)
This commit is contained in:
@@ -105,11 +105,12 @@ geonames_fields = [
|
|||||||
GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
|
GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
|
||||||
GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
|
GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
|
||||||
GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
|
GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
|
||||||
GeonamesField('population', 'GEONAMES_POPULATION'),
|
GeonamesField('gn.population', 'GEONAMES_POPULATION'),
|
||||||
GeonamesField('latitude', 'GEONAMES_LATITUDE'),
|
GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'),
|
||||||
GeonamesField('longitude', 'GEONAMES_LONGITUDE'),
|
GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'),
|
||||||
GeonamesField('feature_code', 'GEONAMES_FEATURE_CODE'),
|
GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'),
|
||||||
GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
|
GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
|
||||||
|
GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'),
|
||||||
GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
|
GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
|
||||||
GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
|
GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
|
||||||
GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
|
GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
|
||||||
@@ -178,11 +179,15 @@ left join admin4_codes a4
|
|||||||
base_geonames_query = '''
|
base_geonames_query = '''
|
||||||
select {geonames_fields}
|
select {geonames_fields}
|
||||||
from geonames gn
|
from geonames gn
|
||||||
|
join countries c
|
||||||
|
on gn.country_code = c.country_code
|
||||||
{admin_joins}
|
{admin_joins}
|
||||||
{{predicate}}
|
{{predicate}}
|
||||||
union all
|
union all
|
||||||
select {alt_name_fields}
|
select {alt_name_fields}
|
||||||
from geonames gn
|
from geonames gn
|
||||||
|
join countries c
|
||||||
|
on gn.country_code = c.country_code
|
||||||
join alternate_names an
|
join alternate_names an
|
||||||
on an.geonames_id = gn.geonames_id
|
on an.geonames_id = gn.geonames_id
|
||||||
and iso_language not in ('doi','faac','iata',
|
and iso_language not in ('doi','faac','iata',
|
||||||
@@ -205,16 +210,26 @@ IGNORE_COUNTRY_POSTAL_CODES = set([
|
|||||||
postal_code_fields = [
|
postal_code_fields = [
|
||||||
GeonamesField('postal_code', 'GN_POSTAL_CODE'),
|
GeonamesField('postal_code', 'GN_POSTAL_CODE'),
|
||||||
GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
|
GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
|
||||||
|
GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'),
|
||||||
|
GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'),
|
||||||
GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
|
GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
|
||||||
GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
|
GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
|
||||||
GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
|
GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
|
||||||
GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
|
GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
POSTAL_CODE_INDEX = [i for i, f in enumerate(postal_code_fields)
|
||||||
|
if f.c_constant == 'GN_POSTAL_CODE'][0]
|
||||||
|
|
||||||
|
POSTAL_CODE_POP_INDEX = [i for i, f in enumerate(postal_code_fields)
|
||||||
|
if f.c_constant == 'GN_POSTAL_COUNTRY_POPULATION'][0]
|
||||||
|
|
||||||
postal_codes_query = '''
|
postal_codes_query = '''
|
||||||
select
|
select
|
||||||
{fields}
|
{fields}
|
||||||
from postal_codes p
|
from postal_codes p
|
||||||
|
join countries c
|
||||||
|
on p.country_code = c.country_code
|
||||||
left join (
|
left join (
|
||||||
select
|
select
|
||||||
gn.geonames_id,
|
gn.geonames_id,
|
||||||
@@ -555,7 +570,14 @@ def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
logging.info('Sorting...')
|
logging.info('Sorting...')
|
||||||
subprocess.check_call(['sort', '-t\t', '-k1,1', '-k2,2', '-o', filename, temp_filename])
|
|
||||||
|
subprocess.check_call([
|
||||||
|
'sort', '-t\t', '--ignore-case',
|
||||||
|
'-k{0},{0}'.format(POSTAL_CODE_INDEX + 1),
|
||||||
|
'-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1),
|
||||||
|
'-o', filename,
|
||||||
|
temp_filename
|
||||||
|
])
|
||||||
os.unlink(temp_filename)
|
os.unlink(temp_filename)
|
||||||
|
|
||||||
# Generates a C header telling us the order of the fields as written
|
# Generates a C header telling us the order of the fields as written
|
||||||
|
|||||||
Reference in New Issue
Block a user