[geonames] Adding LC_ALL environment variable for utf8 sorting
This commit is contained in:
@@ -500,21 +500,32 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
logging.info('Sorting...')
|
logging.info('Sorting...')
|
||||||
subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
|
|
||||||
'-k{0},{0}'.format(NAME_INDEX + 1),
|
env = os.environ.copy()
|
||||||
# If there's a Wikipedia link to this name for the given id, sort first
|
env['LC_ALL'] = 'C'
|
||||||
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
|
|
||||||
# Historical entries should be sorted last
|
command = ['sort', '-t\t', '-u', '--ignore-case',
|
||||||
'-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
|
'-k{0},{0}'.format(NAME_INDEX + 1),
|
||||||
# Sort descending by population (basic proxy for relevance)
|
# If there's a Wikipedia link to this name for the given id, sort first
|
||||||
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
|
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
|
||||||
# group rows for the same geonames ID together
|
# Historical entries should be sorted last
|
||||||
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
|
'-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
|
||||||
# preferred names come first within that grouping
|
# Sort descending by population (basic proxy for relevance)
|
||||||
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
|
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
|
||||||
# since uniquing is done on the sort key, add language
|
# group rows for the same geonames ID together
|
||||||
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
|
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
|
||||||
'-o', filename, temp_filename])
|
# preferred names come first within that grouping
|
||||||
|
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
|
||||||
|
# since uniquing is done on the sort key, add language
|
||||||
|
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
|
||||||
|
'-o', filename, temp_filename]
|
||||||
|
|
||||||
|
p = subprocess.Popen(command, env=env)
|
||||||
|
|
||||||
|
return_code = p.wait()
|
||||||
|
if return_code != 0:
|
||||||
|
raise subprocess.CalledProcessError(return_code, command)
|
||||||
|
|
||||||
os.unlink(temp_filename)
|
os.unlink(temp_filename)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user