[geonames] Adding LC_ALL environment variable for utf8 sorting

This commit is contained in:
Al
2015-07-06 00:39:23 -04:00
parent 6ff91fef6b
commit 0c5e741bb6

View File

@@ -500,21 +500,32 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
f.close()
logging.info('Sorting...')
subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
'-k{0},{0}'.format(NAME_INDEX + 1),
# If there's a Wikipedia link to this name for the given id, sort first
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
# Historical entries should be sorted last
'-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
# Sort descending by population (basic proxy for relevance)
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
# group rows for the same geonames ID together
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
# preferred names come first within that grouping
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
# since uniquing is done on the sort key, add language
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
'-o', filename, temp_filename])
env = os.environ.copy()
env['LC_ALL'] = 'C'
command = ['sort', '-t\t', '-u', '--ignore-case',
'-k{0},{0}'.format(NAME_INDEX + 1),
# If there's a Wikipedia link to this name for the given id, sort first
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
# Historical entries should be sorted last
'-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
# Sort descending by population (basic proxy for relevance)
'-k{0},{0}nr'.format(POPULATION_INDEX + 1),
# group rows for the same geonames ID together
'-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
# preferred names come first within that grouping
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
# since uniquing is done on the sort key, add language
'-k{0},{0}'.format(LANGUAGE_INDEX + 1),
'-o', filename, temp_filename]
p = subprocess.Popen(command, env=env)
return_code = p.wait()
if return_code != 0:
raise subprocess.CalledProcessError(return_code, command)
os.unlink(temp_filename)