[geonames] Adding LC_ALL environment variable for utf8 sorting

This commit is contained in:
Al
2015-07-06 00:39:23 -04:00
parent 6ff91fef6b
commit 0c5e741bb6

View File

@@ -500,7 +500,11 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
f.close() f.close()
logging.info('Sorting...') logging.info('Sorting...')
subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
env = os.environ.copy()
env['LC_ALL'] = 'C'
command = ['sort', '-t\t', '-u', '--ignore-case',
'-k{0},{0}'.format(NAME_INDEX + 1), '-k{0},{0}'.format(NAME_INDEX + 1),
# If there's a Wikipedia link to this name for the given id, sort first # If there's a Wikipedia link to this name for the given id, sort first
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1), '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
@@ -514,7 +518,14 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
'-k{0},{0}nr'.format(PREFERRED_INDEX + 1), '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
# since uniquing is done on the sort key, add language # since uniquing is done on the sort key, add language
'-k{0},{0}'.format(LANGUAGE_INDEX + 1), '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
'-o', filename, temp_filename]) '-o', filename, temp_filename]
p = subprocess.Popen(command, env=env)
return_code = p.wait()
if return_code != 0:
raise subprocess.CalledProcessError(return_code, command)
os.unlink(temp_filename) os.unlink(temp_filename)