[geonames] Pre-escaping tabs, no quoting in geonames/postal code TSVs
This commit is contained in:
@@ -2,6 +2,7 @@ import argparse
|
|||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -20,6 +21,13 @@ from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
|
|||||||
from geodata.i18n.unicode_paths import CLDR_DIR
|
from geodata.i18n.unicode_paths import CLDR_DIR
|
||||||
from geodata.log import log_to_file
|
from geodata.log import log_to_file
|
||||||
|
|
||||||
|
multispace_regex = re.compile('[\s]+')
|
||||||
|
|
||||||
|
csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
|
||||||
|
|
||||||
|
|
||||||
|
def encode_field(value):
|
||||||
|
return multispace_regex.sub(' ', safe_encode((value or '')))
|
||||||
|
|
||||||
log_to_file(sys.stderr)
|
log_to_file(sys.stderr)
|
||||||
|
|
||||||
@@ -249,7 +257,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
temp_filename = filename + '.tmp'
|
temp_filename = filename + '.tmp'
|
||||||
|
|
||||||
f = open(temp_filename, 'w')
|
f = open(temp_filename, 'w')
|
||||||
writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
|
|
||||||
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
|
|
||||||
country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
|
country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
|
||||||
country_alpha2 = set([c.alpha2 for c in pycountry.countries])
|
country_alpha2 = set([c.alpha2 for c in pycountry.countries])
|
||||||
@@ -277,7 +286,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
break
|
break
|
||||||
rows = []
|
rows = []
|
||||||
for row in batch:
|
for row in batch:
|
||||||
row = [safe_encode(val or '') for val in row]
|
row = map(encode_field, row)
|
||||||
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
|
||||||
|
|
||||||
if boundary_type == boundary_types.COUNTRY:
|
if boundary_type == boundary_types.COUNTRY:
|
||||||
@@ -321,7 +330,7 @@ def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
temp_filename = filename + '.tmp'
|
temp_filename = filename + '.tmp'
|
||||||
f = open(temp_filename, 'w')
|
f = open(temp_filename, 'w')
|
||||||
|
|
||||||
writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
|
|
||||||
cursor = db.execute(postal_codes_query)
|
cursor = db.execute(postal_codes_query)
|
||||||
|
|
||||||
@@ -331,7 +340,7 @@ def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
if not batch:
|
if not batch:
|
||||||
break
|
break
|
||||||
rows = [
|
rows = [
|
||||||
[safe_encode(val or '') for val in row]
|
map(encode_field, row)
|
||||||
for row in batch
|
for row in batch
|
||||||
]
|
]
|
||||||
writer.writerows(rows)
|
writer.writerows(rows)
|
||||||
|
|||||||
Reference in New Issue
Block a user