[fix] carriage returns
This commit is contained in:
@@ -340,6 +340,12 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
|
||||
|
||||
return country, name_language
|
||||
|
||||
newline_regex = re.compile('\r\n|\r|\n')
|
||||
|
||||
|
||||
def tsv_string(s):
|
||||
return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' '))
|
||||
|
||||
|
||||
def build_ways_training_data(language_rtree, infile, out_dir):
|
||||
i = 0
|
||||
@@ -354,7 +360,7 @@ def build_ways_training_data(language_rtree, infile, out_dir):
|
||||
for k, v in name_language.iteritems():
|
||||
for s in v:
|
||||
if k in languages:
|
||||
writer.writerow((k, country, safe_encode(s).replace('\t', ' ').replace('\n', ', ')))
|
||||
writer.writerow((k, country, tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print 'did', i, 'ways'
|
||||
i += 1
|
||||
@@ -390,11 +396,11 @@ def build_address_format_training_data(language_rtree, infile, out_dir):
|
||||
formatted_address_tagged = formatter.format_address(country, value)
|
||||
formatted_address_untagged = formatter.format_address(country, value, tag_components=False)
|
||||
if formatted_address_tagged is not None:
|
||||
formatted_address_tagged = safe_encode(formatted_address_tagged.replace('\t', ' ').replace('\n', ', '))
|
||||
formatted_address_tagged = tsv_string(formatted_address_tagged)
|
||||
formatted_tagged_writer.writerow((default_languages[0]['lang'], country, formatted_address_tagged))
|
||||
|
||||
if formatted_address_untagged is not None:
|
||||
formatted_address_untagged = safe_encode(formatted_address_untagged.replace('\t', ' ').replace('\n', ', '))
|
||||
formatted_address_untagged = tsv_string(formatted_address_untagged)
|
||||
formatted_writer.writerow((default_languages[0]['lang'], country, formatted_address_untagged))
|
||||
|
||||
if formatted_address_tagged is not None or formatted_address_untagged is not None:
|
||||
@@ -419,7 +425,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
|
||||
if not s:
|
||||
continue
|
||||
if k in languages:
|
||||
writer.writerow((k, country, safe_encode(s).replace('\t', ' ').replace('\n', ', ')))
|
||||
writer.writerow((k, country, tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print 'did', i, 'streets'
|
||||
i += 1
|
||||
@@ -454,7 +460,7 @@ def build_venue_training_data(language_rtree, infile, out_dir):
|
||||
for s in v:
|
||||
s = s.strip()
|
||||
if k in languages:
|
||||
writer.writerow((k, country, safe_encode(venue_type), safe_encode(s).replace('\t', ' ').replace('\n', ', ')))
|
||||
writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print 'did', i, 'venues'
|
||||
i += 1
|
||||
|
||||
Reference in New Issue
Block a user