From c844d0484a296325d5cef5e70efd95a57836b82f Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 13 Aug 2015 21:07:12 -0400 Subject: [PATCH] [fix] carriage returns --- scripts/geodata/osm/osm_address_training_data.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 1c1dae8c..3e5b718d 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -340,6 +340,12 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language +newline_regex = re.compile('\r\n|\r|\n') + + +def tsv_string(s): + return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' ')) + def build_ways_training_data(language_rtree, infile, out_dir): i = 0 @@ -354,7 +360,7 @@ def build_ways_training_data(language_rtree, infile, out_dir): for k, v in name_language.iteritems(): for s in v: if k in languages: - writer.writerow((k, country, safe_encode(s).replace('\t', ' ').replace('\n', ', '))) + writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: print 'did', i, 'ways' i += 1 @@ -390,11 +396,11 @@ def build_address_format_training_data(language_rtree, infile, out_dir): formatted_address_tagged = formatter.format_address(country, value) formatted_address_untagged = formatter.format_address(country, value, tag_components=False) if formatted_address_tagged is not None: - formatted_address_tagged = safe_encode(formatted_address_tagged.replace('\t', ' ').replace('\n', ', ')) + formatted_address_tagged = tsv_string(formatted_address_tagged) formatted_tagged_writer.writerow((default_languages[0]['lang'], country, formatted_address_tagged)) if formatted_address_untagged is not None: - formatted_address_untagged = safe_encode(formatted_address_untagged.replace('\t', ' ').replace('\n', ', ')) + formatted_address_untagged = tsv_string(formatted_address_untagged) formatted_writer.writerow((default_languages[0]['lang'], country, formatted_address_untagged)) if formatted_address_tagged is not None or formatted_address_untagged is not None: @@ -419,7 +425,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False): if not s: continue if k in languages: - writer.writerow((k, country, safe_encode(s).replace('\t', ' ').replace('\n', ', '))) + writer.writerow((k, country, tsv_string(s))) if i % 1000 == 0 and i > 0: print 'did', i, 'streets' i += 1 @@ -454,7 +460,7 @@ def build_venue_training_data(language_rtree, infile, out_dir): for s in v: s = s.strip() if k in languages: - writer.writerow((k, country, safe_encode(venue_type), safe_encode(s).replace('\t', ' ').replace('\n', ', '))) + writer.writerow((k, country, safe_encode(venue_type), tsv_string(s))) if i % 1000 == 0 and i > 0: print 'did', i, 'venues' i += 1