From 46f2c68a690c549de7ee0af9a78e6389ebf3ae61 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 13 Aug 2015 18:40:41 -0400 Subject: [PATCH] [osm] Using tsv_no_quote writers in all OSM training data files --- .../geodata/osm/osm_address_training_data.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 50d94826..b5c766ed 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -24,6 +24,7 @@ from address_normalizer.text.tokenize import * from geodata.i18n.languages import * from geodata.polygons.language_polys import * +from geodata.csv_utils import * from geodata.file_utils import * this_dir = os.path.realpath(os.path.dirname(__file__)) @@ -78,7 +79,7 @@ def parse_osm(filename, allowed_types=ALL_OSM_TAGS): def write_osm_json(filename, out_filename): out = open(out_filename, 'w') - writer = csv.writer(out, delimiter='\t') + writer = csv.writer(out, 'tsv_no_quote') for key, attrs in parse_osm(filename): writer.writerow((key, json.dumps(attrs))) out.close() @@ -343,7 +344,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): def build_ways_training_data(language_rtree, infile, out_dir): i = 0 f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w') - writer = csv.writer(f, delimiter='\t') + writer = csv.writer(f, 'tsv_no_quote') for key, value in parse_osm(infile, allowed_types=WAYS_RELATIONS): country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') @@ -371,10 +372,10 @@ def build_address_format_training_data(language_rtree, infile, out_dir): formatter = AddressFormatter(splitter='\n') formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') - formatted_writer = csv.writer(formatted_file, delimiter='\t') + formatted_writer = csv.writer(formatted_file, 'tsv_no_quote') formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') - formatted_tagged_writer = csv.writer(formatted_tagged_file, delimiter='\t') + formatted_tagged_writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') for key, value in parse_osm(infile): try: @@ -390,11 +391,11 @@ def build_address_format_training_data(language_rtree, infile, out_dir): formatted_address_untagged = formatter.format_address(country, value, tag_components=False) if formatted_address_tagged is not None: formatted_address_tagged = safe_encode(formatted_address_tagged.replace('\n', '\\n')) - formatted_tagged_writer.writerow((country, default_languages[0]['lang'], formatted_address_tagged)) + formatted_tagged_writer.writerow((default_languages[0]['lang'], country, formatted_address_tagged)) if formatted_address_untagged is not None: formatted_address_untagged = safe_encode(formatted_address_untagged.replace('\n', '\\n')) - formatted_writer.writerow((country, default_languages[0]['lang'], formatted_address_untagged)) + formatted_writer.writerow((default_languages[0]['lang'], country, formatted_address_untagged)) if formatted_address_tagged is not None or formatted_address_untagged is not None: i += 1 @@ -405,7 +406,7 @@ def build_address_format_training_data(language_rtree, infile, out_dir): def build_address_training_data(langauge_rtree, infile, out_dir, format=False): i = 0 f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w') - writer = csv.writer(f, delimiter='\t') + writer = csv.writer(f, 'tsv_no_quote') for key, value in parse_osm(infile): country, street_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') @@ -432,7 +433,7 @@ def build_venue_training_data(language_rtree, infile, out_dir): i = 0 f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w') - writer = csv.writer(f, delimiter='\t') + writer = csv.writer(f, 'tsv_no_quote') for key, value in parse_osm(infile): country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') @@ -444,8 +445,9 @@ def build_venue_training_data(language_rtree, infile, out_dir): continue for k, v in name_language.iteritems(): for s in v: + s = s.strip() if k in languages: - writer.writerow((k, country, safe_encode(venue_type), safe_encode(s))) + writer.writerow((k, country, safe_encode(venue_type), safe_encode(s.replace('\t', ' ')))) if i % 1000 == 0 and i > 0: print 'did', i, 'venues' i += 1