From e40ca0bb8933ab229c47d91795d5548c3bd54931 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 24 Nov 2015 21:15:22 -0500 Subject: [PATCH] [fix] Removing house numbers from formatted address language training data, using a simple whitespace splitter --- scripts/geodata/osm/osm_address_training_data.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 8e1d69a2..b1733407 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -644,6 +644,13 @@ NAME_KEYS = ( 'name', 'addr:housename', ) + +HOUSE_NUMBER_KEYS = ( + 'addr:house_number', + 'addr:housenumber', + 'house_number' +) + COUNTRY_KEYS = ( 'country', 'country_name', @@ -673,16 +680,17 @@ def build_address_format_training_data_limited(language_rtree, infile, out_dir): Example: - nb no Olaf Ryes Plass 8 | Oslo + nb no Olaf Ryes Plass Oslo ''' i = 0 - formatter = AddressFormatter() + # Simple whitespace splitter is all that's necessary + formatter = AddressFormatter(splitter=' ') f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') writer = csv.writer(f, 'tsv_no_quote') - remove_keys = NAME_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS + remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS for key, value, deps in parse_osm(infile): try: