diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 71f6d728..7017c966 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -25,7 +25,8 @@ OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv' OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv' -numeric_range_regex = re.compile(six.u('[\s]*\-[\s]*')) +multiple_spaces_regex = re.compile('[\s]{2,}') +numeric_range_regex = re.compile('[\s]*\-[\s]*') not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I) @@ -170,6 +171,8 @@ class OpenAddressesFormatter(object): if not_applicable_regex.match(value): continue + value = multiple_spaces_regex.sub(six.u(' '), value) + value = value.strip(', ') if value: components[key] = value