From a931c5ddc923b0d2e50c5739bafbde2278cbc7bd Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 Jan 2017 02:34:29 -0500 Subject: [PATCH] [osm] checking for valid street names in OSM street-only training data so e.g. the street name is not just a simple number like "831" --- scripts/geodata/addresses/components.py | 5 ++++- scripts/geodata/osm/formatter.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 1b4f0841..1e7e0c70 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -1502,9 +1502,12 @@ class AddressComponents(object): invalid_street_regex = re.compile('^\s*(?:none|null|not applicable|n\s*/\s*a)\s*$', re.I) + def street_name_is_valid(self, street): + return street is not None and not (self.invalid_street_regex.match(street) or not any((c.isalnum() for c in street))) + def cleanup_street(self, address_components): street = address_components.get(AddressFormatter.ROAD) - if street is not None and (self.invalid_street_regex.match(street) or not any((c.isalnum() for c in street))): + if not self.street_name_is_valid(street): address_components.pop(AddressFormatter.ROAD) newline_regex = re.compile('[\n]+') diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index f1c98caf..bcc653b4 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -1597,7 +1597,7 @@ class OSMAddressFormatter(object): for v, is_base in vals: for street_name in v.split(';'): street_name = street_name.strip() - if street_name: + if street_name and self.components.street_name_is_valid(street_name): address_components = {AddressFormatter.ROAD: street_name} self.components.add_admin_boundaries(address_components, osm_components,