From 242a5281cc4309c59e6589b1727036434a44464a Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 22 Dec 2016 03:36:21 -0500 Subject: [PATCH] [osm] throwing away street names that are None/NULL, and those that only contain punctuation --- scripts/geodata/addresses/components.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 82e8ba97..d9defa8a 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -1274,6 +1274,13 @@ class AddressComponents(object): else: address_components.pop(AddressFormatter.HOUSE_NUMBER, None) + invalid_street_regex = re.compile('^\s*(?:none|null|not applicable|n\s*/\s*a)\s*$', re.I) + + def cleanup_street(self, address_components): + street = address_components.get(AddressFormatter.ROAD) + if street is not None and (invalid_street_regex.match(street) or not any(c.isalnum() for c in street): + address_components.pop(AddressFormatter.ROAD) + newline_regex = re.compile('[\n]+') name_regex = re.compile('^[\s\-]*(.*?)[\s\-]*$') whitespace_regex = re.compile('(?<=[\w])[\s]+(?=[\w])') @@ -1512,6 +1519,7 @@ class AddressComponents(object): language_suffix=language_suffix) street = address_components.get(AddressFormatter.ROAD) + self.cleanup_street(address_components) self.cleanup_boundary_names(address_components) self.country_specific_cleanup(address_components, country)