From 2654683af4705a713ad8fc85e9059a88b4cf2004 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 26 Aug 2016 15:34:30 -0400 Subject: [PATCH] [openaddresses] Adding quick-and-dirty regex-based exclusion list for fields containing various patterns in OpenAddresses, to be used sparingly --- resources/parser/data_sets/openaddresses.yaml | 3 +++ scripts/geodata/openaddresses/formatter.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/resources/parser/data_sets/openaddresses.yaml b/resources/parser/data_sets/openaddresses.yaml index 95bef4ef..78e01b82 100644 --- a/resources/parser/data_sets/openaddresses.yaml +++ b/resources/parser/data_sets/openaddresses.yaml @@ -93,6 +93,9 @@ countries: us: cldr_country_probability: 0.05 add_osm_neighborhoods: true + ignore_fields_containing: + city: + - "\bcounty\b" subdirs: al: add: diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 7017c966..9f2de7b1 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -128,6 +128,9 @@ class OpenAddressesFormatter(object): numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False) + ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) + for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))} + language = self.get_property('language', *configs) add_components = self.get_property('add', *configs) @@ -174,6 +177,10 @@ class OpenAddressesFormatter(object): value = multiple_spaces_regex.sub(six.u(' '), value) value = value.strip(', ') + + if key in ignore_fields_containing and ignore_fields_containing[key].search(value): + continue + if value: components[key] = value