[openaddresses] Adding quick-and-dirty regex-based exclusion list for fields containing various patterns in OpenAddresses, to be used sparingly

This commit is contained in:
Al
2016-08-26 15:34:30 -04:00
parent 7bcddeff44
commit 2654683af4
2 changed files with 10 additions and 0 deletions

View File

@@ -93,6 +93,9 @@ countries:
us:
cldr_country_probability: 0.05
add_osm_neighborhoods: true
ignore_fields_containing:
city:
- "\bcounty\b"
subdirs:
al:
add:

View File

@@ -128,6 +128,9 @@ class OpenAddressesFormatter(object):
numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}
language = self.get_property('language', *configs)
add_components = self.get_property('add', *configs)
@@ -174,6 +177,10 @@ class OpenAddressesFormatter(object):
value = multiple_spaces_regex.sub(six.u(' '), value)
value = value.strip(', ')
if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
continue
if value:
components[key] = value