[openaddresses] Handling validation after cleanup, adding per-field regex replacements

This commit is contained in:
Al
2016-08-28 11:47:30 -04:00
parent 544c93899e
commit b8b1ac1261

View File

@@ -30,10 +30,8 @@ OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir,
OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv' OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv' OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
numeric_range_regex = re.compile('[\s]*\-[\s]*')
null_regex = re.compile('^\s*(?:null|none)\s*$', re.I) null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
unknown_regex = re.compile('^\s*(?:unknown)\s*$', re.I) unknown_regex = re.compile('^\s*(?:unknown)\s*$', re.I)
number_sign_regex = re.compile('^#', re.UNICODE)
not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I) not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$') sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$')
@@ -41,10 +39,21 @@ SPANISH = 'es'
class OpenAddressesFormatter(object): class OpenAddressesFormatter(object):
all_field_regex_replacements = [ field_regex_replacements = {
(re.compile('<\s*null\s*>', re.I), six.u('')), # All fields
(re.compile('[\s]{2,}'), six.u(' ')) None:
] [
(re.compile('<\s*null\s*>', re.I), six.u('')),
(re.compile('[\s]{2,}'), six.u(' '))
]
},
AddressFormatter.HOUSE_NUMBER: [
# Most of the house numbers in Montreal start with "#"
(re.compile('^#', re.UNICODE), sixu('')),
# Some house number ranges are split up like "12 -14"
(re.compile('[\s]*\-[\s]*'), six.u('-')),
]
}
unit_type_regexes = {} unit_type_regexes = {}
@@ -102,7 +111,7 @@ class OpenAddressesFormatter(object):
house_number = int(house_number.strip()) house_number = int(house_number.strip())
return house_number > 0 return house_number > 0
except (ValueError, TypeError): except (ValueError, TypeError):
return house_number.strip('# ') and is_numeric(house_number) and not all((c == '0' for c in house_number if c.isdigit())) return house_number.strip() and is_numeric(house_number) and not all((c == '0' for c in house_number if c.isdigit()))
@classmethod @classmethod
def validate_house_number_spanish(cls, house_number): def validate_house_number_spanish(cls, house_number):
@@ -243,11 +252,6 @@ class OpenAddressesFormatter(object):
if key == AddressFormatter.ROAD and language == SPANISH: if key == AddressFormatter.ROAD and language == SPANISH:
value = self.spanish_street_name(value) value = self.spanish_street_name(value)
validator = self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))
if validator is not None and not validator(value):
continue
if key in AddressFormatter.BOUNDARY_COMPONENTS: if key in AddressFormatter.BOUNDARY_COMPONENTS:
value = self.components.cleaned_name(value, first_comma_delimited_phrase=True) value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
if value and len(value) < 2 or is_numeric(value): if value and len(value) < 2 or is_numeric(value):
@@ -256,11 +260,19 @@ class OpenAddressesFormatter(object):
if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value): if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
continue continue
for exp, sub_val in self.all_field_regex_replacements: for exp, sub_val in self.field_regex_replacements.get(key, []):
value = exp.sub(sub_val, value)
for exp, sub_val in self.field_regex_replacements.get(None, []):
value = exp.sub(sub_val, value) value = exp.sub(sub_val, value)
value = value.strip(', -') value = value.strip(', -')
validator = self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))
if validator is not None and not validator(value):
continue
if key in ignore_fields_containing and ignore_fields_containing[key].search(value): if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
continue continue
@@ -294,8 +306,6 @@ class OpenAddressesFormatter(object):
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
if house_number: if house_number:
house_number = numeric_range_regex.replace(six.u('-'), house_number).strip()
house_number = number_sign_regex.replace(six.u(''), house_number)
house_number = self.cleanup_number(house_number) house_number = self.cleanup_number(house_number)
postcode = components.get(AddressFormatter.POSTCODE, None) postcode = components.get(AddressFormatter.POSTCODE, None)