[openaddresses] Handling validation after cleanup, adding per-field regex replacements
This commit is contained in:
@@ -30,10 +30,8 @@ OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir,
|
|||||||
OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
|
OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
|
||||||
OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
|
OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
|
||||||
|
|
||||||
numeric_range_regex = re.compile('[\s]*\-[\s]*')
|
|
||||||
null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
|
null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
|
||||||
unknown_regex = re.compile('^\s*(?:unknown)\s*$', re.I)
|
unknown_regex = re.compile('^\s*(?:unknown)\s*$', re.I)
|
||||||
number_sign_regex = re.compile('^#', re.UNICODE)
|
|
||||||
not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
|
not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
|
||||||
sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$')
|
sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$')
|
||||||
|
|
||||||
@@ -41,10 +39,21 @@ SPANISH = 'es'
|
|||||||
|
|
||||||
|
|
||||||
class OpenAddressesFormatter(object):
|
class OpenAddressesFormatter(object):
|
||||||
all_field_regex_replacements = [
|
field_regex_replacements = {
|
||||||
(re.compile('<\s*null\s*>', re.I), six.u('')),
|
# All fields
|
||||||
(re.compile('[\s]{2,}'), six.u(' '))
|
None:
|
||||||
]
|
[
|
||||||
|
(re.compile('<\s*null\s*>', re.I), six.u('')),
|
||||||
|
(re.compile('[\s]{2,}'), six.u(' '))
|
||||||
|
]
|
||||||
|
},
|
||||||
|
AddressFormatter.HOUSE_NUMBER: [
|
||||||
|
# Most of the house numbers in Montreal start with "#"
|
||||||
|
(re.compile('^#', re.UNICODE), sixu('')),
|
||||||
|
# Some house number ranges are split up like "12 -14"
|
||||||
|
(re.compile('[\s]*\-[\s]*'), six.u('-')),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
unit_type_regexes = {}
|
unit_type_regexes = {}
|
||||||
|
|
||||||
@@ -102,7 +111,7 @@ class OpenAddressesFormatter(object):
|
|||||||
house_number = int(house_number.strip())
|
house_number = int(house_number.strip())
|
||||||
return house_number > 0
|
return house_number > 0
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
return house_number.strip('# ') and is_numeric(house_number) and not all((c == '0' for c in house_number if c.isdigit()))
|
return house_number.strip() and is_numeric(house_number) and not all((c == '0' for c in house_number if c.isdigit()))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_house_number_spanish(cls, house_number):
|
def validate_house_number_spanish(cls, house_number):
|
||||||
@@ -243,11 +252,6 @@ class OpenAddressesFormatter(object):
|
|||||||
if key == AddressFormatter.ROAD and language == SPANISH:
|
if key == AddressFormatter.ROAD and language == SPANISH:
|
||||||
value = self.spanish_street_name(value)
|
value = self.spanish_street_name(value)
|
||||||
|
|
||||||
validator = self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))
|
|
||||||
|
|
||||||
if validator is not None and not validator(value):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if key in AddressFormatter.BOUNDARY_COMPONENTS:
|
if key in AddressFormatter.BOUNDARY_COMPONENTS:
|
||||||
value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
|
value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
|
||||||
if value and len(value) < 2 or is_numeric(value):
|
if value and len(value) < 2 or is_numeric(value):
|
||||||
@@ -256,11 +260,19 @@ class OpenAddressesFormatter(object):
|
|||||||
if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
|
if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for exp, sub_val in self.all_field_regex_replacements:
|
for exp, sub_val in self.field_regex_replacements.get(key, []):
|
||||||
|
value = exp.sub(sub_val, value)
|
||||||
|
|
||||||
|
for exp, sub_val in self.field_regex_replacements.get(None, []):
|
||||||
value = exp.sub(sub_val, value)
|
value = exp.sub(sub_val, value)
|
||||||
|
|
||||||
value = value.strip(', -')
|
value = value.strip(', -')
|
||||||
|
|
||||||
|
validator = self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))
|
||||||
|
|
||||||
|
if validator is not None and not validator(value):
|
||||||
|
continue
|
||||||
|
|
||||||
if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
|
if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -294,8 +306,6 @@ class OpenAddressesFormatter(object):
|
|||||||
|
|
||||||
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
|
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
|
||||||
if house_number:
|
if house_number:
|
||||||
house_number = numeric_range_regex.replace(six.u('-'), house_number).strip()
|
|
||||||
house_number = number_sign_regex.replace(six.u(''), house_number)
|
|
||||||
house_number = self.cleanup_number(house_number)
|
house_number = self.cleanup_number(house_number)
|
||||||
|
|
||||||
postcode = components.get(AddressFormatter.POSTCODE, None)
|
postcode = components.get(AddressFormatter.POSTCODE, None)
|
||||||
|
|||||||
Reference in New Issue
Block a user