From 23be122d2e962ba7d166d1184d21d969405e5eaf Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 22 Aug 2016 22:07:34 -0400 Subject: [PATCH] [openaddresses] Adding ability to use OSM boundaries for OpenAddresses (not turned on by default), cleaning up street names, requiring at least house number and street, validating house number to provide some assurance that it's not a badly-formatted NULL value, adding ability to strip letters from postcode for data sets like New York's statewide where there are some codes attached. --- scripts/geodata/openaddresses/formatter.py | 87 +++++++++++++++++----- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index cc32fdab..dad93fc6 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -9,6 +9,7 @@ from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_t from geodata.address_formatting.formatter import AddressFormatter from geodata.addresses.components import AddressComponents from geodata.countries.names import country_names +from geodata.encoding import safe_decode from geodata.math.sampling import cdf, weighted_choice from geodata.text.utils import is_numeric @@ -23,17 +24,10 @@ OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tag OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv' -def validate_postcode(postcode): - return not all((c == '0' for c in postcode)) - - class OpenAddressesFormatter(object): - openaddresses_validators = { - AddressFormatter.POSTCODE: validate_postcode - } - - def __init__(self, language_rtree): - self.language_rtree = language_rtree + def __init__(self, components): + self.components = components + self.language_rtree = components.language_rtree config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG)) self.config = config['global'] @@ -41,6 +35,37 @@ class OpenAddressesFormatter(object): self.formatter = AddressFormatter() + class validators: + @classmethod + def validate_postcode(cls, postcode): + ''' + Postcodes that are all zeros are improperly-formatted NULL values + ''' + return not all((c == '0' for c in postcode)) + + @classmethod + def validate_house_number(cls, house_number): + ''' + House number doesn't necessarily have to be numeric, but in some of the + OpenAddresses data sets the house number field is equal to the capitalized + street name, so this at least provides protection against insane values + for house number at the cost of maybe missing a few houses numbered "A", etc. + + Also OpenAddresses primarily comes from county GIS servers, etc. which use + a variety of database schemas and don't always handle NULLs very well. Again, + while a single zero is a valid house number, in OpenAddresses it's more likely + an error + + While a single zero is a valid house number, more than one zero is not, or + at least not in OpenAddresses + ''' + return house_number.strip() and is_numeric(house_number) and not all((c == '0' for c in house_number)) + + component_validators = { + AddressFormatter.HOUSE_NUMBER: validators.validate_house_number, + AddressFormatter.POSTCODE: validators.validate_postcode, + } + def get_property(self, key, *configs): for config in configs: value = config.get(key, None) @@ -77,6 +102,9 @@ class OpenAddressesFormatter(object): abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs)) separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0) + add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False) + strip_alpha_from_postcode = bool(self.get_property('strip_alpha_from_postcode', *configs) or False) + add_components = self.get_property('add', *configs) field_map = self.get_property('field_map', *configs) @@ -106,7 +134,7 @@ class OpenAddressesFormatter(object): if not value: continue - validator = self.openaddresses_validators.get(key, None) + validator = self.component_validators.get(key, None) if validator is not None and not validator(value): continue @@ -121,14 +149,19 @@ class OpenAddressesFormatter(object): street = components.get(AddressFormatter.ROAD, None) if street is not None: + street = street.strip() + street = AddressComponents.cleaned_name(street) street = abbreviate(street_types_gazetteer, street, language, abbreviate_prob=abbreviate_street_prob, separate_prob=separate_street_prob) components[AddressFormatter.ROAD] = street house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) - if house_number and not is_numeric(house_number): - components.pop(AddressFormatter.HOUSE_NUMBER) + if house_number: + house_number = house_number.strip() + + if not (street and house_number): + continue unit = components.get(AddressFormatter.UNIT, None) if unit is not None: @@ -137,6 +170,14 @@ class OpenAddressesFormatter(object): separate_prob=separate_unit_prob) components[AddressFormatter.UNIT] = unit + postcode = components.get(AddressFormatter.POSTCODE, None) + if postcode and postcode.strip() is not None and strip_alpha_from_postcode: + postcode = six.u('').join((c for c in safe_decode(postcode) if not c.isalpha())).strip() + if postcode: + components[AddressFormatter.POSTCODE] = postcode + else: + components.pop(AddressFormatter.POSTCODE) + country_name = self.cldr_country_name(country, language, configs) if country_name: components[AddressFormatter.COUNTRY] = country_name @@ -146,6 +187,14 @@ class OpenAddressesFormatter(object): if k not in components: components[k] = v + address_state = self.components.state_name(components, country, language) + if address_state: + components[AddressFormatter.STATE] = address_state + + if add_osm_boundaries: + osm_components = self.osm_reverse_geocoded_components(latitude, longitude) + self.components.add_admin_boundaries(components, osm_components, country, language) + formatted = self.formatter.format_address(components, country, language=language, tag_components=tag_components) yield (language, country, formatted) @@ -161,11 +210,11 @@ class OpenAddressesFormatter(object): i = 0 for country, config in six.iteritems(self.country_configs): - for file_props in config.get('files', []): - filename = file_props['filename'] + for file_config in config.get('files', []): + filename = file_config['filename'] path = os.path.join(base_dir, country, filename) - configs = (file_props, config, self.config) + configs = (file_config, config, self.config) for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue @@ -185,12 +234,12 @@ class OpenAddressesFormatter(object): print('did {} formatted addresses'.format(i)) for subdir, subdir_config in six.iteritems(config.get('subdirs', {})): - for file_props in subdir_config.get('files', []): - filename = file_props['filename'] + for file_config in subdir_config.get('files', []): + filename = file_config['filename'] path = os.path.join(base_dir, country, subdir, filename) - configs = (file_props, subdir_config, config, self.config) + configs = (file_config, subdir_config, config, self.config) for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue