[openaddresses] Adding ability to use OSM boundaries for OpenAddresses (not turned on by default), cleaning up street names, requiring at least house number and street, validating house number to provide some assurance that it's not a badly-formatted NULL value, adding ability to strip letters from postcode for data sets like New York's statewide where there are some codes attached.

This commit is contained in:
Al
2016-08-22 22:07:34 -04:00
parent 8b57a7acf2
commit 23be122d2e

View File

@@ -9,6 +9,7 @@ from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_t
from geodata.address_formatting.formatter import AddressFormatter
from geodata.addresses.components import AddressComponents
from geodata.countries.names import country_names
from geodata.encoding import safe_decode
from geodata.math.sampling import cdf, weighted_choice
from geodata.text.utils import is_numeric
@@ -23,17 +24,10 @@ OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tag
OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
def validate_postcode(postcode):
return not all((c == '0' for c in postcode))
class OpenAddressesFormatter(object):
openaddresses_validators = {
AddressFormatter.POSTCODE: validate_postcode
}
def __init__(self, language_rtree):
self.language_rtree = language_rtree
def __init__(self, components):
self.components = components
self.language_rtree = components.language_rtree
config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG))
self.config = config['global']
@@ -41,6 +35,37 @@ class OpenAddressesFormatter(object):
self.formatter = AddressFormatter()
class validators:
@classmethod
def validate_postcode(cls, postcode):
'''
Postcodes that are all zeros are improperly-formatted NULL values
'''
return not all((c == '0' for c in postcode))
@classmethod
def validate_house_number(cls, house_number):
'''
House number doesn't necessarily have to be numeric, but in some of the
OpenAddresses data sets the house number field is equal to the capitalized
street name, so this at least provides protection against insane values
for house number at the cost of maybe missing a few houses numbered "A", etc.
Also OpenAddresses primarily comes from county GIS servers, etc. which use
a variety of database schemas and don't always handle NULLs very well. Again,
while a single zero is a valid house number, in OpenAddresses it's more likely
an error
While a single zero is a valid house number, more than one zero is not, or
at least not in OpenAddresses
'''
return house_number.strip() and is_numeric(house_number) and not all((c == '0' for c in house_number))
component_validators = {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
AddressFormatter.POSTCODE: validators.validate_postcode,
}
def get_property(self, key, *configs):
for config in configs:
value = config.get(key, None)
@@ -77,6 +102,9 @@ class OpenAddressesFormatter(object):
abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
strip_alpha_from_postcode = bool(self.get_property('strip_alpha_from_postcode', *configs) or False)
add_components = self.get_property('add', *configs)
field_map = self.get_property('field_map', *configs)
@@ -106,7 +134,7 @@ class OpenAddressesFormatter(object):
if not value:
continue
validator = self.openaddresses_validators.get(key, None)
validator = self.component_validators.get(key, None)
if validator is not None and not validator(value):
continue
@@ -121,14 +149,19 @@ class OpenAddressesFormatter(object):
street = components.get(AddressFormatter.ROAD, None)
if street is not None:
street = street.strip()
street = AddressComponents.cleaned_name(street)
street = abbreviate(street_types_gazetteer, street, language,
abbreviate_prob=abbreviate_street_prob,
separate_prob=separate_street_prob)
components[AddressFormatter.ROAD] = street
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
if house_number and not is_numeric(house_number):
components.pop(AddressFormatter.HOUSE_NUMBER)
if house_number:
house_number = house_number.strip()
if not (street and house_number):
continue
unit = components.get(AddressFormatter.UNIT, None)
if unit is not None:
@@ -137,6 +170,14 @@ class OpenAddressesFormatter(object):
separate_prob=separate_unit_prob)
components[AddressFormatter.UNIT] = unit
postcode = components.get(AddressFormatter.POSTCODE, None)
if postcode and postcode.strip() is not None and strip_alpha_from_postcode:
postcode = six.u('').join((c for c in safe_decode(postcode) if not c.isalpha())).strip()
if postcode:
components[AddressFormatter.POSTCODE] = postcode
else:
components.pop(AddressFormatter.POSTCODE)
country_name = self.cldr_country_name(country, language, configs)
if country_name:
components[AddressFormatter.COUNTRY] = country_name
@@ -146,6 +187,14 @@ class OpenAddressesFormatter(object):
if k not in components:
components[k] = v
address_state = self.components.state_name(components, country, language)
if address_state:
components[AddressFormatter.STATE] = address_state
if add_osm_boundaries:
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
self.components.add_admin_boundaries(components, osm_components, country, language)
formatted = self.formatter.format_address(components, country,
language=language, tag_components=tag_components)
yield (language, country, formatted)
@@ -161,11 +210,11 @@ class OpenAddressesFormatter(object):
i = 0
for country, config in six.iteritems(self.country_configs):
for file_props in config.get('files', []):
filename = file_props['filename']
for file_config in config.get('files', []):
filename = file_config['filename']
path = os.path.join(base_dir, country, filename)
configs = (file_props, config, self.config)
configs = (file_config, config, self.config)
for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components):
if not formatted_address or not formatted_address.strip():
continue
@@ -185,12 +234,12 @@ class OpenAddressesFormatter(object):
print('did {} formatted addresses'.format(i))
for subdir, subdir_config in six.iteritems(config.get('subdirs', {})):
for file_props in subdir_config.get('files', []):
filename = file_props['filename']
for file_config in subdir_config.get('files', []):
filename = file_config['filename']
path = os.path.join(base_dir, country, subdir, filename)
configs = (file_props, subdir_config, config, self.config)
configs = (file_config, subdir_config, config, self.config)
for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components):
if not formatted_address or not formatted_address.strip():
continue