From 1d80d8b6b8c366efecbe2b1a83b76e16b3720ac8 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 31 May 2016 01:41:16 -0400 Subject: [PATCH] [openaddresses] OpenAddresses address formatter, using the config --- scripts/geodata/openaddresses/formatter.py | 168 +++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 scripts/geodata/openaddresses/formatter.py diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py new file mode 100644 index 00000000..dcdebfee --- /dev/null +++ b/scripts/geodata/openaddresses/formatter.py @@ -0,0 +1,168 @@ +import os +import six +import yaml + +from geodata.address_expansions.abbreviations import abbreviate +from geodata.address_expansions.gazeteers import street_types_gazetteer, unit_types_gazetteer +from geodata.address_formatting.formatter import AddressFormatter +from geodata.addresses.components import AddressComponents + +from geodata.csv_utils import tsv_string + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'parser', 'data_sets', 'openaddresses.yaml') + +OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv' +OPENADDRESS_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv' + + +class OpenAddressesFormatter(object): + def __init__(self, language_rtree): + self.language_rtree = language_rtree + + config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG)) + self.config = config['global'] + self.country_configs = config['countries'] + + self.formatter = AddressFormatter() + + def get_property(self, key, *configs): + for config in configs: + value = config.get(key, None) + if value is not None: + return value + return None + + @staticmethod + def validate_postcode(postcode): + return not all((c == '0' for c in postcode)) + + openaddresses_validators = { + AddressFormatter.POSTCODE: validate_postcode + } + + def formatted_addresses(self, path, configs, tag_components=True): + abbreviate_street_prob = self.get_property('abbreviate_street_probability', *configs) + separate_street_prob = self.get_property('separate_street_probability', *configs) or 0.0 + abbreviate_unit_prob = self.get_property('abbreviate_unit_probability', *configs) + separate_unit_prob = self.get_property('separate_unit_probability', *configs) or 0.0 + + field_map = self.get_property('field_map', *configs) + if not field_map: + return + + field_map = {f['field_name']: f['component'] for f in field_map} + + f = open(path) + reader = unicode_csv_reader(f) + headers = reader.next() + + header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map} + latitude_index = headers.index('LAT') + longitude_index = headers.index('LON') + + for row in reader: + try: + latitude = float(row[latitude_index]) + longitude = float(row[longitude_index]) + except (ValueError, TypeError): + continue + + components = {} + for i, key in six.iteritems(header_indices): + value = row[i].strip() + if not value: + continue + + validator = self.openaddresses_validators.get(key, None) + if validator is not None and not validator(value): + continue + + components[key] = value + + if components: + country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + if not (country and candidate_languages): + continue + + if not language: + language = AddressComponents.address_language(components, candidate_languages) + + street = components.get(AddressFormatter.ROAD, None) + if street is not None: + street = abbreviate(street_types_gazetteer, street, language, + abbreviate_prob=abbreviate_street_prob, + separate_prob=separate_street_prob) + components[AddressFormatter.ROAD] = street + + unit = components.get(AddressFormatter.UNIT, None) + if unit is not None: + unit = abbreviate(unit_types_gazetteer, unit, language, + abbreviate_prob=abbreviate_unit_prob, + separate_prob=separate_unit_prob) + + formatted = self.formatter.format_address(components, country, + language=language, tag_components=tag_components) + yield formatted + + def build_training_data(self, base_dir, out_dir, tag_components=True): + if tag_components: + formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') + writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') + else: + formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESS_FORMAT_DATA_FILENAME), 'w') + writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') + + i = 0 + + for country, config in six.iteritems(self.country_configs): + for file_props in config.get('files', []): + filename = file_props['filename'] + + path = os.path.join(base_dir, country_code, filename) + configs = (file_props, config, self.config) + for formatted_address in self.build_training_data_for_file(path, configs, tag_components=tag_components): + if formatted_address and formatted_address.strip(): + continue + + formatted_address = tsv_string(formatted_address) + if not formatted_address or not formatted_address.strip(): + continue + + if tag_components: + row = (language, country, formatted_address) + else: + row = (formatted_address,) + + writer.writerow(row) + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i)) + + for subdir, subdir_config in six.iteritems(config.get('subdirs', {})): + for file_props in config.get('files', []): + filename = file_props['filename'] + + path = os.path.join(base_dir, country_code, filename) + + configs = (file_props, subdir_config, config, self.config) + for formatted_address in self.build_training_data_for_file(path, configs, tag_components=tag_components): + if formatted_address and formatted_address.strip(): + continue + + formatted_address = tsv_string(formatted_address) + if not formatted_address or not formatted_address.strip(): + continue + + if tag_components: + row = (language, country, formatted_address) + else: + row = (formatted_address,) + + writer.writerow(row) + + i += 1 + if i % 1000 == 0 and i > 0: + print('did {} formatted addresses'.format(i))