From bff4fa27f59506ece048a6f2245c94b558ec0a67 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 16 May 2016 11:12:53 -0400 Subject: [PATCH] [formatting] New formatter config including random component component order changes and default/per-country admin component ordering --- resources/formatting/global.yaml | 285 ++++++++++++++++ .../geodata/address_formatting/formatter.py | 305 ++++++++++++++---- 2 files changed, 522 insertions(+), 68 deletions(-) create mode 100644 resources/formatting/global.yaml diff --git a/resources/formatting/global.yaml b/resources/formatting/global.yaml new file mode 100644 index 00000000..655662ef --- /dev/null +++ b/resources/formatting/global.yaml @@ -0,0 +1,285 @@ +global: + # Add these components to templates that don't have them + admin_components: + subdivision: + after: + - road + before: + - suburb + - city_district + - city + - island + - state_district + - state + - postcode + - country + suburb: + after: + - road + - subdivision + before: + - city_district + - city + - island + - state_district + - state + - postcode + - country + city_district: + after: + - road + - suburb + before: + - city + - island + - state_district + - state + - postcode + - country + # This is added to all the templates but only makes it in + island: + after: + - road + - suburb + - city_district + - city + before: + - state_district + - state + - country + state_district: + after: + - suburb + - city_district + - city + - island + before: + - state + - country + state: + after: + - suburb + - city_district + - city + - island + - state_district + before: + - country + country: + after: + - suburb + - city_district + - city + - island + - state_district + - state + - postcode + + insertions: + # For each component, insertions are mutually exclusive + # They don't have to sum to 1 (especially for components + # likely to be found in most addresses) + postcode: + postcode_before_city: + before: city + probability: 0.0001 + + postcode_after_city: + after: city + probability: 0.0001 + + postcode_before_city_district: + before: city_district + probability: 0.0001 + + postcode_before_suburb: + before: suburb + probability: 0.0001 + + postcode_before_state_district: + before: state + probability: 0.0001 + + postcode_before_state: + before: state + probability: 0.0001 + + postcode_before_country: + before: country + probability: 0.05 + + postcode_after_country: + after: country + probability: 0.01 + + postcode_first: + first: true + probability: 0.001 + + postcode_last: + last: true + probability: 0.01 + + # PO Box should be the same in most countries + po_box: + po_box_before_city: + before: city + probability: 0.7 + + po_box_after_house: + after: house + probability: 0.2 + + po_box_first: + first: true + probability: 0.1 + +# Overrides for languages (better for e.g. covering all French-speaking countries) +languages: + en: + insertions: + level: + # e.g. 123 East 45th St, 6th Floor, NYC + level_after_road: + after: road + probability: 0.5 + # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London + level_before_house: + before: house + probability: 0.25 + # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London + level_before_road: + before: road + probability: 0.25 + + unit: + # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London + unit_before_house: + before: house + probability: 0.2 + + # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London + unit_before_house_number: + before: house_number + probability: 0.6 + + # e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK) + unit_after_road: + after: road + probability: 0.1 + + # e.g. Floor 5, Apt 6 + unit_after_level: + after: level + probability: 0.09 + + # e.g. Apt. 6, 5/F (less common) + unit_before_level: + before: level + probability: 0.01 + + es: + insertions: + level: + # e.g. Calle Ruiz de Alarcón 23 piso 3 + level_after_house_number: + after: house_number + probability: 0.8 + # e.g. Piso 3, Museo del Prado, Calle Ruiz de Alarcón 23 + level_before_house: + before: house + probability: 0.1 + # e.g. Museo del Prado, Bajos, Calle Ruiz de Alarcón 23 + level_before_road: + before: road + probability: 0.1 + + unit: + unit_before_house: + before: house + probability: 0.05 + unit_before_house_number: + before: house_number + probability: 0.05 + # e.g. Piso 3 Dpto 12 (most common) + unit_after_level: + after: level + probability: 0.8 + # e.g. Apto 6, 2o piso (less common) + unit_before_level: + before: level + probability: 0.1 + + fr: + # libpostal issue #27 + insertions: + city: + city_before_road: + before: road + probability: 0.001 + + +countries: + # Hungary, e.g. 1075, Budapest Kazinczy utca 14 + hu: + insertions: + postcode: + postcode_before_city: + probability: 0.5 + + # Malaysia (islands are bigger than states) + my: + admin_components: + island: + after: + - road + - suburb + - city_district + - city + - state_district + - state + before: + - country + + us: + insertions: + level: + # e.g. 123 East 45th St, 6th Floor, NYC + level_after_road: + after: road + probability: 0.75 + # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London + level_before_house: + before: house + probability: 0.125 + # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London + level_before_road: + before: road + probability: 0.125 + + unit: + # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London + unit_before_house: + before: house + probability: 0.05 + + # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London + unit_before_house_number: + before: house_number + probability: 0.05 + + # e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK) + unit_after_road: + after: road + probability: 0.8 + + # e.g. Floor 5, Apt 6 + unit_after_level: + after: level + probability: 0.09 + + # e.g. Apt. 6, 5/F (less common) + unit_before_level: + before: level + probability: 0.01 + diff --git a/scripts/geodata/address_formatting/formatter.py b/scripts/geodata/address_formatting/formatter.py index 87366d73..0b85ea1a 100644 --- a/scripts/geodata/address_formatting/formatter.py +++ b/scripts/geodata/address_formatting/formatter.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import copy import os import pystache import re @@ -7,6 +8,9 @@ import subprocess import yaml from geodata.address_formatting.aliases import Aliases +from geodata.configs.utils import nested_get, recursive_merge +from geodata.math.floats import isclose +from geodata.math.sampling import weighted_choice, cdf from geodata.text.tokenize import tokenize, tokenize_raw, token_types from geodata.encoding import safe_decode from collections import OrderedDict @@ -14,6 +18,11 @@ from itertools import ifilter FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting' +this_dir = os.path.realpath(os.path.dirname(__file__)) + +FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'formatting', 'global.yaml') + class AddressFormatter(object): ''' @@ -50,6 +59,7 @@ class AddressFormatter(object): UNIT = 'unit' INTERSECTION = 'intersection' ROAD = 'road' + SUBDIVISION = 'subdivision' SUBURB = 'suburb' CITY_DISTRICT = 'city_district' CITY = 'city' @@ -73,6 +83,7 @@ class AddressFormatter(object): INTERSECTION, ROAD, SUBURB, + SUBDIVISION, CITY, CITY_DISTRICT, ISLAND, @@ -111,58 +122,139 @@ class AddressFormatter(object): (ROAD, POSTCODE) ] + FIRST, BEFORE, AFTER, LAST = range(4) + def __init__(self, scratch_dir='/tmp', splitter=None): if splitter is not None: self.splitter = splitter self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting') self.clone_repo() + self.load_config() + self.load_country_config() + + self.setup_insertion_probabilities() + + self.template_cache = {} def clone_repo(self): subprocess.check_call(['rm', '-rf', self.formatter_repo_path]) subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path]) - def load_config(self): + def load_country_config(self): config = yaml.load(open(os.path.join(self.formatter_repo_path, - 'conf/countries/worldwide.yaml'))) - for key, value in config.items(): + 'conf', 'countries', 'worldwide.yaml'))) + for key in list(config): + country = key + language = None + if '_' in key: + country, language = country.split('_', 1) + value = config[country] if hasattr(value, 'items'): address_template = value.get('address_template') + if not address_template and 'use_country' in value: + # Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references + if value['use_country'] in (country, False): + continue + address_template = config[value['use_country']]['address_template'] + if address_template: - value['address_template'] = self.add_postprocessing_tags(address_template) + value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language) post_format_replacements = value.get('postformat_replace') if post_format_replacements: value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements] else: address_template = value - config[key] = self.add_postprocessing_tags(value) - self.config = config + config[country] = self.add_postprocessing_tags(value, country, language=language) + self.country_formats = config + + def load_config(self): + config = yaml.load(open(FORMATTER_CONFIG)) + self.config = config.get('global', {}) + language_configs = config.get('languages', {}) + + self.language_configs = {} + for language in language_configs: + language_config = language_configs[language] + config_copy = copy.deepcopy(self.config) + self.language_configs[language] = recursive_merge(config_copy, language_config) + + country_configs = config.get('countries', {}) + + self.country_configs = {} + for country in country_configs: + country_config = country_configs[country] + config_copy = copy.deepcopy(self.config) + self.country_configs[country] = recursive_merge(config_copy, country_config) + + def get_property(self, keys, country, language=None, default=None): + if isinstance(keys, six.string_types): + keys = keys.split('.') + keys = tuple(keys) + value = nested_get(self.language_configs, (language,) + keys, default=default) + if not value: + value = nested_get(self.country_configs, (country,) + keys, default=default) + if not value: + value = nested_get(self.config, keys, default=default) + return value + + def get_admin_components(self, country, language=None): + admin_components = self.get_property('admin_components', country, language=language, default={}) + return [(key, value.get('after', ()), value.get('before', ())) for key, value in six.iteritems(admin_components)] + + def insertion_probs(self, config): + component_insertions = {} + for component, insertions in six.iteritems(config): + values = [] + probs = [] + for k, v in six.iteritems(insertions): + if 'before' in v: + val = (self.BEFORE, v['before']) + elif 'after' in v: + val = (self.AFTER, v['after']) + elif 'last' in v: + val = (self.LAST, None) + elif 'first' in v: + val = (self.FIRST, None) + else: + raise ValueError('Insertions must contain one of {first, before, after, last}') + + prob = v['probability'] + values.append(val) + probs.append(prob) + + # If the probabilities don't sum to 1, add a "do nothing" action + if not isclose(sum(probs), 1.0): + probs.append(1.0 - sum(probs)) + values.append((None, None)) + + component_insertions[component] = values, cdf(probs) + return component_insertions + + def setup_insertion_probabilities(self): + self.global_insertions = self.insertion_probs(self.config['insertions']) + + self.country_insertions = {} + + for country, config in six.iteritems(self.country_configs): + if 'insertions' in config: + self.country_insertions[country.lower()] = self.insertion_probs(config['insertions']) + + self.language_insertions = {} + + for language, config in six.iteritems(self.language_configs): + if 'insertions' in config: + self.language_insertions[language.lower()] = self.insertion_probs(config['insertions']) def country_template(self, c): - return self.config.get(c, self.config['default']) + return self.country_formats.get(c, self.country_formats['default']) - postprocessing_tags = [ - (SUBURB, (ROAD,), (CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT, STATE, POSTCODE, COUNTRY)), - (CITY_DISTRICT, (ROAD, SUBURB), (CITY, ISLAND, STATE_DISTRICT, STATE)), - (STATE_DISTRICT, (SUBURB, CITY_DISTRICT, CITY, ISLAND), (STATE,)), - (STATE, (SUBURB, CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT), (COUNTRY,)), - ] - - template_tag_replacements = [ - ('county', STATE_DISTRICT), - ] - - def is_reverse(self, key, template): + def is_reverse(self, template): address_parts_match = self.template_address_parts_re.search(template) admin_parts_match = list(self.template_admin_parts_re.finditer(template)) - if not address_parts_match: - raise ValueError('Template for {} does not contain any address parts'.format(key)) - elif not admin_parts_match: - raise ValueError('Template for {} does not contain any admin parts'.format(key)) - # last instance of city/state/country occurs before the first instance of house_number/road return admin_parts_match[-1].start() < address_parts_match.start() @@ -170,25 +262,24 @@ class AddressFormatter(object): """ For constructing """ return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys])) - def insert_component(self, template, tag, before=(), after=(), separate=True, is_reverse=False): - if not before and not after: + def insert_component(self, template, tag, before=None, after=None, first=False, last=False, separate=True, is_reverse=False): + if not before and not after and not first and not last: return + template = template.rstrip() + tag_match = re.compile('\{{{key}\}}'.format(key=tag)).search(template) if before: - before_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in before])).search(template) + before_match = re.compile('\{{{key}\}}'.format(key=before)).search(template) if before_match and tag_match and before_match.start() > tag_match.start(): return template if after: - after_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in after])).search(template) + after_match = re.compile('\{{{key}\}}'.format(key=after)).search(template) if after_match and tag_match and tag_match.start() > after_match.start(): return template - before = set(before) - after = set(after) - key_added = False skip_next_non_token = False new_components = [] @@ -201,7 +292,7 @@ class AddressFormatter(object): if hasattr(el, 'parsed'): keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')] - if set(keys) & before and not key_added: + if (before in set(keys) or first) and not key_added: token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n' new_components.extend([tag_token, token]) key_added = True @@ -214,7 +305,7 @@ class AddressFormatter(object): new_components.pop() continue - if set(keys) & after and not key_added: + if (after in set(keys) or i == num_tokens - 1) and not key_added: token = '\n' if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types): token = parsed._parse_tree[i + 1] @@ -226,7 +317,7 @@ class AddressFormatter(object): skip_next_non_token = True continue - if el.key in before and not key_added: + if (el.key == before or first) and not key_added: token = '\n' if new_components and '{' not in new_components[-1]: token = new_components[-1] @@ -235,7 +326,7 @@ class AddressFormatter(object): new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key)) - if el.key in after and not key_added: + if (el.key == after or i == num_tokens - 1) and not key_added: token = '\n' if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types): token = parsed._parse_tree[i + 1] @@ -244,30 +335,44 @@ class AddressFormatter(object): elif not skip_next_non_token: new_components.append(el) + if i == num_tokens - 1 and not key_added: + key_added = True + new_components.append(tag_token) + skip_next_non_token = False return ''.join(new_components) - def add_postprocessing_tags(self, template): + def add_postprocessing_tags(self, template, country, language=None): is_reverse = self.is_reverse(template) - for key, pre_keys, post_keys in self.postprocessing_tags: - key_included = key in template + for key, pre_keys, post_keys in self.get_admin_components(country, language=language): + key_tag = six.u('{{{{{{{key}}}}}}}').format(key=key) + + key_included = key_tag in template new_components = [] if key_included: continue - for line in template.split('\n'): - pre_key = re.compile('|'.join(pre_keys)).search(line) - post_key = re.compile('|'.join(post_keys)).search(line) + pre_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in pre_keys])) + post_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in post_keys])) + + for line in template.split(six.u('\n')): + if not line.strip(): + continue + pre_key = pre_keys and pre_key_regex.search(line) + post_key = post_keys and post_key_regex.search(line) if post_key and not pre_key and not key_included: if not is_reverse: - new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key)) + new_components.append(key_tag) key_included = True + new_components.append(line.rstrip('\n')) if post_key and not pre_key and not key_included and is_reverse: - new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key)) + new_components.append(key_tag) key_included = True - template = u'\n'.join(new_components) + if not post_keys and not key_included: + new_components.append(key_tag) + template = six.u('\n').join(new_components) return template def render_template(self, template, components, tagged=False): @@ -298,15 +403,6 @@ class AddressFormatter(object): return True return False - def apply_replacements(self, template, components): - if not template.get('replace'): - return - for key in components.keys(): - value = components[key] - for regex, replacement in template['replace']: - value = re.sub(regex, replacement, value) - components[key] = value - def post_replacements(self, template, text): components = [] seen = set() @@ -322,10 +418,64 @@ class AddressFormatter(object): text = re.sub(regex, replacement, text) return text + def revised_template(self, components, country, language=None): + template = self.get_template(country, language=language) + if not template or 'address_template' not in template: + return None + + country = country.lower() + + template = template['address_template'] + + cache_keys = [] + + for component in components: + scope = country + insertions = nested_get(self.country_insertions, (country, component), default=None) + + if insertions is None and language: + country_language = '{}_{}'.format(country, language) + insertions = nested_get(self.country_insertions, (country_language, component), default=None) + scope = country_language + + if insertions is None and language: + insertions = nested_get(self.language_insertions, (language, component), default=None) + scope = language + + if insertions is None: + insertions = nested_get(self.global_insertions, (component,), default=None) + scope = None + + if insertions is not None: + values, probs = insertions + order, other = weighted_choice(values, probs) + + insertion_id = (scope, component, order, other) + cache_keys.append(insertion_id) + + cache_key = tuple(sorted(cache_keys)) + + if cache_key in self.template_cache: + template = self.template_cache[cache_key] + continue + + if order == self.BEFORE and other in components: + template = self.insert_component(template, component, before=other) + elif order == self.AFTER and other in components: + template = self.insert_component(template, component, after=other) + elif order == self.LAST: + template = self.insert_component(template, component, last=True) + elif order == self.FIRST: + template = self.insert_component(template, component, first=True) + else: + continue + + self.template_cache[cache_key] = template + + return template + def tag_template_separators(self, template): - template = re.sub(r'},', '}} ,/{} '.format(self.separator_tag), template) - template = re.sub(r'}-', '}} -/{} '.format(self.separator_tag), template) - template = re.sub(r' - ', ' -/{} '.format(self.separator_tag), template) + template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template) return template def strip_component(self, value, tagged=False): @@ -374,28 +524,47 @@ class AddressFormatter(object): else: end = num_tokens - j - 1 - return u' '.join(tokens[start:end]) + return six.u(' ').join(tokens[start:end]) + + def get_template(self, country, language=None): + template = None + if language: + # For countries like China and Japan where the country format varies + # based on which language is being used + template = self.country_formats.get('{}_{}'.format(country.upper(), language.lower()), None) + + if not template: + template = self.country_formats.get(country.upper()) - def format_address(self, country, components, - minimal_only=True, tag_components=True, replace_aliases=True, - template_replacements=False): - template = self.config.get(country.upper()) if not template: return None - template_text = template['address_template'] + + use_country = template.get('use_country') + if use_country and use_country.upper() in self.country_formats: + template = self.country_formats[use_country.upper()] + + if 'address_template' not in template: + return None + + return template + + def format_address(self, country, components, language=None, + minimal_only=True, tag_components=True, replace_aliases=True): + template = self.get_template(country, language=language) + if not template: + return None + + template_text = self.revised_template(components, country, language=language) if replace_aliases: - self.replace_aliases(components) + self.aliases.replace(components) if minimal_only and not self.minimal_components(components): return None - if template_replacements: - self.apply_replacements(template, components) - if tag_components: template_text = self.tag_template_separators(template_text) - components = {k: u' '.join([u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_')) - for t, c in tokenize(v)]) + components = {k: six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), k.replace(' ', '_')) + for t, c in tokenize(v)]) for k, v in components.iteritems()} text = self.render_template(template_text, components, tagged=tag_components)