[formatting] New formatter config including random component component order changes and default/per-country admin component ordering

2016-05-16 11:12:53 -04:00
parent 8b9e351961
commit 334f22a41c
2 changed files with 522 additions and 68 deletions
--- a/resources/formatting/global.yaml
+++ b/resources/formatting/global.yaml
@@ -0,0 +1,285 @@
+global:
+    # Add these components to templates that don't have them
+    admin_components:
+        subdivision:
+            after:
+                - road
+            before:
+                - suburb
+                - city_district
+                - city
+                - island
+                - state_district
+                - state
+                - postcode
+                - country
+        suburb:
+            after:
+                - road
+                - subdivision
+            before:
+                - city_district
+                - city
+                - island
+                - state_district
+                - state
+                - postcode
+                - country
+        city_district:
+            after:
+                - road
+                - suburb
+            before:
+                - city
+                - island
+                - state_district
+                - state
+                - postcode
+                - country
+        # This is added to all the templates but only makes it in
+        island:
+            after:
+                - road
+                - suburb
+                - city_district
+                - city
+            before:
+                - state_district
+                - state
+                - country
+        state_district:
+            after:
+                - suburb
+                - city_district
+                - city
+                - island
+            before:
+                - state
+                - country
+        state:
+            after:
+                - suburb
+                - city_district
+                - city
+                - island
+                - state_district
+            before:
+                - country
+        country:
+            after:
+                - suburb
+                - city_district
+                - city
+                - island
+                - state_district
+                - state
+                - postcode
+
+    insertions:
+        # For each component, insertions are mutually exclusive
+        # They don't have to sum to 1 (especially for components
+        # likely to be found in most addresses)
+        postcode:
+            postcode_before_city:
+                before: city
+                probability: 0.0001
+
+            postcode_after_city:
+                after: city
+                probability: 0.0001
+
+            postcode_before_city_district:
+                before: city_district
+                probability: 0.0001
+
+            postcode_before_suburb:
+                before: suburb
+                probability: 0.0001
+
+            postcode_before_state_district:
+                before: state
+                probability: 0.0001
+
+            postcode_before_state:
+                before: state
+                probability: 0.0001
+
+            postcode_before_country:
+                before: country
+                probability: 0.05
+
+            postcode_after_country:
+                after: country
+                probability: 0.01
+
+            postcode_first:
+                first: true
+                probability: 0.001
+
+            postcode_last:
+                last: true
+                probability: 0.01
+
+        # PO Box should be the same in most countries
+        po_box:
+            po_box_before_city:
+                before: city
+                probability: 0.7
+
+            po_box_after_house:
+                after: house
+                probability: 0.2
+
+            po_box_first:
+                first: true
+                probability: 0.1
+
+# Overrides for languages (better for e.g. covering all French-speaking countries)
+languages:
+    en:
+        insertions:
+            level:
+                # e.g. 123 East 45th St, 6th Floor, NYC
+                level_after_road:
+                    after: road
+                    probability: 0.5
+                # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
+                level_before_house:
+                    before: house
+                    probability: 0.25
+                # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
+                level_before_road:
+                    before: road
+                    probability: 0.25
+
+            unit:
+                # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
+                unit_before_house:
+                    before: house
+                    probability: 0.2
+
+                # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
+                unit_before_house_number:
+                    before: house_number
+                    probability: 0.6
+
+                # e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK)
+                unit_after_road:
+                    after: road
+                    probability: 0.1
+
+                # e.g. Floor 5, Apt 6
+                unit_after_level:
+                    after: level
+                    probability: 0.09
+
+                # e.g. Apt. 6, 5/F (less common)
+                unit_before_level:
+                    before: level
+                    probability: 0.01
+
+    es:
+        insertions:
+            level:
+                # e.g. Calle Ruiz de Alarcón 23 piso 3
+                level_after_house_number:
+                    after: house_number
+                    probability: 0.8
+                # e.g. Piso 3, Museo del Prado, Calle Ruiz de Alarcón 23
+                level_before_house:
+                    before: house
+                    probability: 0.1
+                # e.g. Museo del Prado, Bajos, Calle Ruiz de Alarcón 23
+                level_before_road:
+                    before: road
+                    probability: 0.1
+
+            unit:
+                unit_before_house:
+                    before: house
+                    probability: 0.05
+                unit_before_house_number:
+                    before: house_number
+                    probability: 0.05
+                # e.g. Piso 3 Dpto 12 (most common)
+                unit_after_level:
+                    after: level
+                    probability: 0.8
+                # e.g. Apto 6, 2o piso (less common)
+                unit_before_level:
+                    before: level
+                    probability: 0.1
+
+    fr:
+        # libpostal issue #27
+        insertions:
+            city:
+                city_before_road:
+                    before: road
+                    probability: 0.001
+
+
+countries:
+    # Hungary, e.g. 1075, Budapest Kazinczy utca 14
+    hu:
+        insertions:
+            postcode:
+                postcode_before_city:
+                    probability: 0.5
+
+    # Malaysia (islands are bigger than states)
+    my:
+        admin_components:
+            island:
+                after:
+                    - road
+                    - suburb
+                    - city_district
+                    - city
+                    - state_district
+                    - state
+                before:
+                    - country
+
+    us:
+        insertions:
+            level:
+                # e.g. 123 East 45th St, 6th Floor, NYC
+                level_after_road:
+                    after: road
+                    probability: 0.75
+                # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
+                level_before_house:
+                    before: house
+                    probability: 0.125
+                # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
+                level_before_road:
+                    before: road
+                    probability: 0.125
+
+            unit:
+                # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
+                unit_before_house:
+                    before: house
+                    probability: 0.05
+
+                # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
+                unit_before_house_number:
+                    before: house_number
+                    probability: 0.05
+
+                # e.g. Da Vinci House, 44 Saffron Hill, Flat 18, London (not as common in UK)
+                unit_after_road:
+                    after: road
+                    probability: 0.8
+
+                # e.g. Floor 5, Apt 6
+                unit_after_level:
+                    after: level
+                    probability: 0.09
+
+                # e.g. Apt. 6, 5/F (less common)
+                unit_before_level:
+                    before: level
+                    probability: 0.01
+
--- a/scripts/geodata/address_formatting/formatter.py
+++ b/scripts/geodata/address_formatting/formatter.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import copy
 import os
 import pystache
 import re
@@ -7,6 +8,9 @@ import subprocess
 import yaml

 from geodata.address_formatting.aliases import Aliases
+from geodata.configs.utils import nested_get, recursive_merge
+from geodata.math.floats import isclose
+from geodata.math.sampling import weighted_choice, cdf
 from geodata.text.tokenize import tokenize, tokenize_raw, token_types
 from geodata.encoding import safe_decode
 from collections import OrderedDict
@@ -14,6 +18,11 @@ from itertools import ifilter

 FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'

+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                'resources', 'formatting', 'global.yaml')
+

 class AddressFormatter(object):
    '''
@@ -50,6 +59,7 @@ class AddressFormatter(object):
    UNIT = 'unit'
    INTERSECTION = 'intersection'
    ROAD = 'road'
+    SUBDIVISION = 'subdivision'
    SUBURB = 'suburb'
    CITY_DISTRICT = 'city_district'
    CITY = 'city'
@@ -73,6 +83,7 @@ class AddressFormatter(object):
        INTERSECTION,
        ROAD,
        SUBURB,
+        SUBDIVISION,
        CITY,
        CITY_DISTRICT,
        ISLAND,
@@ -111,58 +122,139 @@ class AddressFormatter(object):
        (ROAD, POSTCODE)
    ]

+    FIRST, BEFORE, AFTER, LAST = range(4)
+
    def __init__(self, scratch_dir='/tmp', splitter=None):
        if splitter is not None:
            self.splitter = splitter

        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
        self.clone_repo()
+
        self.load_config()
+        self.load_country_config()
+
+        self.setup_insertion_probabilities()
+
+        self.template_cache = {}

    def clone_repo(self):
        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])

-    def load_config(self):
+    def load_country_config(self):
        config = yaml.load(open(os.path.join(self.formatter_repo_path,
-                                'conf/countries/worldwide.yaml')))
-        for key, value in config.items():
+                                'conf', 'countries', 'worldwide.yaml')))
+        for key in list(config):
+            country = key
+            language = None
+            if '_' in key:
+                country, language = country.split('_', 1)
+            value = config[country]
            if hasattr(value, 'items'):
                address_template = value.get('address_template')
+                if not address_template and 'use_country' in value:
+                    # Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references
+                    if value['use_country'] in (country, False):
+                        continue
+                    address_template = config[value['use_country']]['address_template']
+
                if address_template:
-                    value['address_template'] = self.add_postprocessing_tags(address_template)
+                    value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language)

                post_format_replacements = value.get('postformat_replace')
                if post_format_replacements:
                    value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
            else:
                address_template = value
-                config[key] = self.add_postprocessing_tags(value)
-        self.config = config
+                config[country] = self.add_postprocessing_tags(value, country, language=language)
+        self.country_formats = config
+
+    def load_config(self):
+        config = yaml.load(open(FORMATTER_CONFIG))
+        self.config = config.get('global', {})
+        language_configs = config.get('languages', {})
+
+        self.language_configs = {}
+        for language in language_configs:
+            language_config = language_configs[language]
+            config_copy = copy.deepcopy(self.config)
+            self.language_configs[language] = recursive_merge(config_copy, language_config)
+
+        country_configs = config.get('countries', {})
+
+        self.country_configs = {}
+        for country in country_configs:
+            country_config = country_configs[country]
+            config_copy = copy.deepcopy(self.config)
+            self.country_configs[country] = recursive_merge(config_copy, country_config)
+
+    def get_property(self, keys, country, language=None, default=None):
+        if isinstance(keys, six.string_types):
+            keys = keys.split('.')
+        keys = tuple(keys)
+        value = nested_get(self.language_configs, (language,) + keys, default=default)
+        if not value:
+            value = nested_get(self.country_configs, (country,) + keys, default=default)
+        if not value:
+            value = nested_get(self.config, keys, default=default)
+        return value
+
+    def get_admin_components(self, country, language=None):
+        admin_components = self.get_property('admin_components', country, language=language, default={})
+        return [(key, value.get('after', ()), value.get('before', ())) for key, value in six.iteritems(admin_components)]
+
+    def insertion_probs(self, config):
+        component_insertions = {}
+        for component, insertions in six.iteritems(config):
+            values = []
+            probs = []
+            for k, v in six.iteritems(insertions):
+                if 'before' in v:
+                    val = (self.BEFORE, v['before'])
+                elif 'after' in v:
+                    val = (self.AFTER, v['after'])
+                elif 'last' in v:
+                    val = (self.LAST, None)
+                elif 'first' in v:
+                    val = (self.FIRST, None)
+                else:
+                    raise ValueError('Insertions must contain one of {first, before, after, last}')
+
+                prob = v['probability']
+                values.append(val)
+                probs.append(prob)
+
+            # If the probabilities don't sum to 1, add a "do nothing" action
+            if not isclose(sum(probs), 1.0):
+                probs.append(1.0 - sum(probs))
+                values.append((None, None))
+
+            component_insertions[component] = values, cdf(probs)
+        return component_insertions
+
+    def setup_insertion_probabilities(self):
+        self.global_insertions = self.insertion_probs(self.config['insertions'])
+
+        self.country_insertions = {}
+
+        for country, config in six.iteritems(self.country_configs):
+            if 'insertions' in config:
+                self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
+
+        self.language_insertions = {}
+
+        for language, config in six.iteritems(self.language_configs):
+            if 'insertions' in config:
+                self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])

    def country_template(self, c):
-        return self.config.get(c, self.config['default'])
+        return self.country_formats.get(c, self.country_formats['default'])

-    postprocessing_tags = [
-        (SUBURB, (ROAD,), (CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT, STATE, POSTCODE, COUNTRY)),
-        (CITY_DISTRICT, (ROAD, SUBURB), (CITY, ISLAND, STATE_DISTRICT, STATE)),
-        (STATE_DISTRICT, (SUBURB, CITY_DISTRICT, CITY, ISLAND), (STATE,)),
-        (STATE, (SUBURB, CITY_DISTRICT, CITY, ISLAND, STATE_DISTRICT), (COUNTRY,)),
-    ]
-
-    template_tag_replacements = [
-        ('county', STATE_DISTRICT),
-    ]
-
-    def is_reverse(self, key, template):
+    def is_reverse(self, template):
        address_parts_match = self.template_address_parts_re.search(template)
        admin_parts_match = list(self.template_admin_parts_re.finditer(template))

-        if not address_parts_match:
-            raise ValueError('Template for {} does not contain any address parts'.format(key))
-        elif not admin_parts_match:
-            raise ValueError('Template for {} does not contain any admin parts'.format(key))
-
        # last instance of city/state/country occurs before the first instance of house_number/road
        return admin_parts_match[-1].start() < address_parts_match.start()

@@ -170,25 +262,24 @@ class AddressFormatter(object):
        """ For constructing """
        return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))

-    def insert_component(self, template, tag, before=(), after=(), separate=True, is_reverse=False):
-        if not before and not after:
+    def insert_component(self, template, tag, before=None, after=None, first=False, last=False, separate=True, is_reverse=False):
+        if not before and not after and not first and not last:
            return

+        template = template.rstrip()
+
        tag_match = re.compile('\{{{key}\}}'.format(key=tag)).search(template)

        if before:
-            before_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in before])).search(template)
+            before_match = re.compile('\{{{key}\}}'.format(key=before)).search(template)
            if before_match and tag_match and before_match.start() > tag_match.start():
                return template

        if after:
-            after_match = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in after])).search(template)
+            after_match = re.compile('\{{{key}\}}'.format(key=after)).search(template)
            if after_match and tag_match and tag_match.start() > after_match.start():
                return template

-        before = set(before)
-        after = set(after)
-
        key_added = False
        skip_next_non_token = False
        new_components = []
@@ -201,7 +292,7 @@ class AddressFormatter(object):

            if hasattr(el, 'parsed'):
                keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
-                if set(keys) & before and not key_added:
+                if (before in set(keys) or first) and not key_added:
                    token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
                    new_components.extend([tag_token, token])
                    key_added = True
@@ -214,7 +305,7 @@ class AddressFormatter(object):
                        new_components.pop()
                    continue

-                if set(keys) & after and not key_added:
+                if (after in set(keys) or i == num_tokens - 1) and not key_added:
                    token = '\n'
                    if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
                        token = parsed._parse_tree[i + 1]
@@ -226,7 +317,7 @@ class AddressFormatter(object):
                    skip_next_non_token = True
                    continue

-                if el.key in before and not key_added:
+                if (el.key == before or first) and not key_added:
                    token = '\n'
                    if new_components and '{' not in new_components[-1]:
                        token = new_components[-1]
@@ -235,7 +326,7 @@ class AddressFormatter(object):

                new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))

-                if el.key in after and not key_added:
+                if (el.key == after or i == num_tokens - 1) and not key_added:
                    token = '\n'
                    if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
                        token = parsed._parse_tree[i + 1]
@@ -244,30 +335,44 @@ class AddressFormatter(object):
            elif not skip_next_non_token:
                new_components.append(el)

+            if i == num_tokens - 1 and not key_added:
+                key_added = True
+                new_components.append(tag_token)
+
            skip_next_non_token = False

        return ''.join(new_components)

-    def add_postprocessing_tags(self, template):
+    def add_postprocessing_tags(self, template, country, language=None):
        is_reverse = self.is_reverse(template)
-        for key, pre_keys, post_keys in self.postprocessing_tags:
-            key_included = key in template
+        for key, pre_keys, post_keys in self.get_admin_components(country, language=language):
+            key_tag = six.u('{{{{{{{key}}}}}}}').format(key=key)
+
+            key_included = key_tag in template
            new_components = []
            if key_included:
                continue

-            for line in template.split('\n'):
-                pre_key = re.compile('|'.join(pre_keys)).search(line)
-                post_key = re.compile('|'.join(post_keys)).search(line)
+            pre_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in pre_keys]))
+            post_key_regex = re.compile('|'.join(['{{{}}}'.format(k) for k in post_keys]))
+
+            for line in template.split(six.u('\n')):
+                if not line.strip():
+                    continue
+                pre_key = pre_keys and pre_key_regex.search(line)
+                post_key = post_keys and post_key_regex.search(line)
                if post_key and not pre_key and not key_included:
                    if not is_reverse:
-                        new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key))
+                        new_components.append(key_tag)
                        key_included = True
+
                new_components.append(line.rstrip('\n'))
                if post_key and not pre_key and not key_included and is_reverse:
-                    new_components.append(u'{{{{{{{key}}}}}}}'.format(key=key))
+                    new_components.append(key_tag)
                    key_included = True
-            template = u'\n'.join(new_components)
+            if not post_keys and not key_included:
+                new_components.append(key_tag)
+            template = six.u('\n').join(new_components)
        return template

    def render_template(self, template, components, tagged=False):
@@ -298,15 +403,6 @@ class AddressFormatter(object):
                return True
        return False

-    def apply_replacements(self, template, components):
-        if not template.get('replace'):
-            return
-        for key in components.keys():
-            value = components[key]
-            for regex, replacement in template['replace']:
-                value = re.sub(regex, replacement, value)
-                components[key] = value
-
    def post_replacements(self, template, text):
        components = []
        seen = set()
@@ -322,10 +418,64 @@ class AddressFormatter(object):
                text = re.sub(regex, replacement, text)
        return text

+    def revised_template(self, components, country, language=None):
+        template = self.get_template(country, language=language)
+        if not template or 'address_template' not in template:
+            return None
+
+        country = country.lower()
+
+        template = template['address_template']
+
+        cache_keys = []
+
+        for component in components:
+            scope = country
+            insertions = nested_get(self.country_insertions, (country, component), default=None)
+
+            if insertions is None and language:
+                country_language = '{}_{}'.format(country, language)
+                insertions = nested_get(self.country_insertions, (country_language, component), default=None)
+                scope = country_language
+
+            if insertions is None and language:
+                insertions = nested_get(self.language_insertions, (language, component), default=None)
+                scope = language
+
+            if insertions is None:
+                insertions = nested_get(self.global_insertions, (component,), default=None)
+                scope = None
+
+            if insertions is not None:
+                values, probs = insertions
+                order, other = weighted_choice(values, probs)
+
+                insertion_id = (scope, component, order, other)
+                cache_keys.append(insertion_id)
+
+                cache_key = tuple(sorted(cache_keys))
+
+                if cache_key in self.template_cache:
+                    template = self.template_cache[cache_key]
+                    continue
+
+                if order == self.BEFORE and other in components:
+                    template = self.insert_component(template, component, before=other)
+                elif order == self.AFTER and other in components:
+                    template = self.insert_component(template, component, after=other)
+                elif order == self.LAST:
+                    template = self.insert_component(template, component, last=True)
+                elif order == self.FIRST:
+                    template = self.insert_component(template, component, first=True)
+                else:
+                    continue
+
+                self.template_cache[cache_key] = template
+
+        return template
+
    def tag_template_separators(self, template):
-        template = re.sub(r'},', '}} ,/{} '.format(self.separator_tag), template)
-        template = re.sub(r'}-', '}} -/{} '.format(self.separator_tag), template)
-        template = re.sub(r' - ', ' -/{} '.format(self.separator_tag), template)
+        template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template)
        return template

    def strip_component(self, value, tagged=False):
@@ -374,27 +524,46 @@ class AddressFormatter(object):
                else:
                    end = num_tokens - j - 1

-            return u' '.join(tokens[start:end])
+            return six.u(' ').join(tokens[start:end])
+
+    def get_template(self, country, language=None):
+        template = None
+        if language:
+            # For countries like China and Japan where the country format varies
+            # based on which language is being used
+            template = self.country_formats.get('{}_{}'.format(country.upper(), language.lower()), None)
+
+        if not template:
+            template = self.country_formats.get(country.upper())

-    def format_address(self, country, components, 
-                       minimal_only=True, tag_components=True, replace_aliases=True,
-                       template_replacements=False):
-        template = self.config.get(country.upper())
        if not template:
            return None
-        template_text = template['address_template']
+
+        use_country = template.get('use_country')
+        if use_country and use_country.upper() in self.country_formats:
+            template = self.country_formats[use_country.upper()]
+
+        if 'address_template' not in template:
+            return None
+
+        return template
+
+    def format_address(self, country, components, language=None,
+                       minimal_only=True, tag_components=True, replace_aliases=True):
+        template = self.get_template(country, language=language)
+        if not template:
+            return None
+
+        template_text = self.revised_template(components, country, language=language)
        if replace_aliases:
-            self.replace_aliases(components)
+            self.aliases.replace(components)

        if minimal_only and not self.minimal_components(components):
            return None

-        if template_replacements:
-            self.apply_replacements(template, components)
-
        if tag_components:
            template_text = self.tag_template_separators(template_text)
-            components = {k: u' '.join([u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_'))
+            components = {k: six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), k.replace(' ', '_'))
                                              for t, c in tokenize(v)])
                          for k, v in components.iteritems()}