[addresses] adding new config for postal codes around the world. Allows appending the ISO alpha-2 country code to the beginning of the postcode as in e.g. SI-1000 (only used if the postcode begins with a digit). This system was used for postal codes in continental Europe as a recommendation from the CEPT. Now 7 member states still use it, so in those countries add the country-code with higher probability. The config also contains the license plate codes for countries where e.g. L-1234 might be used instead of LU-1234. Allows configuring in which countries postcodes should be validated using Google's per-country validation regexes (and the ability to override with a custom regex), and in which countries other admin component names should be stripped.

2017-02-10 18:38:32 -05:00
parent 109aa76718
commit 293587bae9
8 changed files with 447 additions and 29 deletions
--- a/resources/postal_codes/config.yaml
+++ b/resources/postal_codes/config.yaml
@@ -0,0 +1,300 @@
 global:
    # Validate using google I18N regexes
    validate_postcode: false
    # Strip other components like city, state, etc. from the postcode
    strip_components: false
    # Add country code to the beginning of the string
    add_country_code: false
    country_code_probablity: 0.0
    country_code_hyphen_probability: 0.0
 use_country_code_seldomly: &use_country_code_seldomly
    add_country_code: true
    country_code_probablity: 0.1
    country_code_hyphen_probability: 0.8
 use_country_code_sometimes: &use_country_code_sometimes
    add_country_code: true
    country_code_probablity: 0.4
    country_code_hyphen_probability: 0.8
 use_country_code_frequently: &use_country_code_frequently
    add_country_code: true
    country_code_probablity: 0.7
    country_code_hyphen_probability: 0.8
 countries:
    # Albania
    al: *use_country_code_seldomly
    # Austria
    at:         
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: AT
            probability: 0.6
            alternatives:
                - alternative: A
                  probability: 0.4
    # Åland Islands (same as Finland)
    ax: *use_country_code_frequently
    # Azerbaijan
    az: *use_country_code_frequently
    # Barbados
    bb: *use_country_code_frequently
    # Belgium
    be: 
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: BE
            probability: 0.6
            alternatives:
                - alternative: B
                  probability: 0.4
    # Bulgaria
    bg: *use_country_code_seldomly
    # Bosnia and Herzegovina
    bs: *use_country_code_seldomly
    # Canada
    ca:
        validate_postcode: true
    # Switzerland
    ch: *use_country_code_seldomly
    # Cyprus
    cy: *use_country_code_seldomly
    # Czech Republic
    cz: *use_country_code_seldomly
    # Germany
    de:        
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: DE
            probability: 0.6
            alternatives:
                - alternative: D
                  probability: 0.4
    # Denmark
    dk: *use_country_code_seldomly
    # Estonia
    ee: *use_country_code_seldomly
    # Spain
    es: 
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: ES
            probability: 0.6
            alternatives:
                - alternative: E
                  probability: 0.4
    # Finland
    fi: *use_country_code_frequently
    # France
    fr:
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: FR
            probability: 0.6
            alternatives:
                - alternative: F
                  probability: 0.4
    # United Kingdom
    gb:
        validate_postcode: true
    # Guernsey
    gg:
        validate_postcode: true
    # Greece
    gr: *use_country_code_seldomly
    # Honduras
    hn: *use_country_code_frequently
    # Croatia
    hr: *use_country_code_frequently
    # Hungary
    hu:
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: HU
            probability: 0.6
            alternatives:
                - alternative: H
                  probability: 0.4
    # Isle of Man
    im:
        validate_postcode: true
    # Iceland
    is: *use_country_code_seldomly
    # Italy
    it:
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: IT
            probability: 0.6
            alternatives:
                - alternative: I
                  probability: 0.4
    # Jersey
    je:
        validate_postcode: true
    # Cayman Islands
    ky: *use_country_code_frequently
    # Saint Lucia
    lc: *use_country_code_frequently
    # Lithuania
    lt: *use_country_code_frequently
    # Luxembourg
    lu:
        <<: *use_country_code_sometimes
        country_code_phrase:
            default: L
            probability: 0.7
            alternatives:
                - alternative: LU
                  probability: 0.3
    # Latvia
    lv: *use_country_code_frequently
    # Monaco
    mc: *use_country_code_seldomly
    # Moldova
    md: *use_country_code_frequently
    # Montenegro
    me: *use_country_code_seldomly
    # Macedonia
    mk: *use_country_code_seldomly
    # Malta
    mt: 
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: MT
            probability: 0.6
            alternatives:
                - alternative: M
                  probability: 0.4
    # Netherlands
    nl: *use_country_code_seldomly
    # Norway
    "no":
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: "NO"
            probability: 0.6
            alternatives:
                - alternative: N
                  probability: 0.4
    # Peru
    pe: *use_country_code_seldomly
    # Poland
    pl: *use_country_code_seldomly
    # Portugal
    pt: 
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: PT
            probability: 0.6
            alternatives:
                - alternative: P
                  probability: 0.4
    # Réunion
    re: *use_country_code_seldomly
    # Romania
    ro: *use_country_code_seldomly
    # Serbia
    rs: *use_country_code_seldomly
    # Sweden
    se:
        <<: *use_country_code_sometimes
        country_code_phrase:
            default: SE
            probability: 0.6
            alternatives:
                - alternative: S
                  probability: 0.4
    # Slovenia
    si: *use_country_code_frequently
    # Svalbard and Jan Mayen
    sj: *use_country_code_seldomly
    # Slovakia
    sk: *use_country_code_seldomly
    # San Marino
    sm: *use_country_code_seldomly
    # Turkey
    tr: *use_country_code_seldomly
    us:
        validate_postcode: true
        strip_components: true
        override_regex: "(\\d{5})(?:[ \\-]?(\\d{4}))?"
    # Vatican
    va: 
        <<: *use_country_code_seldomly
        country_code_phrase:
            default: VA
            probability: 0.6
            alternatives:
                - alternative: V
                  probability: 0.4
    # Saint Vincent and the Grenadines
    vc: *use_country_code_frequently
    # British Virgin Islands
    vg: *use_country_code_frequently
    # Samoa
    ws: *use_country_code_frequently
    # Kosovo
    xk: *use_country_code_seldomly
--- a/scripts/geodata/geoplanet/geoplanet_training_data.py
+++ b/scripts/geodata/geoplanet/geoplanet_training_data.py
@@ -18,7 +18,7 @@ from geodata.address_expansions.gazetteers import *
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.countries.names import country_names
-from geodata.i18n.google import postcode_regexes
+from geodata.postal_codes.validation import postcode_regexes
 from geodata.names.normalization import name_affixes
 from geodata.places.config import place_config
--- a/scripts/geodata/i18n/google.py
+++ b/scripts/geodata/i18n/google.py
@@ -35,26 +35,3 @@ class GoogleI18N(object):
 google_i18n = GoogleI18N()
 class PostcodeRegexes(object):
    def __init__(self):
        self.responses = {}
        self.postcode_regexes = {}
    def get(self, country_code):
        ret = self.postcode_regexes.get(country_code.lower())
        if ret is None:
            response = google_i18n.get(country_code)
            if response:
                postcode_expression = response.get('zip')
                if not postcode_expression:
                    self.postcode_regexes[country_code.lower()] = None
                    return None
                ret = re.compile(postcode_expression, re.I)
                self.postcode_regexes[country_code.lower()] = ret
        return ret
 postcode_regexes = PostcodeRegexes()
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
@@ -34,7 +34,6 @@ from geodata.configs.utils import nested_get
 from geodata.countries.country_names import *
 from geodata.language_id.disambiguation import *
 from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION
 from geodata.i18n.google import postcode_regexes
 from geodata.i18n.languages import *
 from geodata.intersections.query import Intersection, IntersectionQuery
 from geodata.address_formatting.formatter import AddressFormatter
@@ -45,6 +44,7 @@ from geodata.osm.intersections import OSMIntersectionReader
 from geodata.places.config import place_config
 from geodata.polygons.language_polys import *
 from geodata.polygons.reverse_geocode import *
 from geodata.postal_codes.validation import postcode_regexes
 from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.text.tokenize import tokenize, token_types
 from geodata.text.utils import is_numeric
@@ -1101,10 +1101,11 @@ class OSMAddressFormatter(object):
                            if u';' in v:
                                v = random.choice(v.split(u';'))
-                            for p in v.split(','):
+                            if u',' in v:
-                                if self.valid_postal_code(country, p):
+                                for p in v.split(','):
-                                    revised_tags[AddressFormatter.POSTCODE] = p.strip()
+                                    if self.valid_postal_code(country, p):
-                                    break
+                                        revised_tags[AddressFormatter.POSTCODE] = p.strip()
                                        break
                    elif k == AddressFormatter.HOUSE:
                        building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type))
--- a/scripts/geodata/postal_codes/init.py
+++ b/scripts/geodata/postal_codes/init.py
--- a/scripts/geodata/postal_codes/config.py
+++ b/scripts/geodata/postal_codes/config.py
@@ -0,0 +1,51 @@
 import copy
 import operator
 import os
 import random
 import six
 import yaml
 from collections import defaultdict
 from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.configs.utils import nested_get, recursive_merge
 this_dir = os.path.realpath(os.path.dirname(__file__))
 POSTAL_CODES_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                        'resources', 'postal_codes', 'config.yaml')
 class PostalCodesConfig(object):
    def __init__(self, config_file=POSTAL_CODES_CONFIG_FILE):
        self.cache = {}
        postal_codes_config = yaml.load(open(config_file))
        self.global_config = postal_codes_config['global']
        self.country_configs = {}
        countries = postal_codes_config.pop('countries', {})
        for k, v in six.iteritems(countries):
            country_config = countries[k]
            global_config_copy = copy.deepcopy(self.global_config)
            self.country_configs[k] = recursive_merge(global_config_copy, country_config)
        self.country_configs[None] = self.global_config
    def get_property(self, key, country=None, default=None):
        if isinstance(key, six.string_types):
            key = key.split(u'.')
        config = self.global_config
        if country:
            country_config = self.country_configs.get(country.lower(), {})
            if country_config:
                config = country_config
        return nested_get(config, key, default=default)
 postal_codes_config = PostalCodesConfig()
--- a/scripts/geodata/postal_codes/phrases.py
+++ b/scripts/geodata/postal_codes/phrases.py
@@ -0,0 +1,56 @@
 import random
 from geodata.configs.utils import alternative_probabilities
 from geodata.math.sampling import weighted_choice, cdf
 from geodata.postal_codes.config import postal_codes_config
 from geodata.postal_codes.validation import postcode_regexes
 class PostalCodes(object):
    @classmethod
    def is_valid(cls, postal_code, country):
        regex = postcode_regexes.get(country)
        if regex:
            postal_code = postal_code.strip()
            m = regex.match(postal_code)
            if m and m.end() == len(postal_code):
                return True
            else:
                return False
        return True
    @classmethod
    def needs_validation(cls, country):
        return postal_codes_config.get_property('validate_postcode', country=country, default=False)
    @classmethod
    def should_strip_components(cls, country_code):
        return postal_codes_config.get_property('strip_components', country=country_code)
    @classmethod
    def add_country_code(cls, postal_code, country):
        postal_code = postal_code.strip()
        if not postal_codes_config.get_property('add_country_code', country=country):
            return postal_code
        cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0)
        if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit():
            return postal_code
        country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None)
        if country_code_phrases is None:
            country_code_phrase = country.upper()
        else:
            alternates, probs = alternative_probabilities(country_code_phrases)
            probs_cdf = cdf(probs)
            country_code_phrase = weighted_choice(alternates, probs_cdf)
        cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0)
        separator = u''
        r = random.random()
        if r < cc_hyphen_probability:
            separator = u'-'
        return u'{}{}{}'.format(country_code_phrase, separator, postal_code)
--- a/scripts/geodata/postal_codes/validation.py
+++ b/scripts/geodata/postal_codes/validation.py
@@ -0,0 +1,33 @@
 import re
 from geodata.i18n.google import google_i18n
 from geodata.postal_codes.config import postal_codes_config
 class PostcodeRegexes(object):
    def __init__(self):
        self.responses = {}
        self.postcode_regexes = {}
    def get(self, country_code):
        ret = self.postcode_regexes.get(country_code.lower())
        if ret is None:
            override_regex = postal_codes_config.get_property('override_regex', country=country_code)
            if override_regex:
                ret = re.compile(override_regex, re.I)
                self.postcode_regexes[country_code.lower()] = ret
                return ret
            response = google_i18n.get(country_code)
            if response:
                postcode_expression = response.get('zip')
                if not postcode_expression:
                    self.postcode_regexes[country_code.lower()] = None
                    return None
                ret = re.compile(postcode_expression, re.I)
                self.postcode_regexes[country_code.lower()] = ret
        return ret
 postcode_regexes = PostcodeRegexes()