[addresses] adding new config for postal codes around the world. Allows appending the ISO alpha-2 country code to the beginning of the postcode as in e.g. SI-1000 (only used if the postcode begins with a digit). This system was used for postal codes in continental Europe as a recommendation from the CEPT. Now 7 member states still use it, so in those countries add the country-code with higher probability. The config also contains the license plate codes for countries where e.g. L-1234 might be used instead of LU-1234. Allows configuring in which countries postcodes should be validated using Google's per-country validation regexes (and the ability to override with a custom regex), and in which countries other admin component names should be stripped.

2017-02-10 18:38:32 -05:00
parent 109aa76718
commit 293587bae9
8 changed files with 447 additions and 29 deletions
--- a/resources/postal_codes/config.yaml
+++ b/resources/postal_codes/config.yaml
@@ -0,0 +1,300 @@
+global:
+    # Validate using google I18N regexes
+    validate_postcode: false
+    # Strip other components like city, state, etc. from the postcode
+    strip_components: false
+
+    # Add country code to the beginning of the string
+    add_country_code: false
+    country_code_probablity: 0.0
+    country_code_hyphen_probability: 0.0
+
+
+use_country_code_seldomly: &use_country_code_seldomly
+    add_country_code: true
+    country_code_probablity: 0.1
+    country_code_hyphen_probability: 0.8
+
+use_country_code_sometimes: &use_country_code_sometimes
+    add_country_code: true
+    country_code_probablity: 0.4
+    country_code_hyphen_probability: 0.8
+
+use_country_code_frequently: &use_country_code_frequently
+    add_country_code: true
+    country_code_probablity: 0.7
+    country_code_hyphen_probability: 0.8
+
+
+countries:
+    # Albania
+    al: *use_country_code_seldomly
+
+    # Austria
+    at:         
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: AT
+            probability: 0.6
+            alternatives:
+                - alternative: A
+                  probability: 0.4
+
+    # Åland Islands (same as Finland)
+    ax: *use_country_code_frequently
+
+    # Azerbaijan
+    az: *use_country_code_frequently
+
+    # Barbados
+    bb: *use_country_code_frequently
+
+    # Belgium
+    be: 
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: BE
+            probability: 0.6
+            alternatives:
+                - alternative: B
+                  probability: 0.4
+
+    # Bulgaria
+    bg: *use_country_code_seldomly
+
+    # Bosnia and Herzegovina
+    bs: *use_country_code_seldomly
+
+    # Canada
+    ca:
+        validate_postcode: true
+
+    # Switzerland
+    ch: *use_country_code_seldomly
+
+    # Cyprus
+    cy: *use_country_code_seldomly
+
+    # Czech Republic
+    cz: *use_country_code_seldomly
+
+    # Germany
+    de:        
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: DE
+            probability: 0.6
+            alternatives:
+                - alternative: D
+                  probability: 0.4
+
+    # Denmark
+    dk: *use_country_code_seldomly
+
+    # Estonia
+    ee: *use_country_code_seldomly
+
+    # Spain
+    es: 
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: ES
+            probability: 0.6
+            alternatives:
+                - alternative: E
+                  probability: 0.4
+
+    # Finland
+    fi: *use_country_code_frequently
+
+    # France
+    fr:
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: FR
+            probability: 0.6
+            alternatives:
+                - alternative: F
+                  probability: 0.4
+
+    # United Kingdom
+    gb:
+        validate_postcode: true
+
+    # Guernsey
+    gg:
+        validate_postcode: true
+
+    # Greece
+    gr: *use_country_code_seldomly
+
+    # Honduras
+    hn: *use_country_code_frequently
+
+    # Croatia
+    hr: *use_country_code_frequently
+
+    # Hungary
+    hu:
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: HU
+            probability: 0.6
+            alternatives:
+                - alternative: H
+                  probability: 0.4
+
+    # Isle of Man
+    im:
+        validate_postcode: true
+
+    # Iceland
+    is: *use_country_code_seldomly
+
+    # Italy
+    it:
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: IT
+            probability: 0.6
+            alternatives:
+                - alternative: I
+                  probability: 0.4
+
+    # Jersey
+    je:
+        validate_postcode: true
+
+    # Cayman Islands
+    ky: *use_country_code_frequently
+
+    # Saint Lucia
+    lc: *use_country_code_frequently
+
+    # Lithuania
+    lt: *use_country_code_frequently
+
+    # Luxembourg
+    lu:
+        <<: *use_country_code_sometimes
+        country_code_phrase:
+            default: L
+            probability: 0.7
+            alternatives:
+                - alternative: LU
+                  probability: 0.3
+
+    # Latvia
+    lv: *use_country_code_frequently
+
+    # Monaco
+    mc: *use_country_code_seldomly
+
+    # Moldova
+    md: *use_country_code_frequently
+
+    # Montenegro
+    me: *use_country_code_seldomly
+
+    # Macedonia
+    mk: *use_country_code_seldomly
+
+    # Malta
+    mt: 
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: MT
+            probability: 0.6
+            alternatives:
+                - alternative: M
+                  probability: 0.4
+
+    # Netherlands
+    nl: *use_country_code_seldomly
+
+    # Norway
+    "no":
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: "NO"
+            probability: 0.6
+            alternatives:
+                - alternative: N
+                  probability: 0.4
+
+    # Peru
+    pe: *use_country_code_seldomly
+
+    # Poland
+    pl: *use_country_code_seldomly
+
+    # Portugal
+    pt: 
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: PT
+            probability: 0.6
+            alternatives:
+                - alternative: P
+                  probability: 0.4
+
+    # Réunion
+    re: *use_country_code_seldomly
+
+    # Romania
+    ro: *use_country_code_seldomly
+
+    # Serbia
+    rs: *use_country_code_seldomly
+
+    # Sweden
+    se:
+        <<: *use_country_code_sometimes
+        country_code_phrase:
+            default: SE
+            probability: 0.6
+            alternatives:
+                - alternative: S
+                  probability: 0.4
+
+    # Slovenia
+    si: *use_country_code_frequently
+
+    # Svalbard and Jan Mayen
+    sj: *use_country_code_seldomly
+
+    # Slovakia
+    sk: *use_country_code_seldomly
+
+    # San Marino
+    sm: *use_country_code_seldomly
+
+    # Turkey
+    tr: *use_country_code_seldomly
+
+    us:
+        validate_postcode: true
+        strip_components: true
+        override_regex: "(\\d{5})(?:[ \\-]?(\\d{4}))?"
+
+    # Vatican
+    va: 
+        <<: *use_country_code_seldomly
+        country_code_phrase:
+            default: VA
+            probability: 0.6
+            alternatives:
+                - alternative: V
+                  probability: 0.4
+
+    # Saint Vincent and the Grenadines
+    vc: *use_country_code_frequently
+
+    # British Virgin Islands
+    vg: *use_country_code_frequently
+
+    # Samoa
+    ws: *use_country_code_frequently
+
+    # Kosovo
+    xk: *use_country_code_seldomly
--- a/scripts/geodata/geoplanet/geoplanet_training_data.py
+++ b/scripts/geodata/geoplanet/geoplanet_training_data.py
@@ -18,7 +18,7 @@ from geodata.address_expansions.gazetteers import *
 from geodata.address_formatting.formatter import AddressFormatter

 from geodata.countries.names import country_names
-from geodata.i18n.google import postcode_regexes
+from geodata.postal_codes.validation import postcode_regexes
 from geodata.names.normalization import name_affixes
 from geodata.places.config import place_config

--- a/scripts/geodata/i18n/google.py
+++ b/scripts/geodata/i18n/google.py
@@ -35,26 +35,3 @@ class GoogleI18N(object):


 google_i18n = GoogleI18N()
-
-
-class PostcodeRegexes(object):
-    def __init__(self):
-        self.responses = {}
-        self.postcode_regexes = {}
-
-    def get(self, country_code):
-        ret = self.postcode_regexes.get(country_code.lower())
-        if ret is None:
-            response = google_i18n.get(country_code)
-            if response:
-                postcode_expression = response.get('zip')
-                if not postcode_expression:
-                    self.postcode_regexes[country_code.lower()] = None
-                    return None
-                ret = re.compile(postcode_expression, re.I)
-                self.postcode_regexes[country_code.lower()] = ret
-
-        return ret
-
-
-postcode_regexes = PostcodeRegexes()
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
@@ -34,7 +34,6 @@ from geodata.configs.utils import nested_get
 from geodata.countries.country_names import *
 from geodata.language_id.disambiguation import *
 from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION
-from geodata.i18n.google import postcode_regexes
 from geodata.i18n.languages import *
 from geodata.intersections.query import Intersection, IntersectionQuery
 from geodata.address_formatting.formatter import AddressFormatter
@@ -45,6 +44,7 @@ from geodata.osm.intersections import OSMIntersectionReader
 from geodata.places.config import place_config
 from geodata.polygons.language_polys import *
 from geodata.polygons.reverse_geocode import *
+from geodata.postal_codes.validation import postcode_regexes
 from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.text.tokenize import tokenize, token_types
 from geodata.text.utils import is_numeric
@@ -1101,6 +1101,7 @@ class OSMAddressFormatter(object):
                            if u';' in v:
                                v = random.choice(v.split(u';'))

+                            if u',' in v:
                                for p in v.split(','):
                                    if self.valid_postal_code(country, p):
                                        revised_tags[AddressFormatter.POSTCODE] = p.strip()
--- a/scripts/geodata/postal_codes/init.py
+++ b/scripts/geodata/postal_codes/init.py
--- a/scripts/geodata/postal_codes/config.py
+++ b/scripts/geodata/postal_codes/config.py
@@ -0,0 +1,51 @@
+import copy
+import operator
+import os
+import random
+import six
+import yaml
+
+from collections import defaultdict
+
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.address_formatting.formatter import AddressFormatter
+from geodata.configs.utils import nested_get, recursive_merge
+
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+POSTAL_CODES_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                        'resources', 'postal_codes', 'config.yaml')
+
+
+class PostalCodesConfig(object):
+    def __init__(self, config_file=POSTAL_CODES_CONFIG_FILE):
+        self.cache = {}
+        postal_codes_config = yaml.load(open(config_file))
+
+        self.global_config = postal_codes_config['global']
+        self.country_configs = {}
+
+        countries = postal_codes_config.pop('countries', {})
+
+        for k, v in six.iteritems(countries):
+            country_config = countries[k]
+            global_config_copy = copy.deepcopy(self.global_config)
+            self.country_configs[k] = recursive_merge(global_config_copy, country_config)
+
+        self.country_configs[None] = self.global_config
+
+    def get_property(self, key, country=None, default=None):
+        if isinstance(key, six.string_types):
+            key = key.split(u'.')
+
+        config = self.global_config
+
+        if country:
+            country_config = self.country_configs.get(country.lower(), {})
+            if country_config:
+                config = country_config
+
+        return nested_get(config, key, default=default)
+
+postal_codes_config = PostalCodesConfig()
--- a/scripts/geodata/postal_codes/phrases.py
+++ b/scripts/geodata/postal_codes/phrases.py
@@ -0,0 +1,56 @@
+import random
+
+from geodata.configs.utils import alternative_probabilities
+from geodata.math.sampling import weighted_choice, cdf
+from geodata.postal_codes.config import postal_codes_config
+from geodata.postal_codes.validation import postcode_regexes
+
+
+class PostalCodes(object):
+    @classmethod
+    def is_valid(cls, postal_code, country):
+        regex = postcode_regexes.get(country)
+
+        if regex:
+            postal_code = postal_code.strip()
+            m = regex.match(postal_code)
+            if m and m.end() == len(postal_code):
+                return True
+            else:
+                return False
+        return True
+
+    @classmethod
+    def needs_validation(cls, country):
+        return postal_codes_config.get_property('validate_postcode', country=country, default=False)
+
+    @classmethod
+    def should_strip_components(cls, country_code):
+        return postal_codes_config.get_property('strip_components', country=country_code)
+
+    @classmethod
+    def add_country_code(cls, postal_code, country):
+        postal_code = postal_code.strip()
+        if not postal_codes_config.get_property('add_country_code', country=country):
+            return postal_code
+
+        cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0)
+        if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit():
+            return postal_code
+
+        country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None)
+        if country_code_phrases is None:
+            country_code_phrase = country.upper()
+        else:
+            alternates, probs = alternative_probabilities(country_code_phrases)
+            probs_cdf = cdf(probs)
+            country_code_phrase = weighted_choice(alternates, probs_cdf)
+
+        cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0)
+
+        separator = u''
+        r = random.random()
+        if r < cc_hyphen_probability:
+            separator = u'-'
+
+        return u'{}{}{}'.format(country_code_phrase, separator, postal_code)
--- a/scripts/geodata/postal_codes/validation.py
+++ b/scripts/geodata/postal_codes/validation.py
@@ -0,0 +1,33 @@
+import re
+from geodata.i18n.google import google_i18n
+from geodata.postal_codes.config import postal_codes_config
+
+
+class PostcodeRegexes(object):
+    def __init__(self):
+        self.responses = {}
+        self.postcode_regexes = {}
+
+    def get(self, country_code):
+        ret = self.postcode_regexes.get(country_code.lower())
+        if ret is None:
+
+            override_regex = postal_codes_config.get_property('override_regex', country=country_code)
+            if override_regex:
+                ret = re.compile(override_regex, re.I)
+                self.postcode_regexes[country_code.lower()] = ret
+                return ret
+
+            response = google_i18n.get(country_code)
+            if response:
+                postcode_expression = response.get('zip')
+                if not postcode_expression:
+                    self.postcode_regexes[country_code.lower()] = None
+                    return None
+                ret = re.compile(postcode_expression, re.I)
+                self.postcode_regexes[country_code.lower()] = ret
+
+        return ret
+
+
+postcode_regexes = PostcodeRegexes()