[addresses] adding new config for postal codes around the world. Allows appending the ISO alpha-2 country code to the beginning of the postcode as in e.g. SI-1000 (only used if the postcode begins with a digit). This system was used for postal codes in continental Europe as a recommendation from the CEPT. Now 7 member states still use it, so in those countries add the country-code with higher probability. The config also contains the license plate codes for countries where e.g. L-1234 might be used instead of LU-1234. Allows configuring in which countries postcodes should be validated using Google's per-country validation regexes (and the ability to override with a custom regex), and in which countries other admin component names should be stripped.

This commit is contained in:
Al
2017-02-10 18:38:32 -05:00
parent 109aa76718
commit 293587bae9
8 changed files with 447 additions and 29 deletions

View File

@@ -0,0 +1,300 @@
global:
# Validate using google I18N regexes
validate_postcode: false
# Strip other components like city, state, etc. from the postcode
strip_components: false
# Add country code to the beginning of the string
add_country_code: false
country_code_probablity: 0.0
country_code_hyphen_probability: 0.0
use_country_code_seldomly: &use_country_code_seldomly
add_country_code: true
country_code_probablity: 0.1
country_code_hyphen_probability: 0.8
use_country_code_sometimes: &use_country_code_sometimes
add_country_code: true
country_code_probablity: 0.4
country_code_hyphen_probability: 0.8
use_country_code_frequently: &use_country_code_frequently
add_country_code: true
country_code_probablity: 0.7
country_code_hyphen_probability: 0.8
countries:
# Albania
al: *use_country_code_seldomly
# Austria
at:
<<: *use_country_code_seldomly
country_code_phrase:
default: AT
probability: 0.6
alternatives:
- alternative: A
probability: 0.4
# Åland Islands (same as Finland)
ax: *use_country_code_frequently
# Azerbaijan
az: *use_country_code_frequently
# Barbados
bb: *use_country_code_frequently
# Belgium
be:
<<: *use_country_code_seldomly
country_code_phrase:
default: BE
probability: 0.6
alternatives:
- alternative: B
probability: 0.4
# Bulgaria
bg: *use_country_code_seldomly
# Bosnia and Herzegovina
bs: *use_country_code_seldomly
# Canada
ca:
validate_postcode: true
# Switzerland
ch: *use_country_code_seldomly
# Cyprus
cy: *use_country_code_seldomly
# Czech Republic
cz: *use_country_code_seldomly
# Germany
de:
<<: *use_country_code_seldomly
country_code_phrase:
default: DE
probability: 0.6
alternatives:
- alternative: D
probability: 0.4
# Denmark
dk: *use_country_code_seldomly
# Estonia
ee: *use_country_code_seldomly
# Spain
es:
<<: *use_country_code_seldomly
country_code_phrase:
default: ES
probability: 0.6
alternatives:
- alternative: E
probability: 0.4
# Finland
fi: *use_country_code_frequently
# France
fr:
<<: *use_country_code_seldomly
country_code_phrase:
default: FR
probability: 0.6
alternatives:
- alternative: F
probability: 0.4
# United Kingdom
gb:
validate_postcode: true
# Guernsey
gg:
validate_postcode: true
# Greece
gr: *use_country_code_seldomly
# Honduras
hn: *use_country_code_frequently
# Croatia
hr: *use_country_code_frequently
# Hungary
hu:
<<: *use_country_code_seldomly
country_code_phrase:
default: HU
probability: 0.6
alternatives:
- alternative: H
probability: 0.4
# Isle of Man
im:
validate_postcode: true
# Iceland
is: *use_country_code_seldomly
# Italy
it:
<<: *use_country_code_seldomly
country_code_phrase:
default: IT
probability: 0.6
alternatives:
- alternative: I
probability: 0.4
# Jersey
je:
validate_postcode: true
# Cayman Islands
ky: *use_country_code_frequently
# Saint Lucia
lc: *use_country_code_frequently
# Lithuania
lt: *use_country_code_frequently
# Luxembourg
lu:
<<: *use_country_code_sometimes
country_code_phrase:
default: L
probability: 0.7
alternatives:
- alternative: LU
probability: 0.3
# Latvia
lv: *use_country_code_frequently
# Monaco
mc: *use_country_code_seldomly
# Moldova
md: *use_country_code_frequently
# Montenegro
me: *use_country_code_seldomly
# Macedonia
mk: *use_country_code_seldomly
# Malta
mt:
<<: *use_country_code_seldomly
country_code_phrase:
default: MT
probability: 0.6
alternatives:
- alternative: M
probability: 0.4
# Netherlands
nl: *use_country_code_seldomly
# Norway
"no":
<<: *use_country_code_seldomly
country_code_phrase:
default: "NO"
probability: 0.6
alternatives:
- alternative: N
probability: 0.4
# Peru
pe: *use_country_code_seldomly
# Poland
pl: *use_country_code_seldomly
# Portugal
pt:
<<: *use_country_code_seldomly
country_code_phrase:
default: PT
probability: 0.6
alternatives:
- alternative: P
probability: 0.4
# Réunion
re: *use_country_code_seldomly
# Romania
ro: *use_country_code_seldomly
# Serbia
rs: *use_country_code_seldomly
# Sweden
se:
<<: *use_country_code_sometimes
country_code_phrase:
default: SE
probability: 0.6
alternatives:
- alternative: S
probability: 0.4
# Slovenia
si: *use_country_code_frequently
# Svalbard and Jan Mayen
sj: *use_country_code_seldomly
# Slovakia
sk: *use_country_code_seldomly
# San Marino
sm: *use_country_code_seldomly
# Turkey
tr: *use_country_code_seldomly
us:
validate_postcode: true
strip_components: true
override_regex: "(\\d{5})(?:[ \\-]?(\\d{4}))?"
# Vatican
va:
<<: *use_country_code_seldomly
country_code_phrase:
default: VA
probability: 0.6
alternatives:
- alternative: V
probability: 0.4
# Saint Vincent and the Grenadines
vc: *use_country_code_frequently
# British Virgin Islands
vg: *use_country_code_frequently
# Samoa
ws: *use_country_code_frequently
# Kosovo
xk: *use_country_code_seldomly

View File

@@ -18,7 +18,7 @@ from geodata.address_expansions.gazetteers import *
from geodata.address_formatting.formatter import AddressFormatter
from geodata.countries.names import country_names
from geodata.i18n.google import postcode_regexes
from geodata.postal_codes.validation import postcode_regexes
from geodata.names.normalization import name_affixes
from geodata.places.config import place_config

View File

@@ -35,26 +35,3 @@ class GoogleI18N(object):
google_i18n = GoogleI18N()
class PostcodeRegexes(object):
def __init__(self):
self.responses = {}
self.postcode_regexes = {}
def get(self, country_code):
ret = self.postcode_regexes.get(country_code.lower())
if ret is None:
response = google_i18n.get(country_code)
if response:
postcode_expression = response.get('zip')
if not postcode_expression:
self.postcode_regexes[country_code.lower()] = None
return None
ret = re.compile(postcode_expression, re.I)
self.postcode_regexes[country_code.lower()] = ret
return ret
postcode_regexes = PostcodeRegexes()

View File

@@ -34,7 +34,6 @@ from geodata.configs.utils import nested_get
from geodata.countries.country_names import *
from geodata.language_id.disambiguation import *
from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION
from geodata.i18n.google import postcode_regexes
from geodata.i18n.languages import *
from geodata.intersections.query import Intersection, IntersectionQuery
from geodata.address_formatting.formatter import AddressFormatter
@@ -45,6 +44,7 @@ from geodata.osm.intersections import OSMIntersectionReader
from geodata.places.config import place_config
from geodata.polygons.language_polys import *
from geodata.polygons.reverse_geocode import *
from geodata.postal_codes.validation import postcode_regexes
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.text.tokenize import tokenize, token_types
from geodata.text.utils import is_numeric
@@ -1101,10 +1101,11 @@ class OSMAddressFormatter(object):
if u';' in v:
v = random.choice(v.split(u';'))
for p in v.split(','):
if self.valid_postal_code(country, p):
revised_tags[AddressFormatter.POSTCODE] = p.strip()
break
if u',' in v:
for p in v.split(','):
if self.valid_postal_code(country, p):
revised_tags[AddressFormatter.POSTCODE] = p.strip()
break
elif k == AddressFormatter.HOUSE:
building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type))

View File

View File

@@ -0,0 +1,51 @@
import copy
import operator
import os
import random
import six
import yaml
from collections import defaultdict
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.address_formatting.formatter import AddressFormatter
from geodata.configs.utils import nested_get, recursive_merge
this_dir = os.path.realpath(os.path.dirname(__file__))
POSTAL_CODES_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'postal_codes', 'config.yaml')
class PostalCodesConfig(object):
def __init__(self, config_file=POSTAL_CODES_CONFIG_FILE):
self.cache = {}
postal_codes_config = yaml.load(open(config_file))
self.global_config = postal_codes_config['global']
self.country_configs = {}
countries = postal_codes_config.pop('countries', {})
for k, v in six.iteritems(countries):
country_config = countries[k]
global_config_copy = copy.deepcopy(self.global_config)
self.country_configs[k] = recursive_merge(global_config_copy, country_config)
self.country_configs[None] = self.global_config
def get_property(self, key, country=None, default=None):
if isinstance(key, six.string_types):
key = key.split(u'.')
config = self.global_config
if country:
country_config = self.country_configs.get(country.lower(), {})
if country_config:
config = country_config
return nested_get(config, key, default=default)
postal_codes_config = PostalCodesConfig()

View File

@@ -0,0 +1,56 @@
import random
from geodata.configs.utils import alternative_probabilities
from geodata.math.sampling import weighted_choice, cdf
from geodata.postal_codes.config import postal_codes_config
from geodata.postal_codes.validation import postcode_regexes
class PostalCodes(object):
@classmethod
def is_valid(cls, postal_code, country):
regex = postcode_regexes.get(country)
if regex:
postal_code = postal_code.strip()
m = regex.match(postal_code)
if m and m.end() == len(postal_code):
return True
else:
return False
return True
@classmethod
def needs_validation(cls, country):
return postal_codes_config.get_property('validate_postcode', country=country, default=False)
@classmethod
def should_strip_components(cls, country_code):
return postal_codes_config.get_property('strip_components', country=country_code)
@classmethod
def add_country_code(cls, postal_code, country):
postal_code = postal_code.strip()
if not postal_codes_config.get_property('add_country_code', country=country):
return postal_code
cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0)
if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit():
return postal_code
country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None)
if country_code_phrases is None:
country_code_phrase = country.upper()
else:
alternates, probs = alternative_probabilities(country_code_phrases)
probs_cdf = cdf(probs)
country_code_phrase = weighted_choice(alternates, probs_cdf)
cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0)
separator = u''
r = random.random()
if r < cc_hyphen_probability:
separator = u'-'
return u'{}{}{}'.format(country_code_phrase, separator, postal_code)

View File

@@ -0,0 +1,33 @@
import re
from geodata.i18n.google import google_i18n
from geodata.postal_codes.config import postal_codes_config
class PostcodeRegexes(object):
def __init__(self):
self.responses = {}
self.postcode_regexes = {}
def get(self, country_code):
ret = self.postcode_regexes.get(country_code.lower())
if ret is None:
override_regex = postal_codes_config.get_property('override_regex', country=country_code)
if override_regex:
ret = re.compile(override_regex, re.I)
self.postcode_regexes[country_code.lower()] = ret
return ret
response = google_i18n.get(country_code)
if response:
postcode_expression = response.get('zip')
if not postcode_expression:
self.postcode_regexes[country_code.lower()] = None
return None
ret = re.compile(postcode_expression, re.I)
self.postcode_regexes[country_code.lower()] = ret
return ret
postcode_regexes = PostcodeRegexes()