[addresses] adding new config for postal codes around the world. Allows appending the ISO alpha-2 country code to the beginning of the postcode as in e.g. SI-1000 (only used if the postcode begins with a digit). This system was used for postal codes in continental Europe as a recommendation from the CEPT. Now 7 member states still use it, so in those countries add the country-code with higher probability. The config also contains the license plate codes for countries where e.g. L-1234 might be used instead of LU-1234. Allows configuring in which countries postcodes should be validated using Google's per-country validation regexes (and the ability to override with a custom regex), and in which countries other admin component names should be stripped.
This commit is contained in:
300
resources/postal_codes/config.yaml
Normal file
300
resources/postal_codes/config.yaml
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
global:
|
||||||
|
# Validate using google I18N regexes
|
||||||
|
validate_postcode: false
|
||||||
|
# Strip other components like city, state, etc. from the postcode
|
||||||
|
strip_components: false
|
||||||
|
|
||||||
|
# Add country code to the beginning of the string
|
||||||
|
add_country_code: false
|
||||||
|
country_code_probablity: 0.0
|
||||||
|
country_code_hyphen_probability: 0.0
|
||||||
|
|
||||||
|
|
||||||
|
use_country_code_seldomly: &use_country_code_seldomly
|
||||||
|
add_country_code: true
|
||||||
|
country_code_probablity: 0.1
|
||||||
|
country_code_hyphen_probability: 0.8
|
||||||
|
|
||||||
|
use_country_code_sometimes: &use_country_code_sometimes
|
||||||
|
add_country_code: true
|
||||||
|
country_code_probablity: 0.4
|
||||||
|
country_code_hyphen_probability: 0.8
|
||||||
|
|
||||||
|
use_country_code_frequently: &use_country_code_frequently
|
||||||
|
add_country_code: true
|
||||||
|
country_code_probablity: 0.7
|
||||||
|
country_code_hyphen_probability: 0.8
|
||||||
|
|
||||||
|
|
||||||
|
countries:
|
||||||
|
# Albania
|
||||||
|
al: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Austria
|
||||||
|
at:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: AT
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: A
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Åland Islands (same as Finland)
|
||||||
|
ax: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Azerbaijan
|
||||||
|
az: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Barbados
|
||||||
|
bb: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Belgium
|
||||||
|
be:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: BE
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: B
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Bulgaria
|
||||||
|
bg: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Bosnia and Herzegovina
|
||||||
|
bs: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Canada
|
||||||
|
ca:
|
||||||
|
validate_postcode: true
|
||||||
|
|
||||||
|
# Switzerland
|
||||||
|
ch: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Cyprus
|
||||||
|
cy: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Czech Republic
|
||||||
|
cz: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Germany
|
||||||
|
de:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: DE
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: D
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Denmark
|
||||||
|
dk: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Estonia
|
||||||
|
ee: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Spain
|
||||||
|
es:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: ES
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: E
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Finland
|
||||||
|
fi: *use_country_code_frequently
|
||||||
|
|
||||||
|
# France
|
||||||
|
fr:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: FR
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: F
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# United Kingdom
|
||||||
|
gb:
|
||||||
|
validate_postcode: true
|
||||||
|
|
||||||
|
# Guernsey
|
||||||
|
gg:
|
||||||
|
validate_postcode: true
|
||||||
|
|
||||||
|
# Greece
|
||||||
|
gr: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Honduras
|
||||||
|
hn: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Croatia
|
||||||
|
hr: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Hungary
|
||||||
|
hu:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: HU
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: H
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Isle of Man
|
||||||
|
im:
|
||||||
|
validate_postcode: true
|
||||||
|
|
||||||
|
# Iceland
|
||||||
|
is: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Italy
|
||||||
|
it:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: IT
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: I
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Jersey
|
||||||
|
je:
|
||||||
|
validate_postcode: true
|
||||||
|
|
||||||
|
# Cayman Islands
|
||||||
|
ky: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Saint Lucia
|
||||||
|
lc: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Lithuania
|
||||||
|
lt: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Luxembourg
|
||||||
|
lu:
|
||||||
|
<<: *use_country_code_sometimes
|
||||||
|
country_code_phrase:
|
||||||
|
default: L
|
||||||
|
probability: 0.7
|
||||||
|
alternatives:
|
||||||
|
- alternative: LU
|
||||||
|
probability: 0.3
|
||||||
|
|
||||||
|
# Latvia
|
||||||
|
lv: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Monaco
|
||||||
|
mc: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Moldova
|
||||||
|
md: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Montenegro
|
||||||
|
me: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Macedonia
|
||||||
|
mk: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Malta
|
||||||
|
mt:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: MT
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: M
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Netherlands
|
||||||
|
nl: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Norway
|
||||||
|
"no":
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: "NO"
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: N
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Peru
|
||||||
|
pe: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Poland
|
||||||
|
pl: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Portugal
|
||||||
|
pt:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: PT
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: P
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Réunion
|
||||||
|
re: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Romania
|
||||||
|
ro: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Serbia
|
||||||
|
rs: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Sweden
|
||||||
|
se:
|
||||||
|
<<: *use_country_code_sometimes
|
||||||
|
country_code_phrase:
|
||||||
|
default: SE
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: S
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Slovenia
|
||||||
|
si: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Svalbard and Jan Mayen
|
||||||
|
sj: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Slovakia
|
||||||
|
sk: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# San Marino
|
||||||
|
sm: *use_country_code_seldomly
|
||||||
|
|
||||||
|
# Turkey
|
||||||
|
tr: *use_country_code_seldomly
|
||||||
|
|
||||||
|
us:
|
||||||
|
validate_postcode: true
|
||||||
|
strip_components: true
|
||||||
|
override_regex: "(\\d{5})(?:[ \\-]?(\\d{4}))?"
|
||||||
|
|
||||||
|
# Vatican
|
||||||
|
va:
|
||||||
|
<<: *use_country_code_seldomly
|
||||||
|
country_code_phrase:
|
||||||
|
default: VA
|
||||||
|
probability: 0.6
|
||||||
|
alternatives:
|
||||||
|
- alternative: V
|
||||||
|
probability: 0.4
|
||||||
|
|
||||||
|
# Saint Vincent and the Grenadines
|
||||||
|
vc: *use_country_code_frequently
|
||||||
|
|
||||||
|
# British Virgin Islands
|
||||||
|
vg: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Samoa
|
||||||
|
ws: *use_country_code_frequently
|
||||||
|
|
||||||
|
# Kosovo
|
||||||
|
xk: *use_country_code_seldomly
|
||||||
@@ -18,7 +18,7 @@ from geodata.address_expansions.gazetteers import *
|
|||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
|
|
||||||
from geodata.countries.names import country_names
|
from geodata.countries.names import country_names
|
||||||
from geodata.i18n.google import postcode_regexes
|
from geodata.postal_codes.validation import postcode_regexes
|
||||||
from geodata.names.normalization import name_affixes
|
from geodata.names.normalization import name_affixes
|
||||||
from geodata.places.config import place_config
|
from geodata.places.config import place_config
|
||||||
|
|
||||||
|
|||||||
@@ -35,26 +35,3 @@ class GoogleI18N(object):
|
|||||||
|
|
||||||
|
|
||||||
google_i18n = GoogleI18N()
|
google_i18n = GoogleI18N()
|
||||||
|
|
||||||
|
|
||||||
class PostcodeRegexes(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.responses = {}
|
|
||||||
self.postcode_regexes = {}
|
|
||||||
|
|
||||||
def get(self, country_code):
|
|
||||||
ret = self.postcode_regexes.get(country_code.lower())
|
|
||||||
if ret is None:
|
|
||||||
response = google_i18n.get(country_code)
|
|
||||||
if response:
|
|
||||||
postcode_expression = response.get('zip')
|
|
||||||
if not postcode_expression:
|
|
||||||
self.postcode_regexes[country_code.lower()] = None
|
|
||||||
return None
|
|
||||||
ret = re.compile(postcode_expression, re.I)
|
|
||||||
self.postcode_regexes[country_code.lower()] = ret
|
|
||||||
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
postcode_regexes = PostcodeRegexes()
|
|
||||||
|
|||||||
@@ -34,7 +34,6 @@ from geodata.configs.utils import nested_get
|
|||||||
from geodata.countries.country_names import *
|
from geodata.countries.country_names import *
|
||||||
from geodata.language_id.disambiguation import *
|
from geodata.language_id.disambiguation import *
|
||||||
from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION
|
from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION
|
||||||
from geodata.i18n.google import postcode_regexes
|
|
||||||
from geodata.i18n.languages import *
|
from geodata.i18n.languages import *
|
||||||
from geodata.intersections.query import Intersection, IntersectionQuery
|
from geodata.intersections.query import Intersection, IntersectionQuery
|
||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
@@ -45,6 +44,7 @@ from geodata.osm.intersections import OSMIntersectionReader
|
|||||||
from geodata.places.config import place_config
|
from geodata.places.config import place_config
|
||||||
from geodata.polygons.language_polys import *
|
from geodata.polygons.language_polys import *
|
||||||
from geodata.polygons.reverse_geocode import *
|
from geodata.polygons.reverse_geocode import *
|
||||||
|
from geodata.postal_codes.validation import postcode_regexes
|
||||||
from geodata.i18n.unicode_paths import DATA_DIR
|
from geodata.i18n.unicode_paths import DATA_DIR
|
||||||
from geodata.text.tokenize import tokenize, token_types
|
from geodata.text.tokenize import tokenize, token_types
|
||||||
from geodata.text.utils import is_numeric
|
from geodata.text.utils import is_numeric
|
||||||
@@ -1101,10 +1101,11 @@ class OSMAddressFormatter(object):
|
|||||||
if u';' in v:
|
if u';' in v:
|
||||||
v = random.choice(v.split(u';'))
|
v = random.choice(v.split(u';'))
|
||||||
|
|
||||||
for p in v.split(','):
|
if u',' in v:
|
||||||
if self.valid_postal_code(country, p):
|
for p in v.split(','):
|
||||||
revised_tags[AddressFormatter.POSTCODE] = p.strip()
|
if self.valid_postal_code(country, p):
|
||||||
break
|
revised_tags[AddressFormatter.POSTCODE] = p.strip()
|
||||||
|
break
|
||||||
elif k == AddressFormatter.HOUSE:
|
elif k == AddressFormatter.HOUSE:
|
||||||
building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type))
|
building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type))
|
||||||
|
|
||||||
|
|||||||
0
scripts/geodata/postal_codes/__init__.py
Normal file
0
scripts/geodata/postal_codes/__init__.py
Normal file
51
scripts/geodata/postal_codes/config.py
Normal file
51
scripts/geodata/postal_codes/config.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import copy
|
||||||
|
import operator
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import six
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||||
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
|
from geodata.configs.utils import nested_get, recursive_merge
|
||||||
|
|
||||||
|
|
||||||
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
POSTAL_CODES_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
|
'resources', 'postal_codes', 'config.yaml')
|
||||||
|
|
||||||
|
|
||||||
|
class PostalCodesConfig(object):
|
||||||
|
def __init__(self, config_file=POSTAL_CODES_CONFIG_FILE):
|
||||||
|
self.cache = {}
|
||||||
|
postal_codes_config = yaml.load(open(config_file))
|
||||||
|
|
||||||
|
self.global_config = postal_codes_config['global']
|
||||||
|
self.country_configs = {}
|
||||||
|
|
||||||
|
countries = postal_codes_config.pop('countries', {})
|
||||||
|
|
||||||
|
for k, v in six.iteritems(countries):
|
||||||
|
country_config = countries[k]
|
||||||
|
global_config_copy = copy.deepcopy(self.global_config)
|
||||||
|
self.country_configs[k] = recursive_merge(global_config_copy, country_config)
|
||||||
|
|
||||||
|
self.country_configs[None] = self.global_config
|
||||||
|
|
||||||
|
def get_property(self, key, country=None, default=None):
|
||||||
|
if isinstance(key, six.string_types):
|
||||||
|
key = key.split(u'.')
|
||||||
|
|
||||||
|
config = self.global_config
|
||||||
|
|
||||||
|
if country:
|
||||||
|
country_config = self.country_configs.get(country.lower(), {})
|
||||||
|
if country_config:
|
||||||
|
config = country_config
|
||||||
|
|
||||||
|
return nested_get(config, key, default=default)
|
||||||
|
|
||||||
|
postal_codes_config = PostalCodesConfig()
|
||||||
56
scripts/geodata/postal_codes/phrases.py
Normal file
56
scripts/geodata/postal_codes/phrases.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
from geodata.configs.utils import alternative_probabilities
|
||||||
|
from geodata.math.sampling import weighted_choice, cdf
|
||||||
|
from geodata.postal_codes.config import postal_codes_config
|
||||||
|
from geodata.postal_codes.validation import postcode_regexes
|
||||||
|
|
||||||
|
|
||||||
|
class PostalCodes(object):
|
||||||
|
@classmethod
|
||||||
|
def is_valid(cls, postal_code, country):
|
||||||
|
regex = postcode_regexes.get(country)
|
||||||
|
|
||||||
|
if regex:
|
||||||
|
postal_code = postal_code.strip()
|
||||||
|
m = regex.match(postal_code)
|
||||||
|
if m and m.end() == len(postal_code):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def needs_validation(cls, country):
|
||||||
|
return postal_codes_config.get_property('validate_postcode', country=country, default=False)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def should_strip_components(cls, country_code):
|
||||||
|
return postal_codes_config.get_property('strip_components', country=country_code)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def add_country_code(cls, postal_code, country):
|
||||||
|
postal_code = postal_code.strip()
|
||||||
|
if not postal_codes_config.get_property('add_country_code', country=country):
|
||||||
|
return postal_code
|
||||||
|
|
||||||
|
cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0)
|
||||||
|
if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit():
|
||||||
|
return postal_code
|
||||||
|
|
||||||
|
country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None)
|
||||||
|
if country_code_phrases is None:
|
||||||
|
country_code_phrase = country.upper()
|
||||||
|
else:
|
||||||
|
alternates, probs = alternative_probabilities(country_code_phrases)
|
||||||
|
probs_cdf = cdf(probs)
|
||||||
|
country_code_phrase = weighted_choice(alternates, probs_cdf)
|
||||||
|
|
||||||
|
cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0)
|
||||||
|
|
||||||
|
separator = u''
|
||||||
|
r = random.random()
|
||||||
|
if r < cc_hyphen_probability:
|
||||||
|
separator = u'-'
|
||||||
|
|
||||||
|
return u'{}{}{}'.format(country_code_phrase, separator, postal_code)
|
||||||
33
scripts/geodata/postal_codes/validation.py
Normal file
33
scripts/geodata/postal_codes/validation.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import re
|
||||||
|
from geodata.i18n.google import google_i18n
|
||||||
|
from geodata.postal_codes.config import postal_codes_config
|
||||||
|
|
||||||
|
|
||||||
|
class PostcodeRegexes(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.responses = {}
|
||||||
|
self.postcode_regexes = {}
|
||||||
|
|
||||||
|
def get(self, country_code):
|
||||||
|
ret = self.postcode_regexes.get(country_code.lower())
|
||||||
|
if ret is None:
|
||||||
|
|
||||||
|
override_regex = postal_codes_config.get_property('override_regex', country=country_code)
|
||||||
|
if override_regex:
|
||||||
|
ret = re.compile(override_regex, re.I)
|
||||||
|
self.postcode_regexes[country_code.lower()] = ret
|
||||||
|
return ret
|
||||||
|
|
||||||
|
response = google_i18n.get(country_code)
|
||||||
|
if response:
|
||||||
|
postcode_expression = response.get('zip')
|
||||||
|
if not postcode_expression:
|
||||||
|
self.postcode_regexes[country_code.lower()] = None
|
||||||
|
return None
|
||||||
|
ret = re.compile(postcode_expression, re.I)
|
||||||
|
self.postcode_regexes[country_code.lower()] = ret
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
postcode_regexes = PostcodeRegexes()
|
||||||
Reference in New Issue
Block a user