diff --git a/resources/postal_codes/config.yaml b/resources/postal_codes/config.yaml new file mode 100644 index 00000000..dcae4a2f --- /dev/null +++ b/resources/postal_codes/config.yaml @@ -0,0 +1,300 @@ +global: + # Validate using google I18N regexes + validate_postcode: false + # Strip other components like city, state, etc. from the postcode + strip_components: false + + # Add country code to the beginning of the string + add_country_code: false + country_code_probablity: 0.0 + country_code_hyphen_probability: 0.0 + + +use_country_code_seldomly: &use_country_code_seldomly + add_country_code: true + country_code_probablity: 0.1 + country_code_hyphen_probability: 0.8 + +use_country_code_sometimes: &use_country_code_sometimes + add_country_code: true + country_code_probablity: 0.4 + country_code_hyphen_probability: 0.8 + +use_country_code_frequently: &use_country_code_frequently + add_country_code: true + country_code_probablity: 0.7 + country_code_hyphen_probability: 0.8 + + +countries: + # Albania + al: *use_country_code_seldomly + + # Austria + at: + <<: *use_country_code_seldomly + country_code_phrase: + default: AT + probability: 0.6 + alternatives: + - alternative: A + probability: 0.4 + + # Åland Islands (same as Finland) + ax: *use_country_code_frequently + + # Azerbaijan + az: *use_country_code_frequently + + # Barbados + bb: *use_country_code_frequently + + # Belgium + be: + <<: *use_country_code_seldomly + country_code_phrase: + default: BE + probability: 0.6 + alternatives: + - alternative: B + probability: 0.4 + + # Bulgaria + bg: *use_country_code_seldomly + + # Bosnia and Herzegovina + bs: *use_country_code_seldomly + + # Canada + ca: + validate_postcode: true + + # Switzerland + ch: *use_country_code_seldomly + + # Cyprus + cy: *use_country_code_seldomly + + # Czech Republic + cz: *use_country_code_seldomly + + # Germany + de: + <<: *use_country_code_seldomly + country_code_phrase: + default: DE + probability: 0.6 + alternatives: + - alternative: D + probability: 0.4 + + # Denmark + dk: *use_country_code_seldomly + + # Estonia + ee: *use_country_code_seldomly + + # Spain + es: + <<: *use_country_code_seldomly + country_code_phrase: + default: ES + probability: 0.6 + alternatives: + - alternative: E + probability: 0.4 + + # Finland + fi: *use_country_code_frequently + + # France + fr: + <<: *use_country_code_seldomly + country_code_phrase: + default: FR + probability: 0.6 + alternatives: + - alternative: F + probability: 0.4 + + # United Kingdom + gb: + validate_postcode: true + + # Guernsey + gg: + validate_postcode: true + + # Greece + gr: *use_country_code_seldomly + + # Honduras + hn: *use_country_code_frequently + + # Croatia + hr: *use_country_code_frequently + + # Hungary + hu: + <<: *use_country_code_seldomly + country_code_phrase: + default: HU + probability: 0.6 + alternatives: + - alternative: H + probability: 0.4 + + # Isle of Man + im: + validate_postcode: true + + # Iceland + is: *use_country_code_seldomly + + # Italy + it: + <<: *use_country_code_seldomly + country_code_phrase: + default: IT + probability: 0.6 + alternatives: + - alternative: I + probability: 0.4 + + # Jersey + je: + validate_postcode: true + + # Cayman Islands + ky: *use_country_code_frequently + + # Saint Lucia + lc: *use_country_code_frequently + + # Lithuania + lt: *use_country_code_frequently + + # Luxembourg + lu: + <<: *use_country_code_sometimes + country_code_phrase: + default: L + probability: 0.7 + alternatives: + - alternative: LU + probability: 0.3 + + # Latvia + lv: *use_country_code_frequently + + # Monaco + mc: *use_country_code_seldomly + + # Moldova + md: *use_country_code_frequently + + # Montenegro + me: *use_country_code_seldomly + + # Macedonia + mk: *use_country_code_seldomly + + # Malta + mt: + <<: *use_country_code_seldomly + country_code_phrase: + default: MT + probability: 0.6 + alternatives: + - alternative: M + probability: 0.4 + + # Netherlands + nl: *use_country_code_seldomly + + # Norway + "no": + <<: *use_country_code_seldomly + country_code_phrase: + default: "NO" + probability: 0.6 + alternatives: + - alternative: N + probability: 0.4 + + # Peru + pe: *use_country_code_seldomly + + # Poland + pl: *use_country_code_seldomly + + # Portugal + pt: + <<: *use_country_code_seldomly + country_code_phrase: + default: PT + probability: 0.6 + alternatives: + - alternative: P + probability: 0.4 + + # Réunion + re: *use_country_code_seldomly + + # Romania + ro: *use_country_code_seldomly + + # Serbia + rs: *use_country_code_seldomly + + # Sweden + se: + <<: *use_country_code_sometimes + country_code_phrase: + default: SE + probability: 0.6 + alternatives: + - alternative: S + probability: 0.4 + + # Slovenia + si: *use_country_code_frequently + + # Svalbard and Jan Mayen + sj: *use_country_code_seldomly + + # Slovakia + sk: *use_country_code_seldomly + + # San Marino + sm: *use_country_code_seldomly + + # Turkey + tr: *use_country_code_seldomly + + us: + validate_postcode: true + strip_components: true + override_regex: "(\\d{5})(?:[ \\-]?(\\d{4}))?" + + # Vatican + va: + <<: *use_country_code_seldomly + country_code_phrase: + default: VA + probability: 0.6 + alternatives: + - alternative: V + probability: 0.4 + + # Saint Vincent and the Grenadines + vc: *use_country_code_frequently + + # British Virgin Islands + vg: *use_country_code_frequently + + # Samoa + ws: *use_country_code_frequently + + # Kosovo + xk: *use_country_code_seldomly diff --git a/scripts/geodata/geoplanet/geoplanet_training_data.py b/scripts/geodata/geoplanet/geoplanet_training_data.py index c83a3e24..68e9f2c3 100644 --- a/scripts/geodata/geoplanet/geoplanet_training_data.py +++ b/scripts/geodata/geoplanet/geoplanet_training_data.py @@ -18,7 +18,7 @@ from geodata.address_expansions.gazetteers import * from geodata.address_formatting.formatter import AddressFormatter from geodata.countries.names import country_names -from geodata.i18n.google import postcode_regexes +from geodata.postal_codes.validation import postcode_regexes from geodata.names.normalization import name_affixes from geodata.places.config import place_config diff --git a/scripts/geodata/i18n/google.py b/scripts/geodata/i18n/google.py index 988a6320..0e78d29c 100644 --- a/scripts/geodata/i18n/google.py +++ b/scripts/geodata/i18n/google.py @@ -35,26 +35,3 @@ class GoogleI18N(object): google_i18n = GoogleI18N() - - -class PostcodeRegexes(object): - def __init__(self): - self.responses = {} - self.postcode_regexes = {} - - def get(self, country_code): - ret = self.postcode_regexes.get(country_code.lower()) - if ret is None: - response = google_i18n.get(country_code) - if response: - postcode_expression = response.get('zip') - if not postcode_expression: - self.postcode_regexes[country_code.lower()] = None - return None - ret = re.compile(postcode_expression, re.I) - self.postcode_regexes[country_code.lower()] = ret - - return ret - - -postcode_regexes = PostcodeRegexes() diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 8791627e..dbfb2bee 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -34,7 +34,6 @@ from geodata.configs.utils import nested_get from geodata.countries.country_names import * from geodata.language_id.disambiguation import * from geodata.language_id.sample import INTERNET_LANGUAGE_DISTRIBUTION -from geodata.i18n.google import postcode_regexes from geodata.i18n.languages import * from geodata.intersections.query import Intersection, IntersectionQuery from geodata.address_formatting.formatter import AddressFormatter @@ -45,6 +44,7 @@ from geodata.osm.intersections import OSMIntersectionReader from geodata.places.config import place_config from geodata.polygons.language_polys import * from geodata.polygons.reverse_geocode import * +from geodata.postal_codes.validation import postcode_regexes from geodata.i18n.unicode_paths import DATA_DIR from geodata.text.tokenize import tokenize, token_types from geodata.text.utils import is_numeric @@ -1101,10 +1101,11 @@ class OSMAddressFormatter(object): if u';' in v: v = random.choice(v.split(u';')) - for p in v.split(','): - if self.valid_postal_code(country, p): - revised_tags[AddressFormatter.POSTCODE] = p.strip() - break + if u',' in v: + for p in v.split(','): + if self.valid_postal_code(country, p): + revised_tags[AddressFormatter.POSTCODE] = p.strip() + break elif k == AddressFormatter.HOUSE: building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type)) diff --git a/scripts/geodata/postal_codes/__init__.py b/scripts/geodata/postal_codes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/postal_codes/config.py b/scripts/geodata/postal_codes/config.py new file mode 100644 index 00000000..3661e895 --- /dev/null +++ b/scripts/geodata/postal_codes/config.py @@ -0,0 +1,51 @@ +import copy +import operator +import os +import random +import six +import yaml + +from collections import defaultdict + +from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries +from geodata.address_formatting.formatter import AddressFormatter +from geodata.configs.utils import nested_get, recursive_merge + + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +POSTAL_CODES_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'postal_codes', 'config.yaml') + + +class PostalCodesConfig(object): + def __init__(self, config_file=POSTAL_CODES_CONFIG_FILE): + self.cache = {} + postal_codes_config = yaml.load(open(config_file)) + + self.global_config = postal_codes_config['global'] + self.country_configs = {} + + countries = postal_codes_config.pop('countries', {}) + + for k, v in six.iteritems(countries): + country_config = countries[k] + global_config_copy = copy.deepcopy(self.global_config) + self.country_configs[k] = recursive_merge(global_config_copy, country_config) + + self.country_configs[None] = self.global_config + + def get_property(self, key, country=None, default=None): + if isinstance(key, six.string_types): + key = key.split(u'.') + + config = self.global_config + + if country: + country_config = self.country_configs.get(country.lower(), {}) + if country_config: + config = country_config + + return nested_get(config, key, default=default) + +postal_codes_config = PostalCodesConfig() diff --git a/scripts/geodata/postal_codes/phrases.py b/scripts/geodata/postal_codes/phrases.py new file mode 100644 index 00000000..1b4e20e8 --- /dev/null +++ b/scripts/geodata/postal_codes/phrases.py @@ -0,0 +1,56 @@ +import random + +from geodata.configs.utils import alternative_probabilities +from geodata.math.sampling import weighted_choice, cdf +from geodata.postal_codes.config import postal_codes_config +from geodata.postal_codes.validation import postcode_regexes + + +class PostalCodes(object): + @classmethod + def is_valid(cls, postal_code, country): + regex = postcode_regexes.get(country) + + if regex: + postal_code = postal_code.strip() + m = regex.match(postal_code) + if m and m.end() == len(postal_code): + return True + else: + return False + return True + + @classmethod + def needs_validation(cls, country): + return postal_codes_config.get_property('validate_postcode', country=country, default=False) + + @classmethod + def should_strip_components(cls, country_code): + return postal_codes_config.get_property('strip_components', country=country_code) + + @classmethod + def add_country_code(cls, postal_code, country): + postal_code = postal_code.strip() + if not postal_codes_config.get_property('add_country_code', country=country): + return postal_code + + cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0) + if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit(): + return postal_code + + country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None) + if country_code_phrases is None: + country_code_phrase = country.upper() + else: + alternates, probs = alternative_probabilities(country_code_phrases) + probs_cdf = cdf(probs) + country_code_phrase = weighted_choice(alternates, probs_cdf) + + cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0) + + separator = u'' + r = random.random() + if r < cc_hyphen_probability: + separator = u'-' + + return u'{}{}{}'.format(country_code_phrase, separator, postal_code) diff --git a/scripts/geodata/postal_codes/validation.py b/scripts/geodata/postal_codes/validation.py new file mode 100644 index 00000000..8c6e3541 --- /dev/null +++ b/scripts/geodata/postal_codes/validation.py @@ -0,0 +1,33 @@ +import re +from geodata.i18n.google import google_i18n +from geodata.postal_codes.config import postal_codes_config + + +class PostcodeRegexes(object): + def __init__(self): + self.responses = {} + self.postcode_regexes = {} + + def get(self, country_code): + ret = self.postcode_regexes.get(country_code.lower()) + if ret is None: + + override_regex = postal_codes_config.get_property('override_regex', country=country_code) + if override_regex: + ret = re.compile(override_regex, re.I) + self.postcode_regexes[country_code.lower()] = ret + return ret + + response = google_i18n.get(country_code) + if response: + postcode_expression = response.get('zip') + if not postcode_expression: + self.postcode_regexes[country_code.lower()] = None + return None + ret = re.compile(postcode_expression, re.I) + self.postcode_regexes[country_code.lower()] = ret + + return ret + + +postcode_regexes = PostcodeRegexes()