From 182c0b3d26130ebf3c2bcb716f09d5a227fe014e Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 5 Oct 2016 17:05:24 -0400 Subject: [PATCH] [addresses] adding country-specific cleanups for Kingston (city=Kingston 12 split into city=Kingston, postcode=12) and Dublin (e.g. Dublin 3 specified various ways will be treated as a city_district, whereas Eirecodes are treated as postal codes) --- scripts/geodata/addresses/components.py | 96 ++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 563ad9db..2a43f06a 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -3,6 +3,7 @@ import operator import os import pycountry import random +import re import six import yaml @@ -45,7 +46,6 @@ this_dir = os.path.realpath(os.path.dirname(__file__)) PARSER_DEFAULT_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources', 'parser', 'default.yaml') - class ComponentDependencies(object): ''' Declare an address component and its dependencies e.g. @@ -133,6 +133,9 @@ class AddressComponents(object): ALPHANUMERIC_PHRASE = 'alphanumeric' STANDALONE_PHRASE = 'standalone' + IRELAND = 'ie' + JAMAICA = 'jm' + class zones: COMMERCIAL = 'commercial' RESIDENTIAL = 'residential' @@ -675,6 +678,87 @@ class AddressComponents(object): else: return ':{}'.format(language_script or non_local_language or language) + # e.g. Dublin 3 + dublin_postal_district_regex_str = '(?:[1-9]|1[1-9]|2[0-4]|6w)' + dublin_postal_district_regex = re.compile('^{}$'.format(dublin_postal_district_regex_str), re.I) + dublin_city_district_regex = re.compile('dublin {}$'.format(dublin_postal_district_regex_str), re.I) + + def format_dublin_postal_district(self, address_components): + ''' + Dublin postal districts + ----------------------- + + Since the introduction of the Eire code, former Dublin postcodes + are basically being used as what we would call a city_district in + libpostal, so fix that here. + + If addr:city is given as "Dublin 3", make it city_district instead + If addr:city is given as "Dublin" or "City of Dublin" and addr:postcode + is given as "3", remove city/postcode and make it city_district "Dublin 3" + ''' + + city = address_components.get(AddressFormatter.CITY) + # Change to city_district + if city and self.dublin_city_district_regex.match(city): + address_components[AddressFormatter.CITY_DISTRICT] = address_components.pop(AddressFormatter.CITY) + postcode = address_components.get(AddressFormatter.POSTCODE) + if postcode and (self.dublin_postal_district_regex.match(postcode) or self.dublin_city_district_regex.match(postcode)): + address_components.pop(AddressFormatter.POSTCODE) + return True + elif city.lower() in ('dublin', 'city of dublin', 'dublin city') and AddressFormatter.POSTCODE in address_components: + postcode = address_components[AddressFormatter.POSTCODE] + if self.dublin_postal_district_regex.match(postcode): + address_components.pop(AddressFormatter.CITY) + address_components[AddressFormatter.CITY_DISTRICT] = 'Dublin {}'.format(address_components.pop(AddressFormatter.POSTCODE)) + return True + elif self.dublin_city_district_regex.match(postcode): + address_components[AddressFormatter.CITY_DISTRICT] = address_components.pop(AddressFormatter.POSTCODE) + return True + return False + + # e.g. Kingston 5 + kingston_postcode_regex = re.compile('(kingston )?([1-9]|1[1-9]|20|c\.?s\.?o\.?)$', re.I) + + def format_kingston_postcode(self, address_components): + ''' + Kingston postcodes + ------------------ + Jamaica does not have a postcode system, except in Kingston where + there are postal zones 1-20 plus the Central Sorting Office (CSO). + + These are not always consistently labeled in OSM, so normalize here. + + If city is given as "Kingston 20", separate into city="Kingston", postcode="20" + ''' + city = address_components.get(AddressFormatter.CITY) + postcode = address_components.get(AddressFormatter.POSTCODE) + + if city: + match = self.kingston_postcode_regex.match(city) + if match: + city, postcode = match.groups() + if city: + address_components[AddressFormatter.CITY] = city + else: + address_components.pop(AddressFormatter.CITY) + + if postcode: + address_components[AddressFormatter.POSTCODE] = postcode + + return True + elif postcode: + match = self.kingston_postcode_regex.match(postcode) + if match: + city, postcode = match.groups() + if city and AddressFormatter.CITY not in address_components: + address_components[AddressFormatter.CITY] = city + + if postcode: + address_components[AddressFormatter.POSTCODE] = postcode + + return True + return False + def add_admin_boundaries(self, address_components, osm_components, country, language, @@ -1138,6 +1222,12 @@ class AddressComponents(object): else: address_components.pop(AddressFormatter.HOUSE_NUMBER, None) + def country_specific_cleanup(self, address_components, country): + if country == self.IRELAND: + return self.format_dublin_postal_district(address_components) + elif country == self.JAMAICA: + return self.format_kingston_postcode(address_components) + def add_house_number_phrase(self, address_components, language, country=None): house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None) if not is_numeric(house_number) and (not house_number or house_number.lower() not in self.latin_alphabet_lower): @@ -1292,6 +1382,9 @@ class AddressComponents(object): street = address_components.get(AddressFormatter.ROAD) + self.cleanup_boundary_names(address_components) + self.country_specific_cleanup(address_components, country) + self.replace_name_affixes(address_components, non_local_language or language) self.replace_names(address_components) @@ -1303,7 +1396,6 @@ class AddressComponents(object): self.cleanup_house_number(address_components) self.remove_numeric_boundary_names(address_components) - self.cleanup_boundary_names(address_components) self.add_house_number_phrase(address_components, language, country=country) self.add_postcode_phrase(address_components, language, country=country) self.add_metro_station_phrase(address_components, language, country=country)