From 09f808ca47a5f33f2ea01b85d2d843259b039b6e Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 13 Dec 2016 17:03:26 -0500 Subject: [PATCH] [geoplanet] only add short postal codes to GeoPlanet data set if they match the Google regexes --- .../geodata/geoplanet/geoplanet_training_data.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/geodata/geoplanet/geoplanet_training_data.py b/scripts/geodata/geoplanet/geoplanet_training_data.py index 3578978c..c83a3e24 100644 --- a/scripts/geodata/geoplanet/geoplanet_training_data.py +++ b/scripts/geodata/geoplanet/geoplanet_training_data.py @@ -18,6 +18,7 @@ from geodata.address_expansions.gazetteers import * from geodata.address_formatting.formatter import AddressFormatter from geodata.countries.names import country_names +from geodata.i18n.google import postcode_regexes from geodata.names.normalization import name_affixes from geodata.places.config import place_config @@ -199,6 +200,18 @@ class GeoPlanetFormatter(object): language = self.language_codes[language] + if len(postal_code) <= 3: + postcode_regex = postcode_regexes.get(country) + + valid_postcode = False + if postcode_regex: + match = postcode_regex.match(postal_code) + if match and match.end() == len(postal_code): + valid_postcode = True + + if not valid_postcode: + continue + # If the county/state is coterminous with a city and contains only one place, # set the parent_id to the city instead if parent_id in self.coterminous_admins: