[osm] Pulling valid postal codes out into a method

This commit is contained in:
Al
2016-08-13 01:49:26 -04:00
parent 55895369b8
commit bc8acb196c

View File

@@ -421,6 +421,41 @@ class OSMAddressFormatter(object):
formatted_addresses.append(formatted_address)
return formatted_addresses
def extract_valid_postal_codes(self, country, postal_code, validate=True):
'''
"Valid" postal codes
--------------------
This is only really for place formatting at the moment so we don't
accidentally inject too many bad postcodes into the training data.
Sometimes for a given boundary name the postcode might be a delimited
sequence of postcodes like "10013;10014;10015", it might be a range
like "10013-10015", or it might be some non-postcode text. The idea
here is to validate the text using Google's libaddressinput postcode
regexes for the given country. These regexes are a little simplistic
and don't account for variations like the common use of shortened
postcodes in cities like Berlin or Oslo where the first digit is always
known to be the same given the city. Still, at the level of places, that's
probably fine.
'''
postal_codes = []
if postal_code:
valid_postcode = False
if validate:
postcode_regex = postcode_regexes.get(country)
if postcode_regex:
match = postcode_regex.match(postal_code)
if match and match.end() == len(postal_code):
valid_postcode = True
postal_codes.append(postal_code)
else:
valid_postcode = True
if not valid_postcode:
postal_codes = parse_osm_number_range(postal_code, parse_letter_range=False)
return postal_code
def node_place_tags(self, tags):
try:
latitude, longitude = latlon_to_decimal(tags['lat'], tags['lon'])
@@ -490,18 +525,10 @@ class OSMAddressFormatter(object):
place_tags = []
postal_code = revised_tags.get(AddressFormatter.POSTCODE, None)
postal_codes = []
if postal_code:
valid_postcode = False
postcode_regex = postcode_regexes.get(country)
if postcode_regex:
match = postcode_regex.match(postal_code)
if match and match.end() == len(postal_code):
valid_postcode = True
postal_codes.append(postal_code)
if not valid_postcode:
postal_codes = parse_osm_number_range(postal_code, parse_letter_range=False)
postal_codes = self.extract_valid_postal_codes(country, postal_code)
try:
population = int(tags.get('population', 0))