From 7a360f421186818fba29bcd6c567d067650d3bba Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 8 Feb 2017 16:13:25 -0500 Subject: [PATCH] [osm] addr:postcode can be all over the place in OSM. Start with postcodes containing commas or semicolons. If addr:postcode (on address of building) contains either, iterate over the values and pick the first one that matches a postcode validation regex for that country --- scripts/geodata/osm/formatter.py | 75 ++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 8bd8f46a..8791627e 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -562,6 +562,17 @@ class OSMAddressFormatter(object): formatted_addresses.append(formatted_address) return formatted_addresses + @classmethod + def valid_postal_code(self, country, postal_code): + postcode_regex = postcode_regexes.get(country) + + if postcode_regex: + postal_code = postal_code.strip() + match = postcode_regex.match(postal_code) + if match and match.end() == len(postal_code): + return True + return False + def extract_valid_postal_codes(self, country, postal_code, validate=True): ''' "Valid" postal codes @@ -583,15 +594,11 @@ class OSMAddressFormatter(object): if postal_code: valid_postcode = False if validate: - postcode_regex = postcode_regexes.get(country) values = number_split_regex.split(postal_code) - - if postcode_regex: - for p in values: - match = postcode_regex.match(p) - if match and match.end() == len(p): - valid_postcode = True - postal_codes.append(p) + for p in values: + if self.valid_postal_code(country, p): + valid_postcode = True + postal_codes.append(p) else: valid_postcode = True @@ -599,12 +606,9 @@ class OSMAddressFormatter(object): postal_codes = parse_osm_number_range(postal_code, parse_letter_range=False, max_range=1000) if validate: valid_postal_codes = [] - postcode_regex = postcode_regexes.get(country) - if postcode_regex: - for pc in postal_codes: - match = postcode_regex.match(pc) - if match and match.end() == len(pc): - valid_postal_codes.append(pc) + for pc in postal_codes: + if self.valid_postal_code(country, pc): + valid_postal_codes.append(pc) postal_codes = valid_postal_codes return postal_codes @@ -776,13 +780,13 @@ class OSMAddressFormatter(object): language_suffix = '' if name and name.strip(): - if six.u(';') in name: - name = random.choice(name.split(six.u(';'))) - elif six.u(',') in name: - name = name.split(six.u(','), 1)[0] + if u';' in name: + name = random.choice(name.split(u';')) + elif u',' in name: + name = name.split(u',', 1)[0] - if six.u('|') in name: - name = name.replace(six.u('|'), six.u('')) + if u'|' in name: + name = name.replace(u'|', u'') name = self.components.strip_whitespace_and_hyphens(name) @@ -1053,6 +1057,20 @@ class OSMAddressFormatter(object): num_basements = None zone = None + postal_code = revised_tags.get(AddressFormatter.POSTCODE, None) + + if postal_code and u';' in postal_code: + postal_code = random.choice(postal_code.split(u';')) + + if postal_code and u',' in postal_code: + for p in postal_code.split(u','): + if self.valid_postal_code(country, p): + revised_tags[AddressFormatter.POSTCODE] = postal_code = p.strip() + break + else: + revised_tags.pop(AddressFormatter.POSTCODE) + postal_code = None + building_venue_names = [] building_components = self.building_components(latitude, longitude) @@ -1072,8 +1090,21 @@ class OSMAddressFormatter(object): building_is_known_venue_type = building_is_known_venue_type or self.is_known_venue_type(building_tags) for k, v in six.iteritems(building_tags): - if k not in revised_tags and k in (AddressFormatter.HOUSE_NUMBER, AddressFormatter.ROAD, AddressFormatter.POSTCODE): + if k not in revised_tags and k in (AddressFormatter.HOUSE_NUMBER, AddressFormatter.ROAD): revised_tags[k] = v + elif k not in revised_tags and k == AddressFormatter.POSTCODE: + m = number_split_regex.search(v) + + if not m: + revised_tags[k] = v + else: + if u';' in v: + v = random.choice(v.split(u';')) + + for p in v.split(','): + if self.valid_postal_code(country, p): + revised_tags[AddressFormatter.POSTCODE] = p.strip() + break elif k == AddressFormatter.HOUSE: building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type)) @@ -1156,8 +1187,6 @@ class OSMAddressFormatter(object): formatted_addresses.extend(self.formatted_places(address_components, country, language)) - - # In Japan an address without places is basically just house_number + metro_station (if given) # However, where there are streets, it's useful to have address-only queries as well if country != Countries.JAPAN: