From c3c949a147dd2fb7de3a9c5acddf1a8b2849c44a Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 1 Sep 2016 17:41:27 -0400 Subject: [PATCH] [openaddresses] adding the Netherlands with some hacks for house number until the new format function is deployed in OpenAddresses --- resources/parser/data_sets/openaddresses.yaml | 4 +++ scripts/geodata/openaddresses/formatter.py | 31 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/resources/parser/data_sets/openaddresses.yaml b/resources/parser/data_sets/openaddresses.yaml index ff783d27..00189000 100644 --- a/resources/parser/data_sets/openaddresses.yaml +++ b/resources/parser/data_sets/openaddresses.yaml @@ -791,6 +791,10 @@ countries: files: - filename: statewide.csv + nl: + files: + - filename: countrywide.json + us: cldr_country_probability: 0.05 ignore_fields_containing: diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 23e475c6..d6683273 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -34,6 +34,8 @@ sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$') fraction_regex = re.compile('^\s*[\d]+[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*$', re.I) number_space_letter_regex = re.compile('^[\d]+ [a-z]', re.I) +dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I) + SPANISH = 'es' PORTUGUESE = 'pt' @@ -128,7 +130,7 @@ class OpenAddressesFormatter(object): }, PORTUGUESE: { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, - } + }, } def get_property(self, key, *configs): @@ -198,6 +200,26 @@ class OpenAddressesFormatter(object): street = six.u('Calle {}').format(street) return street + # HACK: remove method when #1932 is resolved in OpenAddresses + def dutch_house_number(self, house_number): + house_number = safe_decode(house_number) + match = dutch_house_number_regex.match(house_number) + if not match: + return house_number + + number, letter, additional = match.groups() + + parts = [] + if number: + parts.append(number) + if letter: + parts.append(letter) + if additional: + if parts: + parts.append(six.u('-')) + parts.append(additional) + return six.u('').join(parts) + def strip_unit_phrases_for_language(self, value, language): if language in self.unit_type_regexes: return self.unit_type_regexes[language].sub(six.u(''), value) @@ -238,6 +260,9 @@ class OpenAddressesFormatter(object): latitude_index = headers.index('LAT') longitude_index = headers.index('LON') + # HACK: remove when #1932 is resolved in OpenAddresses + is_netherlands = 'nl' in path.lower().split(os.path.sep)[-3:] + for row in reader: try: latitude = float(row[latitude_index]) @@ -260,6 +285,10 @@ class OpenAddressesFormatter(object): if key == AddressFormatter.ROAD and language == SPANISH: value = self.spanish_street_name(value) + # HACK: remove when #1932 is resolved in OpenAddresses + if key == AddressFormatter.HOUSE_NUMBER and is_netherlands: + value = self.dutch_house_number(value) + if key in AddressFormatter.BOUNDARY_COMPONENTS: value = self.components.cleaned_name(value, first_comma_delimited_phrase=True) if value and len(value) < 2 or is_numeric(value):