From 3ae7a15960c3d5dfa254ef0b6c65f3bf1fd9d8f9 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 27 Aug 2016 15:03:23 -0400 Subject: [PATCH] =?UTF-8?q?[openaddresses]=20Adding=20a=20few=20special=20?= =?UTF-8?q?cases=20for=20Spanish.=20Rewrite=20simple=20numeric=20street=20?= =?UTF-8?q?names=20to=20include=20the=20oft-omitted=20Calle=20(e.g.=2027?= =?UTF-8?q?=20=3D>=20Calle=2027),=20which=20is=20uniformly=20omitted=20in?= =?UTF-8?q?=20the=20Spanish-language=20data=20in=20OpenAddresses=20while?= =?UTF-8?q?=20still=20being=20valid=20for=20grid-based=20cities=20like=20M?= =?UTF-8?q?=C3=A9rida.=20Humans=20and=20signs=20usually=20add=20Calle=20fo?= =?UTF-8?q?r=20numeric=20streets=20while=20it=20may=20be=20omitted=20for?= =?UTF-8?q?=20named=20streets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/geodata/openaddresses/formatter.py | 40 +++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index b9cd5b08..1d5362bd 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import csv import itertools import os @@ -33,6 +35,9 @@ null_regex = re.compile('^\s*(?:null|none)\s*$', re.I) unknown_regex = re.compile('^\s*(?:unknown)\s*$', re.I) number_sign_regex = re.compile('^#', re.UNICODE) not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I) +sin_numero_regex = re.compile('^\s*s\s\s*/\s*n\s*$') + +SPANISH = 'es' class OpenAddressesFormatter(object): @@ -99,12 +104,24 @@ class OpenAddressesFormatter(object): except (ValueError, TypeError): return house_number.strip('# ') and is_numeric(house_number) and not all((c == '0' for c in house_number if c.isdigit())) + @classmethod + def validate_house_number_spanish(cls, house_number): + if sin_numero_regex.match(house_number): + return True + return cls.validate_house_number(house_number) + component_validators = { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number, AddressFormatter.ROAD: validators.validate_street, AddressFormatter.POSTCODE: validators.validate_postcode, } + language_validators = { + SPANISH: { + AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_spanish, + }, + } + def get_property(self, key, *configs): for config in configs: value = config.get(key, None) @@ -155,6 +172,23 @@ class OpenAddressesFormatter(object): pass return num + def spanish_street_name(self, street): + ''' + Most Spanish street names begin with Calle officially + but since it's so common, this is often omitted entirely. + As such, for Spanish-speaking places with numbered streets + like Mérida in Mexico, it would be legitimate to have a + simple number like "27" for the street name in a GIS + data set which omits the Calle. However, we don't really + want to train on "27/road 1/house_number" as that's not + typically how a numeric-only street would be written. However, + we don't want to neglect entire cities like Mérida which are + predominantly a grid, so add Calle (may be abbreviated later). + ''' + if is_numeric(street): + street = six.u('Calle {}').format(street) + return street + def strip_unit_phrases_for_language(self, value, language): if language in self.unit_type_regexes: return self.unit_type_regexes[language].sub(six.u(''), value) @@ -206,7 +240,11 @@ class OpenAddressesFormatter(object): if not value: continue - validator = self.component_validators.get(key, None) + if key == AddressFormatter.ROAD and language == SPANISH: + value = self.spanish_street_name(value) + + validator = self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)) + if validator is not None and not validator(value): continue