From de2dffa3152eb6a78c456e9d7d10090fd4292ee3 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 2 Jan 2017 23:41:01 -0500 Subject: [PATCH] [addresses] adding Calle to purely numeric Spanish street names in OSM as well --- scripts/geodata/addresses/components.py | 27 +++++++++++++++++++++- scripts/geodata/openaddresses/formatter.py | 20 +--------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 3a77ae1e..d6507c5b 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -68,6 +68,7 @@ MACAO = 'mo' JAPANESE_ROMAJI = 'ja_rm' ENGLISH = 'en' +SPANISH = 'es' JAPANESE = 'ja' CHINESE = 'zh' @@ -884,6 +885,24 @@ class AddressComponents(object): if genitive_probability is not None and random.random() < float(genitive_probability): address_components[component] = self.genitive_name(address_components[component], language) + @classmethod + def spanish_street_name(cls, street): + ''' + Most Spanish street names begin with Calle officially + but since it's so common, this is often omitted entirely. + As such, for Spanish-speaking places with numbered streets + like Mérida in Mexico, it would be legitimate to have a + simple number like "27" for the street name in a GIS + data set which omits the Calle. However, we don't really + want to train on "27/road 1/house_number" as that's not + typically how a numeric-only street would be written. However, + we don't want to neglect entire cities like Mérida which are + predominantly a grid, so add Calle (may be abbreviated later). + ''' + if is_numeric(street): + street = six.u('Calle {}').format(street) + return street + def abbreviated_state(self, state, country, language): abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability'))) @@ -1672,8 +1691,14 @@ class AddressComponents(object): self.add_neighborhoods(address_components, neighborhoods, country, language, non_local_language=non_local_language, language_suffix=language_suffix) - street = address_components.get(AddressFormatter.ROAD) self.cleanup_street(address_components) + street = address_components.get(AddressFormatter.ROAD) + + if language == SPANISH and street: + norm_street = self.spanish_street_name(street) + if norm_street: + address_components[AddressFormatter.ROAD] = norm_street + street = norm_street self.cleanup_boundary_names(address_components) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index a7aa49d0..01d51452 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -195,24 +195,6 @@ class OpenAddressesFormatter(object): pass return num - @classmethod - def spanish_street_name(cls, street): - ''' - Most Spanish street names begin with Calle officially - but since it's so common, this is often omitted entirely. - As such, for Spanish-speaking places with numbered streets - like Mérida in Mexico, it would be legitimate to have a - simple number like "27" for the street name in a GIS - data set which omits the Calle. However, we don't really - want to train on "27/road 1/house_number" as that's not - typically how a numeric-only street would be written. However, - we don't want to neglect entire cities like Mérida which are - predominantly a grid, so add Calle (may be abbreviated later). - ''' - if is_numeric(street): - street = six.u('Calle {}').format(street) - return street - def strip_unit_phrases_for_language(self, value, language): if language in self.unit_type_regexes: return self.unit_type_regexes[language].sub(six.u(''), value) @@ -300,7 +282,7 @@ class OpenAddressesFormatter(object): value = mapped_values[key].get(value, value) if key == AddressFormatter.ROAD and language == SPANISH: - value = self.spanish_street_name(value) + value = self.components.spanish_street_name(value) if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if add_osm_boundaries: