[openaddresses] formatting Chinese house number (with annex adding a second number potentially) and adding Spanish street names after the language is known by reverse geocoding

This commit is contained in:
Al
2017-01-28 01:01:26 -05:00
parent c9417436f7
commit 2953759321

View File

@@ -24,6 +24,8 @@ from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE, get_string_scri
from geodata.math.sampling import cdf, weighted_choice from geodata.math.sampling import cdf, weighted_choice
from geodata.openaddresses.config import openaddresses_config from geodata.openaddresses.config import openaddresses_config
from geodata.places.config import place_config from geodata.places.config import place_config
from geodata.text.tokenize import tokenize
from geodata.text.token_types import token_types
from geodata.text.utils import is_numeric, is_numeric_strict from geodata.text.utils import is_numeric, is_numeric_strict
from geodata.csv_utils import tsv_string, unicode_csv_reader from geodata.csv_utils import tsv_string, unicode_csv_reader
@@ -49,6 +51,7 @@ dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I)
SPANISH = 'es' SPANISH = 'es'
PORTUGUESE = 'pt' PORTUGUESE = 'pt'
RUSSIAN = 'ru' RUSSIAN = 'ru'
CHINESE = 'zh'
class OpenAddressesFormatter(object): class OpenAddressesFormatter(object):
@@ -142,12 +145,20 @@ class OpenAddressesFormatter(object):
return True return True
return cls.validate_house_number(house_number) return cls.validate_house_number(house_number)
@classmethod
def validate_chinese_house_number(cls, house_number):
if not house_number:
return False
tokens = tokenize(house_number)
return all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'', u'', u'')) for t, c in tokens)
component_validators = { component_validators = {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number, AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
AddressFormatter.ROAD: validators.validate_street, AddressFormatter.ROAD: validators.validate_street,
AddressFormatter.POSTCODE: validators.validate_postcode, AddressFormatter.POSTCODE: validators.validate_postcode,
} }
language_validators = { language_validators = {
SPANISH: { SPANISH: {
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
@@ -157,9 +168,20 @@ class OpenAddressesFormatter(object):
}, },
RUSSIAN: { RUSSIAN: {
AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number, AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
},
CHINESE: {
AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
} }
} }
chinese_annex_regex = re.compile(u'([\d]+)(?![号栋])', re.U)
@classmethod
def format_chinese_house_number(cls, house_number):
if not house_number:
return house_number
return cls.chinese_annex_regex.sub(u'\\1号', house_number)
def get_property(self, key, *configs): def get_property(self, key, *configs):
for config in configs: for config in configs:
value = config.get(key, None) value = config.get(key, None)
@@ -296,9 +318,6 @@ class OpenAddressesFormatter(object):
if key in mapped_values: if key in mapped_values:
value = mapped_values[key].get(value, value) value = mapped_values[key].get(value, value)
if key == AddressFormatter.ROAD and language == SPANISH:
value = self.components.spanish_street_name(value)
if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
if add_osm_boundaries: if add_osm_boundaries:
continue continue
@@ -355,6 +374,9 @@ class OpenAddressesFormatter(object):
street = street.strip() street = street.strip()
street = AddressComponents.cleaned_name(street) street = AddressComponents.cleaned_name(street)
if street and language == SPANISH:
street = self.components.spanish_street_name(street)
if language == UNKNOWN_LANGUAGE: if language == UNKNOWN_LANGUAGE:
strip_unit_language = candidate_languages[0][0] if candidate_languages else None strip_unit_language = candidate_languages[0][0] if candidate_languages else None
else: else:
@@ -370,6 +392,10 @@ class OpenAddressesFormatter(object):
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
if house_number: if house_number:
house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas) house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
if language == CHINESE:
house_number = self.format_chinese_house_number(house_number)
if house_number is not None: if house_number is not None:
components[AddressFormatter.HOUSE_NUMBER] = house_number components[AddressFormatter.HOUSE_NUMBER] = house_number