[openaddresses] formatting Chinese house number (with annex adding a second number potentially) and adding Spanish street names after the language is known by reverse geocoding
This commit is contained in:
@@ -24,6 +24,8 @@ from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE, get_string_scri
|
|||||||
from geodata.math.sampling import cdf, weighted_choice
|
from geodata.math.sampling import cdf, weighted_choice
|
||||||
from geodata.openaddresses.config import openaddresses_config
|
from geodata.openaddresses.config import openaddresses_config
|
||||||
from geodata.places.config import place_config
|
from geodata.places.config import place_config
|
||||||
|
from geodata.text.tokenize import tokenize
|
||||||
|
from geodata.text.token_types import token_types
|
||||||
from geodata.text.utils import is_numeric, is_numeric_strict
|
from geodata.text.utils import is_numeric, is_numeric_strict
|
||||||
|
|
||||||
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
||||||
@@ -49,6 +51,7 @@ dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I)
|
|||||||
SPANISH = 'es'
|
SPANISH = 'es'
|
||||||
PORTUGUESE = 'pt'
|
PORTUGUESE = 'pt'
|
||||||
RUSSIAN = 'ru'
|
RUSSIAN = 'ru'
|
||||||
|
CHINESE = 'zh'
|
||||||
|
|
||||||
|
|
||||||
class OpenAddressesFormatter(object):
|
class OpenAddressesFormatter(object):
|
||||||
@@ -142,12 +145,20 @@ class OpenAddressesFormatter(object):
|
|||||||
return True
|
return True
|
||||||
return cls.validate_house_number(house_number)
|
return cls.validate_house_number(house_number)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate_chinese_house_number(cls, house_number):
|
||||||
|
if not house_number:
|
||||||
|
return False
|
||||||
|
tokens = tokenize(house_number)
|
||||||
|
return all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens)
|
||||||
|
|
||||||
component_validators = {
|
component_validators = {
|
||||||
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
|
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
|
||||||
AddressFormatter.ROAD: validators.validate_street,
|
AddressFormatter.ROAD: validators.validate_street,
|
||||||
AddressFormatter.POSTCODE: validators.validate_postcode,
|
AddressFormatter.POSTCODE: validators.validate_postcode,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
language_validators = {
|
language_validators = {
|
||||||
SPANISH: {
|
SPANISH: {
|
||||||
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
|
AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
|
||||||
@@ -157,9 +168,20 @@ class OpenAddressesFormatter(object):
|
|||||||
},
|
},
|
||||||
RUSSIAN: {
|
RUSSIAN: {
|
||||||
AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
|
AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
|
||||||
|
},
|
||||||
|
CHINESE: {
|
||||||
|
AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
chinese_annex_regex = re.compile(u'([\d]+)(?![号栋])', re.U)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def format_chinese_house_number(cls, house_number):
|
||||||
|
if not house_number:
|
||||||
|
return house_number
|
||||||
|
return cls.chinese_annex_regex.sub(u'\\1号', house_number)
|
||||||
|
|
||||||
def get_property(self, key, *configs):
|
def get_property(self, key, *configs):
|
||||||
for config in configs:
|
for config in configs:
|
||||||
value = config.get(key, None)
|
value = config.get(key, None)
|
||||||
@@ -296,9 +318,6 @@ class OpenAddressesFormatter(object):
|
|||||||
if key in mapped_values:
|
if key in mapped_values:
|
||||||
value = mapped_values[key].get(value, value)
|
value = mapped_values[key].get(value, value)
|
||||||
|
|
||||||
if key == AddressFormatter.ROAD and language == SPANISH:
|
|
||||||
value = self.components.spanish_street_name(value)
|
|
||||||
|
|
||||||
if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
|
if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
|
||||||
if add_osm_boundaries:
|
if add_osm_boundaries:
|
||||||
continue
|
continue
|
||||||
@@ -355,6 +374,9 @@ class OpenAddressesFormatter(object):
|
|||||||
street = street.strip()
|
street = street.strip()
|
||||||
street = AddressComponents.cleaned_name(street)
|
street = AddressComponents.cleaned_name(street)
|
||||||
|
|
||||||
|
if street and language == SPANISH:
|
||||||
|
street = self.components.spanish_street_name(street)
|
||||||
|
|
||||||
if language == UNKNOWN_LANGUAGE:
|
if language == UNKNOWN_LANGUAGE:
|
||||||
strip_unit_language = candidate_languages[0][0] if candidate_languages else None
|
strip_unit_language = candidate_languages[0][0] if candidate_languages else None
|
||||||
else:
|
else:
|
||||||
@@ -370,6 +392,10 @@ class OpenAddressesFormatter(object):
|
|||||||
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
|
house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
|
||||||
if house_number:
|
if house_number:
|
||||||
house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
|
house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
|
||||||
|
|
||||||
|
if language == CHINESE:
|
||||||
|
house_number = self.format_chinese_house_number(house_number)
|
||||||
|
|
||||||
if house_number is not None:
|
if house_number is not None:
|
||||||
components[AddressFormatter.HOUSE_NUMBER] = house_number
|
components[AddressFormatter.HOUSE_NUMBER] = house_number
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user