[addresses] Add generated units, floors, etc. in expanded address components
This commit is contained in:
@@ -8,11 +8,17 @@ from collections import defaultdict
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.entrances import Entrance
|
||||
from geodata.addresses.house_numbers import HouseNumber
|
||||
from geodata.addresses.po_boxes import POBox
|
||||
from geodata.addresses.postcodes import PostCode
|
||||
from geodata.addresses.staircases import Staircase
|
||||
from geodata.addresses.units import Unit
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.countries.country_names import *
|
||||
from geodata.countries.names import *
|
||||
from geodata.language_id.disambiguation import *
|
||||
from geodata.language_id.sample import sample_random_language
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
@@ -79,6 +85,17 @@ class AddressExpander(object):
|
||||
'ISO3166-1:alpha2', 'ISO3166-1:alpha3',
|
||||
'short_name', 'alt_name', 'official_name'])
|
||||
|
||||
NULL_PHRASE = 'null'
|
||||
ALPHANUMERIC_PHRASE = 'alphanumeric'
|
||||
STANDALONE_PHRASE = 'standalone'
|
||||
|
||||
sub_building_component_class_map = {
|
||||
AddressFormatter.ENTRANCE: Entrance,
|
||||
AddressFormatter.STAIRCASE: Staircase,
|
||||
AddressFormatter.LEVEL: Floor,
|
||||
AddressFormatter.UNIT: Unit,
|
||||
}
|
||||
|
||||
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames):
|
||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||
|
||||
@@ -257,6 +274,49 @@ class AddressExpander(object):
|
||||
self.formatter.aliases.replace(address_components)
|
||||
return address_components
|
||||
|
||||
def generated_type(self, component, existing_components, language, country=None):
|
||||
component_config = address_config.get_property('components.{}'.format(component), language, country=country)
|
||||
if not component_config:
|
||||
return None
|
||||
|
||||
prob_dist = component_config
|
||||
|
||||
if 'conditional' in component_config:
|
||||
for c, vals in six.iteritems(component_config['conditionals']):
|
||||
if c in existing_components:
|
||||
prob_dist = vals['probabilities']
|
||||
break
|
||||
|
||||
for num_type in (cls.NULL_PHRASE, cls.ALPHANUMERIC_PHRASE, cls.STANDALONE_PHRASE):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = alphanumeric_props.get(key)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
probs = cdf(probs)
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
if num_type == cls.NULL_PHRASE:
|
||||
return None
|
||||
else:
|
||||
return num_type
|
||||
|
||||
def is_numeric(self, component, language, country=None):
|
||||
tokens = tokenize(component)
|
||||
return sum((1 for t, c in tokens if c == token_types.NUMERIC or c not in token_types.WORD_TOKEN_TYPES)) == len(tokens)
|
||||
|
||||
def get_component_phrase(self, cls, component, language, country=None):
|
||||
component = safe_decode(component)
|
||||
if self.is_numeric(component):
|
||||
phrase = cls.phrase(component, language, country=country)
|
||||
if phrase != component:
|
||||
return phrase
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return component
|
||||
|
||||
def cldr_country_name(self, country_code, language):
|
||||
'''
|
||||
Country names
|
||||
@@ -551,6 +611,41 @@ class AddressExpander(object):
|
||||
if component not in address_components and random.random() < add_neighborhood_prob:
|
||||
address_components[component] = neighborhoods[0]
|
||||
|
||||
def add_sub_building_component(self, component, address_components, language, country, random_kwargs=None, phrase_kwargs=None):
|
||||
existing = address_components.get(component, None)
|
||||
|
||||
component_class = self.sub_building_component_class_map[component]
|
||||
if existing is None:
|
||||
generated_type = self.generated_type(component, address_components, language, country=country)
|
||||
if generated_type == self.ALPHANUMERIC_PHRASE:
|
||||
num = component_class.random(language, country=country, **(random_kwargs or {}))
|
||||
elif generated_type == self.STANDALONE_PHRASE:
|
||||
num = None
|
||||
else:
|
||||
return
|
||||
|
||||
phrase = component_class.phrase(num, language, country=countyr, **(phrase_kwargs or {}))
|
||||
|
||||
if phrase:
|
||||
address_components[component] = phrase
|
||||
else:
|
||||
phrase = self.get_component_phrase(existing, language, country=country)
|
||||
if phrase and phrase != existing:
|
||||
address_components[component] = phrase
|
||||
|
||||
def add_sub_building_components(self, address_components, language, country=None, num_floors=None, num_basements=None, zone=None):
|
||||
self.add_sub_building_component(AddressFormatter.ENTRANCE, address_components, language, country=country)
|
||||
self.add_sub_building_component(AddressFormatter.STAIRCASE, address_components, language, country=country)
|
||||
|
||||
self.add_sub_building_component(AddressFormatter.LEVEL, address_components, language, country=country,
|
||||
random_kwargs=dict(num_floors=num_floors, num_basements=num_basements),
|
||||
phrase_kwargs=dict(num_floors=num_floors))
|
||||
|
||||
self.add_sub_building_component(AddressFormatter.UNIT, address_components, language, country=country,
|
||||
random_kwargs=dict(num_floors=num_floors, num_basements=num_basements),
|
||||
phrase_kwargs=dict(zone=zone))
|
||||
|
||||
|
||||
def replace_name_affixes(self, address_components, language):
|
||||
'''
|
||||
Name normalization
|
||||
@@ -643,7 +738,20 @@ class AddressExpander(object):
|
||||
else:
|
||||
address_components.pop(AddressFormatter.HOUSE_NUMBER, None)
|
||||
|
||||
def expanded_address_components(self, address_components, latitude, longitude):
|
||||
def add_house_number_phrase(self, address_components, language, country=None):
|
||||
house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None)
|
||||
phrase = HouseNumber.phrase(house_number, language, country=country)
|
||||
if phrase and phrase != house_number:
|
||||
address_components[AddressFormatter.HOUSE_NUMBER] = phrase
|
||||
|
||||
def add_postcode_phrase(self, address_components, language, country=None):
|
||||
postcode = address_components.get(AddressFormatter.POSTCODE, None)
|
||||
if postcode:
|
||||
phrase = PostCode.phrase(postcode, language, country=country)
|
||||
if phrase and phrase != postcode:
|
||||
address_components[AddressFormatter.POSTCODE] = phrase
|
||||
|
||||
def expanded_address_components(self, address_components, latitude, longitude, num_floors=None, num_basements=None, zone=None):
|
||||
try:
|
||||
latitude, longitude = latlon_to_decimal(latitude, longitude)
|
||||
except Exception:
|
||||
@@ -697,6 +805,11 @@ class AddressExpander(object):
|
||||
self.prune_duplicate_names(address_components)
|
||||
|
||||
self.cleanup_house_number(address_components)
|
||||
self.add_house_number_phrase(address_components, language, country=country)
|
||||
self.add_postcode_phrase(address_components, language, country=country)
|
||||
|
||||
self.add_sub_building_components(address_components, language, country=country,
|
||||
num_floors=num_floors, num_basements=num_basements, zone=zone)
|
||||
|
||||
return address_components, country, language
|
||||
|
||||
|
||||
Reference in New Issue
Block a user