216 lines
8.5 KiB
Python
216 lines
8.5 KiB
Python
import copy
|
|
import operator
|
|
import os
|
|
import random
|
|
import six
|
|
import yaml
|
|
|
|
from collections import defaultdict
|
|
|
|
from geodata.addresses.dependencies import ComponentDependencies
|
|
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
|
from geodata.address_formatting.formatter import AddressFormatter
|
|
from geodata.configs.utils import nested_get, recursive_merge
|
|
from geodata.math.sampling import cdf, weighted_choice
|
|
|
|
from geodata.encoding import safe_encode
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
|
|
PLACE_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
|
'resources', 'places', 'countries', 'global.yaml')
|
|
|
|
|
|
class PlaceConfig(object):
|
|
ADMIN_COMPONENTS = {
|
|
AddressFormatter.SUBURB,
|
|
AddressFormatter.CITY_DISTRICT,
|
|
AddressFormatter.CITY,
|
|
AddressFormatter.ISLAND,
|
|
AddressFormatter.STATE_DISTRICT,
|
|
AddressFormatter.STATE,
|
|
AddressFormatter.COUNTRY_REGION,
|
|
AddressFormatter.COUNTRY,
|
|
AddressFormatter.WORLD_REGION,
|
|
}
|
|
|
|
numeric_ops = {'lte': operator.le,
|
|
'gt': operator.gt,
|
|
'lt': operator.lt,
|
|
'gte': operator.ge,
|
|
}
|
|
|
|
def __init__(self, config_file=PLACE_CONFIG_FILE):
|
|
self.cache = {}
|
|
place_config = yaml.load(open(config_file))
|
|
|
|
self.global_config = place_config['global']
|
|
self.country_configs = {}
|
|
|
|
self.cdf_cache = {}
|
|
|
|
countries = place_config.pop('countries', {})
|
|
|
|
for k, v in six.iteritems(countries):
|
|
country_config = countries[k]
|
|
global_config_copy = copy.deepcopy(self.global_config)
|
|
self.country_configs[k] = recursive_merge(global_config_copy, country_config)
|
|
|
|
self.country_configs[None] = self.global_config
|
|
|
|
self.setup_component_dependencies()
|
|
|
|
def setup_component_dependencies(self):
|
|
self.component_dependencies = {}
|
|
|
|
for country, conf in six.iteritems(self.country_configs):
|
|
graph = {k: c['dependencies'] for k, c in six.iteritems(conf['components']) if 'dependencies' in c}
|
|
graph.update({c: [] for c in self.ADMIN_COMPONENTS if c not in graph})
|
|
|
|
self.component_dependencies[country] = ComponentDependencies(graph)
|
|
|
|
def get_property(self, key, country=None, default=None):
|
|
if isinstance(key, six.string_types):
|
|
key = key.split('.')
|
|
|
|
config = self.global_config
|
|
|
|
if country:
|
|
country_config = self.country_configs.get(country.lower(), {})
|
|
if country_config:
|
|
config = country_config
|
|
|
|
return nested_get(config, key, default=default)
|
|
|
|
def include_by_population_exceptions(self, population_exceptions, population):
|
|
if population_exceptions:
|
|
try:
|
|
population = int(population)
|
|
except (TypeError, ValueError):
|
|
population = 0
|
|
|
|
for exc in population_exceptions:
|
|
support = 0
|
|
|
|
for k in exc:
|
|
op = self.numeric_ops.get(k)
|
|
if not op:
|
|
continue
|
|
res = op(population, exc[k])
|
|
if not res:
|
|
support = 0
|
|
break
|
|
|
|
support += 1
|
|
|
|
if support > 0:
|
|
probability = exc.get('probability', 0.0)
|
|
if random.random() < probability:
|
|
return True
|
|
return False
|
|
|
|
def include_component_simple(self, component, containing_ids, country=None):
|
|
containing = self.get_property(('components', component, 'containing'), country=country, default=None)
|
|
|
|
if containing is not None:
|
|
for c in containing:
|
|
if (c['type'], safe_encode(c['id'])) in containing_ids:
|
|
return random.random() < c['probability']
|
|
|
|
probability = self.get_property(('components', component, 'probability'), country=country, default=0.0)
|
|
|
|
return random.random() < probability
|
|
|
|
def include_component(self, component, containing_ids, country=None, population=None, check_population=True, unambiguous_city=False):
|
|
if check_population and not unambiguous_city:
|
|
population_exceptions = self.get_property(('components', component, 'population'), country=country, default=None)
|
|
if population_exceptions and self.include_by_population_exceptions(population_exceptions, population=population or 0):
|
|
return True
|
|
return self.include_component_simple(component, containing_ids, country=country)
|
|
|
|
def drop_invalid_components(self, address_components, country, original_bitset=None):
|
|
if not address_components:
|
|
return
|
|
component_bitset = ComponentDependencies.component_bitset(address_components)
|
|
|
|
deps = self.component_dependencies.get(country, self.component_dependencies[None])
|
|
dep_order = deps.dependency_order
|
|
|
|
for c in dep_order:
|
|
if c not in address_components:
|
|
continue
|
|
if c in deps and not component_bitset & deps[c] and (original_bitset is None or original_bitset & deps[c]):
|
|
address_components.pop(c)
|
|
component_bitset ^= ComponentDependencies.component_bit_values[c]
|
|
|
|
def city_replacements(self, country):
|
|
return set(self.get_property(('city_replacements', ), country=country))
|
|
|
|
def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False):
|
|
containing_ids = set()
|
|
|
|
for boundary in boundaries:
|
|
object_type = boundary.get('type')
|
|
object_id = safe_encode(boundary.get('id', ''))
|
|
if not (object_type and object_id):
|
|
continue
|
|
containing_ids.add((object_type, object_id))
|
|
|
|
original_bitset = ComponentDependencies.component_bitset(components)
|
|
|
|
names = defaultdict(list)
|
|
admin_components = [c for c in components if c in self.ADMIN_COMPONENTS]
|
|
for c in admin_components:
|
|
names[components[c]].append(c)
|
|
|
|
same_name = set()
|
|
for c, v in six.iteritems(names):
|
|
if len(v) > 1:
|
|
same_name |= set(v)
|
|
|
|
new_components = components.copy()
|
|
|
|
city_replacements = set()
|
|
if AddressFormatter.CITY not in components:
|
|
city_replacements = self.city_replacements(country)
|
|
|
|
for component in admin_components:
|
|
include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city)
|
|
|
|
if not include and component not in city_replacements:
|
|
# Note: this check is for cities that have the same name as their admin
|
|
# areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop
|
|
# city, we don't want to include country on its own. This should help the parser
|
|
# default to the city in ambiguous cases where only one component is specified.
|
|
if not (component == AddressFormatter.CITY and component in same_name):
|
|
new_components.pop(component, None)
|
|
else:
|
|
value = components[component]
|
|
for c in names[value]:
|
|
new_components.pop(c, None)
|
|
|
|
for component in self.ADMIN_COMPONENTS:
|
|
value = self.get_property(('components', component, 'value'), country=country, default=None)
|
|
|
|
if not value:
|
|
values, probs = self.cdf_cache.get((country, component), (None, None))
|
|
if values is None:
|
|
values = self.get_property(('components', component, 'values'), country=country, default=None)
|
|
if values is not None:
|
|
values, probs = zip(*[(v['value'], float(v['probability'])) for v in values])
|
|
probs = cdf(probs)
|
|
self.cdf_cache[(country, component)] = (values, probs)
|
|
|
|
if values is not None:
|
|
value = weighted_choice(values, probs)
|
|
|
|
if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city):
|
|
new_components[component] = value
|
|
|
|
self.drop_invalid_components(new_components, country, original_bitset=original_bitset)
|
|
|
|
return new_components
|
|
|
|
|
|
place_config = PlaceConfig()
|