From 996a38d017e105e2deb816c76327204451aa8151 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 22 Sep 2016 17:45:14 -0400 Subject: [PATCH] [places] adding probability distributions on added place components so can have West Indies, W.I. etc. --- resources/places/countries/global.yaml | 8 +++++++- scripts/geodata/places/config.py | 18 ++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/resources/places/countries/global.yaml b/resources/places/countries/global.yaml index 9b5960ca..974f06ec 100644 --- a/resources/places/countries/global.yaml +++ b/resources/places/countries/global.yaml @@ -32,7 +32,13 @@ add_west_indies: &add_west_indies components: world_region: &add_west_indies_world_region probability: 0.1 - value: West Indies + values: + - value: West Indies + probability: 0.6 + - value: W.I. + probability: 0.2 + - value: WI + probability: 0.2 countries: # Anguilla diff --git a/scripts/geodata/places/config.py b/scripts/geodata/places/config.py index 4ab86d5a..8fba4301 100644 --- a/scripts/geodata/places/config.py +++ b/scripts/geodata/places/config.py @@ -9,8 +9,8 @@ from collections import defaultdict from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries from geodata.address_formatting.formatter import AddressFormatter -from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge -from geodata.math.sampling import cdf, check_probability_distribution +from geodata.configs.utils import nested_get, recursive_merge +from geodata.math.sampling import cdf, weighted_choice from geodata.encoding import safe_encode @@ -46,6 +46,8 @@ class PlaceConfig(object): self.global_config = place_config['global'] self.country_configs = {} + self.cdf_cache = {} + countries = place_config.pop('countries', {}) for k, v in six.iteritems(countries): @@ -144,6 +146,18 @@ class PlaceConfig(object): for component in self.ADMIN_COMPONENTS: value = self.get_property(('components', component, 'value'), country=country, default=None) + if not value: + values, probs = self.cdf_cache.get((country, component), (None, None)) + if values is None: + values = self.get_property(('components', component, 'values'), country=country, default=None) + if values is not None: + values, probs = zip(*[(v['value'], float(v['probability'])) for v in values]) + probs = cdf(probs) + self.cdf_cache[(country, component)] = (values, probs) + + if values is not None: + value = weighted_choice(values, probs) + if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population): new_components[component] = value