[config] Adding default/alternative probability distribution to config.utils
This commit is contained in:
@@ -7,7 +7,7 @@ import yaml
|
|||||||
from collections import Mapping
|
from collections import Mapping
|
||||||
|
|
||||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||||
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
|
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
|
||||||
from geodata.math.sampling import cdf, check_probability_distribution
|
from geodata.math.sampling import cdf, check_probability_distribution
|
||||||
|
|
||||||
|
|
||||||
@@ -77,21 +77,15 @@ class AddressConfig(object):
|
|||||||
if properties is None:
|
if properties is None:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
probs = []
|
alternatives, probs = alternative_probabilities(properties)
|
||||||
alternatives = []
|
|
||||||
|
|
||||||
if 'probability' in properties:
|
forms = []
|
||||||
prob = properties['probability']
|
form_probs = []
|
||||||
props = properties['default']
|
|
||||||
|
for props, prob in zip(alternatives, probs):
|
||||||
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
||||||
probs.extend([prob * p for p in phrase_probs])
|
forms.extend([(p, props) for p in phrases])
|
||||||
alternatives.extend([(p, props) for p in phrases])
|
form_probs.extend([prob * p for p in phrase_probs])
|
||||||
elif 'alternatives' not in properties:
|
|
||||||
prob = 1.0
|
|
||||||
props = properties['default']
|
|
||||||
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
|
||||||
probs.extend([prob * p for p in phrase_probs])
|
|
||||||
alternatives.extend([(p, props) for p in phrases])
|
|
||||||
|
|
||||||
sample_probability = properties.get('sample_probability')
|
sample_probability = properties.get('sample_probability')
|
||||||
if sample_probability is not None:
|
if sample_probability is not None:
|
||||||
@@ -102,25 +96,17 @@ class AddressConfig(object):
|
|||||||
sample_phrases.append(canonical)
|
sample_phrases.append(canonical)
|
||||||
sample_phrases.extend(surface_forms)
|
sample_phrases.extend(surface_forms)
|
||||||
# Note: use the outer properties dictionary e.g. units.alphanumeric
|
# Note: use the outer properties dictionary e.g. units.alphanumeric
|
||||||
alternatives.extend([(p, properties) for p in sample_phrases])
|
forms.extend([(p, properties) for p in sample_phrases])
|
||||||
probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
|
form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
|
||||||
|
|
||||||
alts = properties.get('alternatives', [])
|
|
||||||
for alt in alts:
|
|
||||||
prob = alt.get('probability', 1.0 / len(alts))
|
|
||||||
props = alt['alternative']
|
|
||||||
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
|
||||||
probs.extend([prob * p for p in phrase_probs])
|
|
||||||
alternatives.extend([(p, props) for p in phrases])
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
check_probability_distribution(probs)
|
check_probability_distribution(form_probs)
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
print 'values were: {}'.format(alternatives)
|
print 'values were: {}'.format(forms)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
probs_cdf = cdf(probs)
|
form_probs_cdf = cdf(form_probs)
|
||||||
self.cache[key] = (alternatives, probs_cdf)
|
self.cache[key] = (forms, form_probs_cdf)
|
||||||
return self.cache[key]
|
return self.cache[key]
|
||||||
|
|
||||||
def form_probabilities(self, properties, language, dictionaries=()):
|
def form_probabilities(self, properties, language, dictionaries=()):
|
||||||
|
|||||||
@@ -17,15 +17,43 @@ class DoesNotExist:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def nested_get(obj, keys):
|
def nested_get(obj, keys, default=DoesNotExist):
|
||||||
if len(keys) == 0:
|
if len(keys) == 0:
|
||||||
return obj
|
return obj
|
||||||
try:
|
try:
|
||||||
for key in keys[:-1]:
|
for key in keys[:-1]:
|
||||||
obj = obj.get(key, {})
|
obj = obj.get(key, {})
|
||||||
if not hasattr(obj, 'items'):
|
if not hasattr(obj, 'items'):
|
||||||
return DoesNotExist
|
return default
|
||||||
key = keys[-1]
|
key = keys[-1]
|
||||||
return obj.get(key, DoesNotExist)
|
return obj.get(key, default)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
return DoesNotExist
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def alternative_probabilities(properties):
|
||||||
|
if properties is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
probs = []
|
||||||
|
alternatives = []
|
||||||
|
|
||||||
|
if 'probability' in properties:
|
||||||
|
prob = properties['probability']
|
||||||
|
props = properties['default']
|
||||||
|
probs.append(prob)
|
||||||
|
alternatives.append(props)
|
||||||
|
elif 'alternatives' not in properties:
|
||||||
|
prob = 1.0
|
||||||
|
props = properties['default']
|
||||||
|
probs.append(prob)
|
||||||
|
alternatives.append(props)
|
||||||
|
|
||||||
|
alts = properties.get('alternatives', [])
|
||||||
|
for alt in alts:
|
||||||
|
prob = alt.get('probability', 1.0 / len(alts))
|
||||||
|
props = alt['alternative']
|
||||||
|
probs.append(prob)
|
||||||
|
alternatives.append(props)
|
||||||
|
|
||||||
|
return alternatives, probs
|
||||||
|
|||||||
Reference in New Issue
Block a user