diff --git a/scripts/geodata/addresses/sampling.py b/scripts/geodata/addresses/sampling.py new file mode 100644 index 00000000..a0278464 --- /dev/null +++ b/scripts/geodata/addresses/sampling.py @@ -0,0 +1,46 @@ +import bisect +import random +import sys + +FLOAT_EPSILON = 1e-09 + + +def weighted_choice(values, cdf): + """Pick one of n values given a discrete cumulative distribution""" + assert values and cdf, 'values and probabilities cannot be empty/None' + assert len(values) == len(cdf), 'len(values) != len(probs)' + assert all(p >= 0.0 and p <= (1.0 + FLOAT_EPSILON) for p in cdf), 'Probabilities not valid: {}'.format(cdf) + + x = random.random() + i = bisect.bisect(cdf, x) + return values[i] + + +def isclose(a, b, rel_tol=FLOAT_EPSILON, abs_tol=0.0): + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + + +def check_probability_distribution(probs): + cumulative = 0.0 + for p in probs: + assert p >= 0.0, 'Probabilities cannot be negative' + assert p <= 1.0, 'Probabilities cannot be > 1.0' + cumulative += p + assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative) + + +def cdf(probs): + total = 0.0 + cumulative = [0.0] * len(probs) + for i, p in enumerate(probs): + total += p + cumulative[i] = total + + return cumulative + + +def zipfian_distribution(n, b=1.0): + """Distribution where the ith item's frequency is proportional to its rank""" + frequencies = [1. / (i ** b) for i in xrange(1, n + 1)] + total = sum(frequencies) + return [f / total for f in frequencies]