[addresses] utilities for sampling from an arbitrary discrete distribution, building cumulative distributions, and sampling from a Zipfian distribution which seems to be a reasonable way of generating plausible apartment/floor numbers when the height/number of units is unknown. Picking a letter uniformly at random means P('Unit A') == P('Unit Z') when 'A' should be much more likely. Sampling from a Zipfian gets the desired effect in situations where address components are numbered by "counting from 0/1/A" while still allowing for a long tail

2016-04-14 01:13:39 -04:00
parent 58feeab714
commit fe006e0d62
1 changed files with 46 additions and 0 deletions
--- a/scripts/geodata/addresses/sampling.py
+++ b/scripts/geodata/addresses/sampling.py
@@ -0,0 +1,46 @@
+import bisect
+import random
+import sys
+
+FLOAT_EPSILON = 1e-09
+
+
+def weighted_choice(values, cdf):
+    """Pick one of n values given a discrete cumulative distribution"""
+    assert values and cdf, 'values and probabilities cannot be empty/None'
+    assert len(values) == len(cdf), 'len(values) != len(probs)'
+    assert all(p >= 0.0 and p <= (1.0 + FLOAT_EPSILON) for p in cdf), 'Probabilities not valid: {}'.format(cdf)
+
+    x = random.random()
+    i = bisect.bisect(cdf, x)
+    return values[i]
+
+
+def isclose(a, b, rel_tol=FLOAT_EPSILON, abs_tol=0.0):
+    return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+
+def check_probability_distribution(probs):
+    cumulative = 0.0
+    for p in probs:
+        assert p >= 0.0, 'Probabilities cannot be negative'
+        assert p <= 1.0, 'Probabilities cannot be > 1.0'
+        cumulative += p
+    assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative)
+
+
+def cdf(probs):
+    total = 0.0
+    cumulative = [0.0] * len(probs)
+    for i, p in enumerate(probs):
+        total += p
+        cumulative[i] = total
+
+    return cumulative
+
+
+def zipfian_distribution(n, b=1.0):
+    """Distribution where the ith item's frequency is proportional to its rank"""
+    frequencies = [1. / (i ** b) for i in xrange(1, n + 1)]
+    total = sum(frequencies)
+    return [f / total for f in frequencies]