From 7eb18f3538b5114dd8fe34844baf339baa9dfd32 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 3 Oct 2015 13:20:19 -0400
Subject: [PATCH] [languages] Function to sample a random language from a
 discrete distribution (e.g. languages on the Internet, languages in a
 country, etc.)

---
 scripts/geodata/language_id/sample.py | 53 +++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 scripts/geodata/language_id/sample.py

diff --git a/scripts/geodata/language_id/sample.py b/scripts/geodata/language_id/sample.py
new file mode 100644
index 00000000..2c4d2d3b
--- /dev/null
+++ b/scripts/geodata/language_id/sample.py
@@ -0,0 +1,53 @@
+import random
+import bisect
+
+from collections import OrderedDict
+
+'''
+Top languages on the Interwebs. Not a probability distribution
+as it doesn't sum to 1 and websites can be in more than one
+language. Reference:
+
+https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
+'''
+INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
+    ('en', 0.555),
+    ('ru', 0.059),
+    ('de', 0.058),
+    ('ja', 0.05),
+    ('es', 0.046),
+    ('fr', 0.04),
+    ('zh', 0.028),
+    ('pt', 0.025),
+    ('it', 0.019),
+    ('pl', 0.017),
+    ('tr', 0.015),
+    ('nl', 0.013),
+    ('fa', 0.009),
+    ('ar', 0.008),
+    ('ko', 0.007),
+])
+
+
+def cdf(probs):
+    total = float(sum(probs))
+
+    result = []
+    cumulative = 0.0
+    for w in probs:
+        cumulative += w
+        result.append(cumulative / total)
+    return result
+
+
+MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
+INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
+
+
+def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
+                           cdf=INTERNET_LANGUAGES_CDF):
+    assert len(keys) == len(cdf)
+
+    sample = random.random()
+    idx = bisect.bisect(cdf, sample)
+    return keys[idx]