[languages] Function to sample a random language from a discrete distribution (e.g. languages on the Internet, languages in a country, etc.)

2015-10-03 13:20:19 -04:00
parent 0aa6950b6c
commit 7eb18f3538
1 changed files with 53 additions and 0 deletions
--- a/scripts/geodata/language_id/sample.py
+++ b/scripts/geodata/language_id/sample.py
@@ -0,0 +1,53 @@
+import random
+import bisect
+
+from collections import OrderedDict
+
+'''
+Top languages on the Interwebs. Not a probability distribution
+as it doesn't sum to 1 and websites can be in more than one
+language. Reference:
+
+https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
+'''
+INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
+    ('en', 0.555),
+    ('ru', 0.059),
+    ('de', 0.058),
+    ('ja', 0.05),
+    ('es', 0.046),
+    ('fr', 0.04),
+    ('zh', 0.028),
+    ('pt', 0.025),
+    ('it', 0.019),
+    ('pl', 0.017),
+    ('tr', 0.015),
+    ('nl', 0.013),
+    ('fa', 0.009),
+    ('ar', 0.008),
+    ('ko', 0.007),
+])
+
+
+def cdf(probs):
+    total = float(sum(probs))
+
+    result = []
+    cumulative = 0.0
+    for w in probs:
+        cumulative += w
+        result.append(cumulative / total)
+    return result
+
+
+MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
+INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
+
+
+def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
+                           cdf=INTERNET_LANGUAGES_CDF):
+    assert len(keys) == len(cdf)
+
+    sample = random.random()
+    idx = bisect.bisect(cdf, sample)
+    return keys[idx]