From 7eb18f3538b5114dd8fe34844baf339baa9dfd32 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 3 Oct 2015 13:20:19 -0400 Subject: [PATCH] [languages] Function to sample a random language from a discrete distribution (e.g. languages on the Internet, languages in a country, etc.) --- scripts/geodata/language_id/sample.py | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 scripts/geodata/language_id/sample.py diff --git a/scripts/geodata/language_id/sample.py b/scripts/geodata/language_id/sample.py new file mode 100644 index 00000000..2c4d2d3b --- /dev/null +++ b/scripts/geodata/language_id/sample.py @@ -0,0 +1,53 @@ +import random +import bisect + +from collections import OrderedDict + +''' +Top languages on the Interwebs. Not a probability distribution +as it doesn't sum to 1 and websites can be in more than one +language. Reference: + +https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites +''' +INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([ + ('en', 0.555), + ('ru', 0.059), + ('de', 0.058), + ('ja', 0.05), + ('es', 0.046), + ('fr', 0.04), + ('zh', 0.028), + ('pt', 0.025), + ('it', 0.019), + ('pl', 0.017), + ('tr', 0.015), + ('nl', 0.013), + ('fa', 0.009), + ('ar', 0.008), + ('ko', 0.007), +]) + + +def cdf(probs): + total = float(sum(probs)) + + result = [] + cumulative = 0.0 + for w in probs: + cumulative += w + result.append(cumulative / total) + return result + + +MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys() +INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values()) + + +def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES, + cdf=INTERNET_LANGUAGES_CDF): + assert len(keys) == len(cdf) + + sample = random.random() + idx = bisect.bisect(cdf, sample) + return keys[idx]