54 lines
1.2 KiB
Python
54 lines
1.2 KiB
Python
import random
|
|
import bisect
|
|
|
|
from collections import OrderedDict
|
|
|
|
'''
|
|
Top languages on the Interwebs. Not a probability distribution
|
|
as it doesn't sum to 1 and websites can be in more than one
|
|
language. Reference:
|
|
|
|
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
|
|
'''
|
|
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
|
|
('en', 0.555),
|
|
('ru', 0.059),
|
|
('de', 0.058),
|
|
('ja', 0.05),
|
|
('es', 0.046),
|
|
('fr', 0.04),
|
|
('zh', 0.028),
|
|
('pt', 0.025),
|
|
('it', 0.019),
|
|
('pl', 0.017),
|
|
('tr', 0.015),
|
|
('nl', 0.013),
|
|
('fa', 0.009),
|
|
('ar', 0.008),
|
|
('ko', 0.007),
|
|
])
|
|
|
|
|
|
def cdf(probs):
|
|
total = float(sum(probs))
|
|
|
|
result = []
|
|
cumulative = 0.0
|
|
for w in probs:
|
|
cumulative += w
|
|
result.append(cumulative / total)
|
|
return result
|
|
|
|
|
|
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
|
|
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
|
|
|
|
|
|
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
|
|
cdf=INTERNET_LANGUAGES_CDF):
|
|
assert len(keys) == len(cdf)
|
|
|
|
sample = random.random()
|
|
idx = bisect.bisect(cdf, sample)
|
|
return keys[idx]
|