[languages] Function to sample a random language from a discrete distribution (e.g. languages on the Internet, languages in a country, etc.)
This commit is contained in:
53
scripts/geodata/language_id/sample.py
Normal file
53
scripts/geodata/language_id/sample.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import random
|
||||
import bisect
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
'''
|
||||
Top languages on the Interwebs. Not a probability distribution
|
||||
as it doesn't sum to 1 and websites can be in more than one
|
||||
language. Reference:
|
||||
|
||||
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
|
||||
'''
|
||||
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
|
||||
('en', 0.555),
|
||||
('ru', 0.059),
|
||||
('de', 0.058),
|
||||
('ja', 0.05),
|
||||
('es', 0.046),
|
||||
('fr', 0.04),
|
||||
('zh', 0.028),
|
||||
('pt', 0.025),
|
||||
('it', 0.019),
|
||||
('pl', 0.017),
|
||||
('tr', 0.015),
|
||||
('nl', 0.013),
|
||||
('fa', 0.009),
|
||||
('ar', 0.008),
|
||||
('ko', 0.007),
|
||||
])
|
||||
|
||||
|
||||
def cdf(probs):
|
||||
total = float(sum(probs))
|
||||
|
||||
result = []
|
||||
cumulative = 0.0
|
||||
for w in probs:
|
||||
cumulative += w
|
||||
result.append(cumulative / total)
|
||||
return result
|
||||
|
||||
|
||||
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
|
||||
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
|
||||
|
||||
|
||||
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
|
||||
cdf=INTERNET_LANGUAGES_CDF):
|
||||
assert len(keys) == len(cdf)
|
||||
|
||||
sample = random.random()
|
||||
idx = bisect.bisect(cdf, sample)
|
||||
return keys[idx]
|
||||
Reference in New Issue
Block a user