Files
libpostal/scripts/geodata/language_id/sample.py

54 lines
1.2 KiB
Python

import random
import bisect
from collections import OrderedDict
'''
Top languages on the Interwebs. Not a probability distribution
as it doesn't sum to 1 and websites can be in more than one
language. Reference:
https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
'''
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
('en', 0.555),
('ru', 0.059),
('de', 0.058),
('ja', 0.05),
('es', 0.046),
('fr', 0.04),
('zh', 0.028),
('pt', 0.025),
('it', 0.019),
('pl', 0.017),
('tr', 0.015),
('nl', 0.013),
('fa', 0.009),
('ar', 0.008),
('ko', 0.007),
])
def cdf(probs):
total = float(sum(probs))
result = []
cumulative = 0.0
for w in probs:
cumulative += w
result.append(cumulative / total)
return result
MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
cdf=INTERNET_LANGUAGES_CDF):
assert len(keys) == len(cdf)
sample = random.random()
idx = bisect.bisect(cdf, sample)
return keys[idx]