libpostal/scripts/geodata/language_id/sample.py

import random
import bisect

from collections import OrderedDict

'''
Top languages on the Interwebs. Not a probability distribution
as it doesn't sum to 1 and websites can be in more than one
language. Reference:

https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
'''
INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
    ('en', 0.555),
    ('ru', 0.059),
    ('de', 0.058),
    ('ja', 0.05),
    ('es', 0.046),
    ('fr', 0.04),
    ('zh', 0.028),
    ('pt', 0.025),
    ('it', 0.019),
    ('pl', 0.017),
    ('tr', 0.015),
    ('nl', 0.013),
    ('fa', 0.009),
    ('ar', 0.008),
    ('ko', 0.007),
])


def cdf(probs):
    total = float(sum(probs))

    result = []
    cumulative = 0.0
    for w in probs:
        cumulative += w
        result.append(cumulative / total)
    return result


MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())


def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
                           cdf=INTERNET_LANGUAGES_CDF):
    assert len(keys) == len(cdf)

    sample = random.random()
    idx = bisect.bisect(cdf, sample)
    return keys[idx]