import random import six from geodata.addresses.config import address_config from geodata.addresses.sampling import weighted_choice, zipfian_distribution, cdf from geodata.encoding import safe_decode from geodata.numbers.ordinals import ordinal_expressions alphabets = {} def sample_alphabet(alphabet, b=1.5): ''' Sample an "alphabet" using a Zipfian distribution (frequent items are very frequent, long tail of infrequent items). If we look at something like unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or "Unit Z" simply because most dwellings only have a few units. Sampling letters from a Zipfian distribution rather than uniformly means that instead of every letter having the same likelihood (1/26), letters toward the beginning of the alphabet are much more likely to be selected. Letters toward the end can still be selected sometimes, but are not very likely. Note letters don't necessarily need to be sorted alphabetically, just in order of frequency. ''' global alphabets alphabet = tuple(alphabet) if alphabet not in alphabets: probs = zipfian_distribution(len(alphabet), b) probs_cdf = cdf(probs) alphabets[alphabet] = probs_cdf probs_cdf = alphabets[alphabet] return weighted_choice(alphabet, probs_cdf) latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)] class NumberPhrases(object): @classmethod def phrase(cls, number, language, country=None): values, probs = address_config.alternative_probabilities('numbers', language, dictionaries=['number'], country=country) if not values: return safe_decode(number) phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] for num_type in ('numeric', 'numeric_affix'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key, None) if prob is not None: values.append(num_type) probs.append(prob) probs = cdf(probs) if len(values) < 2: num_type = 'numeric' else: num_type = weighted_choice(values, probs) props = phrase_props[num_type] if num_type == 'numeric': # Numeric phrase the default is with whitespace e.g. "No 1" whitespace_default = True elif num_type == 'numeric_affix': phrase = props['affix'] # Numeric affix default is no whitespace e.g. "#1" whitespace_default = False direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) if props.get('title_case', True): # Title case unless the config specifies otherwise phrase = phrase.title() whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, number) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(number, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(number) class NumberedComponent(object): @classmethod def numeric_phrase(cls, key, num, language, country=None, dictionaries=()): is_alpha = False try: num = int(num) except ValueError: try: num = float(num) except ValueError: is_alpha = True # Pick a phrase given the probability distribution from the config values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country) phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] # Dictionaries are lowercased, so title case here if phrase_props.get('title_case', True): phrase = phrase.title() ''' There are a few ways we can express the number itself 1. Alias it as some standalone word like basement (for floor "-1") 2. Use the number itself, so "Floor 2" 3. Append/prepend an affix e.g. 2/F for second floor 4. As an ordinal expression e.g. "2nd Floor" ''' have_standalone = False for num_type in ('standalone', 'numeric', 'numeric_affix', 'ordinal'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key) if prob is not None: if num_type == 'standalone': have_standalone = True values.append(num_type) probs.append(prob) if not probs: return phrase # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items if is_alpha: values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'standalone')]) total = sum(probs) probs = [p / total for p in probs] probs = cdf(probs) if len(values) < 2: num_type = 'standalone' if have_standalone else 'numeric' else: num_type = weighted_choice(values, probs) if num_type == 'standalone': return phrase props = phrase_props[num_type] if phrase_props.get('number_abs_value', False): num = abs(num) if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']: return phrase if phrase_props.get('number_subtract_abs_value'): num -= phrase_props['number_subtract_abs_value'] num = safe_decode(num) # Do we add the numeric phrase e.g. Floor No 1 add_number_phrase = props.get('add_number_phrase', False) if add_number_phrase and random.random() < props['add_number_phrase_probability']: num = NumberPhrases.phrase(num, language, country=country) whitespace_default = True if num_type == 'numeric_affix': phrase = props['affix'] if props.get('upper_case', True): phrase = phrase.upper() whitespace_default = False elif num_type == 'ordinal': num = ordinal_expressions.suffixed_number(num, language) direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, num) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(num, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(num)