From f0ac3522daf23adbbe1bce923a1fd905dd1ad44b Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 14 Apr 2016 01:17:43 -0400 Subject: [PATCH] [addresses] base class for numbered components (floors, units, house numbers in some languages/countries). Can generate many variants of a number (e.g. Floor 2, 2nd Floor, Floor #2, Floor No. 2, etc.) --- scripts/geodata/addresses/numbering.py | 195 +++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 scripts/geodata/addresses/numbering.py diff --git a/scripts/geodata/addresses/numbering.py b/scripts/geodata/addresses/numbering.py new file mode 100644 index 00000000..9b7428e7 --- /dev/null +++ b/scripts/geodata/addresses/numbering.py @@ -0,0 +1,195 @@ +import random +import six + +from geodata.addresses.config import address_config +from geodata.addresses.sampling import weighted_choice, zipfian_distribution, cdf +from geodata.encoding import safe_decode +from geodata.numbers.ordinals import ordinal_expressions + + +alphabets = {} + + +def sample_alphabet(alphabet, b=1.5): + ''' + Sample an "alphabet" using a Zipfian distribution (frequent items are very + frequent, long tail of infrequent items). If we look at something like + unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or + "Unit Z" simply because most dwellings only have a few units. Sampling + letters from a Zipfian distribution rather than uniformly means that instead + of every letter having the same likelihood (1/26), letters toward the beginning + of the alphabet are much more likely to be selected. Letters toward the end can + still be selected sometimes, but are not very likely. + + Note letters don't necessarily need to be sorted alphabetically, just in order + of frequency. + ''' + global alphabets + alphabet = tuple(alphabet) + if alphabet not in alphabets: + probs = zipfian_distribution(len(alphabet), b) + probs_cdf = cdf(probs) + + alphabets[alphabet] = probs_cdf + + probs_cdf = alphabets[alphabet] + return weighted_choice(alphabet, probs_cdf) + +latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)] + + +class NumberPhrases(object): + @classmethod + def phrase(cls, number, language, country=None): + values, probs = address_config.alternative_probabilities('numbers', language, dictionaries=['number'], country=country) + if not values: + return safe_decode(number) + + phrase, phrase_props = weighted_choice(values, probs) + + values = [] + probs = [] + + for num_type in ('numeric', 'numeric_affix'): + key = '{}_probability'.format(num_type) + prob = phrase_props.get(key, None) + if prob is not None: + values.append(num_type) + probs.append(prob) + + probs = cdf(probs) + + if len(values) < 2: + num_type = 'numeric' + else: + num_type = weighted_choice(values, probs) + + props = phrase_props[num_type] + + if num_type == 'numeric': + # Numeric phrase the default is with whitespace e.g. "No 1" + whitespace_default = True + elif num_type == 'numeric_affix': + phrase = props['affix'] + # Numeric affix default is no whitespace e.g. "#1" + whitespace_default = False + + direction = props['direction'] + whitespace = props.get('whitespace', whitespace_default) + if props.get('title_case', True): + # Title case unless the config specifies otherwise + phrase = phrase.title() + + whitespace_phrase = six.u(' ') if whitespace else six.u('') + # Phrase goes to the left of hte number + if direction == 'left': + return six.u('{}{}{}').format(phrase, whitespace_phrase, number) + # Phrase goes to the right of the number + elif direction == 'right': + return six.u('{}{}{}').format(number, whitespace_phrase, phrase) + # Need to specify a direction, otherwise return naked number + else: + return safe_decode(number) + + +class NumberedComponent(object): + @classmethod + def numeric_phrase(cls, key, num, language, country=None, dictionaries=()): + is_alpha = False + try: + num = int(num) + except ValueError: + try: + num = float(num) + except ValueError: + is_alpha = True + + # Pick a phrase given the probability distribution from the config + values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country) + phrase, phrase_props = weighted_choice(values, probs) + + values = [] + probs = [] + + # Dictionaries are lowercased, so title case here + if phrase_props.get('title_case', True): + phrase = phrase.title() + + ''' + There are a few ways we can express the number itself + + 1. Alias it as some standalone word like basement (for floor "-1") + 2. Use the number itself, so "Floor 2" + 3. Append/prepend an affix e.g. 2/F for second floor + 4. As an ordinal expression e.g. "2nd Floor" + ''' + have_standalone = False + for num_type in ('standalone', 'numeric', 'numeric_affix', 'ordinal'): + key = '{}_probability'.format(num_type) + prob = phrase_props.get(key) + if prob is not None: + if num_type == 'standalone': + have_standalone = True + values.append(num_type) + probs.append(prob) + + if not probs: + return phrase + + # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items + if is_alpha: + values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'standalone')]) + total = sum(probs) + probs = [p / total for p in probs] + + probs = cdf(probs) + + if len(values) < 2: + num_type = 'standalone' if have_standalone else 'numeric' + else: + num_type = weighted_choice(values, probs) + + if num_type == 'standalone': + return phrase + + props = phrase_props[num_type] + + if phrase_props.get('number_abs_value', False): + num = abs(num) + + if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']: + return phrase + + if phrase_props.get('number_subtract_abs_value'): + num -= phrase_props['number_subtract_abs_value'] + + num = safe_decode(num) + + # Do we add the numeric phrase e.g. Floor No 1 + add_number_phrase = props.get('add_number_phrase', False) + if add_number_phrase and random.random() < props['add_number_phrase_probability']: + num = NumberPhrases.phrase(num, language, country=country) + + whitespace_default = True + + if num_type == 'numeric_affix': + phrase = props['affix'] + if props.get('upper_case', True): + phrase = phrase.upper() + whitespace_default = False + elif num_type == 'ordinal': + num = ordinal_expressions.suffixed_number(num, language) + + direction = props['direction'] + whitespace = props.get('whitespace', whitespace_default) + + whitespace_phrase = six.u(' ') if whitespace else six.u('') + # Phrase goes to the left of hte number + if direction == 'left': + return six.u('{}{}{}').format(phrase, whitespace_phrase, num) + # Phrase goes to the right of the number + elif direction == 'right': + return six.u('{}{}{}').format(num, whitespace_phrase, phrase) + # Need to specify a direction, otherwise return naked number + else: + return safe_decode(num)