355 lines
13 KiB
Python
355 lines
13 KiB
Python
import random
|
|
import six
|
|
|
|
from geodata.addresses.config import address_config
|
|
from geodata.encoding import safe_decode
|
|
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
|
from geodata.math.floats import isclose
|
|
from geodata.numbers.ordinals import ordinal_expressions
|
|
from geodata.numbers.spellout import numeric_expressions
|
|
from geodata.text.tokenize import tokenize, token_types
|
|
|
|
alphabets = {}
|
|
|
|
|
|
def sample_alphabet(alphabet, b=1.5):
|
|
'''
|
|
Sample an "alphabet" using a Zipfian distribution (frequent items are very
|
|
frequent, long tail of infrequent items). If we look at something like
|
|
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
|
|
"Unit Z" simply because most dwellings only have a few units. Sampling
|
|
letters from a Zipfian distribution rather than uniformly means that instead
|
|
of every letter having the same likelihood (1/26), letters toward the beginning
|
|
of the alphabet are much more likely to be selected. Letters toward the end can
|
|
still be selected sometimes, but are not very likely.
|
|
|
|
Note letters don't necessarily need to be sorted alphabetically, just in order
|
|
of frequency.
|
|
'''
|
|
global alphabets
|
|
alphabet = tuple(alphabet)
|
|
if alphabet not in alphabets:
|
|
probs = zipfian_distribution(len(alphabet), b)
|
|
probs_cdf = cdf(probs)
|
|
|
|
alphabets[alphabet] = probs_cdf
|
|
|
|
probs_cdf = alphabets[alphabet]
|
|
return weighted_choice(alphabet, probs_cdf)
|
|
|
|
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
|
|
|
|
|
class NumericPhrase(object):
|
|
key = None
|
|
|
|
NUMERIC = 'numeric'
|
|
NUMERIC_AFFIX = 'numeric_affix'
|
|
|
|
@classmethod
|
|
def pick_phrase_and_type(cls, number, language, country=None):
|
|
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
|
|
if not values:
|
|
return safe_decode(number) if number is not None else None
|
|
|
|
phrase, phrase_props = weighted_choice(values, probs)
|
|
|
|
values = []
|
|
probs = []
|
|
|
|
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
|
|
key = '{}_probability'.format(num_type)
|
|
prob = phrase_props.get(key, None)
|
|
if prob is not None:
|
|
values.append(num_type)
|
|
probs.append(prob)
|
|
|
|
probs = cdf(probs)
|
|
|
|
if len(values) < 2:
|
|
num_type = cls.NUMERIC
|
|
else:
|
|
num_type = weighted_choice(values, probs)
|
|
|
|
return num_type, phrase, phrase_props[num_type]
|
|
|
|
@classmethod
|
|
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
|
|
if num_type == cls.NUMERIC_AFFIX:
|
|
phrase = props['affix']
|
|
if 'zero_pad' in props and number.isdigit():
|
|
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
|
|
|
direction = props['direction']
|
|
whitespace = props.get('whitespace', whitespace_default)
|
|
if props.get('title_case', True):
|
|
# Title case unless the config specifies otherwise
|
|
phrase = phrase.title()
|
|
|
|
if number is None:
|
|
return phrase
|
|
|
|
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
|
# Phrase goes to the left of hte number
|
|
if direction == 'left':
|
|
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
|
|
# Phrase goes to the right of the number
|
|
elif direction == 'right':
|
|
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
|
|
# Need to specify a direction, otherwise return naked number
|
|
else:
|
|
return safe_decode(number)
|
|
|
|
@classmethod
|
|
def phrase(cls, number, language, country=None):
|
|
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
|
|
whitespace_default = num_type == cls.NUMERIC
|
|
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
|
|
|
|
|
|
class Number(NumericPhrase):
|
|
key = 'numbers'
|
|
dictionaries = ['number']
|
|
|
|
|
|
class NumberedComponent(object):
|
|
NUMERIC = 'numeric'
|
|
ALPHA = 'alpha'
|
|
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
|
|
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
|
|
ROMAN_NUMERAL = 'roman_numeral'
|
|
|
|
@classmethod
|
|
def choose_alphanumeric_type(cls, key, language, country=None):
|
|
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
|
|
if alphanumeric_props is None:
|
|
return None, None
|
|
|
|
values = []
|
|
probs = []
|
|
|
|
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.ROMAN_NUMERAL):
|
|
key = '{}_probability'.format(num_type)
|
|
prob = alphanumeric_props.get(key)
|
|
if prob is not None:
|
|
values.append(num_type)
|
|
probs.append(prob)
|
|
|
|
probs = cdf(probs)
|
|
num_type = weighted_choice(values, probs)
|
|
num_type_props = alphanumeric_props.get(num_type, {})
|
|
|
|
return num_type, num_type_props
|
|
|
|
@classmethod
|
|
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
|
|
has_alpha = False
|
|
has_numeric = True
|
|
is_integer = False
|
|
is_none = False
|
|
if num is not None:
|
|
try:
|
|
num = int(num)
|
|
is_integer = True
|
|
except ValueError:
|
|
try:
|
|
num = float(num)
|
|
except ValueError:
|
|
tokens = tokenize(safe_decode(num))
|
|
has_numeric = False
|
|
for t, c in tokens:
|
|
if c == token_types.NUMERIC:
|
|
has_numeric = True
|
|
if t.isalnum():
|
|
has_alpha = True
|
|
|
|
if strict_numeric and has_alpha:
|
|
return safe_decode(num)
|
|
|
|
else:
|
|
is_none = True
|
|
|
|
values, probs = None, None
|
|
|
|
if is_alpha:
|
|
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
|
|
|
|
# Pick a phrase given the probability distribution from the config
|
|
if values is None:
|
|
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
|
|
|
|
if not values:
|
|
return safe_decode(num) if not is_none else None
|
|
|
|
phrase, phrase_props = weighted_choice(values, probs)
|
|
|
|
values = []
|
|
probs = []
|
|
|
|
# Dictionaries are lowercased, so title case here
|
|
if phrase_props.get('title_case', True):
|
|
phrase = phrase.title()
|
|
|
|
'''
|
|
There are a few ways we can express the number itself
|
|
|
|
1. Alias it as some standalone word like basement (for floor "-1")
|
|
2. Use the number itself, so "Floor 2"
|
|
3. Append/prepend an affix e.g. 2/F for second floor
|
|
4. As an ordinal expression e.g. "2nd Floor"
|
|
'''
|
|
have_standalone = False
|
|
have_null = False
|
|
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
|
|
key = '{}_probability'.format(num_type)
|
|
prob = phrase_props.get(key)
|
|
if prob is not None:
|
|
if num_type == 'standalone':
|
|
have_standalone = True
|
|
elif num_type == 'null':
|
|
have_null = True
|
|
values.append(num_type)
|
|
probs.append(prob)
|
|
elif num_type in phrase_props:
|
|
values.append(num_type)
|
|
probs.append(1.0)
|
|
break
|
|
|
|
if not probs or is_none:
|
|
return phrase
|
|
|
|
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
|
|
if has_alpha:
|
|
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
|
|
total = float(sum(probs))
|
|
if isclose(total, 0.0):
|
|
return None
|
|
|
|
probs = [p / total for p in probs]
|
|
|
|
probs = cdf(probs)
|
|
|
|
if len(values) < 2:
|
|
if have_standalone:
|
|
num_type = 'standalone'
|
|
elif have_null:
|
|
num_type = 'null'
|
|
else:
|
|
num_type = 'numeric'
|
|
else:
|
|
num_type = weighted_choice(values, probs)
|
|
|
|
if num_type == 'standalone':
|
|
return phrase
|
|
elif num_type == 'null':
|
|
return safe_decode(num)
|
|
|
|
props = phrase_props[num_type]
|
|
|
|
if is_integer:
|
|
if phrase_props.get('number_abs_value', False):
|
|
num = abs(num)
|
|
|
|
if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']:
|
|
return None
|
|
|
|
if 'number_max_abs_value' in phrase_props and num > phrase_props['number_max_abs_value']:
|
|
return None
|
|
|
|
if phrase_props.get('number_subtract_abs_value'):
|
|
num -= phrase_props['number_subtract_abs_value']
|
|
|
|
num = safe_decode(num)
|
|
|
|
# Do we add the numeric phrase e.g. Floor No 1
|
|
add_number_phrase = props.get('add_number_phrase', False)
|
|
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
|
|
num = Number.phrase(num, language, country=country)
|
|
|
|
whitespace_default = True
|
|
|
|
if num_type == 'numeric' and safe_decode(num).isdigit():
|
|
values = []
|
|
probs = []
|
|
for cardinal_type in ('roman_numeral', 'spellout'):
|
|
key = '{}_probability'.format(cardinal_type)
|
|
if key in props:
|
|
values.append(cardinal_type)
|
|
probs.append(props[key])
|
|
|
|
values.append(None)
|
|
probs.append(1.0 - sum(probs))
|
|
|
|
probs = cdf(probs)
|
|
|
|
cardinal_type = weighted_choice(values, probs)
|
|
cardinal_expression = None
|
|
if cardinal_type == 'roman_numeral':
|
|
cardinal_expression = numeric_expressions.roman_numeral(num)
|
|
elif cardinal_type == 'spellout':
|
|
cardinal_expression = numeric_expressions.spellout_cardinal(num, language, gender=props.get('gender', None))
|
|
|
|
if cardinal_expression is not None:
|
|
num = cardinal_expression
|
|
|
|
elif num_type == 'numeric_affix':
|
|
phrase = props['affix']
|
|
if props.get('upper_case', True):
|
|
phrase = phrase.upper()
|
|
if 'zero_pad' in props and num.isdigit():
|
|
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
|
whitespace_default = False
|
|
elif num_type == 'ordinal' and safe_decode(num).isdigit():
|
|
values = []
|
|
probs = []
|
|
|
|
for ordinal_type in ('roman_numeral', 'spellout'):
|
|
key = '{}_probability'.format(ordinal_type)
|
|
if key in props:
|
|
values.append(ordinal_type)
|
|
probs.append(props[key])
|
|
|
|
values.append('digit_suffix')
|
|
probs.append(1.0 - sum(probs))
|
|
|
|
probs = cdf(probs)
|
|
|
|
ordinal_type = weighted_choice(values, probs)
|
|
|
|
ordinal_expression = None
|
|
if ordinal_type == 'digit_suffix':
|
|
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
|
|
|
|
elif ordinal_type == 'roman_numeral':
|
|
ordinal_expression = numeric_expressions.roman_numeral(num)
|
|
elif ordinal_type == 'spellout':
|
|
ordinal_expression = numeric_expressions.spellout_ordinal(num, language, gender=props.get('gender', None))
|
|
|
|
if ordinal_expression is not None:
|
|
num = ordinal_expression
|
|
|
|
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
|
|
if random.random() < props['null_phrase_probability']:
|
|
return num
|
|
|
|
direction = props['direction']
|
|
whitespace = props.get('whitespace', whitespace_default)
|
|
|
|
# Occasionally switch up if direction_probability is specified
|
|
if random.random() > props.get('direction_probability', 1.0):
|
|
if direction == 'left':
|
|
direction = 'right'
|
|
elif direction == 'right':
|
|
direction = 'left'
|
|
|
|
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
|
# Phrase goes to the left of hte number
|
|
if direction == 'left':
|
|
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
|
|
# Phrase goes to the right of the number
|
|
elif direction == 'right':
|
|
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
|
|
# Need to specify a direction, otherwise return naked number
|
|
else:
|
|
return safe_decode(num)
|