424 lines
15 KiB
Python
424 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
||
import random
|
||
import six
|
||
|
||
from geodata.addresses.config import address_config
|
||
from geodata.encoding import safe_decode
|
||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||
from geodata.math.floats import isclose
|
||
from geodata.numbers.ordinals import ordinal_expressions
|
||
from geodata.numbers.spellout import numeric_expressions
|
||
from geodata.text.tokenize import tokenize, token_types
|
||
|
||
alphabets = {}
|
||
|
||
|
||
def sample_alphabet(alphabet, b=1.5):
|
||
'''
|
||
Sample an "alphabet" using a Zipfian distribution (frequent items are very
|
||
frequent, long tail of infrequent items). If we look at something like
|
||
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
|
||
"Unit Z" simply because most dwellings only have a few units. Sampling
|
||
letters from a Zipfian distribution rather than uniformly means that instead
|
||
of every letter having the same likelihood (1/26), letters toward the beginning
|
||
of the alphabet are much more likely to be selected. Letters toward the end can
|
||
still be selected sometimes, but are not very likely.
|
||
|
||
Note letters don't necessarily need to be sorted alphabetically, just in order
|
||
of frequency.
|
||
'''
|
||
global alphabets
|
||
alphabet = tuple(alphabet)
|
||
if alphabet not in alphabets:
|
||
probs = zipfian_distribution(len(alphabet), b)
|
||
probs_cdf = cdf(probs)
|
||
|
||
alphabets[alphabet] = probs_cdf
|
||
|
||
probs_cdf = alphabets[alphabet]
|
||
return weighted_choice(alphabet, probs_cdf)
|
||
|
||
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
||
|
||
|
||
class Digits(object):
|
||
ASCII = 'ascii'
|
||
SPELLOUT = 'spellout'
|
||
UNICODE_FULL_WIDTH = 'unicode_full_width'
|
||
ROMAN_NUMERAL = 'roman_numeral'
|
||
|
||
CARDINAL = 'cardinal'
|
||
ORDINAL = 'ordinal'
|
||
|
||
unicode_full_width_map = {
|
||
'0': safe_decode('0'),
|
||
'1': safe_decode('1'),
|
||
'2': safe_decode('2'),
|
||
'3': safe_decode('3'),
|
||
'4': safe_decode('4'),
|
||
'5': safe_decode('5'),
|
||
'6': safe_decode('6'),
|
||
'7': safe_decode('7'),
|
||
'8': safe_decode('8'),
|
||
'9': safe_decode('9'),
|
||
}
|
||
|
||
@classmethod
|
||
def rewrite_full_width(cls, s):
|
||
return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
|
||
|
||
@classmethod
|
||
def rewrite_roman_numeral(cls, s):
|
||
roman_numeral = None
|
||
if s.isdigit():
|
||
roman_numeral = numeric_expressions.roman_numeral(s)
|
||
|
||
if roman_numeral:
|
||
return roman_numeral
|
||
else:
|
||
return s
|
||
|
||
@classmethod
|
||
def rewrite_spellout(cls, s, lang, num_type, props):
|
||
if s.isdigit():
|
||
num = int(s)
|
||
spellout = None
|
||
gender = props.get('gender')
|
||
category = props.get('category')
|
||
|
||
if num_type == cls.CARDINAL:
|
||
spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
|
||
elif num_type == cls.ORDINAL:
|
||
spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
|
||
|
||
if spellout:
|
||
return spellout.title()
|
||
return s
|
||
else:
|
||
return s
|
||
|
||
@classmethod
|
||
def rewrite(cls, d, lang, props, num_type=CARDINAL):
|
||
if not props:
|
||
return d
|
||
|
||
d = safe_decode(d)
|
||
|
||
values = []
|
||
probs = []
|
||
|
||
for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
|
||
key = '{}_probability'.format(digit_type)
|
||
if key in props:
|
||
values.append(digit_type)
|
||
probs.append(props[key])
|
||
|
||
if not isclose(sum(probs), 1.0):
|
||
values.append(cls.ASCII)
|
||
probs.append(1.0 - sum(probs))
|
||
|
||
probs = cdf(probs)
|
||
digit_type = weighted_choice(values, probs)
|
||
|
||
if digit_type == cls.ASCII:
|
||
return d
|
||
elif digit_type == cls.SPELLOUT:
|
||
return cls.rewrite_spellout(d, lang, num_type, props)
|
||
elif digit_type == cls.ROMAN_NUMERAL:
|
||
roman_numeral = cls.rewrite_roman_numeral(d)
|
||
if random.random() < props.get('ordinal_suffix_probability', 0.0):
|
||
ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
|
||
if ordinal_suffix:
|
||
roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
|
||
return roman_numeral
|
||
elif digit_type == cls.UNICODE_FULL_WIDTH:
|
||
return cls.rewrite_full_width(d)
|
||
else:
|
||
return d
|
||
|
||
|
||
class NumericPhrase(object):
|
||
key = None
|
||
|
||
NUMERIC = 'numeric'
|
||
NUMERIC_AFFIX = 'numeric_affix'
|
||
|
||
@classmethod
|
||
def pick_phrase_and_type(cls, number, language, country=None):
|
||
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
|
||
if not values:
|
||
return None, safe_decode(number) if number is not None else None, None
|
||
|
||
phrase, phrase_props = weighted_choice(values, probs)
|
||
|
||
values = []
|
||
probs = []
|
||
|
||
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
|
||
key = '{}_probability'.format(num_type)
|
||
prob = phrase_props.get(key, None)
|
||
if prob is not None:
|
||
values.append(num_type)
|
||
probs.append(prob)
|
||
|
||
if not probs:
|
||
num_type = cls.NUMERIC
|
||
else:
|
||
probs = cdf(probs)
|
||
num_type = weighted_choice(values, probs)
|
||
|
||
return num_type, phrase, phrase_props[num_type]
|
||
|
||
@classmethod
|
||
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
|
||
|
||
if num_type == cls.NUMERIC_AFFIX:
|
||
phrase = props['affix']
|
||
if 'zero_pad' in props and number.isdigit():
|
||
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||
|
||
direction = props['direction']
|
||
whitespace = props.get('whitespace', whitespace_default)
|
||
whitespace_probability = props.get('whitespace_probability')
|
||
if whitespace_probability is not None:
|
||
whitespace = random.random() < whitespace_probability
|
||
|
||
if props.get('title_case', True):
|
||
# Title case unless the config specifies otherwise
|
||
phrase = phrase.title()
|
||
|
||
if number is None:
|
||
return phrase
|
||
|
||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||
# Phrase goes to the left of hte number
|
||
if direction == 'left':
|
||
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
|
||
# Phrase goes to the right of the number
|
||
elif direction == 'right':
|
||
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
|
||
# Need to specify a direction, otherwise return naked number
|
||
else:
|
||
return safe_decode(number)
|
||
|
||
@classmethod
|
||
def phrase(cls, number, language, country=None):
|
||
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
|
||
whitespace_default = num_type == cls.NUMERIC
|
||
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
|
||
|
||
|
||
class Number(NumericPhrase):
|
||
key = 'numbers'
|
||
dictionaries = ['number']
|
||
|
||
|
||
class NumberedComponent(object):
|
||
NUMERIC = 'numeric'
|
||
ALPHA = 'alpha'
|
||
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
|
||
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
|
||
ROMAN_NUMERAL = 'roman_numeral'
|
||
|
||
@classmethod
|
||
def choose_alphanumeric_type(cls, key, language, country=None):
|
||
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
|
||
if alphanumeric_props is None:
|
||
return None, None
|
||
|
||
values = []
|
||
probs = []
|
||
|
||
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.ROMAN_NUMERAL):
|
||
key = '{}_probability'.format(num_type)
|
||
prob = alphanumeric_props.get(key)
|
||
if prob is not None:
|
||
values.append(num_type)
|
||
probs.append(prob)
|
||
|
||
if not values:
|
||
return None, None
|
||
|
||
probs = cdf(probs)
|
||
num_type = weighted_choice(values, probs)
|
||
num_type_props = alphanumeric_props.get(num_type, {})
|
||
|
||
return num_type, num_type_props
|
||
|
||
@classmethod
|
||
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
|
||
has_alpha = False
|
||
has_numeric = True
|
||
is_integer = False
|
||
is_none = False
|
||
if num is not None:
|
||
try:
|
||
num_int = int(num)
|
||
is_integer = True
|
||
except ValueError:
|
||
try:
|
||
num_float = float(num)
|
||
except ValueError:
|
||
tokens = tokenize(safe_decode(num))
|
||
has_numeric = False
|
||
for t, c in tokens:
|
||
if c == token_types.NUMERIC:
|
||
has_numeric = True
|
||
if any((ch.isalpha() for ch in t)):
|
||
has_alpha = True
|
||
|
||
if strict_numeric and has_alpha:
|
||
return safe_decode(num)
|
||
|
||
else:
|
||
is_none = True
|
||
|
||
values, probs = None, None
|
||
|
||
if is_alpha:
|
||
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
|
||
|
||
# Pick a phrase given the probability distribution from the config
|
||
if values is None:
|
||
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
|
||
|
||
if not values:
|
||
return safe_decode(num) if not is_none else None
|
||
|
||
phrase, phrase_props = weighted_choice(values, probs)
|
||
|
||
values = []
|
||
probs = []
|
||
|
||
# Dictionaries are lowercased, so title case here
|
||
if phrase_props.get('title_case', True):
|
||
phrase = phrase.title()
|
||
|
||
'''
|
||
There are a few ways we can express the number itself
|
||
|
||
1. Alias it as some standalone word like basement (for floor "-1")
|
||
2. Use the number itself, so "Floor 2"
|
||
3. Append/prepend an affix e.g. 2/F for second floor
|
||
4. As an ordinal expression e.g. "2nd Floor"
|
||
'''
|
||
have_standalone = False
|
||
have_null = False
|
||
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
|
||
key = '{}_probability'.format(num_type)
|
||
prob = phrase_props.get(key)
|
||
if prob is not None:
|
||
if num_type == 'standalone':
|
||
have_standalone = True
|
||
elif num_type == 'null':
|
||
have_null = True
|
||
values.append(num_type)
|
||
probs.append(prob)
|
||
elif num_type in phrase_props:
|
||
values.append(num_type)
|
||
probs.append(1.0)
|
||
break
|
||
|
||
if not probs or is_none:
|
||
return phrase
|
||
|
||
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
|
||
if has_alpha:
|
||
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
|
||
total = float(sum(probs))
|
||
if isclose(total, 0.0):
|
||
return None
|
||
|
||
probs = [p / total for p in probs]
|
||
|
||
probs = cdf(probs)
|
||
|
||
if len(values) < 2:
|
||
if have_standalone:
|
||
num_type = 'standalone'
|
||
elif have_null:
|
||
num_type = 'null'
|
||
else:
|
||
num_type = 'numeric'
|
||
else:
|
||
num_type = weighted_choice(values, probs)
|
||
|
||
if num_type == 'standalone':
|
||
return phrase
|
||
elif num_type == 'null':
|
||
return safe_decode(num)
|
||
|
||
props = phrase_props[num_type]
|
||
|
||
if is_integer:
|
||
num = int(num)
|
||
if phrase_props.get('number_abs_value', False):
|
||
num = abs(num)
|
||
|
||
if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']:
|
||
return None
|
||
|
||
if 'number_max_abs_value' in phrase_props and num > phrase_props['number_max_abs_value']:
|
||
return None
|
||
|
||
if phrase_props.get('number_subtract_abs_value'):
|
||
num -= phrase_props['number_subtract_abs_value']
|
||
|
||
num = safe_decode(num)
|
||
digits_props = props.get('digits')
|
||
if digits_props:
|
||
# Inherit the gender and category e.g. for ordinals
|
||
for k in ('gender', 'category'):
|
||
if k in props:
|
||
digits_props[k] = props[k]
|
||
num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
|
||
|
||
# Do we add the numeric phrase e.g. Floor No 1
|
||
add_number_phrase = props.get('add_number_phrase', False)
|
||
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
|
||
num = Number.phrase(num, language, country=country)
|
||
|
||
whitespace_default = True
|
||
|
||
if num_type == 'numeric_affix':
|
||
phrase = props['affix']
|
||
if props.get('upper_case', True):
|
||
phrase = phrase.upper()
|
||
if 'zero_pad' in props and num.isdigit():
|
||
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||
whitespace_default = False
|
||
elif num_type == 'ordinal' and safe_decode(num).isdigit():
|
||
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
|
||
|
||
if ordinal_expression is not None:
|
||
num = ordinal_expression
|
||
|
||
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
|
||
if random.random() < props['null_phrase_probability']:
|
||
return num
|
||
|
||
direction = props['direction']
|
||
whitespace = props.get('whitespace', whitespace_default)
|
||
|
||
whitespace_probability = props.get('whitespace_probability')
|
||
if whitespace_probability is not None:
|
||
whitespace = random.random() < whitespace_probability
|
||
|
||
# Occasionally switch up if direction_probability is specified
|
||
if random.random() > props.get('direction_probability', 1.0):
|
||
if direction == 'left':
|
||
direction = 'right'
|
||
elif direction == 'right':
|
||
direction = 'left'
|
||
|
||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||
# Phrase goes to the left of hte number
|
||
if direction == 'left':
|
||
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
|
||
# Phrase goes to the right of the number
|
||
elif direction == 'right':
|
||
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
|
||
# Need to specify a direction, otherwise return naked number
|
||
else:
|
||
return safe_decode(num)
|