Files
libpostal/scripts/geodata/addresses/numbering.py

355 lines
13 KiB
Python

import random
import six
from geodata.addresses.config import address_config
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
from geodata.math.floats import isclose
from geodata.numbers.ordinals import ordinal_expressions
from geodata.numbers.spellout import numeric_expressions
from geodata.text.tokenize import tokenize, token_types
alphabets = {}
def sample_alphabet(alphabet, b=1.5):
'''
Sample an "alphabet" using a Zipfian distribution (frequent items are very
frequent, long tail of infrequent items). If we look at something like
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
"Unit Z" simply because most dwellings only have a few units. Sampling
letters from a Zipfian distribution rather than uniformly means that instead
of every letter having the same likelihood (1/26), letters toward the beginning
of the alphabet are much more likely to be selected. Letters toward the end can
still be selected sometimes, but are not very likely.
Note letters don't necessarily need to be sorted alphabetically, just in order
of frequency.
'''
global alphabets
alphabet = tuple(alphabet)
if alphabet not in alphabets:
probs = zipfian_distribution(len(alphabet), b)
probs_cdf = cdf(probs)
alphabets[alphabet] = probs_cdf
probs_cdf = alphabets[alphabet]
return weighted_choice(alphabet, probs_cdf)
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
class NumericPhrase(object):
key = None
NUMERIC = 'numeric'
NUMERIC_AFFIX = 'numeric_affix'
@classmethod
def pick_phrase_and_type(cls, number, language, country=None):
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
if not values:
return safe_decode(number) if number is not None else None
phrase, phrase_props = weighted_choice(values, probs)
values = []
probs = []
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
key = '{}_probability'.format(num_type)
prob = phrase_props.get(key, None)
if prob is not None:
values.append(num_type)
probs.append(prob)
probs = cdf(probs)
if len(values) < 2:
num_type = cls.NUMERIC
else:
num_type = weighted_choice(values, probs)
return num_type, phrase, phrase_props[num_type]
@classmethod
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
if num_type == cls.NUMERIC_AFFIX:
phrase = props['affix']
if 'zero_pad' in props and number.isdigit():
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
direction = props['direction']
whitespace = props.get('whitespace', whitespace_default)
if props.get('title_case', True):
# Title case unless the config specifies otherwise
phrase = phrase.title()
if number is None:
return phrase
whitespace_phrase = six.u(' ') if whitespace else six.u('')
# Phrase goes to the left of hte number
if direction == 'left':
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
# Phrase goes to the right of the number
elif direction == 'right':
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
# Need to specify a direction, otherwise return naked number
else:
return safe_decode(number)
@classmethod
def phrase(cls, number, language, country=None):
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
whitespace_default = num_type == cls.NUMERIC
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
class Number(NumericPhrase):
key = 'numbers'
dictionaries = ['number']
class NumberedComponent(object):
NUMERIC = 'numeric'
ALPHA = 'alpha'
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
ROMAN_NUMERAL = 'roman_numeral'
@classmethod
def choose_alphanumeric_type(cls, key, language, country=None):
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
if alphanumeric_props is None:
return None, None
values = []
probs = []
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.ROMAN_NUMERAL):
key = '{}_probability'.format(num_type)
prob = alphanumeric_props.get(key)
if prob is not None:
values.append(num_type)
probs.append(prob)
probs = cdf(probs)
num_type = weighted_choice(values, probs)
num_type_props = alphanumeric_props.get(num_type, {})
return num_type, num_type_props
@classmethod
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
has_alpha = False
has_numeric = True
is_integer = False
is_none = False
if num is not None:
try:
num = int(num)
is_integer = True
except ValueError:
try:
num = float(num)
except ValueError:
tokens = tokenize(safe_decode(num))
has_numeric = False
for t, c in tokens:
if c == token_types.NUMERIC:
has_numeric = True
if t.isalnum():
has_alpha = True
if strict_numeric and has_alpha:
return safe_decode(num)
else:
is_none = True
values, probs = None, None
if is_alpha:
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
# Pick a phrase given the probability distribution from the config
if values is None:
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
if not values:
return safe_decode(num) if not is_none else None
phrase, phrase_props = weighted_choice(values, probs)
values = []
probs = []
# Dictionaries are lowercased, so title case here
if phrase_props.get('title_case', True):
phrase = phrase.title()
'''
There are a few ways we can express the number itself
1. Alias it as some standalone word like basement (for floor "-1")
2. Use the number itself, so "Floor 2"
3. Append/prepend an affix e.g. 2/F for second floor
4. As an ordinal expression e.g. "2nd Floor"
'''
have_standalone = False
have_null = False
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
key = '{}_probability'.format(num_type)
prob = phrase_props.get(key)
if prob is not None:
if num_type == 'standalone':
have_standalone = True
elif num_type == 'null':
have_null = True
values.append(num_type)
probs.append(prob)
elif num_type in phrase_props:
values.append(num_type)
probs.append(1.0)
break
if not probs or is_none:
return phrase
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
if has_alpha:
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
total = float(sum(probs))
if isclose(total, 0.0):
return None
probs = [p / total for p in probs]
probs = cdf(probs)
if len(values) < 2:
if have_standalone:
num_type = 'standalone'
elif have_null:
num_type = 'null'
else:
num_type = 'numeric'
else:
num_type = weighted_choice(values, probs)
if num_type == 'standalone':
return phrase
elif num_type == 'null':
return safe_decode(num)
props = phrase_props[num_type]
if is_integer:
if phrase_props.get('number_abs_value', False):
num = abs(num)
if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']:
return None
if 'number_max_abs_value' in phrase_props and num > phrase_props['number_max_abs_value']:
return None
if phrase_props.get('number_subtract_abs_value'):
num -= phrase_props['number_subtract_abs_value']
num = safe_decode(num)
# Do we add the numeric phrase e.g. Floor No 1
add_number_phrase = props.get('add_number_phrase', False)
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
num = Number.phrase(num, language, country=country)
whitespace_default = True
if num_type == 'numeric' and safe_decode(num).isdigit():
values = []
probs = []
for cardinal_type in ('roman_numeral', 'spellout'):
key = '{}_probability'.format(cardinal_type)
if key in props:
values.append(cardinal_type)
probs.append(props[key])
values.append(None)
probs.append(1.0 - sum(probs))
probs = cdf(probs)
cardinal_type = weighted_choice(values, probs)
cardinal_expression = None
if cardinal_type == 'roman_numeral':
cardinal_expression = numeric_expressions.roman_numeral(num)
elif cardinal_type == 'spellout':
cardinal_expression = numeric_expressions.spellout_cardinal(num, language, gender=props.get('gender', None))
if cardinal_expression is not None:
num = cardinal_expression
elif num_type == 'numeric_affix':
phrase = props['affix']
if props.get('upper_case', True):
phrase = phrase.upper()
if 'zero_pad' in props and num.isdigit():
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
whitespace_default = False
elif num_type == 'ordinal' and safe_decode(num).isdigit():
values = []
probs = []
for ordinal_type in ('roman_numeral', 'spellout'):
key = '{}_probability'.format(ordinal_type)
if key in props:
values.append(ordinal_type)
probs.append(props[key])
values.append('digit_suffix')
probs.append(1.0 - sum(probs))
probs = cdf(probs)
ordinal_type = weighted_choice(values, probs)
ordinal_expression = None
if ordinal_type == 'digit_suffix':
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
elif ordinal_type == 'roman_numeral':
ordinal_expression = numeric_expressions.roman_numeral(num)
elif ordinal_type == 'spellout':
ordinal_expression = numeric_expressions.spellout_ordinal(num, language, gender=props.get('gender', None))
if ordinal_expression is not None:
num = ordinal_expression
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
if random.random() < props['null_phrase_probability']:
return num
direction = props['direction']
whitespace = props.get('whitespace', whitespace_default)
# Occasionally switch up if direction_probability is specified
if random.random() > props.get('direction_probability', 1.0):
if direction == 'left':
direction = 'right'
elif direction == 'right':
direction = 'left'
whitespace_phrase = six.u(' ') if whitespace else six.u('')
# Phrase goes to the left of hte number
if direction == 'left':
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
# Phrase goes to the right of the number
elif direction == 'right':
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
# Need to specify a direction, otherwise return naked number
else:
return safe_decode(num)