[addresses] base class for numbered components (floors, units, house numbers in some languages/countries). Can generate many variants of a number (e.g. Floor 2, 2nd Floor, Floor #2, Floor No. 2, etc.)
This commit is contained in:
195
scripts/geodata/addresses/numbering.py
Normal file
195
scripts/geodata/addresses/numbering.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.numbers.ordinals import ordinal_expressions
|
||||
|
||||
|
||||
alphabets = {}
|
||||
|
||||
|
||||
def sample_alphabet(alphabet, b=1.5):
|
||||
'''
|
||||
Sample an "alphabet" using a Zipfian distribution (frequent items are very
|
||||
frequent, long tail of infrequent items). If we look at something like
|
||||
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
|
||||
"Unit Z" simply because most dwellings only have a few units. Sampling
|
||||
letters from a Zipfian distribution rather than uniformly means that instead
|
||||
of every letter having the same likelihood (1/26), letters toward the beginning
|
||||
of the alphabet are much more likely to be selected. Letters toward the end can
|
||||
still be selected sometimes, but are not very likely.
|
||||
|
||||
Note letters don't necessarily need to be sorted alphabetically, just in order
|
||||
of frequency.
|
||||
'''
|
||||
global alphabets
|
||||
alphabet = tuple(alphabet)
|
||||
if alphabet not in alphabets:
|
||||
probs = zipfian_distribution(len(alphabet), b)
|
||||
probs_cdf = cdf(probs)
|
||||
|
||||
alphabets[alphabet] = probs_cdf
|
||||
|
||||
probs_cdf = alphabets[alphabet]
|
||||
return weighted_choice(alphabet, probs_cdf)
|
||||
|
||||
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
||||
|
||||
|
||||
class NumberPhrases(object):
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
values, probs = address_config.alternative_probabilities('numbers', language, dictionaries=['number'], country=country)
|
||||
if not values:
|
||||
return safe_decode(number)
|
||||
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for num_type in ('numeric', 'numeric_affix'):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key, None)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
if len(values) < 2:
|
||||
num_type = 'numeric'
|
||||
else:
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
props = phrase_props[num_type]
|
||||
|
||||
if num_type == 'numeric':
|
||||
# Numeric phrase the default is with whitespace e.g. "No 1"
|
||||
whitespace_default = True
|
||||
elif num_type == 'numeric_affix':
|
||||
phrase = props['affix']
|
||||
# Numeric affix default is no whitespace e.g. "#1"
|
||||
whitespace_default = False
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
if props.get('title_case', True):
|
||||
# Title case unless the config specifies otherwise
|
||||
phrase = phrase.title()
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(number)
|
||||
|
||||
|
||||
class NumberedComponent(object):
|
||||
@classmethod
|
||||
def numeric_phrase(cls, key, num, language, country=None, dictionaries=()):
|
||||
is_alpha = False
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
try:
|
||||
num = float(num)
|
||||
except ValueError:
|
||||
is_alpha = True
|
||||
|
||||
# Pick a phrase given the probability distribution from the config
|
||||
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
# Dictionaries are lowercased, so title case here
|
||||
if phrase_props.get('title_case', True):
|
||||
phrase = phrase.title()
|
||||
|
||||
'''
|
||||
There are a few ways we can express the number itself
|
||||
|
||||
1. Alias it as some standalone word like basement (for floor "-1")
|
||||
2. Use the number itself, so "Floor 2"
|
||||
3. Append/prepend an affix e.g. 2/F for second floor
|
||||
4. As an ordinal expression e.g. "2nd Floor"
|
||||
'''
|
||||
have_standalone = False
|
||||
for num_type in ('standalone', 'numeric', 'numeric_affix', 'ordinal'):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key)
|
||||
if prob is not None:
|
||||
if num_type == 'standalone':
|
||||
have_standalone = True
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
if not probs:
|
||||
return phrase
|
||||
|
||||
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
|
||||
if is_alpha:
|
||||
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'standalone')])
|
||||
total = sum(probs)
|
||||
probs = [p / total for p in probs]
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
if len(values) < 2:
|
||||
num_type = 'standalone' if have_standalone else 'numeric'
|
||||
else:
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
if num_type == 'standalone':
|
||||
return phrase
|
||||
|
||||
props = phrase_props[num_type]
|
||||
|
||||
if phrase_props.get('number_abs_value', False):
|
||||
num = abs(num)
|
||||
|
||||
if 'number_min_abs_value' in phrase_props and num < phrase_props['number_min_abs_value']:
|
||||
return phrase
|
||||
|
||||
if phrase_props.get('number_subtract_abs_value'):
|
||||
num -= phrase_props['number_subtract_abs_value']
|
||||
|
||||
num = safe_decode(num)
|
||||
|
||||
# Do we add the numeric phrase e.g. Floor No 1
|
||||
add_number_phrase = props.get('add_number_phrase', False)
|
||||
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
|
||||
num = NumberPhrases.phrase(num, language, country=country)
|
||||
|
||||
whitespace_default = True
|
||||
|
||||
if num_type == 'numeric_affix':
|
||||
phrase = props['affix']
|
||||
if props.get('upper_case', True):
|
||||
phrase = phrase.upper()
|
||||
whitespace_default = False
|
||||
elif num_type == 'ordinal':
|
||||
num = ordinal_expressions.suffixed_number(num, language)
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(num)
|
||||
Reference in New Issue
Block a user