From fc94753481337b42e14dc4ab388d2107d6b75935 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 18 May 2016 00:43:01 -0400 Subject: [PATCH] [po boxes] random PO box generation --- scripts/geodata/addresses/floors.py | 3 +- scripts/geodata/addresses/numbering.py | 3 +- scripts/geodata/addresses/po_boxes.py | 39 +++++++++++++++++++++++++- scripts/geodata/addresses/units.py | 3 +- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/scripts/geodata/addresses/floors.py b/scripts/geodata/addresses/floors.py index 7dc2b87d..fd4ff87d 100644 --- a/scripts/geodata/addresses/floors.py +++ b/scripts/geodata/addresses/floors.py @@ -2,14 +2,13 @@ import random import six from geodata.addresses.config import address_config + from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet from geodata.encoding import safe_decode from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf class Floor(NumberedComponent): - config_key = 'levels' - # When we don't know the number of floors, use a Zipfian distribution # to choose randomly between 1 and max_floors with 1 being much more # likely than 2, etc. diff --git a/scripts/geodata/addresses/numbering.py b/scripts/geodata/addresses/numbering.py index b84500dc..3c43f946 100644 --- a/scripts/geodata/addresses/numbering.py +++ b/scripts/geodata/addresses/numbering.py @@ -107,7 +107,6 @@ class NumberedComponent(object): ALPHA = 'alpha' ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric' NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha' - DIRECTIONAL = 'directional' @classmethod def choose_alphanumeric_type(cls, key, language, country=None): @@ -116,7 +115,7 @@ class NumberedComponent(object): values = [] probs = [] - for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.DIRECTIONAL): + for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA): key = '{}_probability'.format(num_type) prob = alphanumeric_props.get(key) if prob is not None: diff --git a/scripts/geodata/addresses/po_boxes.py b/scripts/geodata/addresses/po_boxes.py index f9482b1b..cd2f4703 100644 --- a/scripts/geodata/addresses/po_boxes.py +++ b/scripts/geodata/addresses/po_boxes.py @@ -1,8 +1,10 @@ import random import six +from geodata.addresses.config import address_config from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet from geodata.encoding import safe_decode +from geodata.math.sampling import cdf, weighted_choice class POBox(NumberedComponent): @@ -24,9 +26,44 @@ class POBox(NumberedComponent): return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix]) @classmethod - def random_letter(cls, alphabet=latin_alphabet): + def random_letter(cls, language, country=None): + alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) return sample_alphabet(alphabet) + @classmethod + def random(cls, language, country=None): + num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country) + + if num_type != cls.ALPHA: + digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[]) + values = [] + probs = [] + + for val in digit_config: + values.append(val['length']) + probs.append(val['probability']) + + probs = cdf(probs) + + num_digits = weighted_choice(values, probs) + + number = cls.random_digits(num_digits) + + if num_type == cls.NUMERIC: + return safe_decode(number) + else: + letter = cls.random_letter(language, country=country) + + whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0)) + whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('') + + if num_type == cls.ALPHA_PLUS_NUMERIC: + return six.u('{}{}{}').format(letter, whitespace_phrase, number) + elif num_type == cls.NUMERIC_PLUS_ALPHA: + return six.u('{}{}{}').format(number, whitespace_phrase, letter) + else: + return cls.random_letter(language, country=country) + @classmethod def phrase(cls, box_number, language, country=None): return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language, diff --git a/scripts/geodata/addresses/units.py b/scripts/geodata/addresses/units.py index 5f3230b1..b3f55612 100644 --- a/scripts/geodata/addresses/units.py +++ b/scripts/geodata/addresses/units.py @@ -5,6 +5,7 @@ from geodata.addresses.config import address_config from geodata.addresses.directions import RelativeDirection from geodata.addresses.floors import Floor from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet +from geodata.configs.utils import nested_get from geodata.encoding import safe_decode from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf @@ -67,7 +68,7 @@ class Unit(NumberedComponent): else: number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf) - whitespace_probability = num_type_props.get('{}_whitespace_probability'.format(num_type)) + whitespace_probability = nested_get(num_type_props, (num_type, 'whitespace_probability')) whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: