Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/addresses/init.py
+++ b/scripts/geodata/addresses/init.py
--- a/scripts/geodata/addresses/blocks.py
+++ b/scripts/geodata/addresses/blocks.py
@@ -0,0 +1,59 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Block(NumberedComponent):
+    max_blocks = 10
+
+    block_range = range(1, max_blocks + 1)
+    block_range_probs = zipfian_distribution(len(block_range), 2.0)
+    block_range_cdf = cdf(block_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.block_range, cls.block_range_cdf)
+            return safe_decode(number)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.block_range, cls.block_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, block, language, country=None):
+        if block is None:
+            return None
+
+        phrase_prob = address_config.get_property('blocks.alphanumeric_phrase_probability', language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return cls.numeric_phrase('blocks.alphanumeric', block, language,
+                                      dictionaries=['qualifiers'], country=country)
+        else:
+            return None
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
--- a/scripts/geodata/addresses/config.py
+++ b/scripts/geodata/addresses/config.py
@@ -0,0 +1,152 @@
+
+import copy
+import os
+import six
+import yaml
+
+from collections import Mapping
+
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
+from geodata.math.sampling import cdf, check_probability_distribution
+
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                  'resources', 'addresses')
+
+DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                'resources', 'dictionaries')
+
+
+class AddressConfig(object):
+    def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
+        self.address_configs = {}
+        self.cache = {}
+
+        for filename in os.listdir(config_dir):
+            if not filename.endswith('.yaml'):
+                continue
+            config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
+            countries = config.pop('countries', {})
+
+            for k in countries.keys():
+                country_config = countries[k]
+                config_copy = copy.deepcopy(config)
+                countries[k] = recursive_merge(config_copy, country_config)
+
+            config['countries'] = countries
+
+            lang = filename.rsplit('.yaml')[0]
+            self.address_configs[lang] = config
+
+        self.sample_phrases = {}
+
+        for language in address_phrase_dictionaries.languages:
+            for dictionary in address_phrase_dictionaries.language_dictionaries[language]:
+                self.sample_phrases[(language, dictionary)] = {}
+                for phrases in address_phrase_dictionaries.phrases[(language, dictionary)]:
+                    self.sample_phrases[(language, dictionary)][phrases[0]] = phrases[1:]
+
+    def get_property(self, key, language, country=None, default=None):
+        keys = key.split('.')
+        config = self.address_configs.get(language, {})
+
+        if country:
+            country_config = config.get('countries', {}).get(country, {})
+            if country_config:
+                config = country_config
+
+        value = nested_get(config, keys)
+        if value is not DoesNotExist:
+            return value
+
+        return default
+
+    def cache_key(self, prop, language, dictionaries=(), country=None):
+        return (prop, language, country, tuple(dictionaries))
+
+    def alternative_probabilities(self, prop, language, dictionaries=(), country=None):
+        '''Get a probability distribution over alternatives'''
+        key = self.cache_key(prop, language, dictionaries, country=country)
+        if key not in self.cache:
+            properties = self.get_property(prop, language, country=country, default=None)
+
+            if properties is None:
+                return None, None
+
+            alternatives, probs = alternative_probabilities(properties)
+            if alternatives is None:
+                return None, None
+
+            forms = []
+            form_probs = []
+
+            for props, prob in zip(alternatives, probs):
+                phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
+                forms.extend([(p, props) for p in phrases])
+                form_probs.extend([prob * p for p in phrase_probs])
+
+            sample_probability = properties.get('sample_probability')
+            if sample_probability is not None:
+                sample_phrases = []
+                for dictionary in dictionaries:
+                    phrases = self.sample_phrases.get((language, dictionary), [])
+                    for canonical, surface_forms in six.iteritems(phrases):
+                        sample_phrases.append(canonical)
+                        sample_phrases.extend(surface_forms)
+                # Note: use the outer properties dictionary e.g. units.alphanumeric
+                forms.extend([(p, properties) for p in sample_phrases])
+                form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
+
+            try:
+                check_probability_distribution(form_probs)
+            except AssertionError:
+                print 'values were: {}'.format(forms)
+                raise
+
+            form_probs_cdf = cdf(form_probs)
+            self.cache[key] = (forms, form_probs_cdf)
+        return self.cache[key]
+
+    def form_probabilities(self, properties, language, dictionaries=()):
+        probs = []
+        alternatives = []
+        canonical_prob = properties.get('canonical_probability', 1.0)
+        canonical = properties['canonical']
+
+        alternatives.append(canonical)
+        probs.append(canonical_prob)
+
+        if 'abbreviated_probability' in properties:
+            probs.append(properties['abbreviated_probability'])
+            abbreviated = properties['abbreviated']
+            assert isinstance(abbreviated, basestring)
+            alternatives.append(abbreviated)
+
+        if properties.get('sample', False) and 'sample_probability' in properties:
+            sample_prob = properties['sample_probability']
+            samples = set()
+            for dictionary in dictionaries:
+                phrases = self.sample_phrases.get((language, dictionary), {})
+                samples |= set(phrases.get(canonical, []))
+            if 'sample_exclude' in properties:
+                samples -= set(properties['sample_exclude'])
+            if samples:
+                for phrase in samples:
+                    probs.append(sample_prob / float(len(samples)))
+                    alternatives.append(phrase)
+            else:
+                total = sum(probs)
+                probs = [p / total for p in probs]
+
+        try:
+            check_probability_distribution(probs)
+        except AssertionError:
+            print 'values were: {}'.format(alternatives)
+            raise
+
+        return alternatives, probs
+
+address_config = AddressConfig()
--- a/scripts/geodata/addresses/conjunctions.py
+++ b/scripts/geodata/addresses/conjunctions.py
@@ -0,0 +1,37 @@
+import six
+from geodata.addresses.config import address_config
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice
+
+
+class Conjunction(object):
+    DEFAULT_WHITESPACE_JOIN = ', '
+    DEFAULT_NON_WHITESPACE_JOIN = ''
+    key = 'and'
+
+    @classmethod
+    def join(cls, phrases, language, country=None):
+
+        if not hasattr(phrases, '__iter__'):
+            raise ValueError('Param phrases must be iterable')
+
+        values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
+        phrase, props = weighted_choice(values, probs)
+
+        whitespace = props.get('whitespace', True)
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+
+        phrases = [safe_decode(p) for p in phrases]
+
+        max_phrase_join = props.get('max_phrase_join', 2)
+        if len(phrases) > max_phrase_join:
+            default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
+            prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
+        else:
+            prefix = six.u('')
+
+        if whitespace:
+            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
+        joined_phrase = phrase.join(phrases[-max_phrase_join:])
+
+        return six.u('').join([prefix, joined_phrase])
--- a/scripts/geodata/addresses/conscription_numbers.py
+++ b/scripts/geodata/addresses/conscription_numbers.py
@@ -0,0 +1,19 @@
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class ConscriptionNumber(NumberedComponent):
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        if number is None:
+            return number
+
+        key = 'conscription_numbers.alphanumeric'
+        dictionaries = ['house_numbers']
+        default = safe_decode(number)
+
+        return cls.numeric_phrase(key, safe_decode(number), language,
+                                  dictionaries=dictionaries, country=country)
--- a/scripts/geodata/addresses/dependencies.py
+++ b/scripts/geodata/addresses/dependencies.py
@@ -0,0 +1,42 @@
+import operator
+import six
+
+from geodata.graph.topsort import topsort
+
+
+class ComponentDependencies(object):
+    '''
+    Declare an address component and its dependencies e.g.
+    a house_numer cannot be used in the absence of a road name.
+    '''
+
+    component_bit_values = {}
+
+    def __init__(self, graph):
+        self.dependencies = {}
+
+        self.all_values = long('1' * len(graph), 2)
+
+        self.dependency_order = [c for c in topsort(graph)]
+
+        for component, deps in six.iteritems(graph):
+            self.dependencies[component] = self.component_bitset(deps) if deps else self.all_values
+
+    def __getitem__(self, key):
+        return self.dependencies.__getitem__(key)
+
+    def __contains__(self, key):
+        return self.dependencies.__contains__(key)
+
+    @classmethod
+    def get_component_bit_value(cls, name):
+        val = cls.component_bit_values.get(name)
+        if val is None:
+            num_values = len(cls.component_bit_values)
+            val = 1 << num_values
+            cls.component_bit_values[name] = val
+        return val
+
+    @classmethod
+    def component_bitset(cls, components):
+        return reduce(operator.or_, [cls.get_component_bit_value(name) for name in components])
--- a/scripts/geodata/addresses/directions.py
+++ b/scripts/geodata/addresses/directions.py
@@ -0,0 +1,37 @@
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumericPhrase
+from geodata.math.sampling import weighted_choice
+
+
+class RelativeDirection(NumericPhrase):
+    key = 'directions'
+    dictionaries = ['unit_directions']
+
+
+class AnteroposteriorDirection(RelativeDirection):
+    key = 'directions.anteroposterior'
+
+
+class LateralDirection(RelativeDirection):
+    key = 'directions.lateral'
+
+
+class CardinalDirection(NumericPhrase):
+    key = 'cardinal_directions'
+    dictionaries = ['cardinal_directions']
+
+
+class Direction(object):
+    CARDINAL = 'cardinal'
+    RELATIVE = 'relative'
+
+    @classmethod
+    def random(cls, language, country=None, cardinal_proability=0.5):
+        values = [cls.CARDINAL, cls.RELATIVE]
+        probs_cdf = [cardinal_proability, 1.0]
+
+        choice = weighted_choice(values, probs_cdf)
+        if choice == cls.CARDINAL:
+            return CardinalDirection.phrase(None, language, country=country)
+        else:
+            return RelativeDirection.phrase(None, language, country=country)
--- a/scripts/geodata/addresses/entrances.py
+++ b/scripts/geodata/addresses/entrances.py
@@ -0,0 +1,66 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.directions import RelativeDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Entrance(NumberedComponent):
+    max_entrances = 10
+
+    entrance_range = range(1, max_entrances + 1)
+    entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0)
+    entrance_range_cdf = cdf(entrance_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-'
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, entrance, language, country=None):
+        if entrance is None:
+            return None
+        return cls.numeric_phrase('entrances.alphanumeric', entrance, language,
+                                  dictionaries=['entrances'], country=country)
--- a/scripts/geodata/addresses/floors.py
+++ b/scripts/geodata/addresses/floors.py
@@ -0,0 +1,165 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.numbers.spellout import numeric_expressions
+
+
+class Floor(NumberedComponent):
+    # When we don't know the number of floors, use a Zipfian distribution
+    # to choose randomly between 1 and max_floors with 1 being much more
+    # likely than 2, etc.
+    max_floors = 10
+    max_basements = 2
+    numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1)
+    floor_probs = zipfian_distribution(len(numbered_floors), 0.75)
+    floor_probs_cdf = cdf(floor_probs)
+
+    # For use with letters e.g. A0 is probably not as common
+    floors_letters = range(1, max_floors + 1) + [0]
+    floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0)
+    floors_letters_cdf = cdf(floors_letters_probs)
+
+    @classmethod
+    def sample_floors(cls, num_floors, num_basements=0):
+        num_floors = int(num_floors)
+        return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0)
+
+    @classmethod
+    def sample_floors_range(cls, min_floor, max_floor):
+        return random.randint(min_floor, (max_floor - 1) if max_floor > min_floor else min_floor)
+
+    @classmethod
+    def random_int(cls, language, country=None, num_floors=None, num_basements=None):
+        number = None
+        if num_floors is not None:
+            try:
+                num_floors = int(num_floors)
+            except (ValueError, TypeError):
+                return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
+
+            if num_floors <= cls.max_floors:
+                number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
+            else:
+                number = cls.sample_floors_range(cls.max_floors + 1, num_floors)
+
+        else:
+            number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
+
+        return number
+
+    @classmethod
+    def random_from_int(cls, number, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
+
+        if number >= 0:
+            number += numbering_starts_at
+
+        if num_type == cls.NUMERIC:
+            return safe_decode(number)
+        elif num_type == cls.ROMAN_NUMERAL:
+            roman_numeral = numeric_expressions.roman_numeral(number)
+            if roman_numeral is not None:
+                return roman_numeral
+            else:
+                return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number2 = number + sample_floors_range(1, cls.max_floors)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet)
+            if num_type == cls.ALPHA:
+                return letter
+            else:
+                number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}').format(letter, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}').format(number, letter)
+
+        return None
+
+    @classmethod
+    def random(cls, language, country=None, num_floors=None, num_basements=None):
+        number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
+        return cls.random_from_int(number, language, country=country)
+
+    @classmethod
+    def phrase(cls, floor, language, country=None, num_floors=None):
+        if floor is None:
+            return None
+
+        integer_floor = False
+        floor = safe_decode(floor)
+        try:
+            floor = int(floor)
+            integer_floor = True
+        except (ValueError, TypeError):
+            try:
+                floor = float(floor)
+                integer_floor = int(floor) == floor
+            except (ValueError, TypeError):
+                return cls.numeric_phrase('levels.alphanumeric', floor, language,
+                                          dictionaries=['level_types_numbered'], country=country)
+
+        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
+        try:
+            num_floors = int(num_floors)
+            top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1
+            is_top = num_floors and floor == top_floor
+        except (ValueError, TypeError):
+            is_top = False
+
+        alias_prefix = 'levels.aliases'
+        aliases = address_config.get_property(alias_prefix, language, country=country)
+        if aliases:
+            alias = None
+
+            if not integer_floor and floor >= 0 and 'half_floors' in aliases:
+                floor = int(floor)
+                alias = 'half_floors'
+            elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases:
+                floor = int(floor)
+                alias = 'half_floors_negative'
+            elif floor < -1 and '<-1' in aliases:
+                alias = '<-1'
+            elif is_top and 'top' in aliases:
+                alias = 'top'
+            elif safe_decode(floor) in aliases:
+                alias = safe_decode(floor)
+
+            floor = safe_decode(floor)
+
+            if alias:
+                alias_props = aliases.get(alias)
+
+                # Aliases upon aliases, e.g. for something like "Upper Mezzanine"
+                # where it's an alias for "1" under the half_floors key
+                if safe_decode(floor) in alias_props.get('aliases', {}):
+                    alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias)
+                    alias = safe_decode(floor)
+
+            if alias:
+                return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language,
+                                          dictionaries=['level_types_basement',
+                                                        'level_types_mezzanine',
+                                                        'level_types_numbered',
+                                                        'level_types_standalone',
+                                                        'level_types_sub_basement'],
+                                          country=country)
+
+        return cls.numeric_phrase('levels.alphanumeric', floor, language,
+                              dictionaries=['level_types_numbered'], country=country)
--- a/scripts/geodata/addresses/house_numbers.py
+++ b/scripts/geodata/addresses/house_numbers.py
@@ -0,0 +1,26 @@
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class HouseNumber(NumberedComponent):
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        if number is not None:
+            prob_key = 'house_numbers.alphanumeric_phrase_probability'
+            key = 'house_numbers.alphanumeric'
+            dictionaries = ['house_numbers', 'number']
+            default = safe_decode(number)
+        else:
+            prob_key = 'house_numbers.no_number_probability'
+            key = 'house_numbers.no_number'
+            dictionaries = ['no_number']
+            default = None
+
+        phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return cls.numeric_phrase(key, safe_decode(number), language,
+                                      dictionaries=dictionaries, country=country)
+        return default
--- a/scripts/geodata/addresses/metro_stations.py
+++ b/scripts/geodata/addresses/metro_stations.py
@@ -0,0 +1,24 @@
+from geodata.addresses.config import address_config
+
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumericPhrase 
+from geodata.encoding import safe_decode
+
+
+class MetroStationPhrase(NumericPhrase):
+    key = 'metro_stations.alphanumeric'
+    dictionaries = ['qualifiers']
+
+
+class MetroStation(object):
+    @classmethod
+    def phrase(cls, station, language, country=None):
+        if station is None:
+            return None
+        phrase_prob = address_config.get_property('metro_stations.alphanumeric_phrase_probability', language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return MetroStationPhrase.phrase(station, language, country=country)
+
+        return None
--- a/scripts/geodata/addresses/numbering.py
+++ b/scripts/geodata/addresses/numbering.py
@@ -0,0 +1,434 @@
+# -*- coding: utf-8 -*-
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.math.floats import isclose
+from geodata.numbers.ordinals import ordinal_expressions
+from geodata.numbers.spellout import numeric_expressions
+from geodata.text.tokenize import tokenize, token_types
+
+alphabets = {}
+
+
+def sample_alphabet(alphabet, b=1.5):
+    '''
+    Sample an "alphabet" using a Zipfian distribution (frequent items are very
+    frequent, long tail of infrequent items). If we look at something like
+    unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
+    "Unit Z" simply because most dwellings only have a few units. Sampling
+    letters from a Zipfian distribution rather than uniformly means that instead
+    of every letter having the same likelihood (1/26), letters toward the beginning
+    of the alphabet are much more likely to be selected. Letters toward the end can
+    still be selected sometimes, but are not very likely.
+
+    Note letters don't necessarily need to be sorted alphabetically, just in order
+    of frequency.
+    '''
+    global alphabets
+    alphabet = tuple(alphabet)
+    if alphabet not in alphabets:
+        probs = zipfian_distribution(len(alphabet), b)
+        probs_cdf = cdf(probs)
+
+        alphabets[alphabet] = probs_cdf
+
+    probs_cdf = alphabets[alphabet]
+    return weighted_choice(alphabet, probs_cdf)
+
+latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
+
+
+class Digits(object):
+    ASCII = 'ascii'
+    SPELLOUT = 'spellout'
+    UNICODE_FULL_WIDTH = 'unicode_full_width'
+    ROMAN_NUMERAL = 'roman_numeral'
+
+    CARDINAL = 'cardinal'
+    ORDINAL = 'ordinal'
+
+    unicode_full_width_map = {
+        '0': safe_decode('０'),
+        '1': safe_decode('１'),
+        '2': safe_decode('２'),
+        '3': safe_decode('３'),
+        '4': safe_decode('４'),
+        '5': safe_decode('５'),
+        '6': safe_decode('６'),
+        '7': safe_decode('７'),
+        '8': safe_decode('８'),
+        '9': safe_decode('９'),
+    }
+
+    full_width_digit_map = {
+        v: k for k, v in six.iteritems(unicode_full_width_map)
+    }
+
+    @classmethod
+    def rewrite_full_width(cls, s):
+        return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
+
+    @classmethod
+    def rewrite_standard_width(cls, s):
+        return six.u('').join([cls.full_width_digit_map.get(c, c) for c in s])
+
+    @classmethod
+    def rewrite_roman_numeral(cls, s):
+        roman_numeral = None
+        if s.isdigit():
+            roman_numeral = numeric_expressions.roman_numeral(s)
+
+        if roman_numeral:
+            return roman_numeral
+        else:
+            return s
+
+    @classmethod
+    def rewrite_spellout(cls, s, lang, num_type, props):
+        if s.isdigit():
+            num = int(s)
+            spellout = None
+            gender = props.get('gender')
+            category = props.get('category')
+
+            if num_type == cls.CARDINAL:
+                spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
+            elif num_type == cls.ORDINAL:
+                spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
+
+            if spellout:
+                return spellout.title()
+            return s
+        else:
+            return s
+
+    @classmethod
+    def rewrite(cls, d, lang, props, num_type=CARDINAL):
+        if not props:
+            return d
+
+        d = safe_decode(d)
+
+        values = []
+        probs = []
+
+        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
+            key = '{}_probability'.format(digit_type)
+            if key in props:
+                values.append(digit_type)
+                probs.append(props[key])
+
+        if not isclose(sum(probs), 1.0):
+            values.append(cls.ASCII)
+            probs.append(1.0 - sum(probs))
+
+        probs = cdf(probs)
+        digit_type = weighted_choice(values, probs)
+
+        if digit_type == cls.ASCII:
+            return d
+        elif digit_type == cls.SPELLOUT:
+            return cls.rewrite_spellout(d, lang, num_type, props)
+        elif digit_type == cls.ROMAN_NUMERAL:
+            roman_numeral = cls.rewrite_roman_numeral(d)
+            if random.random() < props.get('ordinal_suffix_probability', 0.0):
+                ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
+                if ordinal_suffix:
+                    roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
+            return roman_numeral
+        elif digit_type == cls.UNICODE_FULL_WIDTH:
+            return cls.rewrite_full_width(d)
+        else:
+            return d
+
+
+class NumericPhrase(object):
+    key = None
+
+    NUMERIC = 'numeric'
+    NUMERIC_AFFIX = 'numeric_affix'
+
+    @classmethod
+    def pick_phrase_and_type(cls, number, language, country=None):
+        values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
+        if not values:
+            return None, safe_decode(number) if number is not None else None, None
+
+        phrase, phrase_props = weighted_choice(values, probs)
+
+        values = []
+        probs = []
+
+        for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
+            key = '{}_probability'.format(num_type)
+            prob = phrase_props.get(key, None)
+            if prob is not None:
+                values.append(num_type)
+                probs.append(prob)
+
+        if not probs:
+            num_type = cls.NUMERIC
+        else:
+            probs = cdf(probs)
+            num_type = weighted_choice(values, probs)
+
+        return num_type, phrase, phrase_props[num_type]
+
+    @classmethod
+    def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
+
+        if num_type == cls.NUMERIC_AFFIX:
+            phrase = props['affix']
+            if 'zero_pad' in props and number.isdigit():
+                number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
+
+        direction = props['direction']
+        whitespace = props.get('whitespace', whitespace_default)
+        whitespace_probability = props.get('whitespace_probability')
+        if whitespace_probability is not None:
+            whitespace = random.random() < whitespace_probability
+
+        if props.get('title_case', True):
+            # Title case unless the config specifies otherwise
+            phrase = phrase.title()
+
+        if number is None:
+            return phrase
+
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+        # Phrase goes to the left of hte number
+        if direction == 'left':
+            return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
+        # Phrase goes to the right of the number
+        elif direction == 'right':
+            return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
+        # Need to specify a direction, otherwise return naked number
+        else:
+            return safe_decode(number)
+
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
+        whitespace_default = num_type == cls.NUMERIC
+        return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
+
+
+class Number(NumericPhrase):
+    key = 'numbers'
+    dictionaries = ['number']
+
+
+class NumberedComponent(object):
+    NUMERIC = 'numeric'
+    ALPHA = 'alpha'
+    ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
+    NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
+    HYPHENATED_NUMBER = 'hyphenated_number'
+    ROMAN_NUMERAL = 'roman_numeral'
+
+    @classmethod
+    def choose_alphanumeric_type(cls, key, language, country=None):
+        alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
+        if alphanumeric_props is None:
+            return None, None
+
+        values = []
+        probs = []
+
+        for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
+            key = '{}_probability'.format(num_type)
+            prob = alphanumeric_props.get(key)
+            if prob is not None:
+                values.append(num_type)
+                probs.append(prob)
+
+        if not values:
+            return None, None
+
+        probs = cdf(probs)
+        num_type = weighted_choice(values, probs)
+        num_type_props = alphanumeric_props.get(num_type, {})
+
+        return num_type, num_type_props
+
+    @classmethod
+    def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
+        has_alpha = False
+        has_numeric = True
+        is_integer = False
+        is_none = False
+        if num is not None:
+            try:
+                num_int = int(num)
+                is_integer = True
+            except ValueError:
+                try:
+                    num_float = float(num)
+                except ValueError:
+                    tokens = tokenize(safe_decode(num))
+                    has_numeric = False
+                    for t, c in tokens:
+                        if c == token_types.NUMERIC:
+                            has_numeric = True
+                        if any((ch.isalpha() for ch in t)):
+                            has_alpha = True
+
+                    if strict_numeric and has_alpha:
+                        return safe_decode(num)
+
+        else:
+            is_none = True
+
+        values, probs = None, None
+
+        if is_alpha:
+            values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
+
+        # Pick a phrase given the probability distribution from the config
+        if values is None:
+            values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
+
+        if not values:
+            return safe_decode(num) if not is_none else None
+
+        phrase, phrase_props = weighted_choice(values, probs)
+
+        values = []
+        probs = []
+
+        # Dictionaries are lowercased, so title case here
+        if phrase_props.get('title_case', True):
+            phrase = phrase.title()
+
+        '''
+        There are a few ways we can express the number itself
+
+        1. Alias it as some standalone word like basement (for floor "-1")
+        2. Use the number itself, so "Floor 2"
+        3. Append/prepend an affix e.g. 2/F for second floor
+        4. As an ordinal expression e.g. "2nd Floor"
+        '''
+        have_standalone = False
+        have_null = False
+        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
+            key = '{}_probability'.format(num_type)
+            prob = phrase_props.get(key)
+            if prob is not None:
+                if num_type == 'standalone':
+                    have_standalone = True
+                elif num_type == 'null':
+                    have_null = True
+                values.append(num_type)
+                probs.append(prob)
+            elif num_type in phrase_props:
+                values.append(num_type)
+                probs.append(1.0)
+                break
+
+        if not probs or is_none:
+            return phrase
+
+        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
+        if has_alpha:
+            values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
+            total = float(sum(probs))
+            if isclose(total, 0.0):
+                return None
+
+            probs = [p / total for p in probs]
+
+        probs = cdf(probs)
+
+        if len(values) < 2:
+            if have_standalone:
+                num_type = 'standalone'
+            elif have_null:
+                num_type = 'null'
+            else:
+                num_type = 'numeric'
+        else:
+            num_type = weighted_choice(values, probs)
+
+        if num_type == 'standalone':
+            return phrase
+        elif num_type == 'null':
+            return safe_decode(num)
+
+        props = phrase_props[num_type]
+
+        if is_integer:
+            num_int = int(num)
+            if phrase_props.get('number_abs_value', False):
+                num_int = abs(num_int)
+                num = num_int
+
+            if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
+                return None
+
+            if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
+                return None
+
+            if phrase_props.get('number_subtract_abs_value'):
+                num_int -= phrase_props['number_subtract_abs_value']
+                num = num_int
+
+        num = safe_decode(num)
+        digits_props = props.get('digits')
+        if digits_props:
+            # Inherit the gender and category e.g. for ordinals
+            for k in ('gender', 'category'):
+                if k in props:
+                    digits_props[k] = props[k]
+            num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
+
+        # Do we add the numeric phrase e.g. Floor No 1
+        add_number_phrase = props.get('add_number_phrase', False)
+        if add_number_phrase and random.random() < props['add_number_phrase_probability']:
+            num = Number.phrase(num, language, country=country)
+
+        whitespace_default = True
+
+        if num_type == 'numeric_affix':
+            phrase = props['affix']
+            if props.get('upper_case', True):
+                phrase = phrase.upper()
+            if 'zero_pad' in props and num.isdigit():
+                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
+            whitespace_default = False
+        elif num_type == 'ordinal' and safe_decode(num).isdigit():
+            ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
+
+            if ordinal_expression is not None:
+                num = ordinal_expression
+
+        if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
+            if random.random() < props['null_phrase_probability']:
+                return num
+
+        direction = props['direction']
+        whitespace = props.get('whitespace', whitespace_default)
+
+        whitespace_probability = props.get('whitespace_probability')
+        if whitespace_probability is not None:
+            whitespace = random.random() < whitespace_probability
+
+        # Occasionally switch up if direction_probability is specified
+        if random.random() > props.get('direction_probability', 1.0):
+            if direction == 'left':
+                direction = 'right'
+            elif direction == 'right':
+                direction = 'left'
+
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+        # Phrase goes to the left of hte number
+        if direction == 'left':
+            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
+        # Phrase goes to the right of the number
+        elif direction == 'right':
+            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
+        # Need to specify a direction, otherwise return naked number
+        else:
+            return safe_decode(num)
--- a/scripts/geodata/addresses/po_boxes.py
+++ b/scripts/geodata/addresses/po_boxes.py
@@ -0,0 +1,76 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import cdf, weighted_choice
+
+
+class POBox(NumberedComponent):
+    @classmethod
+    def random_digits(cls, num_digits):
+        # Note: PO Boxes can have leading zeros but not important for the parser
+        # since it only cares about how many digits there are in a number
+        low = 10 ** (num_digits - 1)
+        high = (10 ** num_digits) - 1
+
+        return random.randint(low, high)
+
+    @classmethod
+    def random_digits_with_prefix(cls, num_digits, prefix=six.u('')):
+        return six.u('').join([prefix, safe_decode(cls.random_digits(num_digits))])
+
+    @classmethod
+    def random_digits_with_suffix(cls, num_digits, suffix=six.u('')):
+        return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix])
+
+    @classmethod
+    def random_letter(cls, language, country=None):
+        alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+        return sample_alphabet(alphabet)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type != cls.ALPHA:
+            digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
+            values = []
+            probs = []
+
+            for val in digit_config:
+                values.append(val['length'])
+                probs.append(val['probability'])
+
+            probs = cdf(probs)
+
+            num_digits = weighted_choice(values, probs)
+
+            digits = cls.random_digits(num_digits)
+            number = Digits.rewrite(digits, language, num_type_props)
+
+
+            if num_type == cls.NUMERIC:
+                return safe_decode(number)
+            else:
+                letter = cls.random_letter(language, country=country)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+        else:
+            return cls.random_letter(language, country=country)
+
+    @classmethod
+    def phrase(cls, box_number, language, country=None):
+        if box_number is None:
+            return None
+        return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language,
+                                  dictionaries=['post_office'], country=country)
--- a/scripts/geodata/addresses/postcodes.py
+++ b/scripts/geodata/addresses/postcodes.py
@@ -0,0 +1,11 @@
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class PostCode(NumberedComponent):
+    @classmethod
+    def phrase(cls, postcode, language, country=None):
+        if postcode is None:
+            return None
+        return cls.numeric_phrase('postcodes.alphanumeric', postcode, language,
+                                  dictionaries=['postcodes'], country=country)
--- a/scripts/geodata/addresses/staircases.py
+++ b/scripts/geodata/addresses/staircases.py
@@ -0,0 +1,66 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.directions import RelativeDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Staircase(NumberedComponent):
+    max_staircases = 10
+
+    staircase_range = range(1, max_staircases + 1)
+    staircase_range_probs = zipfian_distribution(len(staircase_range), 2.0)
+    staircase_range_cdf = cdf(staircase_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-'
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, staircase, language, country=None):
+        if staircase is None:
+            return None
+        return cls.numeric_phrase('staircases.alphanumeric', staircase, language,
+                                  dictionaries=['staircases'], country=country)
--- a/scripts/geodata/addresses/units.py
+++ b/scripts/geodata/addresses/units.py
@@ -0,0 +1,285 @@
+import itertools
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.directions import RelativeDirection, LateralDirection, AnteroposteriorDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.configs.utils import nested_get
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.text.utils import is_numeric_strict
+
+
+class Unit(NumberedComponent):
+    # When we don't know the number of units, use a Zipfian distribution
+    # to choose randomly between 1 and max_units with 1 being much more
+    # likely than 2, etc.
+    max_units = 99
+    max_basements = 2
+
+    hundreds_numbered_units_tens = [range(101, 110) + [100],
+                                    range(201, 210) + [200],
+                                    range(301, 310) + [300],
+                                    range(401, 410) + [400],
+                                    range(501, 510) + [500],
+                                    ]
+
+    hundreds_numbered_units = [range(110, 200),
+                               range(210, 300),
+                               range(310, 400),
+                               range(410, 500),
+                               range(510, 600),
+                               ]
+
+    thousands_numbered_units = [range(1001, 1030) + [1000],
+                                range(2001, 2030) + [2000],
+                                range(3001, 3030) + [3000],
+                                range(4001, 4030) + [4000],
+                                range(5001, 5030) + [5000]
+                                ]
+
+    numbered_units = range(1, 10)
+    numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units_tens)))
+    numbered_units.extend(range(10, 100))
+    numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units)))
+    numbered_units.extend(itertools.chain(*itertools.izip(*thousands_numbered_units)))
+    numbered_units.extend(range(10001, 10100) + [10000])
+    numbered_units.append(0)
+    numbered_units.extend(range(0, -max_basements - 1, -1))
+
+    unit_probs = zipfian_distribution(len(numbered_units), 0.7)
+    unit_probs_cdf = cdf(unit_probs)
+
+    num_digits = [2, 3, 4]
+    num_digits_probs = zipfian_distribution(len(num_digits), 4.0)
+    num_digits_cdf = cdf(num_digits_probs)
+
+    # For use with floors e.g. #301 more common than #389
+    positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1)
+    positive_units_floors_probs = zipfian_distribution(len(positive_units_floors), 0.6)
+    positive_units_floors_cdf = cdf(positive_units_floors_probs)
+
+    # For basic positive units
+    positive_units = range(1, max_units + 1)
+    positive_units_probs = zipfian_distribution(len(positive_units), 0.6)
+    positive_units_cdf = cdf(positive_units_probs)
+
+    # For use with letters e.g. A0 less common
+    positive_units_letters = range(1, max_units + 1) + [0]
+    positive_units_letters_probs = zipfian_distribution(len(positive_units_letters), 0.6)
+    positive_units_letters_cdf = cdf(positive_units_letters_probs)
+
+    RESIDENTIAL = 'residential'
+    COMMERCIAL = 'commercial'
+    INDUSTRIAL = 'industrial'
+    UNIVERSITY = 'university'
+
+    @classmethod
+    def sample_num_digits(cls):
+        return weighted_choice(cls.num_digits, cls.num_digits_cdf)
+
+    @classmethod
+    def for_floor(cls, floor_number, num_digits=None):
+        num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
+        unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
+        return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
+
+    @classmethod
+    def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)
+
+        use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)
+
+        if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
+            if random.random() >= use_positive_numbers_prob:
+                number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
+            else:
+                number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+        else:
+            if floor is None or not floor.isdigit():
+                floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
+
+            floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
+            ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)
+
+            if ground_floor_starts_at is not None:
+                try:
+                    floor = int(floor)
+                    if floor >= floor_numbering_starts_at:
+                        floor -= floor_numbering_starts_at
+                    floor += ground_floor_starts_at
+                    floor = safe_decode(floor)
+                except (TypeError, ValueError):
+                    pass
+
+            use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
+            if use_floor_affix_prob and random.random() < use_floor_affix_prob:
+                floor_phrase = Floor.phrase(floor, language, country=country)
+                # Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
+                if is_numeric_strict(floor_phrase):
+                    unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+
+                    unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
+                    if unit_num_digits is not None:
+                        unit = safe_decode(unit).zfill(unit_num_digits)
+
+                    return six.u('{}{}').format(floor_phrase, unit)
+
+            floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
+            if floor_num_digits is not None and floor.isdigit():
+                floor = floor.zfill(floor_num_digits)
+
+            number = cls.for_floor(floor)
+
+        if num_type == cls.NUMERIC:
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+            range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
+            direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
+            direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))
+
+            if random.random() < direction_prob:
+                direction = 'left' if direction == 'right' else 'right'
+
+            direction_right = direction == 'right'
+
+            if random.random() < range_prob:
+                if direction_right:
+                    number2 += number
+                else:
+                    number2 = max(0, number - number2)
+            if direction == 'right':
+                return u'{}-{}'.format(number, number2)
+            else:
+                return u'{}-{}'.format(number2, number)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                if num_floors is None:
+                    number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-' 
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def add_direction(cls, key, unit, language, country=None):
+        add_direction_probability = address_config.get_property('{}.add_direction_probability'.format(key),
+                                                                language, country=country, default=0.0)
+        if not random.random() < add_direction_probability:
+            return unit
+        add_direction_numeric = address_config.get_property('{}.add_direction_numeric'.format(key),
+                                                            language, country=country)
+        try:
+            unit = int(unit)
+            integer_unit = True
+        except (ValueError, TypeError):
+            integer_unit = False
+
+        if add_direction_numeric and integer_unit:
+            return RelativeDirection.phrase(unit, language, country=country)
+        elif not integer_unit:
+            add_direction_standalone = address_config.get_property('{}.add_direction_standalone'.format(key),
+                                                                   language, country=country)
+            if add_direction_standalone:
+                return RelativeDirection.phrase(None, language, country=country)
+
+    @classmethod
+    def add_quadrant(cls, key, unit, language, country=None):
+        add_quadrant_probability = address_config.get_property('{}.add_quadrant_probability'.format(key),
+                                                               language, country=country, default=0.0)
+        if not random.random() < add_quadrant_probability:
+            return unit
+        add_quadrant_numeric = address_config.get_property('{}.add_quadrant_numeric'.format(key),
+                                                           language, country=country)
+        try:
+            unit = int(unit)
+            integer_unit = True
+        except (ValueError, TypeError):
+            integer_unit = False
+
+        first_direction = address_config.get_property('{}.add_quadrant_first_direction'.format(key),
+                                                      language, country=country)
+
+        if first_direction == 'lateral':
+            ordering = (LateralDirection, AnteroposteriorDirection)
+        elif first_direction == 'anteroposterior':
+            ordering = (AnteroposteriorDirection, LateralDirection)
+        else:
+            return unit
+
+        if not integer_unit:
+            add_quadrant_standalone = address_config.get_property('{}.add_quadrant_standalone'.format(key),
+                                                                  language, country=country)
+            if add_quadrant_standalone:
+                unit = None
+            else:
+                return None
+
+        last_num_type = None
+        for i, c in enumerate(ordering):
+            num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country)
+            whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC
+            unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default)
+            last_num_type = num_type
+
+        return unit
+
+    @classmethod
+    def phrase(cls, unit, language, country=None, zone=None):
+        if unit is not None:
+            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)
+
+            if not address_config.get_property(key, language, country=country):
+                return None
+
+            is_alpha = safe_decode(unit).isalpha()
+
+            direction_unit = None
+            add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
+            if add_direction:
+                direction_unit = cls.add_direction(key, unit, language, country=country)
+
+            if direction_unit and direction_unit != unit:
+                unit = direction_unit
+                is_alpha = False
+            else:
+                add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
+                if add_quadrant:
+                    unit = cls.add_quadrant(key, unit, language, country=country)
+                    is_alpha = False
+
+            return cls.numeric_phrase(key, safe_decode(unit), language,
+                                      dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
+        else:
+            key = 'units.standalone'
+            values, probs = address_config.alternative_probabilities(key, language,
+                                                                     dictionaries=['unit_types_standalone'],
+                                                                     country=country)
+            if values is None:
+                return None
+            phrase, phrase_props = weighted_choice(values, probs)
+            return phrase.title()