Initial fork commit
This commit is contained in:
0
scripts/geodata/addresses/__init__.py
Normal file
0
scripts/geodata/addresses/__init__.py
Normal file
59
scripts/geodata/addresses/blocks.py
Normal file
59
scripts/geodata/addresses/blocks.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Block(NumberedComponent):
|
||||
max_blocks = 10
|
||||
|
||||
block_range = range(1, max_blocks + 1)
|
||||
block_range_probs = zipfian_distribution(len(block_range), 2.0)
|
||||
block_range_cdf = cdf(block_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.block_range, cls.block_range_cdf)
|
||||
return safe_decode(number)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.block_range, cls.block_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, block, language, country=None):
|
||||
if block is None:
|
||||
return None
|
||||
|
||||
phrase_prob = address_config.get_property('blocks.alphanumeric_phrase_probability', language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return cls.numeric_phrase('blocks.alphanumeric', block, language,
|
||||
dictionaries=['qualifiers'], country=country)
|
||||
else:
|
||||
return None
|
||||
2022
scripts/geodata/addresses/components.py
Normal file
2022
scripts/geodata/addresses/components.py
Normal file
File diff suppressed because it is too large
Load Diff
152
scripts/geodata/addresses/config.py
Normal file
152
scripts/geodata/addresses/config.py
Normal file
@@ -0,0 +1,152 @@
|
||||
|
||||
import copy
|
||||
import os
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
|
||||
from geodata.math.sampling import cdf, check_probability_distribution
|
||||
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'addresses')
|
||||
|
||||
DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'dictionaries')
|
||||
|
||||
|
||||
class AddressConfig(object):
|
||||
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
|
||||
self.address_configs = {}
|
||||
self.cache = {}
|
||||
|
||||
for filename in os.listdir(config_dir):
|
||||
if not filename.endswith('.yaml'):
|
||||
continue
|
||||
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
|
||||
countries = config.pop('countries', {})
|
||||
|
||||
for k in countries.keys():
|
||||
country_config = countries[k]
|
||||
config_copy = copy.deepcopy(config)
|
||||
countries[k] = recursive_merge(config_copy, country_config)
|
||||
|
||||
config['countries'] = countries
|
||||
|
||||
lang = filename.rsplit('.yaml')[0]
|
||||
self.address_configs[lang] = config
|
||||
|
||||
self.sample_phrases = {}
|
||||
|
||||
for language in address_phrase_dictionaries.languages:
|
||||
for dictionary in address_phrase_dictionaries.language_dictionaries[language]:
|
||||
self.sample_phrases[(language, dictionary)] = {}
|
||||
for phrases in address_phrase_dictionaries.phrases[(language, dictionary)]:
|
||||
self.sample_phrases[(language, dictionary)][phrases[0]] = phrases[1:]
|
||||
|
||||
def get_property(self, key, language, country=None, default=None):
|
||||
keys = key.split('.')
|
||||
config = self.address_configs.get(language, {})
|
||||
|
||||
if country:
|
||||
country_config = config.get('countries', {}).get(country, {})
|
||||
if country_config:
|
||||
config = country_config
|
||||
|
||||
value = nested_get(config, keys)
|
||||
if value is not DoesNotExist:
|
||||
return value
|
||||
|
||||
return default
|
||||
|
||||
def cache_key(self, prop, language, dictionaries=(), country=None):
|
||||
return (prop, language, country, tuple(dictionaries))
|
||||
|
||||
def alternative_probabilities(self, prop, language, dictionaries=(), country=None):
|
||||
'''Get a probability distribution over alternatives'''
|
||||
key = self.cache_key(prop, language, dictionaries, country=country)
|
||||
if key not in self.cache:
|
||||
properties = self.get_property(prop, language, country=country, default=None)
|
||||
|
||||
if properties is None:
|
||||
return None, None
|
||||
|
||||
alternatives, probs = alternative_probabilities(properties)
|
||||
if alternatives is None:
|
||||
return None, None
|
||||
|
||||
forms = []
|
||||
form_probs = []
|
||||
|
||||
for props, prob in zip(alternatives, probs):
|
||||
phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
|
||||
forms.extend([(p, props) for p in phrases])
|
||||
form_probs.extend([prob * p for p in phrase_probs])
|
||||
|
||||
sample_probability = properties.get('sample_probability')
|
||||
if sample_probability is not None:
|
||||
sample_phrases = []
|
||||
for dictionary in dictionaries:
|
||||
phrases = self.sample_phrases.get((language, dictionary), [])
|
||||
for canonical, surface_forms in six.iteritems(phrases):
|
||||
sample_phrases.append(canonical)
|
||||
sample_phrases.extend(surface_forms)
|
||||
# Note: use the outer properties dictionary e.g. units.alphanumeric
|
||||
forms.extend([(p, properties) for p in sample_phrases])
|
||||
form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
|
||||
|
||||
try:
|
||||
check_probability_distribution(form_probs)
|
||||
except AssertionError:
|
||||
print 'values were: {}'.format(forms)
|
||||
raise
|
||||
|
||||
form_probs_cdf = cdf(form_probs)
|
||||
self.cache[key] = (forms, form_probs_cdf)
|
||||
return self.cache[key]
|
||||
|
||||
def form_probabilities(self, properties, language, dictionaries=()):
|
||||
probs = []
|
||||
alternatives = []
|
||||
canonical_prob = properties.get('canonical_probability', 1.0)
|
||||
canonical = properties['canonical']
|
||||
|
||||
alternatives.append(canonical)
|
||||
probs.append(canonical_prob)
|
||||
|
||||
if 'abbreviated_probability' in properties:
|
||||
probs.append(properties['abbreviated_probability'])
|
||||
abbreviated = properties['abbreviated']
|
||||
assert isinstance(abbreviated, basestring)
|
||||
alternatives.append(abbreviated)
|
||||
|
||||
if properties.get('sample', False) and 'sample_probability' in properties:
|
||||
sample_prob = properties['sample_probability']
|
||||
samples = set()
|
||||
for dictionary in dictionaries:
|
||||
phrases = self.sample_phrases.get((language, dictionary), {})
|
||||
samples |= set(phrases.get(canonical, []))
|
||||
if 'sample_exclude' in properties:
|
||||
samples -= set(properties['sample_exclude'])
|
||||
if samples:
|
||||
for phrase in samples:
|
||||
probs.append(sample_prob / float(len(samples)))
|
||||
alternatives.append(phrase)
|
||||
else:
|
||||
total = sum(probs)
|
||||
probs = [p / total for p in probs]
|
||||
|
||||
try:
|
||||
check_probability_distribution(probs)
|
||||
except AssertionError:
|
||||
print 'values were: {}'.format(alternatives)
|
||||
raise
|
||||
|
||||
return alternatives, probs
|
||||
|
||||
address_config = AddressConfig()
|
||||
37
scripts/geodata/addresses/conjunctions.py
Normal file
37
scripts/geodata/addresses/conjunctions.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import six
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
|
||||
class Conjunction(object):
|
||||
DEFAULT_WHITESPACE_JOIN = ', '
|
||||
DEFAULT_NON_WHITESPACE_JOIN = ''
|
||||
key = 'and'
|
||||
|
||||
@classmethod
|
||||
def join(cls, phrases, language, country=None):
|
||||
|
||||
if not hasattr(phrases, '__iter__'):
|
||||
raise ValueError('Param phrases must be iterable')
|
||||
|
||||
values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
|
||||
phrase, props = weighted_choice(values, probs)
|
||||
|
||||
whitespace = props.get('whitespace', True)
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
|
||||
phrases = [safe_decode(p) for p in phrases]
|
||||
|
||||
max_phrase_join = props.get('max_phrase_join', 2)
|
||||
if len(phrases) > max_phrase_join:
|
||||
default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
|
||||
prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
|
||||
else:
|
||||
prefix = six.u('')
|
||||
|
||||
if whitespace:
|
||||
phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
|
||||
joined_phrase = phrase.join(phrases[-max_phrase_join:])
|
||||
|
||||
return six.u('').join([prefix, joined_phrase])
|
||||
19
scripts/geodata/addresses/conscription_numbers.py
Normal file
19
scripts/geodata/addresses/conscription_numbers.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class ConscriptionNumber(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
if number is None:
|
||||
return number
|
||||
|
||||
key = 'conscription_numbers.alphanumeric'
|
||||
dictionaries = ['house_numbers']
|
||||
default = safe_decode(number)
|
||||
|
||||
return cls.numeric_phrase(key, safe_decode(number), language,
|
||||
dictionaries=dictionaries, country=country)
|
||||
42
scripts/geodata/addresses/dependencies.py
Normal file
42
scripts/geodata/addresses/dependencies.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import operator
|
||||
import six
|
||||
|
||||
from geodata.graph.topsort import topsort
|
||||
|
||||
|
||||
class ComponentDependencies(object):
|
||||
'''
|
||||
Declare an address component and its dependencies e.g.
|
||||
a house_numer cannot be used in the absence of a road name.
|
||||
'''
|
||||
|
||||
component_bit_values = {}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.dependencies = {}
|
||||
|
||||
self.all_values = long('1' * len(graph), 2)
|
||||
|
||||
self.dependency_order = [c for c in topsort(graph)]
|
||||
|
||||
for component, deps in six.iteritems(graph):
|
||||
self.dependencies[component] = self.component_bitset(deps) if deps else self.all_values
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.dependencies.__getitem__(key)
|
||||
|
||||
def __contains__(self, key):
|
||||
return self.dependencies.__contains__(key)
|
||||
|
||||
@classmethod
|
||||
def get_component_bit_value(cls, name):
|
||||
val = cls.component_bit_values.get(name)
|
||||
if val is None:
|
||||
num_values = len(cls.component_bit_values)
|
||||
val = 1 << num_values
|
||||
cls.component_bit_values[name] = val
|
||||
return val
|
||||
|
||||
@classmethod
|
||||
def component_bitset(cls, components):
|
||||
return reduce(operator.or_, [cls.get_component_bit_value(name) for name in components])
|
||||
37
scripts/geodata/addresses/directions.py
Normal file
37
scripts/geodata/addresses/directions.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumericPhrase
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
|
||||
class RelativeDirection(NumericPhrase):
|
||||
key = 'directions'
|
||||
dictionaries = ['unit_directions']
|
||||
|
||||
|
||||
class AnteroposteriorDirection(RelativeDirection):
|
||||
key = 'directions.anteroposterior'
|
||||
|
||||
|
||||
class LateralDirection(RelativeDirection):
|
||||
key = 'directions.lateral'
|
||||
|
||||
|
||||
class CardinalDirection(NumericPhrase):
|
||||
key = 'cardinal_directions'
|
||||
dictionaries = ['cardinal_directions']
|
||||
|
||||
|
||||
class Direction(object):
|
||||
CARDINAL = 'cardinal'
|
||||
RELATIVE = 'relative'
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, cardinal_proability=0.5):
|
||||
values = [cls.CARDINAL, cls.RELATIVE]
|
||||
probs_cdf = [cardinal_proability, 1.0]
|
||||
|
||||
choice = weighted_choice(values, probs_cdf)
|
||||
if choice == cls.CARDINAL:
|
||||
return CardinalDirection.phrase(None, language, country=country)
|
||||
else:
|
||||
return RelativeDirection.phrase(None, language, country=country)
|
||||
66
scripts/geodata/addresses/entrances.py
Normal file
66
scripts/geodata/addresses/entrances.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.directions import RelativeDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Entrance(NumberedComponent):
|
||||
max_entrances = 10
|
||||
|
||||
entrance_range = range(1, max_entrances + 1)
|
||||
entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0)
|
||||
entrance_range_cdf = cdf(entrance_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, entrance, language, country=None):
|
||||
if entrance is None:
|
||||
return None
|
||||
return cls.numeric_phrase('entrances.alphanumeric', entrance, language,
|
||||
dictionaries=['entrances'], country=country)
|
||||
165
scripts/geodata/addresses/floors.py
Normal file
165
scripts/geodata/addresses/floors.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.numbers.spellout import numeric_expressions
|
||||
|
||||
|
||||
class Floor(NumberedComponent):
|
||||
# When we don't know the number of floors, use a Zipfian distribution
|
||||
# to choose randomly between 1 and max_floors with 1 being much more
|
||||
# likely than 2, etc.
|
||||
max_floors = 10
|
||||
max_basements = 2
|
||||
numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1)
|
||||
floor_probs = zipfian_distribution(len(numbered_floors), 0.75)
|
||||
floor_probs_cdf = cdf(floor_probs)
|
||||
|
||||
# For use with letters e.g. A0 is probably not as common
|
||||
floors_letters = range(1, max_floors + 1) + [0]
|
||||
floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0)
|
||||
floors_letters_cdf = cdf(floors_letters_probs)
|
||||
|
||||
@classmethod
|
||||
def sample_floors(cls, num_floors, num_basements=0):
|
||||
num_floors = int(num_floors)
|
||||
return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0)
|
||||
|
||||
@classmethod
|
||||
def sample_floors_range(cls, min_floor, max_floor):
|
||||
return random.randint(min_floor, (max_floor - 1) if max_floor > min_floor else min_floor)
|
||||
|
||||
@classmethod
|
||||
def random_int(cls, language, country=None, num_floors=None, num_basements=None):
|
||||
number = None
|
||||
if num_floors is not None:
|
||||
try:
|
||||
num_floors = int(num_floors)
|
||||
except (ValueError, TypeError):
|
||||
return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
|
||||
|
||||
if num_floors <= cls.max_floors:
|
||||
number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
|
||||
else:
|
||||
number = cls.sample_floors_range(cls.max_floors + 1, num_floors)
|
||||
|
||||
else:
|
||||
number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
|
||||
|
||||
return number
|
||||
|
||||
@classmethod
|
||||
def random_from_int(cls, number, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
|
||||
|
||||
if number >= 0:
|
||||
number += numbering_starts_at
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.ROMAN_NUMERAL:
|
||||
roman_numeral = numeric_expressions.roman_numeral(number)
|
||||
if roman_numeral is not None:
|
||||
return roman_numeral
|
||||
else:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number2 = number + sample_floors_range(1, cls.max_floors)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet)
|
||||
if num_type == cls.ALPHA:
|
||||
return letter
|
||||
else:
|
||||
number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}').format(letter, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}').format(number, letter)
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, num_floors=None, num_basements=None):
|
||||
number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
|
||||
return cls.random_from_int(number, language, country=country)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, floor, language, country=None, num_floors=None):
|
||||
if floor is None:
|
||||
return None
|
||||
|
||||
integer_floor = False
|
||||
floor = safe_decode(floor)
|
||||
try:
|
||||
floor = int(floor)
|
||||
integer_floor = True
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
floor = float(floor)
|
||||
integer_floor = int(floor) == floor
|
||||
except (ValueError, TypeError):
|
||||
return cls.numeric_phrase('levels.alphanumeric', floor, language,
|
||||
dictionaries=['level_types_numbered'], country=country)
|
||||
|
||||
numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
|
||||
try:
|
||||
num_floors = int(num_floors)
|
||||
top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1
|
||||
is_top = num_floors and floor == top_floor
|
||||
except (ValueError, TypeError):
|
||||
is_top = False
|
||||
|
||||
alias_prefix = 'levels.aliases'
|
||||
aliases = address_config.get_property(alias_prefix, language, country=country)
|
||||
if aliases:
|
||||
alias = None
|
||||
|
||||
if not integer_floor and floor >= 0 and 'half_floors' in aliases:
|
||||
floor = int(floor)
|
||||
alias = 'half_floors'
|
||||
elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases:
|
||||
floor = int(floor)
|
||||
alias = 'half_floors_negative'
|
||||
elif floor < -1 and '<-1' in aliases:
|
||||
alias = '<-1'
|
||||
elif is_top and 'top' in aliases:
|
||||
alias = 'top'
|
||||
elif safe_decode(floor) in aliases:
|
||||
alias = safe_decode(floor)
|
||||
|
||||
floor = safe_decode(floor)
|
||||
|
||||
if alias:
|
||||
alias_props = aliases.get(alias)
|
||||
|
||||
# Aliases upon aliases, e.g. for something like "Upper Mezzanine"
|
||||
# where it's an alias for "1" under the half_floors key
|
||||
if safe_decode(floor) in alias_props.get('aliases', {}):
|
||||
alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias)
|
||||
alias = safe_decode(floor)
|
||||
|
||||
if alias:
|
||||
return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language,
|
||||
dictionaries=['level_types_basement',
|
||||
'level_types_mezzanine',
|
||||
'level_types_numbered',
|
||||
'level_types_standalone',
|
||||
'level_types_sub_basement'],
|
||||
country=country)
|
||||
|
||||
return cls.numeric_phrase('levels.alphanumeric', floor, language,
|
||||
dictionaries=['level_types_numbered'], country=country)
|
||||
26
scripts/geodata/addresses/house_numbers.py
Normal file
26
scripts/geodata/addresses/house_numbers.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class HouseNumber(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
if number is not None:
|
||||
prob_key = 'house_numbers.alphanumeric_phrase_probability'
|
||||
key = 'house_numbers.alphanumeric'
|
||||
dictionaries = ['house_numbers', 'number']
|
||||
default = safe_decode(number)
|
||||
else:
|
||||
prob_key = 'house_numbers.no_number_probability'
|
||||
key = 'house_numbers.no_number'
|
||||
dictionaries = ['no_number']
|
||||
default = None
|
||||
|
||||
phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return cls.numeric_phrase(key, safe_decode(number), language,
|
||||
dictionaries=dictionaries, country=country)
|
||||
return default
|
||||
24
scripts/geodata/addresses/metro_stations.py
Normal file
24
scripts/geodata/addresses/metro_stations.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from geodata.addresses.config import address_config
|
||||
|
||||
import random
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumericPhrase
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class MetroStationPhrase(NumericPhrase):
|
||||
key = 'metro_stations.alphanumeric'
|
||||
dictionaries = ['qualifiers']
|
||||
|
||||
|
||||
class MetroStation(object):
|
||||
@classmethod
|
||||
def phrase(cls, station, language, country=None):
|
||||
if station is None:
|
||||
return None
|
||||
phrase_prob = address_config.get_property('metro_stations.alphanumeric_phrase_probability', language, country=country, default=0.0)
|
||||
if random.random() < phrase_prob:
|
||||
return MetroStationPhrase.phrase(station, language, country=country)
|
||||
|
||||
return None
|
||||
434
scripts/geodata/addresses/numbering.py
Normal file
434
scripts/geodata/addresses/numbering.py
Normal file
@@ -0,0 +1,434 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.numbers.ordinals import ordinal_expressions
|
||||
from geodata.numbers.spellout import numeric_expressions
|
||||
from geodata.text.tokenize import tokenize, token_types
|
||||
|
||||
alphabets = {}
|
||||
|
||||
|
||||
def sample_alphabet(alphabet, b=1.5):
|
||||
'''
|
||||
Sample an "alphabet" using a Zipfian distribution (frequent items are very
|
||||
frequent, long tail of infrequent items). If we look at something like
|
||||
unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
|
||||
"Unit Z" simply because most dwellings only have a few units. Sampling
|
||||
letters from a Zipfian distribution rather than uniformly means that instead
|
||||
of every letter having the same likelihood (1/26), letters toward the beginning
|
||||
of the alphabet are much more likely to be selected. Letters toward the end can
|
||||
still be selected sometimes, but are not very likely.
|
||||
|
||||
Note letters don't necessarily need to be sorted alphabetically, just in order
|
||||
of frequency.
|
||||
'''
|
||||
global alphabets
|
||||
alphabet = tuple(alphabet)
|
||||
if alphabet not in alphabets:
|
||||
probs = zipfian_distribution(len(alphabet), b)
|
||||
probs_cdf = cdf(probs)
|
||||
|
||||
alphabets[alphabet] = probs_cdf
|
||||
|
||||
probs_cdf = alphabets[alphabet]
|
||||
return weighted_choice(alphabet, probs_cdf)
|
||||
|
||||
latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
|
||||
|
||||
|
||||
class Digits(object):
|
||||
ASCII = 'ascii'
|
||||
SPELLOUT = 'spellout'
|
||||
UNICODE_FULL_WIDTH = 'unicode_full_width'
|
||||
ROMAN_NUMERAL = 'roman_numeral'
|
||||
|
||||
CARDINAL = 'cardinal'
|
||||
ORDINAL = 'ordinal'
|
||||
|
||||
unicode_full_width_map = {
|
||||
'0': safe_decode('0'),
|
||||
'1': safe_decode('1'),
|
||||
'2': safe_decode('2'),
|
||||
'3': safe_decode('3'),
|
||||
'4': safe_decode('4'),
|
||||
'5': safe_decode('5'),
|
||||
'6': safe_decode('6'),
|
||||
'7': safe_decode('7'),
|
||||
'8': safe_decode('8'),
|
||||
'9': safe_decode('9'),
|
||||
}
|
||||
|
||||
full_width_digit_map = {
|
||||
v: k for k, v in six.iteritems(unicode_full_width_map)
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def rewrite_full_width(cls, s):
|
||||
return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
|
||||
|
||||
@classmethod
|
||||
def rewrite_standard_width(cls, s):
|
||||
return six.u('').join([cls.full_width_digit_map.get(c, c) for c in s])
|
||||
|
||||
@classmethod
|
||||
def rewrite_roman_numeral(cls, s):
|
||||
roman_numeral = None
|
||||
if s.isdigit():
|
||||
roman_numeral = numeric_expressions.roman_numeral(s)
|
||||
|
||||
if roman_numeral:
|
||||
return roman_numeral
|
||||
else:
|
||||
return s
|
||||
|
||||
@classmethod
|
||||
def rewrite_spellout(cls, s, lang, num_type, props):
|
||||
if s.isdigit():
|
||||
num = int(s)
|
||||
spellout = None
|
||||
gender = props.get('gender')
|
||||
category = props.get('category')
|
||||
|
||||
if num_type == cls.CARDINAL:
|
||||
spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
|
||||
elif num_type == cls.ORDINAL:
|
||||
spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
|
||||
|
||||
if spellout:
|
||||
return spellout.title()
|
||||
return s
|
||||
else:
|
||||
return s
|
||||
|
||||
@classmethod
|
||||
def rewrite(cls, d, lang, props, num_type=CARDINAL):
|
||||
if not props:
|
||||
return d
|
||||
|
||||
d = safe_decode(d)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
|
||||
key = '{}_probability'.format(digit_type)
|
||||
if key in props:
|
||||
values.append(digit_type)
|
||||
probs.append(props[key])
|
||||
|
||||
if not isclose(sum(probs), 1.0):
|
||||
values.append(cls.ASCII)
|
||||
probs.append(1.0 - sum(probs))
|
||||
|
||||
probs = cdf(probs)
|
||||
digit_type = weighted_choice(values, probs)
|
||||
|
||||
if digit_type == cls.ASCII:
|
||||
return d
|
||||
elif digit_type == cls.SPELLOUT:
|
||||
return cls.rewrite_spellout(d, lang, num_type, props)
|
||||
elif digit_type == cls.ROMAN_NUMERAL:
|
||||
roman_numeral = cls.rewrite_roman_numeral(d)
|
||||
if random.random() < props.get('ordinal_suffix_probability', 0.0):
|
||||
ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
|
||||
if ordinal_suffix:
|
||||
roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
|
||||
return roman_numeral
|
||||
elif digit_type == cls.UNICODE_FULL_WIDTH:
|
||||
return cls.rewrite_full_width(d)
|
||||
else:
|
||||
return d
|
||||
|
||||
|
||||
class NumericPhrase(object):
|
||||
key = None
|
||||
|
||||
NUMERIC = 'numeric'
|
||||
NUMERIC_AFFIX = 'numeric_affix'
|
||||
|
||||
@classmethod
|
||||
def pick_phrase_and_type(cls, number, language, country=None):
|
||||
values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
|
||||
if not values:
|
||||
return None, safe_decode(number) if number is not None else None, None
|
||||
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key, None)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
if not probs:
|
||||
num_type = cls.NUMERIC
|
||||
else:
|
||||
probs = cdf(probs)
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
return num_type, phrase, phrase_props[num_type]
|
||||
|
||||
@classmethod
|
||||
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
|
||||
|
||||
if num_type == cls.NUMERIC_AFFIX:
|
||||
phrase = props['affix']
|
||||
if 'zero_pad' in props and number.isdigit():
|
||||
number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
whitespace_probability = props.get('whitespace_probability')
|
||||
if whitespace_probability is not None:
|
||||
whitespace = random.random() < whitespace_probability
|
||||
|
||||
if props.get('title_case', True):
|
||||
# Title case unless the config specifies otherwise
|
||||
phrase = phrase.title()
|
||||
|
||||
if number is None:
|
||||
return phrase
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(number)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, number, language, country=None):
|
||||
num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
|
||||
whitespace_default = num_type == cls.NUMERIC
|
||||
return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
|
||||
|
||||
|
||||
class Number(NumericPhrase):
|
||||
key = 'numbers'
|
||||
dictionaries = ['number']
|
||||
|
||||
|
||||
class NumberedComponent(object):
|
||||
NUMERIC = 'numeric'
|
||||
ALPHA = 'alpha'
|
||||
ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
|
||||
NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
|
||||
HYPHENATED_NUMBER = 'hyphenated_number'
|
||||
ROMAN_NUMERAL = 'roman_numeral'
|
||||
|
||||
@classmethod
|
||||
def choose_alphanumeric_type(cls, key, language, country=None):
|
||||
alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
|
||||
if alphanumeric_props is None:
|
||||
return None, None
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = alphanumeric_props.get(key)
|
||||
if prob is not None:
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
|
||||
if not values:
|
||||
return None, None
|
||||
|
||||
probs = cdf(probs)
|
||||
num_type = weighted_choice(values, probs)
|
||||
num_type_props = alphanumeric_props.get(num_type, {})
|
||||
|
||||
return num_type, num_type_props
|
||||
|
||||
@classmethod
|
||||
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
|
||||
has_alpha = False
|
||||
has_numeric = True
|
||||
is_integer = False
|
||||
is_none = False
|
||||
if num is not None:
|
||||
try:
|
||||
num_int = int(num)
|
||||
is_integer = True
|
||||
except ValueError:
|
||||
try:
|
||||
num_float = float(num)
|
||||
except ValueError:
|
||||
tokens = tokenize(safe_decode(num))
|
||||
has_numeric = False
|
||||
for t, c in tokens:
|
||||
if c == token_types.NUMERIC:
|
||||
has_numeric = True
|
||||
if any((ch.isalpha() for ch in t)):
|
||||
has_alpha = True
|
||||
|
||||
if strict_numeric and has_alpha:
|
||||
return safe_decode(num)
|
||||
|
||||
else:
|
||||
is_none = True
|
||||
|
||||
values, probs = None, None
|
||||
|
||||
if is_alpha:
|
||||
values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
|
||||
|
||||
# Pick a phrase given the probability distribution from the config
|
||||
if values is None:
|
||||
values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
|
||||
|
||||
if not values:
|
||||
return safe_decode(num) if not is_none else None
|
||||
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
# Dictionaries are lowercased, so title case here
|
||||
if phrase_props.get('title_case', True):
|
||||
phrase = phrase.title()
|
||||
|
||||
'''
|
||||
There are a few ways we can express the number itself
|
||||
|
||||
1. Alias it as some standalone word like basement (for floor "-1")
|
||||
2. Use the number itself, so "Floor 2"
|
||||
3. Append/prepend an affix e.g. 2/F for second floor
|
||||
4. As an ordinal expression e.g. "2nd Floor"
|
||||
'''
|
||||
have_standalone = False
|
||||
have_null = False
|
||||
for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
|
||||
key = '{}_probability'.format(num_type)
|
||||
prob = phrase_props.get(key)
|
||||
if prob is not None:
|
||||
if num_type == 'standalone':
|
||||
have_standalone = True
|
||||
elif num_type == 'null':
|
||||
have_null = True
|
||||
values.append(num_type)
|
||||
probs.append(prob)
|
||||
elif num_type in phrase_props:
|
||||
values.append(num_type)
|
||||
probs.append(1.0)
|
||||
break
|
||||
|
||||
if not probs or is_none:
|
||||
return phrase
|
||||
|
||||
# If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
|
||||
if has_alpha:
|
||||
values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
|
||||
total = float(sum(probs))
|
||||
if isclose(total, 0.0):
|
||||
return None
|
||||
|
||||
probs = [p / total for p in probs]
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
if len(values) < 2:
|
||||
if have_standalone:
|
||||
num_type = 'standalone'
|
||||
elif have_null:
|
||||
num_type = 'null'
|
||||
else:
|
||||
num_type = 'numeric'
|
||||
else:
|
||||
num_type = weighted_choice(values, probs)
|
||||
|
||||
if num_type == 'standalone':
|
||||
return phrase
|
||||
elif num_type == 'null':
|
||||
return safe_decode(num)
|
||||
|
||||
props = phrase_props[num_type]
|
||||
|
||||
if is_integer:
|
||||
num_int = int(num)
|
||||
if phrase_props.get('number_abs_value', False):
|
||||
num_int = abs(num_int)
|
||||
num = num_int
|
||||
|
||||
if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
|
||||
return None
|
||||
|
||||
if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
|
||||
return None
|
||||
|
||||
if phrase_props.get('number_subtract_abs_value'):
|
||||
num_int -= phrase_props['number_subtract_abs_value']
|
||||
num = num_int
|
||||
|
||||
num = safe_decode(num)
|
||||
digits_props = props.get('digits')
|
||||
if digits_props:
|
||||
# Inherit the gender and category e.g. for ordinals
|
||||
for k in ('gender', 'category'):
|
||||
if k in props:
|
||||
digits_props[k] = props[k]
|
||||
num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
|
||||
|
||||
# Do we add the numeric phrase e.g. Floor No 1
|
||||
add_number_phrase = props.get('add_number_phrase', False)
|
||||
if add_number_phrase and random.random() < props['add_number_phrase_probability']:
|
||||
num = Number.phrase(num, language, country=country)
|
||||
|
||||
whitespace_default = True
|
||||
|
||||
if num_type == 'numeric_affix':
|
||||
phrase = props['affix']
|
||||
if props.get('upper_case', True):
|
||||
phrase = phrase.upper()
|
||||
if 'zero_pad' in props and num.isdigit():
|
||||
num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
|
||||
whitespace_default = False
|
||||
elif num_type == 'ordinal' and safe_decode(num).isdigit():
|
||||
ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
|
||||
|
||||
if ordinal_expression is not None:
|
||||
num = ordinal_expression
|
||||
|
||||
if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
|
||||
if random.random() < props['null_phrase_probability']:
|
||||
return num
|
||||
|
||||
direction = props['direction']
|
||||
whitespace = props.get('whitespace', whitespace_default)
|
||||
|
||||
whitespace_probability = props.get('whitespace_probability')
|
||||
if whitespace_probability is not None:
|
||||
whitespace = random.random() < whitespace_probability
|
||||
|
||||
# Occasionally switch up if direction_probability is specified
|
||||
if random.random() > props.get('direction_probability', 1.0):
|
||||
if direction == 'left':
|
||||
direction = 'right'
|
||||
elif direction == 'right':
|
||||
direction = 'left'
|
||||
|
||||
whitespace_phrase = six.u(' ') if whitespace else six.u('')
|
||||
# Phrase goes to the left of hte number
|
||||
if direction == 'left':
|
||||
return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
|
||||
# Phrase goes to the right of the number
|
||||
elif direction == 'right':
|
||||
return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
|
||||
# Need to specify a direction, otherwise return naked number
|
||||
else:
|
||||
return safe_decode(num)
|
||||
76
scripts/geodata/addresses/po_boxes.py
Normal file
76
scripts/geodata/addresses/po_boxes.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
|
||||
|
||||
class POBox(NumberedComponent):
|
||||
@classmethod
|
||||
def random_digits(cls, num_digits):
|
||||
# Note: PO Boxes can have leading zeros but not important for the parser
|
||||
# since it only cares about how many digits there are in a number
|
||||
low = 10 ** (num_digits - 1)
|
||||
high = (10 ** num_digits) - 1
|
||||
|
||||
return random.randint(low, high)
|
||||
|
||||
@classmethod
|
||||
def random_digits_with_prefix(cls, num_digits, prefix=six.u('')):
|
||||
return six.u('').join([prefix, safe_decode(cls.random_digits(num_digits))])
|
||||
|
||||
@classmethod
|
||||
def random_digits_with_suffix(cls, num_digits, suffix=six.u('')):
|
||||
return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix])
|
||||
|
||||
@classmethod
|
||||
def random_letter(cls, language, country=None):
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
return sample_alphabet(alphabet)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type != cls.ALPHA:
|
||||
digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for val in digit_config:
|
||||
values.append(val['length'])
|
||||
probs.append(val['probability'])
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
num_digits = weighted_choice(values, probs)
|
||||
|
||||
digits = cls.random_digits(num_digits)
|
||||
number = Digits.rewrite(digits, language, num_type_props)
|
||||
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
else:
|
||||
letter = cls.random_letter(language, country=country)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
else:
|
||||
return cls.random_letter(language, country=country)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, box_number, language, country=None):
|
||||
if box_number is None:
|
||||
return None
|
||||
return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language,
|
||||
dictionaries=['post_office'], country=country)
|
||||
11
scripts/geodata/addresses/postcodes.py
Normal file
11
scripts/geodata/addresses/postcodes.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
|
||||
class PostCode(NumberedComponent):
|
||||
@classmethod
|
||||
def phrase(cls, postcode, language, country=None):
|
||||
if postcode is None:
|
||||
return None
|
||||
return cls.numeric_phrase('postcodes.alphanumeric', postcode, language,
|
||||
dictionaries=['postcodes'], country=country)
|
||||
66
scripts/geodata/addresses/staircases.py
Normal file
66
scripts/geodata/addresses/staircases.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.numbering import NumberedComponent
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.addresses.directions import RelativeDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
|
||||
|
||||
class Staircase(NumberedComponent):
|
||||
max_staircases = 10
|
||||
|
||||
staircase_range = range(1, max_staircases + 1)
|
||||
staircase_range_probs = zipfian_distribution(len(staircase_range), 2.0)
|
||||
staircase_range_cdf = cdf(staircase_range_probs)
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet, 2.0)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, staircase, language, country=None):
|
||||
if staircase is None:
|
||||
return None
|
||||
return cls.numeric_phrase('staircases.alphanumeric', staircase, language,
|
||||
dictionaries=['staircases'], country=country)
|
||||
285
scripts/geodata/addresses/units.py
Normal file
285
scripts/geodata/addresses/units.py
Normal file
@@ -0,0 +1,285 @@
|
||||
import itertools
|
||||
import random
|
||||
import six
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.addresses.directions import RelativeDirection, LateralDirection, AnteroposteriorDirection
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
|
||||
from geodata.configs.utils import nested_get
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
|
||||
from geodata.text.utils import is_numeric_strict
|
||||
|
||||
|
||||
class Unit(NumberedComponent):
|
||||
# When we don't know the number of units, use a Zipfian distribution
|
||||
# to choose randomly between 1 and max_units with 1 being much more
|
||||
# likely than 2, etc.
|
||||
max_units = 99
|
||||
max_basements = 2
|
||||
|
||||
hundreds_numbered_units_tens = [range(101, 110) + [100],
|
||||
range(201, 210) + [200],
|
||||
range(301, 310) + [300],
|
||||
range(401, 410) + [400],
|
||||
range(501, 510) + [500],
|
||||
]
|
||||
|
||||
hundreds_numbered_units = [range(110, 200),
|
||||
range(210, 300),
|
||||
range(310, 400),
|
||||
range(410, 500),
|
||||
range(510, 600),
|
||||
]
|
||||
|
||||
thousands_numbered_units = [range(1001, 1030) + [1000],
|
||||
range(2001, 2030) + [2000],
|
||||
range(3001, 3030) + [3000],
|
||||
range(4001, 4030) + [4000],
|
||||
range(5001, 5030) + [5000]
|
||||
]
|
||||
|
||||
numbered_units = range(1, 10)
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units_tens)))
|
||||
numbered_units.extend(range(10, 100))
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units)))
|
||||
numbered_units.extend(itertools.chain(*itertools.izip(*thousands_numbered_units)))
|
||||
numbered_units.extend(range(10001, 10100) + [10000])
|
||||
numbered_units.append(0)
|
||||
numbered_units.extend(range(0, -max_basements - 1, -1))
|
||||
|
||||
unit_probs = zipfian_distribution(len(numbered_units), 0.7)
|
||||
unit_probs_cdf = cdf(unit_probs)
|
||||
|
||||
num_digits = [2, 3, 4]
|
||||
num_digits_probs = zipfian_distribution(len(num_digits), 4.0)
|
||||
num_digits_cdf = cdf(num_digits_probs)
|
||||
|
||||
# For use with floors e.g. #301 more common than #389
|
||||
positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1)
|
||||
positive_units_floors_probs = zipfian_distribution(len(positive_units_floors), 0.6)
|
||||
positive_units_floors_cdf = cdf(positive_units_floors_probs)
|
||||
|
||||
# For basic positive units
|
||||
positive_units = range(1, max_units + 1)
|
||||
positive_units_probs = zipfian_distribution(len(positive_units), 0.6)
|
||||
positive_units_cdf = cdf(positive_units_probs)
|
||||
|
||||
# For use with letters e.g. A0 less common
|
||||
positive_units_letters = range(1, max_units + 1) + [0]
|
||||
positive_units_letters_probs = zipfian_distribution(len(positive_units_letters), 0.6)
|
||||
positive_units_letters_cdf = cdf(positive_units_letters_probs)
|
||||
|
||||
RESIDENTIAL = 'residential'
|
||||
COMMERCIAL = 'commercial'
|
||||
INDUSTRIAL = 'industrial'
|
||||
UNIVERSITY = 'university'
|
||||
|
||||
@classmethod
|
||||
def sample_num_digits(cls):
|
||||
return weighted_choice(cls.num_digits, cls.num_digits_cdf)
|
||||
|
||||
@classmethod
|
||||
def for_floor(cls, floor_number, num_digits=None):
|
||||
num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
|
||||
unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
|
||||
return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
|
||||
num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
|
||||
if num_type is None:
|
||||
return None
|
||||
|
||||
use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)
|
||||
|
||||
use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)
|
||||
|
||||
if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
|
||||
if random.random() >= use_positive_numbers_prob:
|
||||
number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
|
||||
else:
|
||||
number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
else:
|
||||
if floor is None or not floor.isdigit():
|
||||
floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
|
||||
|
||||
floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
|
||||
ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)
|
||||
|
||||
if ground_floor_starts_at is not None:
|
||||
try:
|
||||
floor = int(floor)
|
||||
if floor >= floor_numbering_starts_at:
|
||||
floor -= floor_numbering_starts_at
|
||||
floor += ground_floor_starts_at
|
||||
floor = safe_decode(floor)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
|
||||
if use_floor_affix_prob and random.random() < use_floor_affix_prob:
|
||||
floor_phrase = Floor.phrase(floor, language, country=country)
|
||||
# Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
|
||||
if is_numeric_strict(floor_phrase):
|
||||
unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
|
||||
unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
|
||||
if unit_num_digits is not None:
|
||||
unit = safe_decode(unit).zfill(unit_num_digits)
|
||||
|
||||
return six.u('{}{}').format(floor_phrase, unit)
|
||||
|
||||
floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
|
||||
if floor_num_digits is not None and floor.isdigit():
|
||||
floor = floor.zfill(floor_num_digits)
|
||||
|
||||
number = cls.for_floor(floor)
|
||||
|
||||
if num_type == cls.NUMERIC:
|
||||
return safe_decode(number)
|
||||
elif num_type == cls.HYPHENATED_NUMBER:
|
||||
number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
|
||||
range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
|
||||
direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
|
||||
direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))
|
||||
|
||||
if random.random() < direction_prob:
|
||||
direction = 'left' if direction == 'right' else 'right'
|
||||
|
||||
direction_right = direction == 'right'
|
||||
|
||||
if random.random() < range_prob:
|
||||
if direction_right:
|
||||
number2 += number
|
||||
else:
|
||||
number2 = max(0, number - number2)
|
||||
if direction == 'right':
|
||||
return u'{}-{}'.format(number, number2)
|
||||
else:
|
||||
return u'{}-{}'.format(number2, number)
|
||||
else:
|
||||
alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
|
||||
alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
|
||||
if alphabet_probability is not None and random.random() >= alphabet_probability:
|
||||
alphabet = latin_alphabet
|
||||
letter = sample_alphabet(alphabet)
|
||||
if num_type == cls.ALPHA:
|
||||
return safe_decode(letter)
|
||||
else:
|
||||
if num_floors is None:
|
||||
number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)
|
||||
|
||||
whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
|
||||
hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
|
||||
whitespace_phrase = u''
|
||||
r = random.random()
|
||||
if r < whitespace_probability:
|
||||
whitespace_phrase = u' '
|
||||
elif r < (whitespace_probability + hyphen_probability):
|
||||
whitespace_phrase = u'-'
|
||||
|
||||
if num_type == cls.ALPHA_PLUS_NUMERIC:
|
||||
return six.u('{}{}{}').format(letter, whitespace_phrase, number)
|
||||
elif num_type == cls.NUMERIC_PLUS_ALPHA:
|
||||
return six.u('{}{}{}').format(number, whitespace_phrase, letter)
|
||||
|
||||
@classmethod
|
||||
def add_direction(cls, key, unit, language, country=None):
|
||||
add_direction_probability = address_config.get_property('{}.add_direction_probability'.format(key),
|
||||
language, country=country, default=0.0)
|
||||
if not random.random() < add_direction_probability:
|
||||
return unit
|
||||
add_direction_numeric = address_config.get_property('{}.add_direction_numeric'.format(key),
|
||||
language, country=country)
|
||||
try:
|
||||
unit = int(unit)
|
||||
integer_unit = True
|
||||
except (ValueError, TypeError):
|
||||
integer_unit = False
|
||||
|
||||
if add_direction_numeric and integer_unit:
|
||||
return RelativeDirection.phrase(unit, language, country=country)
|
||||
elif not integer_unit:
|
||||
add_direction_standalone = address_config.get_property('{}.add_direction_standalone'.format(key),
|
||||
language, country=country)
|
||||
if add_direction_standalone:
|
||||
return RelativeDirection.phrase(None, language, country=country)
|
||||
|
||||
@classmethod
|
||||
def add_quadrant(cls, key, unit, language, country=None):
|
||||
add_quadrant_probability = address_config.get_property('{}.add_quadrant_probability'.format(key),
|
||||
language, country=country, default=0.0)
|
||||
if not random.random() < add_quadrant_probability:
|
||||
return unit
|
||||
add_quadrant_numeric = address_config.get_property('{}.add_quadrant_numeric'.format(key),
|
||||
language, country=country)
|
||||
try:
|
||||
unit = int(unit)
|
||||
integer_unit = True
|
||||
except (ValueError, TypeError):
|
||||
integer_unit = False
|
||||
|
||||
first_direction = address_config.get_property('{}.add_quadrant_first_direction'.format(key),
|
||||
language, country=country)
|
||||
|
||||
if first_direction == 'lateral':
|
||||
ordering = (LateralDirection, AnteroposteriorDirection)
|
||||
elif first_direction == 'anteroposterior':
|
||||
ordering = (AnteroposteriorDirection, LateralDirection)
|
||||
else:
|
||||
return unit
|
||||
|
||||
if not integer_unit:
|
||||
add_quadrant_standalone = address_config.get_property('{}.add_quadrant_standalone'.format(key),
|
||||
language, country=country)
|
||||
if add_quadrant_standalone:
|
||||
unit = None
|
||||
else:
|
||||
return None
|
||||
|
||||
last_num_type = None
|
||||
for i, c in enumerate(ordering):
|
||||
num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country)
|
||||
whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC
|
||||
unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default)
|
||||
last_num_type = num_type
|
||||
|
||||
return unit
|
||||
|
||||
@classmethod
|
||||
def phrase(cls, unit, language, country=None, zone=None):
|
||||
if unit is not None:
|
||||
key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)
|
||||
|
||||
if not address_config.get_property(key, language, country=country):
|
||||
return None
|
||||
|
||||
is_alpha = safe_decode(unit).isalpha()
|
||||
|
||||
direction_unit = None
|
||||
add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
|
||||
if add_direction:
|
||||
direction_unit = cls.add_direction(key, unit, language, country=country)
|
||||
|
||||
if direction_unit and direction_unit != unit:
|
||||
unit = direction_unit
|
||||
is_alpha = False
|
||||
else:
|
||||
add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
|
||||
if add_quadrant:
|
||||
unit = cls.add_quadrant(key, unit, language, country=country)
|
||||
is_alpha = False
|
||||
|
||||
return cls.numeric_phrase(key, safe_decode(unit), language,
|
||||
dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
|
||||
else:
|
||||
key = 'units.standalone'
|
||||
values, probs = address_config.alternative_probabilities(key, language,
|
||||
dictionaries=['unit_types_standalone'],
|
||||
country=country)
|
||||
if values is None:
|
||||
return None
|
||||
phrase, phrase_props = weighted_choice(values, probs)
|
||||
return phrase.title()
|
||||
Reference in New Issue
Block a user