diff --git a/scripts/geodata/addresses/config.py b/scripts/geodata/addresses/config.py index 6d423024..a5e743e7 100644 --- a/scripts/geodata/addresses/config.py +++ b/scripts/geodata/addresses/config.py @@ -6,8 +6,8 @@ import yaml from collections import Mapping -from geodata.addresses.sampling import cdf, check_probability_distribution from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries +from geodata.math.sampling import cdf, check_probability_distribution this_dir = os.path.realpath(os.path.dirname(__file__)) @@ -54,7 +54,7 @@ class AddressConfig(object): self.cache = {} for filename in os.listdir(config_dir): - if filename != 'en.yaml': + if filename not in ('en.yaml', 'es.yaml'): continue config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename))) @@ -100,7 +100,10 @@ class AddressConfig(object): '''Get a probability distribution over alternatives''' key = self.cache_key(prop, language, dictionaries, country=country) if key not in self.cache: - properties = self.get_property(prop, language, country=country) + properties = self.get_property(prop, language, country=country, default=None) + + if properties is None: + return None, None probs = [] alternatives = [] @@ -118,10 +121,21 @@ class AddressConfig(object): probs.extend([prob * p for p in phrase_probs]) alternatives.extend([(p, props) for p in phrases]) + sample_probability = properties.get('sample_probability') + if sample_probability is not None: + sample_phrases = [] + for dictionary in dictionaries: + phrases = self.sample_phrases.get((language, dictionary), []) + for canonical, surface_forms in six.iteritems(phrases): + sample_phrases.append(canonical) + sample_phrases.extend(surface_forms) + # Note: use the outer properties dictionary e.g. units.alphanumeric + alternatives.extend([(p, properties) for p in sample_phrases]) + probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases)) + alts = properties.get('alternatives', []) - total_before_alts = 0.0 for alt in alts: - prob = alt.get('probability', (1.0 - total_before_alts) / len(alts)) + prob = alt.get('probability', 1.0 / len(alts)) props = alt['alternative'] phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries) probs.extend([prob * p for p in phrase_probs]) @@ -130,7 +144,7 @@ class AddressConfig(object): try: check_probability_distribution(probs) except AssertionError: - print 'values where: {}'.format(alternatives) + print 'values were: {}'.format(alternatives) raise probs_cdf = cdf(probs) diff --git a/scripts/geodata/addresses/conjunctions.py b/scripts/geodata/addresses/conjunctions.py index 38b768e3..75089ec9 100644 --- a/scripts/geodata/addresses/conjunctions.py +++ b/scripts/geodata/addresses/conjunctions.py @@ -1,7 +1,7 @@ import six from geodata.addresses.config import address_config -from geodata.addresses.sampling import weighted_choice from geodata.encoding import safe_decode +from geodata.math.sampling import weighted_choice class Conjunction(object): diff --git a/scripts/geodata/addresses/directions.py b/scripts/geodata/addresses/directions.py index 72f9b31e..ee21d82a 100644 --- a/scripts/geodata/addresses/directions.py +++ b/scripts/geodata/addresses/directions.py @@ -1,10 +1,4 @@ -import random -import six - -from geodata.addresses.config import address_config from geodata.addresses.numbering import NumericPhrase -from geodata.addresses.sampling import weighted_choice -from geodata.encoding import safe_decode class RelativeDirection(NumericPhrase): diff --git a/scripts/geodata/addresses/floors.py b/scripts/geodata/addresses/floors.py index 021d1b7b..3a3eb5df 100644 --- a/scripts/geodata/addresses/floors.py +++ b/scripts/geodata/addresses/floors.py @@ -3,8 +3,8 @@ import six from geodata.addresses.config import address_config from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet -from geodata.addresses.sampling import weighted_choice, zipfian_distribution, cdf from geodata.encoding import safe_decode +from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf class Floor(NumberedComponent): @@ -74,11 +74,11 @@ class Floor(NumberedComponent): try: floor = int(floor) integer_floor = True - except ValueError: + except (ValueError, TypeError): try: floor = float(floor) integer_floor = int(floor) == floor - except ValueError: + except (ValueError, TypeError): return cls.numeric_phrase('levels.alphanumeric', safe_decode(floor), language, dictionaries=['level_types_numbered'], country=country) @@ -121,4 +121,4 @@ class Floor(NumberedComponent): country=country) return cls.numeric_phrase('levels.alphanumeric', safe_decode(floor), language, - dictionaries=['level_types_numbered'], country=country) + dictionaries=['level_types_numbered'], country=country) \ No newline at end of file diff --git a/scripts/geodata/addresses/numbering.py b/scripts/geodata/addresses/numbering.py index 36a4098d..b6fdb0f6 100644 --- a/scripts/geodata/addresses/numbering.py +++ b/scripts/geodata/addresses/numbering.py @@ -2,8 +2,8 @@ import random import six from geodata.addresses.config import address_config -from geodata.addresses.sampling import weighted_choice, zipfian_distribution, cdf from geodata.encoding import safe_decode +from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf from geodata.numbers.ordinals import ordinal_expressions @@ -202,6 +202,9 @@ class NumberedComponent(object): elif num_type == 'ordinal': num = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None)) + if random.random() < props.get('null_phrase_probability', 0.0): + return num + direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) diff --git a/scripts/geodata/math/__init__.py b/scripts/geodata/math/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/addresses/sampling.py b/scripts/geodata/math/sampling.py similarity index 100% rename from scripts/geodata/addresses/sampling.py rename to scripts/geodata/math/sampling.py