From ce381134fbd191e31a690d7652473aea559f6e40 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 May 2016 16:00:04 -0400 Subject: [PATCH] [categories/chains] Reusing category config in chain queries --- scripts/geodata/categories/config.py | 3 ++ scripts/geodata/categories/preposition.py | 31 ++++++++++++++++++++ scripts/geodata/categories/query.py | 12 ++++++-- scripts/geodata/chains/query.py | 35 ++++++++++++++++++++++- 4 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 scripts/geodata/categories/preposition.py diff --git a/scripts/geodata/categories/config.py b/scripts/geodata/categories/config.py index b59e7b1d..4a8f91cb 100644 --- a/scripts/geodata/categories/config.py +++ b/scripts/geodata/categories/config.py @@ -20,6 +20,8 @@ class CategoryConfig(object): self.language_categories_singular = {} self.language_categories_plural = {} + self.property_names = set() + if not os.path.exists(base_dir): raise RuntimeError('{} does not exist'.format(base_dir)) @@ -37,6 +39,7 @@ class CategoryConfig(object): reader.next() # headers for key, value, is_plural, phrase in reader: + self.property_names.add(key) is_plural = bool(int(is_plural)) if is_plural: plural_rules[(key, value)].append(phrase) diff --git a/scripts/geodata/categories/preposition.py b/scripts/geodata/categories/preposition.py new file mode 100644 index 00000000..37d2b47d --- /dev/null +++ b/scripts/geodata/categories/preposition.py @@ -0,0 +1,31 @@ +from geodata.addresses.config import address_config +from geodata.categories.config import category_config +from geodata.math.sampling import weighted_choice, cdf + + +class CategoryPreposition(object): + NEAR = 'near' + NEARBY = 'nearby' + NEAR_ME = 'near_me' + IN = 'in' + NULL = 'null' + + @classmethod + def random(cls, language, country=None): + category_props = address_config.get_property('categories', language, country=country) + if category_props is None: + return None + + values = [] + probs = [] + + for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL): + k = '{}_probability'.format(prep_phrase_type) + prob = category_props.get(k, None) + if prob is not None: + values.append(prep_phrase_type) + probs.append(prob) + + probs = cdf(probs) + + return weighted_choice(values, probs) diff --git a/scripts/geodata/categories/query.py b/scripts/geodata/categories/query.py index 0641a6a4..840f40c8 100644 --- a/scripts/geodata/categories/query.py +++ b/scripts/geodata/categories/query.py @@ -14,6 +14,12 @@ NULL_CATEGORY_QUERY = CategoryQuery(None, None, False) class Category(object): + NEAR = 'near' + NEARBY = 'nearby' + NEAR_ME = 'near_me' + IN = 'in' + NULL = 'null' + @classmethod def phrase(cls, language, key, value, is_plural=False, country=None): category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural) @@ -29,7 +35,7 @@ class Category(object): values = [] probs = [] - for prep_phrase_type in ('near', 'nearby', 'near_me', 'in', 'null'): + for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL): k = '{}_probability'.format(prep_phrase_type) prob = category_props.get(k, None) if prob is not None: @@ -40,7 +46,7 @@ class Category(object): prep_phrase_type = weighted_choice(values, probs) - if prep_phrase_type == 'null': + if prep_phrase_type == cls.NULL: return CategoryQuery(category_phrase, prep=None, add_place_name=True) values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country) @@ -50,6 +56,6 @@ class Category(object): prep_phrase, prep_phrase_props = weighted_choice(values, probs) prep_phrase = safe_decode(prep_phrase) - add_place_name = prep_phrase_type not in ('nearby', 'near_me') + add_place_name = prep_phrase_type not in (cls.NEARBY, cls.NEAR_ME) return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name) diff --git a/scripts/geodata/chains/query.py b/scripts/geodata/chains/query.py index 07faa0d4..75656b9b 100644 --- a/scripts/geodata/chains/query.py +++ b/scripts/geodata/chains/query.py @@ -1,10 +1,20 @@ import random +import six + +from collections import namedtuple from geodata.addresses.config import address_config from geodata.address_expansions.gazetteers import chains_gazetteer -from geodata.categories.query import * +from geodata.categories.config import category_config +from geodata.categories.preposition import CategoryPreposition +from geodata.math.sampling import weighted_choice, cdf from geodata.text.normalize import normalized_tokens from geodata.text.tokenize import tokenize, token_types +from geodata.encoding import safe_decode + +ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name') + +NULL_CHAIN_QUERY = ChainQuery(None, None, False) class Chain(object): @@ -64,3 +74,26 @@ class Chain(object): if not choices: return canonical return random.choice(choices) + + @classmethod + def phrase(cls, chain, language, country=None): + if not chain: + return NULL_CHAIN_QUERY + + chain_phrase = safe_decode(chain) + + prep_phrase_type = CategoryPreposition.random(language, country=country) + + if prep_phrase_type in (None, CategoryPreposition.NULL): + return CategoryQuery(chain_phrase, prep=None, add_place_name=True) + + values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country) + if not values: + return ChainQuery(chain_phrase, prep=None, add_place_name=True) + + prep_phrase, prep_phrase_props = weighted_choice(values, probs) + prep_phrase = safe_decode(prep_phrase) + + add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) + + return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name)