From ce381134fbd191e31a690d7652473aea559f6e40 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 20 May 2016 16:00:04 -0400
Subject: [PATCH] [categories/chains] Reusing category config in chain queries

---
 scripts/geodata/categories/config.py      |  3 ++
 scripts/geodata/categories/preposition.py | 31 ++++++++++++++++++++
 scripts/geodata/categories/query.py       | 12 ++++++--
 scripts/geodata/chains/query.py           | 35 ++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 scripts/geodata/categories/preposition.py

diff --git a/scripts/geodata/categories/config.py b/scripts/geodata/categories/config.py
index b59e7b1d..4a8f91cb 100644
--- a/scripts/geodata/categories/config.py
+++ b/scripts/geodata/categories/config.py
@@ -20,6 +20,8 @@ class CategoryConfig(object):
         self.language_categories_singular = {}
         self.language_categories_plural = {}
 
+        self.property_names = set()
+
         if not os.path.exists(base_dir):
             raise RuntimeError('{} does not exist'.format(base_dir))
 
@@ -37,6 +39,7 @@ class CategoryConfig(object):
             reader.next()  # headers
 
             for key, value, is_plural, phrase in reader:
+                self.property_names.add(key)
                 is_plural = bool(int(is_plural))
                 if is_plural:
                     plural_rules[(key, value)].append(phrase)
diff --git a/scripts/geodata/categories/preposition.py b/scripts/geodata/categories/preposition.py
new file mode 100644
index 00000000..37d2b47d
--- /dev/null
+++ b/scripts/geodata/categories/preposition.py
@@ -0,0 +1,31 @@
+from geodata.addresses.config import address_config
+from geodata.categories.config import category_config
+from geodata.math.sampling import weighted_choice, cdf
+
+
+class CategoryPreposition(object):
+    NEAR = 'near'
+    NEARBY = 'nearby'
+    NEAR_ME = 'near_me'
+    IN = 'in'
+    NULL = 'null'
+
+    @classmethod
+    def random(cls, language, country=None):
+        category_props = address_config.get_property('categories', language, country=country)
+        if category_props is None:
+            return None
+
+        values = []
+        probs = []
+
+        for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
+            k = '{}_probability'.format(prep_phrase_type)
+            prob = category_props.get(k, None)
+            if prob is not None:
+                values.append(prep_phrase_type)
+                probs.append(prob)
+
+        probs = cdf(probs)
+
+        return weighted_choice(values, probs)
diff --git a/scripts/geodata/categories/query.py b/scripts/geodata/categories/query.py
index 0641a6a4..840f40c8 100644
--- a/scripts/geodata/categories/query.py
+++ b/scripts/geodata/categories/query.py
@@ -14,6 +14,12 @@ NULL_CATEGORY_QUERY = CategoryQuery(None, None, False)
 
 
 class Category(object):
+    NEAR = 'near'
+    NEARBY = 'nearby'
+    NEAR_ME = 'near_me'
+    IN = 'in'
+    NULL = 'null'
+
     @classmethod
     def phrase(cls, language, key, value, is_plural=False, country=None):
         category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
@@ -29,7 +35,7 @@ class Category(object):
         values = []
         probs = []
 
-        for prep_phrase_type in ('near', 'nearby', 'near_me', 'in', 'null'):
+        for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
             k = '{}_probability'.format(prep_phrase_type)
             prob = category_props.get(k, None)
             if prob is not None:
@@ -40,7 +46,7 @@ class Category(object):
 
         prep_phrase_type = weighted_choice(values, probs)
 
-        if prep_phrase_type == 'null':
+        if prep_phrase_type == cls.NULL:
             return CategoryQuery(category_phrase, prep=None, add_place_name=True)
 
         values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
@@ -50,6 +56,6 @@ class Category(object):
         prep_phrase, prep_phrase_props = weighted_choice(values, probs)
         prep_phrase = safe_decode(prep_phrase)
 
-        add_place_name = prep_phrase_type not in ('nearby', 'near_me')
+        add_place_name = prep_phrase_type not in (cls.NEARBY, cls.NEAR_ME)
 
         return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name)
diff --git a/scripts/geodata/chains/query.py b/scripts/geodata/chains/query.py
index 07faa0d4..75656b9b 100644
--- a/scripts/geodata/chains/query.py
+++ b/scripts/geodata/chains/query.py
@@ -1,10 +1,20 @@
 import random
+import six
+
+from collections import namedtuple
 
 from geodata.addresses.config import address_config
 from geodata.address_expansions.gazetteers import chains_gazetteer
-from geodata.categories.query import *
+from geodata.categories.config import category_config
+from geodata.categories.preposition import CategoryPreposition
+from geodata.math.sampling import weighted_choice, cdf
 from geodata.text.normalize import normalized_tokens
 from geodata.text.tokenize import tokenize, token_types
+from geodata.encoding import safe_decode
+
+ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name')
+
+NULL_CHAIN_QUERY = ChainQuery(None, None, False)
 
 
 class Chain(object):
@@ -64,3 +74,26 @@ class Chain(object):
         if not choices:
             return canonical
         return random.choice(choices)
+
+    @classmethod
+    def phrase(cls, chain, language, country=None):
+        if not chain:
+            return NULL_CHAIN_QUERY
+
+        chain_phrase = safe_decode(chain)
+
+        prep_phrase_type = CategoryPreposition.random(language, country=country)
+
+        if prep_phrase_type in (None, CategoryPreposition.NULL):
+            return CategoryQuery(chain_phrase, prep=None, add_place_name=True)
+
+        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
+        if not values:
+            return ChainQuery(chain_phrase, prep=None, add_place_name=True)
+
+        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
+        prep_phrase = safe_decode(prep_phrase)
+
+        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
+
+        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name)