[categories/chains] Reusing category config in chain queries
This commit is contained in:
@@ -20,6 +20,8 @@ class CategoryConfig(object):
|
|||||||
self.language_categories_singular = {}
|
self.language_categories_singular = {}
|
||||||
self.language_categories_plural = {}
|
self.language_categories_plural = {}
|
||||||
|
|
||||||
|
self.property_names = set()
|
||||||
|
|
||||||
if not os.path.exists(base_dir):
|
if not os.path.exists(base_dir):
|
||||||
raise RuntimeError('{} does not exist'.format(base_dir))
|
raise RuntimeError('{} does not exist'.format(base_dir))
|
||||||
|
|
||||||
@@ -37,6 +39,7 @@ class CategoryConfig(object):
|
|||||||
reader.next() # headers
|
reader.next() # headers
|
||||||
|
|
||||||
for key, value, is_plural, phrase in reader:
|
for key, value, is_plural, phrase in reader:
|
||||||
|
self.property_names.add(key)
|
||||||
is_plural = bool(int(is_plural))
|
is_plural = bool(int(is_plural))
|
||||||
if is_plural:
|
if is_plural:
|
||||||
plural_rules[(key, value)].append(phrase)
|
plural_rules[(key, value)].append(phrase)
|
||||||
|
|||||||
31
scripts/geodata/categories/preposition.py
Normal file
31
scripts/geodata/categories/preposition.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from geodata.addresses.config import address_config
|
||||||
|
from geodata.categories.config import category_config
|
||||||
|
from geodata.math.sampling import weighted_choice, cdf
|
||||||
|
|
||||||
|
|
||||||
|
class CategoryPreposition(object):
|
||||||
|
NEAR = 'near'
|
||||||
|
NEARBY = 'nearby'
|
||||||
|
NEAR_ME = 'near_me'
|
||||||
|
IN = 'in'
|
||||||
|
NULL = 'null'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def random(cls, language, country=None):
|
||||||
|
category_props = address_config.get_property('categories', language, country=country)
|
||||||
|
if category_props is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
values = []
|
||||||
|
probs = []
|
||||||
|
|
||||||
|
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
|
||||||
|
k = '{}_probability'.format(prep_phrase_type)
|
||||||
|
prob = category_props.get(k, None)
|
||||||
|
if prob is not None:
|
||||||
|
values.append(prep_phrase_type)
|
||||||
|
probs.append(prob)
|
||||||
|
|
||||||
|
probs = cdf(probs)
|
||||||
|
|
||||||
|
return weighted_choice(values, probs)
|
||||||
@@ -14,6 +14,12 @@ NULL_CATEGORY_QUERY = CategoryQuery(None, None, False)
|
|||||||
|
|
||||||
|
|
||||||
class Category(object):
|
class Category(object):
|
||||||
|
NEAR = 'near'
|
||||||
|
NEARBY = 'nearby'
|
||||||
|
NEAR_ME = 'near_me'
|
||||||
|
IN = 'in'
|
||||||
|
NULL = 'null'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def phrase(cls, language, key, value, is_plural=False, country=None):
|
def phrase(cls, language, key, value, is_plural=False, country=None):
|
||||||
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
|
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
|
||||||
@@ -29,7 +35,7 @@ class Category(object):
|
|||||||
values = []
|
values = []
|
||||||
probs = []
|
probs = []
|
||||||
|
|
||||||
for prep_phrase_type in ('near', 'nearby', 'near_me', 'in', 'null'):
|
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
|
||||||
k = '{}_probability'.format(prep_phrase_type)
|
k = '{}_probability'.format(prep_phrase_type)
|
||||||
prob = category_props.get(k, None)
|
prob = category_props.get(k, None)
|
||||||
if prob is not None:
|
if prob is not None:
|
||||||
@@ -40,7 +46,7 @@ class Category(object):
|
|||||||
|
|
||||||
prep_phrase_type = weighted_choice(values, probs)
|
prep_phrase_type = weighted_choice(values, probs)
|
||||||
|
|
||||||
if prep_phrase_type == 'null':
|
if prep_phrase_type == cls.NULL:
|
||||||
return CategoryQuery(category_phrase, prep=None, add_place_name=True)
|
return CategoryQuery(category_phrase, prep=None, add_place_name=True)
|
||||||
|
|
||||||
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||||
@@ -50,6 +56,6 @@ class Category(object):
|
|||||||
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||||
prep_phrase = safe_decode(prep_phrase)
|
prep_phrase = safe_decode(prep_phrase)
|
||||||
|
|
||||||
add_place_name = prep_phrase_type not in ('nearby', 'near_me')
|
add_place_name = prep_phrase_type not in (cls.NEARBY, cls.NEAR_ME)
|
||||||
|
|
||||||
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name)
|
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name)
|
||||||
|
|||||||
@@ -1,10 +1,20 @@
|
|||||||
import random
|
import random
|
||||||
|
import six
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
from geodata.addresses.config import address_config
|
from geodata.addresses.config import address_config
|
||||||
from geodata.address_expansions.gazetteers import chains_gazetteer
|
from geodata.address_expansions.gazetteers import chains_gazetteer
|
||||||
from geodata.categories.query import *
|
from geodata.categories.config import category_config
|
||||||
|
from geodata.categories.preposition import CategoryPreposition
|
||||||
|
from geodata.math.sampling import weighted_choice, cdf
|
||||||
from geodata.text.normalize import normalized_tokens
|
from geodata.text.normalize import normalized_tokens
|
||||||
from geodata.text.tokenize import tokenize, token_types
|
from geodata.text.tokenize import tokenize, token_types
|
||||||
|
from geodata.encoding import safe_decode
|
||||||
|
|
||||||
|
ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name')
|
||||||
|
|
||||||
|
NULL_CHAIN_QUERY = ChainQuery(None, None, False)
|
||||||
|
|
||||||
|
|
||||||
class Chain(object):
|
class Chain(object):
|
||||||
@@ -64,3 +74,26 @@ class Chain(object):
|
|||||||
if not choices:
|
if not choices:
|
||||||
return canonical
|
return canonical
|
||||||
return random.choice(choices)
|
return random.choice(choices)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def phrase(cls, chain, language, country=None):
|
||||||
|
if not chain:
|
||||||
|
return NULL_CHAIN_QUERY
|
||||||
|
|
||||||
|
chain_phrase = safe_decode(chain)
|
||||||
|
|
||||||
|
prep_phrase_type = CategoryPreposition.random(language, country=country)
|
||||||
|
|
||||||
|
if prep_phrase_type in (None, CategoryPreposition.NULL):
|
||||||
|
return CategoryQuery(chain_phrase, prep=None, add_place_name=True)
|
||||||
|
|
||||||
|
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||||
|
if not values:
|
||||||
|
return ChainQuery(chain_phrase, prep=None, add_place_name=True)
|
||||||
|
|
||||||
|
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||||
|
prep_phrase = safe_decode(prep_phrase)
|
||||||
|
|
||||||
|
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
|
||||||
|
|
||||||
|
return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name)
|
||||||
|
|||||||
Reference in New Issue
Block a user