diff --git a/scripts/geodata/text/phrases.py b/scripts/geodata/text/phrases.py new file mode 100644 index 00000000..eb9266ec --- /dev/null +++ b/scripts/geodata/text/phrases.py @@ -0,0 +1,74 @@ +import ujson as json + +from collections import * +from marisa_trie import BytesTrie + +SENTINEL = None + + +class PhraseFilter(object): + def __init__(self): + self.configured = False + + def configure(self, *args, **kw): + pass + + serialize = json.dumps + deserialize = json.loads + + def filter(self, tokens): + def return_item(item): + return False, item, [] + + if not tokens: + return + + ent = [] + ent_tokens = [] + + queue = deque(tokens + [(SENTINEL,) * 2]) + skip_until = 0 + + trie = self.trie + + while queue: + item = queue.popleft() + t, c = item + + if t is not SENTINEL and trie.has_keys_with_prefix(u' '.join(ent_tokens + [t])): + ent.append(item) + ent_tokens.append(item[0]) + elif ent_tokens: + res = trie.get(u' '.join(ent_tokens)) or None + if res is not None: + yield (True, ent, map(self.deserialize, res)) + queue.appendleft(item) + ent = [] + ent_tokens = [] + elif len(ent_tokens) == 1: + yield return_item(ent[0]) + ent = [] + ent_tokens = [] + queue.appendleft(item) + else: + have_phrase = False + + for i in xrange(len(ent) - 1, 0, -1): + remainder = ent[i:] + res = trie.get(u' '.join([e[0] for e in ent[:i]])) or None + if res is not None: + yield (True, ent[:i], map(self.deserialize, res)) + have_phrase = True + break + + if not have_phrase: + yield return_item(ent[0]) + + todos = list(remainder) + todos.append(item) + queue.extendleft(reversed(todos)) + + ent = [] + ent_tokens = [] + elif t is not SENTINEL: + yield return_item(item)