[phrases] Adding Python phrase filter from address_normalizer until a Python wrapper around libpostal's trie_search is available
This commit is contained in:
74
scripts/geodata/text/phrases.py
Normal file
74
scripts/geodata/text/phrases.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import ujson as json
|
||||
|
||||
from collections import *
|
||||
from marisa_trie import BytesTrie
|
||||
|
||||
SENTINEL = None
|
||||
|
||||
|
||||
class PhraseFilter(object):
|
||||
def __init__(self):
|
||||
self.configured = False
|
||||
|
||||
def configure(self, *args, **kw):
|
||||
pass
|
||||
|
||||
serialize = json.dumps
|
||||
deserialize = json.loads
|
||||
|
||||
def filter(self, tokens):
|
||||
def return_item(item):
|
||||
return False, item, []
|
||||
|
||||
if not tokens:
|
||||
return
|
||||
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
|
||||
queue = deque(tokens + [(SENTINEL,) * 2])
|
||||
skip_until = 0
|
||||
|
||||
trie = self.trie
|
||||
|
||||
while queue:
|
||||
item = queue.popleft()
|
||||
t, c = item
|
||||
|
||||
if t is not SENTINEL and trie.has_keys_with_prefix(u' '.join(ent_tokens + [t])):
|
||||
ent.append(item)
|
||||
ent_tokens.append(item[0])
|
||||
elif ent_tokens:
|
||||
res = trie.get(u' '.join(ent_tokens)) or None
|
||||
if res is not None:
|
||||
yield (True, ent, map(self.deserialize, res))
|
||||
queue.appendleft(item)
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
elif len(ent_tokens) == 1:
|
||||
yield return_item(ent[0])
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
queue.appendleft(item)
|
||||
else:
|
||||
have_phrase = False
|
||||
|
||||
for i in xrange(len(ent) - 1, 0, -1):
|
||||
remainder = ent[i:]
|
||||
res = trie.get(u' '.join([e[0] for e in ent[:i]])) or None
|
||||
if res is not None:
|
||||
yield (True, ent[:i], map(self.deserialize, res))
|
||||
have_phrase = True
|
||||
break
|
||||
|
||||
if not have_phrase:
|
||||
yield return_item(ent[0])
|
||||
|
||||
todos = list(remainder)
|
||||
todos.append(item)
|
||||
queue.extendleft(reversed(todos))
|
||||
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
elif t is not SENTINEL:
|
||||
yield return_item(item)
|
||||
Reference in New Issue
Block a user