[scripts] Adding the tokenize/normalize wrappers directly into the internal geodata package so pypostal can be maintained in an independent repo

This commit is contained in:
Al
2016-01-12 13:26:55 -05:00
parent 622dc354e7
commit 58e53cab1c
10 changed files with 731 additions and 5 deletions

View File

@@ -0,0 +1,14 @@
from geodata.encoding import safe_encode, safe_decode
from geodata.text import _tokenize
from geodata.text.token_types import token_types
def tokenize_raw(s):
return _tokenize.tokenize(safe_decode(s))
def tokenize(s):
u = safe_decode(s)
s = safe_encode(s)
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
for start, length, token_type in _tokenize.tokenize(u)]