[python] Adding initial pypostal bindings for tokenize so we can remove address_normalizer dependency. Not tested on Python 3.

This commit is contained in:
Al
2015-09-20 14:59:33 -04:00
parent 3fab0f984f
commit 5485ea2197
8 changed files with 319 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
from postal.text.encoding import safe_decode
from postal.text import _tokenize
from postal.text.token_types import token_types
def tokenize_raw(s):
return _tokenize.tokenize(safe_decode(s))
def tokenize(s):
return [(s[start:start + length], token_types.from_id(token_type))
for start, length, token_type in _tokenize.tokenize(safe_decode(s))]