13 lines
367 B
Python
13 lines
367 B
Python
from postal.text.encoding import safe_decode
|
|
from postal.text import _tokenize
|
|
from postal.text.token_types import token_types
|
|
|
|
|
|
def tokenize_raw(s):
|
|
return _tokenize.tokenize(safe_decode(s))
|
|
|
|
|
|
def tokenize(s):
|
|
return [(s[start:start + length], token_types.from_id(token_type))
|
|
for start, length, token_type in _tokenize.tokenize(safe_decode(s))]
|