[scripts] Adding the tokenize/normalize wrappers directly into the internal geodata package so pypostal can be maintained in an independent repo
This commit is contained in:
14
scripts/geodata/text/tokenize.py
Normal file
14
scripts/geodata/text/tokenize.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.text import _tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
|
||||
def tokenize_raw(s):
|
||||
return _tokenize.tokenize(safe_decode(s))
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
u = safe_decode(s)
|
||||
s = safe_encode(s)
|
||||
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
|
||||
for start, length, token_type in _tokenize.tokenize(u)]
|
||||
Reference in New Issue
Block a user