[tokenization/osm] Using utf8 encoded version of string for tokens in python tokenizer
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
from postal.text.encoding import safe_decode
|
from postal.text.encoding import safe_encode, safe_decode
|
||||||
from postal.text import _tokenize
|
from postal.text import _tokenize
|
||||||
from postal.text.token_types import token_types
|
from postal.text.token_types import token_types
|
||||||
|
|
||||||
@@ -8,5 +8,7 @@ def tokenize_raw(s):
|
|||||||
|
|
||||||
|
|
||||||
def tokenize(s):
|
def tokenize(s):
|
||||||
return [(s[start:start + length], token_types.from_id(token_type))
|
u = safe_decode(s)
|
||||||
for start, length, token_type in _tokenize.tokenize(safe_decode(s))]
|
s = safe_encode(s)
|
||||||
|
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
|
||||||
|
for start, length, token_type in _tokenize.tokenize(u)]
|
||||||
|
|||||||
Reference in New Issue
Block a user