From 236737eab31af79c1d80be135d4239776be33544 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 21 Sep 2015 17:27:43 -0400 Subject: [PATCH] [tokenization/osm] Using utf8 encoded version of string for tokens in python tokenizer --- python/postal/text/tokenize.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/postal/text/tokenize.py b/python/postal/text/tokenize.py index f058dbc7..e2ed77b5 100644 --- a/python/postal/text/tokenize.py +++ b/python/postal/text/tokenize.py @@ -1,4 +1,4 @@ -from postal.text.encoding import safe_decode +from postal.text.encoding import safe_encode, safe_decode from postal.text import _tokenize from postal.text.token_types import token_types @@ -8,5 +8,7 @@ def tokenize_raw(s): def tokenize(s): - return [(s[start:start + length], token_types.from_id(token_type)) - for start, length, token_type in _tokenize.tokenize(safe_decode(s))] + u = safe_decode(s) + s = safe_encode(s) + return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) + for start, length, token_type in _tokenize.tokenize(u)]