[fix] convert UTF8 slices back to unicode before using with the Python trie

This commit is contained in:
Al
2016-01-23 20:20:23 -05:00
parent 5eb6bb309b
commit 308ceb5a5f

View File

@@ -367,7 +367,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
'''
raw_tokens = tokenize_raw(s)
s_utf8 = safe_encode(s)
tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens]
tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
n = len(tokens)