[fix] convert UTF8 slices back to unicode before using with the Python trie

2016-01-23 20:20:23 -05:00
parent 5eb6bb309b
commit 308ceb5a5f
1 changed files with 1 additions and 1 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -367,7 +367,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
    '''
    raw_tokens = tokenize_raw(s)
    s_utf8 = safe_encode(s)
-    tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens]
+    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]

    n = len(tokens)