From 308ceb5a5fb835179522a630e38139ddd9e41923 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Jan 2016 20:20:23 -0500 Subject: [PATCH] [fix] convert UTF8 slices back to unicode before using with the Python trie --- scripts/geodata/osm/osm_address_training_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index feeedc24..7b45b668 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -367,7 +367,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0. ''' raw_tokens = tokenize_raw(s) s_utf8 = safe_encode(s) - tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens] + tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens] norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] n = len(tokens)