From 308ceb5a5fb835179522a630e38139ddd9e41923 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 23 Jan 2016 20:20:23 -0500
Subject: [PATCH] [fix] convert UTF8 slices back to unicode before using with
 the Python trie

---
 scripts/geodata/osm/osm_address_training_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index feeedc24..7b45b668 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -367,7 +367,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
     '''
     raw_tokens = tokenize_raw(s)
     s_utf8 = safe_encode(s)
-    tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens]
+    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
     norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
 
     n = len(tokens)