From 5eb6bb309b72aa500d99ec114c71e67fb4a0c481 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 23 Jan 2016 20:09:45 -0500
Subject: [PATCH] [fix] Only adding whitespace back into tokenized strings
 during abbreviation if it existed in the original string

---
 scripts/geodata/osm/osm_address_training_data.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index 27085bd5..feeedc24 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -365,9 +365,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
     to real-world input, we can safely replace the canonical phrase with an
     abbreviated version and retain the meaning of the words
     '''
-    tokens = tokenize(s)
+    raw_tokens = tokenize_raw(s)
+    s_utf8 = safe_encode(s)
+    tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens]
     norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
 
+    n = len(tokens)
+
     abbreviated = []
 
     i = 0
@@ -382,7 +386,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
             if random.random() > abbreviate_prob:
                 for j, (t_i, c_i) in enumerate(t):
                     abbreviated.append(tokens[i + j][0])
-                    if c_i != token_types.IDEOGRAPHIC_CHAR:
+                    if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1]  + 1:
                         abbreviated.append(u' ')
                 i += len(t)
                 continue
@@ -460,13 +464,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
             else:
                 for j, (t_i, c_i) in enumerate(t):
                     abbreviated.append(tokens[i + j][0])
-                    if c_i != token_types.IDEOGRAPHIC_CHAR:
+                    if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1]  + 1:
                         abbreviated.append(u' ')
             i += len(t)
 
         else:
             abbreviated.append(tokens[i][0])
-            if (c != token_types.IDEOGRAPHIC_CHAR):
+            if i < n - 1 and raw_tokens[i + 1][1] > raw_tokens[i][1]  + 1:
                 abbreviated.append(u' ')
             i += 1