[fix] Only adding whitespace back into tokenized strings during abbreviation if it existed in the original string

2016-01-23 20:09:45 -05:00
parent d61207e95a
commit 5eb6bb309b
1 changed files with 8 additions and 4 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -365,9 +365,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
    to real-world input, we can safely replace the canonical phrase with an
    abbreviated version and retain the meaning of the words
    '''
-    tokens = tokenize(s)
+    raw_tokens = tokenize_raw(s)
+    s_utf8 = safe_encode(s)
+    tokens = [(s_utf8[o:o+l], token_types.from_id(c)) for o, l, c in raw_tokens]
    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]

+    n = len(tokens)
+
    abbreviated = []

    i = 0
@@ -382,7 +386,7 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
            if random.random() > abbreviate_prob:
                for j, (t_i, c_i) in enumerate(t):
                    abbreviated.append(tokens[i + j][0])
-                    if c_i != token_types.IDEOGRAPHIC_CHAR:
+                    if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1]  + 1:
                        abbreviated.append(u' ')
                i += len(t)
                continue
@@ -460,13 +464,13 @@ def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.
            else:
                for j, (t_i, c_i) in enumerate(t):
                    abbreviated.append(tokens[i + j][0])
-                    if c_i != token_types.IDEOGRAPHIC_CHAR:
+                    if i + j < n - 1 and raw_tokens[i + j + 1][1] > raw_tokens[i + j][1]  + 1:
                        abbreviated.append(u' ')
            i += len(t)

        else:
            abbreviated.append(tokens[i][0])
-            if (c != token_types.IDEOGRAPHIC_CHAR):
+            if i < n - 1 and raw_tokens[i + 1][1] > raw_tokens[i][1]  + 1:
                abbreviated.append(u' ')
            i += 1