From d35f5196292cba644e860333a002f01c22d29ac7 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Mon, 7 Dec 2015 19:18:46 -0500
Subject: [PATCH] [expansion] Fixing case where non-ideographic tokens like #
 can potentially be concatenated with surrounding tokens and should normalized
 with whitespace in between

---
 src/libpostal.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/libpostal.c b/src/libpostal.c
index f7236db0..d663b926 100644
--- a/src/libpostal.c
+++ b/src/libpostal.c
@@ -33,6 +33,10 @@ inline bool is_word_token(uint16_t type) {
     return type == WORD || type == ABBREVIATION || type == ACRONYM || type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE;
 }
 
+inline bool is_ideographic(uint16_t type) {
+    return type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE || type == IDEOGRAPHIC_NUMBER;
+}
+
 inline bool is_numeric_token(uint16_t type) {
     return type == NUMERIC;
 }
@@ -163,12 +167,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
         int start = 0;
         int end = 0;
 
+        phrase_t phrase = NULL_PHRASE;
+
         key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);
 
         for (int i = 0; i < phrases->n; i++) {
             phrase_lang = phrases->a[i];
 
-            phrase_t phrase = phrase_lang.phrase;
+            phrase = phrase_lang.phrase;
             if (phrase.start < start) {
                 continue;
             }
@@ -195,6 +201,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
                 string_tree_finalize_token(tree);       
             }
 
+            if (phrase.start > 0) {
+                token_t prev_token = tokens->a[phrase.start - 1];
+                if (!(prev_token.type == WHITESPACE && !is_ideographic(prev_token.type))) {
+                    string_tree_add_string(tree, " ");
+                    string_tree_finalize_token(tree);
+                }
+            }
+
             expansion_value_t value;
             value.value = phrase.data;
 
@@ -262,6 +276,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
                     string_tree_finalize_token(tree);
 
                 }
+
+                if (phrase.start + phrase.len < tokens->n - 1) {
+                    token_t next_token = tokens->a[phrase.start + phrase.len + 1];
+                    if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) {
+                        string_tree_add_string(tree, " ");
+                        string_tree_finalize_token(tree);
+                    }
+                }
+
             }
 
             start = phrase.start + phrase.len;
@@ -272,6 +295,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
 
         end = (int)tokens->n;
 
+        if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) {
+            token_t next_token = tokens->a[phrase.start + phrase.len];
+            if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) {
+                string_tree_add_string(tree, " ");
+                string_tree_finalize_token(tree);
+            }
+        }    
+
+
         for (int j = start; j < end; j++) {
             token_t token = tokens->a[j]; 
             if (token.type != WHITESPACE) {
@@ -282,7 +314,8 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
                 log_debug("Adding space\n");
                 string_tree_add_string(tree, " ");
             }
-            string_tree_finalize_token(tree);       
+            string_tree_finalize_token(tree);
+
         }