From 46141a6c36eca1652a691ae8b364ebd1d9122a76 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 2 Aug 2015 14:34:32 -0600
Subject: [PATCH] [normalize] Adding an option when normalizing tokens to split
 tokens of the form [\w]+[\.\-]?[\d]+ for cases like I35, CR123, R-66, RN.7,
 etc. where the alpha component is an expansion

---
 src/normalize.c | 26 +++++++++++++++++++++++++-
 src/normalize.h |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/normalize.c b/src/normalize.c
index c48710ce..e24b6681 100644
--- a/src/normalize.c
+++ b/src/normalize.c
@@ -155,6 +155,9 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
     size_t len = token.len;
     if (token.len == 0) return;
 
+    bool alpha_numeric_split = false;
+    char *append_if_not_numeric = NULL;
+
     int32_t ch;
     ssize_t char_len;
 
@@ -172,6 +175,9 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
         int cat = utf8proc_category(ch);
 
         bool is_letter = utf8_is_letter(cat);
+        bool is_number = utf8_is_number(cat);
+
+        bool is_full_stop = ch == FULL_STOP_CODEPOINT;
 
         if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
             cstring_array_append_string(array, " ");
@@ -180,7 +186,25 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
             append_char = false;
         }
 
-        if (ch == FULL_STOP_CODEPOINT) {
+        if ((is_hyphen || is_full_stop) && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {
+            ptr += char_len;
+            idx += char_len;
+            append_if_not_numeric = is_hyphen ? "-" : ".";
+            append_char = true;
+            continue;
+        }
+
+        if (!is_number && append_if_not_numeric != NULL) {
+            cstring_array_append_string(array, append_if_not_numeric);
+            append_if_not_numeric = NULL;
+        }
+
+        if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
+            cstring_array_append_string(array, " ");
+            alpha_numeric_split = true;
+        }
+
+        if (is_full_stop) {
             if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) {
                 break;
             }
diff --git a/src/normalize.h b/src/normalize.h
index a09caf65..4c576b74 100644
--- a/src/normalize.h
+++ b/src/normalize.h
@@ -52,6 +52,7 @@ extern "C" {
 #define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
 #define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
 #define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
+#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
 
 char *normalize_string_utf8(char *str, uint64_t options);