From 6cfbab9969073d1d57f68b5d98dd5289606277e5 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Wed, 1 Jul 2015 14:52:28 -0400
Subject: [PATCH] [normalization] string normalization module for tokens and
 full strings

---
 src/normalize.c | 207 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/normalize.h |  49 ++++++++++++
 2 files changed, 256 insertions(+)
 create mode 100644 src/normalize.c
 create mode 100644 src/normalize.h

diff --git a/src/normalize.c b/src/normalize.c
new file mode 100644
index 00000000..d4018032
--- /dev/null
+++ b/src/normalize.c
@@ -0,0 +1,207 @@
+#include "normalize.h"
+
+#define FULL_STOP_CODEPOINT 0x002e
+#define APOSTROPHE_CODEPOINT 0x0027
+
+
+char *utf8_normalize_string(char *str, uint64_t options) {    
+    int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
+    uint8_t *utf8proc_normalized = NULL;
+    ssize_t normalized_len = 0;
+
+    bool have_utf8proc_options = false;
+
+    if (options & NORMALIZE_STRING_DECOMPOSE) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_NFKD;
+    }
+
+    if (options & NORMALIZE_STRING_STRIP_ACCENTS) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS;
+    }
+
+    if (options & NORMALIZE_STRING_LOWERCASE) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
+    }
+
+    if (have_utf8proc_options) {
+        ssize_t normalized_len = utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
+        return (char *)utf8proc_normalized;
+    }
+
+    return NULL;
+}
+
+void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
+    
+    char *transliterated = NULL;
+    char *utf8_normalized = NULL;
+    char *prev_string = NULL;
+
+    if (options & NORMALIZE_STRING_LATIN_ASCII) {
+        transliterated = transliterate(LATIN_ASCII, str, len);
+        if (transliterated != NULL) {
+            utf8_normalized = utf8_normalize_string(transliterated, options);
+            free(transliterated);
+            transliterated = NULL;
+        }
+
+        if (utf8_normalized != NULL) {
+            string_tree_add_string(tree, utf8_normalized);
+            prev_string = utf8_normalized;
+            utf8_normalized = NULL;
+        }
+    }
+
+    utf8_normalized = utf8_normalize_string(str, options);
+
+    if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
+        transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
+        free(utf8_normalized);
+    } else {
+        transliterated = utf8_normalized;
+    }
+
+    if (transliterated != NULL) {
+        if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) {
+            string_tree_add_string(tree, transliterated);
+        }
+        free(transliterated);
+        transliterated = NULL;
+    }
+
+    if (prev_string != NULL) {
+        free(prev_string);
+    }
+
+}
+
+string_tree_t *normalize_string(char *str, uint64_t options) {
+    size_t len = strlen(str);
+    string_tree_t *tree = string_tree_new_size(len);
+
+    size_t consumed = 0;
+
+    while (consumed < len)  {            
+
+        string_script_t script_span = get_string_script(str, len - consumed);
+        script_t script = script_span.script;
+        size_t script_len = script_span.len;
+        bool is_ascii = script_span.ascii;
+
+        char *utf8_normalized = NULL;
+        char *transliterated = NULL;
+        char *ascii = NULL;
+
+        if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) {
+            utf8_normalized = utf8_normalize_string(str, NORMALIZE_STRING_LOWERCASE);
+            if (utf8_normalized != NULL) {
+                string_tree_add_string(tree, utf8_normalized);
+                free(utf8_normalized);
+                utf8_normalized = NULL;
+            }
+            string_tree_finalize_token(tree);
+        } else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) {
+            add_latin_alternatives(tree, str, script_len, options);
+            string_tree_finalize_token(tree);
+        } else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
+            char *trans_name;
+            foreach_transliterator(script, "", trans_name, {
+                transliterated = transliterate(trans_name, str, script_len);
+
+                if (transliterated != NULL) {
+                    add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
+                    free(transliterated);
+                }
+            })
+
+            string_tree_finalize_token(tree);
+        } else {
+            string_tree_add_string_len(tree, str, script_len);
+        }
+        consumed += script_len;
+        str += script_len;
+    }
+
+    return tree;
+
+}
+
+
+void add_normalized_token(string_tree_t *tree, char *str, token_t token, uint64_t options) {
+    size_t idx = 0;
+
+    uint8_t *ptr = (uint8_t *)str + token.offset;
+    size_t len = token.len;
+
+    int32_t ch;
+    ssize_t char_len;
+
+    bool last_was_letter = false;
+    bool append_char = true;
+
+    cstring_array *array = tree->strings;
+
+    size_t initial_n = array->str->n;
+
+    while (idx < len) {
+        char_len = utf8proc_iterate(ptr, len, &ch);
+        
+        if (char_len <= 0) break;
+
+        bool is_hyphen = utf8_is_hyphen(ch);
+        int cat = utf8proc_category(ch);
+
+        bool is_letter = utf8_is_letter(cat);
+
+        if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
+            cstring_array_append_string(array, " ");
+            append_char = false;
+        } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
+            append_char = false;
+        }
+
+        if (ch == FULL_STOP_CODEPOINT) {
+            if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) {
+                break;
+            }
+
+            if (token.type == ACRONYM && options & NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) {
+                append_char = false;
+            }
+        }
+
+        if (idx == len - 2 && len > 2 && options & NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES) {
+            char this_char = *ptr;
+            char next_char = *(ptr + 1);
+
+            if (this_char == '\'' && next_char == 's') {
+                break;
+            } else if (this_char == 's' && next_char == '\'') {
+                cstring_array_append_string(array, "s");
+                break;
+            }
+        }
+
+        if (ch == APOSTROPHE_CODEPOINT && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
+            append_char = false;
+        }
+
+        if (append_char) {
+            cstring_array_append_string_len(array, (char *)ptr, char_len);
+        }
+
+        ptr += char_len;
+        idx += char_len;
+        append_char = true;
+
+        last_was_letter = is_letter;
+
+    }
+
+    if (array->str->n > initial_n) {
+        string_tree_finalize_token(tree);
+    }
+}
diff --git a/src/normalize.h b/src/normalize.h
new file mode 100644
index 00000000..ae494b15
--- /dev/null
+++ b/src/normalize.h
@@ -0,0 +1,49 @@
+#ifndef NORMALIZE_H
+#define NORMALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "constants.h"
+#include "klib/khash.h"
+#include "numex.h"
+#include "scanner.h"
+#include "string_utils.h"
+#include "utf8proc/utf8proc.h"
+#include "unicode_scripts.h"
+#include "transliterate.h"
+#include "trie.h"
+#include "tokens.h"
+#include "vector.h"
+
+#define NORMALIZE_STRING_LATIN_ASCII 1 << 0
+#define NORMALIZE_STRING_TRANSLITERATE 1 << 1
+#define NORMALIZE_STRING_STRIP_ACCENTS 1 << 2
+#define NORMALIZE_STRING_DECOMPOSE 1 << 3
+#define NORMALIZE_STRING_LOWERCASE 1 << 4
+
+#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
+#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
+#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2
+#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
+#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
+#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
+
+char *utf8_normalize_string(char *str, uint64_t options);
+
+// Takes NORMALIZE_TOKEN_* options
+bool add_token_alternatives(cstring_array *array, char *str, token_t token, uint64_t options);
+
+// Takes NORMALIZE_STRING_* options
+string_tree_t *normalize_string(char *str, uint64_t options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file