[normalization] string normalization module for tokens and full strings

2015-07-01 14:52:28 -04:00
parent 46e51ae91e
commit 6cfbab9969
2 changed files with 256 additions and 0 deletions
--- a/src/normalize.c
+++ b/src/normalize.c
@@ -0,0 +1,207 @@
+#include "normalize.h"
+
+#define FULL_STOP_CODEPOINT 0x002e
+#define APOSTROPHE_CODEPOINT 0x0027
+
+
+char *utf8_normalize_string(char *str, uint64_t options) {    
+    int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
+    uint8_t *utf8proc_normalized = NULL;
+    ssize_t normalized_len = 0;
+
+    bool have_utf8proc_options = false;
+
+    if (options & NORMALIZE_STRING_DECOMPOSE) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_NFKD;
+    }
+
+    if (options & NORMALIZE_STRING_STRIP_ACCENTS) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS;
+    }
+
+    if (options & NORMALIZE_STRING_LOWERCASE) {
+        have_utf8proc_options = true;
+        utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
+    }
+
+    if (have_utf8proc_options) {
+        ssize_t normalized_len = utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
+        return (char *)utf8proc_normalized;
+    }
+
+    return NULL;
+}
+
+void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
+    
+    char *transliterated = NULL;
+    char *utf8_normalized = NULL;
+    char *prev_string = NULL;
+
+    if (options & NORMALIZE_STRING_LATIN_ASCII) {
+        transliterated = transliterate(LATIN_ASCII, str, len);
+        if (transliterated != NULL) {
+            utf8_normalized = utf8_normalize_string(transliterated, options);
+            free(transliterated);
+            transliterated = NULL;
+        }
+
+        if (utf8_normalized != NULL) {
+            string_tree_add_string(tree, utf8_normalized);
+            prev_string = utf8_normalized;
+            utf8_normalized = NULL;
+        }
+    }
+
+    utf8_normalized = utf8_normalize_string(str, options);
+
+    if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
+        transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
+        free(utf8_normalized);
+    } else {
+        transliterated = utf8_normalized;
+    }
+
+    if (transliterated != NULL) {
+        if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) {
+            string_tree_add_string(tree, transliterated);
+        }
+        free(transliterated);
+        transliterated = NULL;
+    }
+
+    if (prev_string != NULL) {
+        free(prev_string);
+    }
+
+}
+
+string_tree_t *normalize_string(char *str, uint64_t options) {
+    size_t len = strlen(str);
+    string_tree_t *tree = string_tree_new_size(len);
+
+    size_t consumed = 0;
+
+    while (consumed < len)  {            
+
+        string_script_t script_span = get_string_script(str, len - consumed);
+        script_t script = script_span.script;
+        size_t script_len = script_span.len;
+        bool is_ascii = script_span.ascii;
+
+        char *utf8_normalized = NULL;
+        char *transliterated = NULL;
+        char *ascii = NULL;
+
+        if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) {
+            utf8_normalized = utf8_normalize_string(str, NORMALIZE_STRING_LOWERCASE);
+            if (utf8_normalized != NULL) {
+                string_tree_add_string(tree, utf8_normalized);
+                free(utf8_normalized);
+                utf8_normalized = NULL;
+            }
+            string_tree_finalize_token(tree);
+        } else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) {
+            add_latin_alternatives(tree, str, script_len, options);
+            string_tree_finalize_token(tree);
+        } else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
+            char *trans_name;
+            foreach_transliterator(script, "", trans_name, {
+                transliterated = transliterate(trans_name, str, script_len);
+
+                if (transliterated != NULL) {
+                    add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
+                    free(transliterated);
+                }
+            })
+
+            string_tree_finalize_token(tree);
+        } else {
+            string_tree_add_string_len(tree, str, script_len);
+        }
+        consumed += script_len;
+        str += script_len;
+    }
+
+    return tree;
+
+}
+
+
+void add_normalized_token(string_tree_t *tree, char *str, token_t token, uint64_t options) {
+    size_t idx = 0;
+
+    uint8_t *ptr = (uint8_t *)str + token.offset;
+    size_t len = token.len;
+
+    int32_t ch;
+    ssize_t char_len;
+
+    bool last_was_letter = false;
+    bool append_char = true;
+
+    cstring_array *array = tree->strings;
+
+    size_t initial_n = array->str->n;
+
+    while (idx < len) {
+        char_len = utf8proc_iterate(ptr, len, &ch);
+        
+        if (char_len <= 0) break;
+
+        bool is_hyphen = utf8_is_hyphen(ch);
+        int cat = utf8proc_category(ch);
+
+        bool is_letter = utf8_is_letter(cat);
+
+        if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
+            cstring_array_append_string(array, " ");
+            append_char = false;
+        } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
+            append_char = false;
+        }
+
+        if (ch == FULL_STOP_CODEPOINT) {
+            if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) {
+                break;
+            }
+
+            if (token.type == ACRONYM && options & NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) {
+                append_char = false;
+            }
+        }
+
+        if (idx == len - 2 && len > 2 && options & NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES) {
+            char this_char = *ptr;
+            char next_char = *(ptr + 1);
+
+            if (this_char == '\'' && next_char == 's') {
+                break;
+            } else if (this_char == 's' && next_char == '\'') {
+                cstring_array_append_string(array, "s");
+                break;
+            }
+        }
+
+        if (ch == APOSTROPHE_CODEPOINT && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
+            append_char = false;
+        }
+
+        if (append_char) {
+            cstring_array_append_string_len(array, (char *)ptr, char_len);
+        }
+
+        ptr += char_len;
+        idx += char_len;
+        append_char = true;
+
+        last_was_letter = is_letter;
+
+    }
+
+    if (array->str->n > initial_n) {
+        string_tree_finalize_token(tree);
+    }
+}