[fix] Stepping through codepoints first then through chars in trie_search_prefixes_from_index (used in transliteration and numex)

2015-12-23 01:58:39 -05:00
parent baa8e3cc3f
commit aaa1fc0387
1 changed files with 51 additions and 27 deletions
--- a/src/trie_search.c
+++ b/src/trie_search.c
@@ -612,24 +612,37 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u
    trie_data_node_t data_node;
    trie_node_t terminal_node;

-    for (; idx < len; last_node = node, last_node_id = node_id) {
-        log_debug("Getting transition index for %d, (%d, %d)\n", last_node_id, last_node.base, last_node.check);
-        node_id = trie_get_transition_index(self, last_node, *ptr);
-        node = trie_get_node(self, node_id);
-        log_debug("Doing %c, got node_id=%d\n", *ptr, node_id);
+    while (idx < len) {
+        char_len = utf8proc_iterate(ptr, len, &codepoint);
+        log_debug("char_len = %zu, char=%d\n", char_len, codepoint);
+        if (char_len <= 0) break;
+        if (!(utf8proc_codepoint_valid(codepoint))) break;
+
+        bool is_hyphen = utf8_is_hyphen(codepoint);
+
+        int cat = utf8proc_category(codepoint);
+        bool is_space = utf8_is_separator(cat);
+
+        uint8_t *char_ptr = ptr;
+        size_t i = 0;
+
+        for (i = 0; i < char_len; char_ptr++, i++, last_node = node, last_node_id = node_id) {
+            node_id = trie_get_transition_index(self, last_node, *char_ptr);
+            node = trie_get_node(self, node_id);
+            log_debug("At idx=%zu, char=%s\n", i, char_ptr);
+
+            if (node.check != last_node_id) {
+                log_debug("Fell off the trie mid-character, breaking\n");
+                break;
+            } else if (node.base < 0) {
+                log_debug("Hit tail node, breaking\n");
+                break;
+            }
+        }
+
        if (node.check != last_node_id) { 
            log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);

-            char_len = utf8proc_iterate(ptr, len, &codepoint);
-
-            if (char_len <= 0) break;
-            if (!(utf8proc_codepoint_valid(codepoint))) break;
-
-            bool is_hyphen = utf8_is_hyphen(codepoint);
-
-            int cat = utf8proc_category(codepoint);
-            bool is_space = utf8_is_separator(cat);
-
            if (is_hyphen || (is_space && *ptr != ' ')) {
                log_debug("Got hyphen or other separator, trying space instead\n");
                node_id = trie_get_transition_index(self, last_node, ' ');
@@ -658,22 +671,33 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u

            unsigned char *current_tail = self->tail->a + current_tail_pos;

-            log_debug("comparing tail: %s vs %s\n", current_tail, ptr + 1);
+            log_debug("comparing tail: %s vs %s\n", current_tail, char_ptr + 1);
            size_t current_tail_len = strlen((char *)current_tail);
-            size_t match_len = 0;
-            size_t offset = 1;
+            size_t match_len = i + 1;
+            size_t offset = i + 1;
+            log_debug("offset=%zu\n", offset);
+
            if (char_len > 1) {
-                match_len = strncmp((char *)ptr + 1, (char *)current_tail, char_len - 1);
-                offset = char_len;
+                log_debug("char_len = %zu\n", char_len);
+                log_debug("Doing strncmp: (%zu) %s vs %s\n", char_len - offset, current_tail, char_ptr + 1);
+
+                if (strncmp((char *)char_ptr + 1, (char *)current_tail, char_len - offset) == 0) {
+                    match_len += char_len - offset;
+                    offset = char_len;
+                    log_debug("in char match_len = %zu\n", match_len);
+                } else {
+                    return NULL_PHRASE;
+                }
            }

-            match_len += utf8_common_prefix_len_ignore_separators((char *)ptr + offset, (char *)current_tail, current_tail_len);
-
-            log_debug("match_len=%zu\n", match_len);
+            if (match_len < current_tail_len) {
+                match_len += utf8_common_prefix_len_ignore_separators((char *)ptr + offset, (char *)current_tail + offset, current_tail_len - offset);
+                log_debug("match_len=%zu\n", match_len);
+            }

            if (match_len >= current_tail_len) {
                if (first_char) phrase_start = idx;
-                phrase_len = (uint32_t)(idx + match_len + 1) - phrase_start;
+                phrase_len = (uint32_t)(idx + match_len) - phrase_start;

                log_debug("tail match! phrase_len=%u\n", phrase_len);
                value = data_node.data;
@@ -699,13 +723,13 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u
            first_char = false;
        }

-        idx++;
-        ptr++;
+        idx += char_len;
+        ptr += char_len;
    }
-
    if (phrase_len == 0) return NULL_PHRASE;

    return (phrase_t) {phrase_start, phrase_len, value};
+
 }

 inline phrase_t trie_search_prefixes_from_index_get_prefix_char(trie_t *self, char *word, size_t len, uint32_t start_node_id) {