From 8fb9bacfa630935d70c11f6f126f504bf25cecfe Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 27 Apr 2015 01:01:43 -0400 Subject: [PATCH] [phrases] New trie_add_nodes_only method for concatenating strings to the trie, plus boolean return values on trie_add_* APIs --- src/trie.c | 62 ++++++++++++++++++++++++++++++++++++++++++------------ src/trie.h | 7 +++--- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/src/trie.c b/src/trie.c index a974b065..b0592264 100644 --- a/src/trie.c +++ b/src/trie.c @@ -609,12 +609,47 @@ void trie_print(trie_t *self) { } -void trie_add_at_index(trie_t *self, uint32_t node_id, char *key, uint32_t data) { + +uint32_t trie_add_nodes_only(trie_t *self, uint32_t node_id, char *key, size_t len) { + unsigned char *ptr = (unsigned char *)key; + uint32_t last_node_id = node_id; + trie_node_t last_node = trie_get_node(self, node_id); + if (last_node.base == NULL_ID) return NULL_ID; + + uint32_t original_node_id = node_id; + + trie_node_t node; + + // Walks node until prefix reached, including the trailing \0 + + for (int i = 0; i < len; ptr++, i++, last_node_id = node_id, last_node = node) { + log_debug("--- char=%c\n", *ptr); + node_id = trie_get_transition_index(self, last_node, *ptr); + log_debug("node_id=%d\n", node_id); + if (node_id != NULL_ID) { + trie_make_room_for(self, node_id); + } + + node = trie_get_node(self, node_id); + log_debug("node.check=%d, last_node_id=%d, node.base=%d\n", node.check, last_node_id, node.base); + + if (node.check < 0 || (node.check != last_node_id)) { + node_id = trie_add_transition(self, last_node_id, *ptr); + if (node_id == TRIE_INDEX_ERROR) { + trie_prune_up_to(self, original_node_id, last_node_id); + return NULL_ID; + } + } + } + return node_id; +} + +bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, uint32_t data) { int num_chars = strlen(key); unsigned char *ptr = (unsigned char *)key; uint32_t last_node_id = node_id; trie_node_t last_node = trie_get_node(self, node_id); - if (last_node.base == NULL_ID) return; + if (last_node.base == NULL_ID) return false; trie_node_t node; @@ -634,25 +669,25 @@ void trie_add_at_index(trie_t *self, uint32_t node_id, char *key, uint32_t data) if (node.check < 0 || (node.check != last_node_id)) { log_debug("last_node_id=%d, ptr=%s, tail_pos=%zu\n", last_node_id, ptr, self->tail->n); trie_separate_tail(self, last_node_id, ptr, data); - return; + return true; } else if (node.base < 0 && node.check == last_node_id) { log_debug("Case 3 insertion\n"); trie_tail_merge(self, node_id, ptr + 1, data); - return; + return true; } } - return; + return true; } -void trie_add(trie_t *self, char *key, uint32_t data) { - if (strlen(key) == 0) return; - trie_add_at_index(self, ROOT_ID, key, data); +bool trie_add(trie_t *self, char *key, uint32_t data) { + if (strlen(key) == 0) return false; + return trie_add_at_index(self, ROOT_ID, key, data); } -void trie_add_suffix(trie_t *self, char *key, uint32_t data) { - if (strlen(key) == 0) return; +bool trie_add_suffix(trie_t *self, char *key, uint32_t data) { + if (strlen(key) == 0) return false; trie_node_t root = trie_get_root(self); uint32_t node_id = trie_get_transition_index(self, root, '\0'); @@ -662,8 +697,9 @@ void trie_add_suffix(trie_t *self, char *key, uint32_t data) { } char *suffix = utf8_reversed_string(key); - trie_add_at_index(self, node_id, suffix, data); + bool success = trie_add_at_index(self, node_id, suffix, data); free(suffix); + return success; } uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i) { @@ -680,7 +716,7 @@ uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i) { /* Include NUL-byte if we're looking for whole phrases. * It may be stored if this phrase is a prefix of a longer one */ - for (int i = 0; i < len; i++, ptr++, node_id = next_id) { + for (int i = 0; i < len + 1; i++, ptr++, node_id = next_id) { next_id = trie_get_transition_index(self, node, *ptr); node = trie_get_node(self, next_id); @@ -700,7 +736,7 @@ uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i) { int tail_match = strncmp((char *)current_tail, query_tail, query_tail_len); if (tail_match == 0) { - return next_id; + return data_node.data; } else { return NULL_ID; } diff --git a/src/trie.h b/src/trie.h index 7196684e..fb7eea67 100644 --- a/src/trie.h +++ b/src/trie.h @@ -91,9 +91,10 @@ void trie_set_tail(trie_t *self, unsigned char *tail, int32_t tail_pos); int32_t trie_separate_tail(trie_t *self, uint32_t from_index, unsigned char *tail, uint32_t data); void trie_tail_merge(trie_t *self, uint32_t old_node_id, unsigned char *suffix, uint32_t data); -void trie_add_at_index(trie_t *self, uint32_t node_id, char *key, uint32_t data); -void trie_add(trie_t *self, char *key, uint32_t data); -void trie_add_suffix(trie_t *self, char *key, uint32_t data); +uint32_t trie_add_nodes_only(trie_t *self, uint32_t node_id, char *key, size_t len); +bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, uint32_t data); +bool trie_add(trie_t *self, char *key, uint32_t data); +bool trie_add_suffix(trie_t *self, char *key, uint32_t data); uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i); uint32_t trie_get_len(trie_t *self, char *word, size_t len);