[phrases] Adding *_from_index methods to trie_search
This commit is contained in:
@@ -18,9 +18,9 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
|
|||||||
|
|
||||||
const uint8_t *ptr = (const uint8_t *)text;
|
const uint8_t *ptr = (const uint8_t *)text;
|
||||||
const uint8_t *fail_ptr = ptr;
|
const uint8_t *fail_ptr = ptr;
|
||||||
trie_node_t node = trie_get_root(self), last_node = node;
|
|
||||||
|
|
||||||
uint32_t node_id = start_node_id;
|
uint32_t node_id = start_node_id;
|
||||||
|
trie_node_t node = trie_get_node(self, node_id), last_node = node;
|
||||||
uint32_t next_id;
|
uint32_t next_id;
|
||||||
|
|
||||||
bool match = false;
|
bool match = false;
|
||||||
@@ -81,8 +81,8 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
|
|||||||
log_debug("done with char, now at %s\n", ptr);
|
log_debug("done with char, now at %s\n", ptr);
|
||||||
}
|
}
|
||||||
fail_ptr = ptr;
|
fail_ptr = ptr;
|
||||||
last_node = node = trie_get_root(self);
|
|
||||||
node_id = start_node_id;
|
node_id = start_node_id;
|
||||||
|
last_node = node = trie_get_node(self, node_id);
|
||||||
phrase_start = phrase_len = 0;
|
phrase_start = phrase_len = 0;
|
||||||
last_state = state;
|
last_state = state;
|
||||||
match = false;
|
match = false;
|
||||||
@@ -214,13 +214,14 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|
||||||
|
phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id) {
|
||||||
if (str == NULL || tokens == NULL || tokens->n == 0) return NULL;
|
if (str == NULL || tokens == NULL || tokens->n == 0) return NULL;
|
||||||
|
|
||||||
phrase_array *phrases = phrase_array_new();
|
phrase_array *phrases = phrase_array_new();
|
||||||
|
|
||||||
trie_node_t node = trie_get_root(self), last_node = node;
|
uint32_t node_id = start_node_id, last_node_id = start_node_id;
|
||||||
uint32_t node_id = ROOT_NODE_ID, last_node_id = ROOT_NODE_ID;
|
trie_node_t node = trie_get_node(self, node_id), last_node = node;
|
||||||
|
|
||||||
uint32_t data;
|
uint32_t data;
|
||||||
|
|
||||||
@@ -254,8 +255,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
|
|
||||||
if (node.check != last_node_id && last_node.base >= 0) {
|
if (node.check != last_node_id && last_node.base >= 0) {
|
||||||
log_debug("Fell off trie. last_node_id=%d and node.check=%d\n", last_node_id, node.check);
|
log_debug("Fell off trie. last_node_id=%d and node.check=%d\n", last_node_id, node.check);
|
||||||
node = trie_get_root(self);
|
node_id = start_node_id;
|
||||||
node_id = ROOT_NODE_ID;
|
node = trie_get_node(self, node_id);
|
||||||
break;
|
break;
|
||||||
} else if (node.base < 0) {
|
} else if (node.base < 0) {
|
||||||
log_debug("Searching tail at index %d\n", i);
|
log_debug("Searching tail at index %d\n", i);
|
||||||
@@ -278,8 +279,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
log_debug("node tail matches first token\n");
|
log_debug("node tail matches first token\n");
|
||||||
int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i+1);
|
int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i+1);
|
||||||
if (tail_search_result == -1) {
|
if (tail_search_result == -1) {
|
||||||
node = trie_get_root(self);
|
node_id = start_node_id;
|
||||||
node_id = ROOT_NODE_ID;
|
node = trie_get_node(self, node_id);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
phrase_len = tail_search_result - phrase_start + 1;
|
phrase_len = tail_search_result - phrase_start + 1;
|
||||||
@@ -289,8 +290,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
node = trie_get_root(self);
|
node_id = start_node_id;
|
||||||
node_id = ROOT_NODE_ID;
|
node = trie_get_node(self, node_id);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -305,8 +306,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
i = last_match_index;
|
i = last_match_index;
|
||||||
last_match_index = -1;
|
last_match_index = -1;
|
||||||
phrase_start = 0;
|
phrase_start = 0;
|
||||||
last_node = trie_get_root(self);
|
last_node_id = start_node_id;
|
||||||
last_node_id = ROOT_NODE_ID;
|
last_node = trie_get_node(self, last_node_id);
|
||||||
continue;
|
continue;
|
||||||
} else if (last_state == SEARCH_STATE_PARTIAL_MATCH) {
|
} else if (last_state == SEARCH_STATE_PARTIAL_MATCH) {
|
||||||
log_debug("last_state == SEARCH_STATE_PARTIAL_MATCH\n");
|
log_debug("last_state == SEARCH_STATE_PARTIAL_MATCH\n");
|
||||||
@@ -317,8 +318,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
// this token was not a phrase
|
// this token was not a phrase
|
||||||
log_debug("Plain token=%.*s\n", token.len, str + token.offset);
|
log_debug("Plain token=%.*s\n", token.len, str + token.offset);
|
||||||
}
|
}
|
||||||
last_node = trie_get_root(self);
|
last_node_id = start_node_id;
|
||||||
last_node_id = ROOT_NODE_ID;
|
last_node = trie_get_node(self, last_node_id);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
state = SEARCH_STATE_PARTIAL_MATCH;
|
state = SEARCH_STATE_PARTIAL_MATCH;
|
||||||
@@ -356,8 +357,8 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
log_debug("node->match no continuation\n");
|
log_debug("node->match no continuation\n");
|
||||||
phrase_array_push(phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
phrase_array_push(phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
||||||
last_match_index = -1;
|
last_match_index = -1;
|
||||||
last_node = node = trie_get_root(self);
|
last_node_id = start_node_id;
|
||||||
last_node_id = node_id = ROOT_NODE_ID;
|
last_node = trie_get_node(self, last_node_id);
|
||||||
state = SEARCH_STATE_BEGIN;
|
state = SEARCH_STATE_BEGIN;
|
||||||
} else {
|
} else {
|
||||||
log_debug("Has continuation, node_id=%d\n", continuation_id);
|
log_debug("Has continuation, node_id=%d\n", continuation_id);
|
||||||
@@ -375,13 +376,17 @@ phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
|||||||
return phrases;
|
return phrases;
|
||||||
}
|
}
|
||||||
|
|
||||||
phrase_t trie_search_suffixes(trie_t *self, char *word) {
|
inline phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
|
||||||
uint32_t node_id = ROOT_NODE_ID, last_node_id = ROOT_NODE_ID;
|
return trie_search_tokens_from_index(self, str, tokens, ROOT_NODE_ID);
|
||||||
trie_node_t last_node = trie_get_root(self);
|
}
|
||||||
node_id = trie_get_transition_index(self, last_node, '\0');
|
|
||||||
|
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, uint32_t start_node_id) {
|
||||||
|
uint32_t last_node_id = start_node_id;
|
||||||
|
trie_node_t last_node = trie_get_node(self, last_node_id);
|
||||||
|
uint32_t node_id = trie_get_transition_index(self, last_node, '\0');
|
||||||
trie_node_t node = trie_get_node(self, node_id);
|
trie_node_t node = trie_get_node(self, node_id);
|
||||||
|
|
||||||
if (node.check != ROOT_NODE_ID) {
|
if (node.check != start_node_id) {
|
||||||
return (phrase_t){0, 0, 0};
|
return (phrase_t){0, 0, 0};
|
||||||
} else {
|
} else {
|
||||||
last_node = node;
|
last_node = node;
|
||||||
@@ -479,9 +484,14 @@ phrase_t trie_search_suffixes(trie_t *self, char *word) {
|
|||||||
return (phrase_t) {phrase_start, phrase_len, value};
|
return (phrase_t) {phrase_start, phrase_len, value};
|
||||||
}
|
}
|
||||||
|
|
||||||
phrase_t trie_search_prefixes(trie_t *self, char *word) {
|
inline phrase_t trie_search_suffixes(trie_t *self, char *word) {
|
||||||
uint32_t node_id = ROOT_NODE_ID, last_node_id = node_id;
|
return trie_search_suffixes_from_index(self, word, ROOT_NODE_ID);
|
||||||
trie_node_t node = trie_get_root(self), last_node = node;
|
}
|
||||||
|
|
||||||
|
|
||||||
|
phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, uint32_t start_node_id) {
|
||||||
|
uint32_t node_id = start_node_id, last_node_id = node_id;
|
||||||
|
trie_node_t node = trie_get_node(self, node_id), last_node = node;
|
||||||
|
|
||||||
uint32_t value = 0, phrase_start = 0, phrase_len = 0;
|
uint32_t value = 0, phrase_start = 0, phrase_len = 0;
|
||||||
|
|
||||||
@@ -528,3 +538,8 @@ phrase_t trie_search_prefixes(trie_t *self, char *word) {
|
|||||||
return (phrase_t) {phrase_start, phrase_len, value};
|
return (phrase_t) {phrase_start, phrase_len, value};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline phrase_t trie_search_prefixes(trie_t *self, char *word) {
|
||||||
|
return trie_search_prefixes_from_index(self, word, ROOT_NODE_ID);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,9 +24,12 @@ typedef struct phrase {
|
|||||||
VECTOR_INIT(phrase_array, phrase_t)
|
VECTOR_INIT(phrase_array, phrase_t)
|
||||||
|
|
||||||
phrase_array *trie_search(trie_t *self, char *text);
|
phrase_array *trie_search(trie_t *self, char *text);
|
||||||
|
phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id);
|
||||||
phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens);
|
phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens);
|
||||||
|
phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id);
|
||||||
|
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, uint32_t start_node_id);
|
||||||
phrase_t trie_search_suffixes(trie_t *self, char *word);
|
phrase_t trie_search_suffixes(trie_t *self, char *word);
|
||||||
|
phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, uint32_t start_node_id);
|
||||||
phrase_t trie_search_prefixes(trie_t *self, char *word);
|
phrase_t trie_search_prefixes(trie_t *self, char *word);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
Reference in New Issue
Block a user