[phrases] Adding with_phrases versions of trie search methods for pre-allocated phrases

This commit is contained in:
Al
2015-09-16 21:23:34 -04:00
parent d5ec005787
commit 23103a21d4
2 changed files with 52 additions and 26 deletions

View File

@@ -7,10 +7,8 @@ typedef enum {
SEARCH_STATE_MATCH SEARCH_STATE_MATCH
} trie_search_state_t; } trie_search_state_t;
phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id) { bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases) {
if (text == NULL) return NULL; if (text == NULL) return false;
phrase_array *phrases = NULL;
ssize_t len, remaining; ssize_t len, remaining;
int32_t unich = 0; int32_t unich = 0;
@@ -35,8 +33,8 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
while(1) { while(1) {
len = utf8proc_iterate(ptr, -1, &unich); len = utf8proc_iterate(ptr, -1, &unich);
remaining = len; remaining = len;
if (len <= 0) return NULL; if (len <= 0) return false;
if (!(utf8proc_codepoint_valid(unich))) return NULL; if (!(utf8proc_codepoint_valid(unich))) return false;
int cat = utf8proc_category(unich); int cat = utf8proc_category(unich);
bool is_letter = utf8_is_letter(cat); bool is_letter = utf8_is_letter(cat);
@@ -68,10 +66,10 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN; state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN;
if (match) { if (match) {
log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n"); log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n");
if (!phrases) { if (*phrases == NULL) {
phrases = phrase_array_new_size(1); *phrases = phrase_array_new_size(1);
} }
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data}); phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
index = phrase_start + phrase_len; index = phrase_start + phrase_len;
advance_index = false; advance_index = false;
// Set the text back to the end of the last phrase // Set the text back to the end of the last phrase
@@ -122,10 +120,10 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
} else if (match) { } else if (match) {
log_debug("match is true and longer phrase tail did not match\n"); log_debug("match is true and longer phrase tail did not match\n");
log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len); log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len);
if (!phrases) { if (*phrases == NULL) {
phrases = phrase_array_new_size(1); *phrases = phrase_array_new_size(1);
} }
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data}); phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
ptr = fail_ptr; ptr = fail_ptr;
match = false; match = false;
index = phrase_start + phrase_len; index = phrase_start + phrase_len;
@@ -157,7 +155,10 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
if (unich == 0) { if (unich == 0) {
if (last_state == SEARCH_STATE_MATCH) { if (last_state == SEARCH_STATE_MATCH) {
log_debug("Found match at the end\n"); log_debug("Found match at the end\n");
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data}); if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
} }
break; break;
} }
@@ -168,11 +169,19 @@ phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_no
log_debug("index now %llu\n", index); log_debug("index now %llu\n", index);
} // while } // while
return phrases; return true;
}
inline bool trie_search_with_phrases(trie_t *self, char *str, phrase_array **phrases) {
return trie_search_from_index(self, str, ROOT_NODE_ID, &phrases);
} }
inline phrase_array *trie_search(trie_t *self, char *text) { inline phrase_array *trie_search(trie_t *self, char *text) {
return trie_search_from_index(self, text, ROOT_NODE_ID); phrase_array *phrases = NULL;
if (!trie_search_with_phrases(self, text, &phrases)) {
return false;
}
return phrases;
} }
int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, token_array *tokens, int tail_index, int token_index) { int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, token_array *tokens, int tail_index, int token_index) {
@@ -218,10 +227,8 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke
} }
phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id) { bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id, phrase_array **phrases) {
if (str == NULL || tokens == NULL || tokens->n == 0) return NULL; if (str == NULL || tokens == NULL || tokens->n == 0) return false;
phrase_array *phrases = phrase_array_new();
uint32_t node_id = start_node_id, last_node_id = start_node_id; uint32_t node_id = start_node_id, last_node_id = start_node_id;
trie_node_t node = trie_get_node(self, node_id), last_node = node; trie_node_t node = trie_get_node(self, node_id), last_node = node;
@@ -315,7 +322,10 @@ phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array
// check // check
if (last_match_index != -1) { if (last_match_index != -1) {
log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d", data); log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d", data);
phrase_array_push(phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
i = last_match_index; i = last_match_index;
last_match_index = -1; last_match_index = -1;
phrase_start = 0; phrase_start = 0;
@@ -378,7 +388,10 @@ phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array
node = last_node = trie_get_node(self, start_node_id); node = last_node = trie_get_node(self, start_node_id);
} else if (continuation.check != node_id && last_match_index == i) { } else if (continuation.check != node_id && last_match_index == i) {
log_debug("node->match no continuation\n"); log_debug("node->match no continuation\n");
phrase_array_push(phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
last_match_index = -1; last_match_index = -1;
node_id = last_node_id = start_node_id; node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id); node = last_node = trie_get_node(self, start_node_id);
@@ -394,14 +407,25 @@ phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array
} }
if (last_match_index != -1) { if (last_match_index != -1) {
phrase_array_push(phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
} }
return phrases; return true;
}
inline bool trie_search_tokens_with_phrases(trie_t *self, char *str, token_array *tokens, phrase_array **phrases) {
return trie_search_tokens_from_index(self, str, tokens, ROOT_NODE_ID, &phrases);
} }
inline phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) { inline phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens) {
return trie_search_tokens_from_index(self, str, tokens, ROOT_NODE_ID); phrase_array *phrases = NULL;
if (!trie_search_tokens_with_phrases(self, str, tokens, &phrases)) {
return NULL;
}
return phrases;
} }
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) { phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) {

View File

@@ -28,9 +28,11 @@ VECTOR_INIT(phrase_array, phrase_t)
#define NULL_PHRASE (phrase_t){0, 0, 0}; #define NULL_PHRASE (phrase_t){0, 0, 0};
phrase_array *trie_search(trie_t *self, char *text); phrase_array *trie_search(trie_t *self, char *text);
phrase_array *trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id); bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases);
bool trie_search_with_phrases(trie_t *self, char *text, phrase_array **phrases);
phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens); phrase_array *trie_search_tokens(trie_t *self, char *str, token_array *tokens);
phrase_array *trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id); bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, uint32_t start_node_id, phrase_array **phrases);
bool trie_search_tokens_with_phrases(trie_t *self, char *text, token_array *tokens, phrase_array **phrases);
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id); phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id);
phrase_t trie_search_suffixes_from_index_get_suffix_char(trie_t *self, char *word, size_t len, uint32_t start_node_id); phrase_t trie_search_suffixes_from_index_get_suffix_char(trie_t *self, char *word, size_t len, uint32_t start_node_id);
phrase_t trie_search_suffixes(trie_t *self, char *word, size_t len); phrase_t trie_search_suffixes(trie_t *self, char *word, size_t len);