[phrases] Case where trie search finds a match, makes progress beyond the next token but has to fall back. Adding trie search test case

This commit is contained in:
Al
2016-02-08 01:07:56 -05:00
parent 3701d8380f
commit 9ac0379a65
5 changed files with 82 additions and 17 deletions

View File

@@ -84,7 +84,7 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke
}
}
if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) {
normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
normalize_token(strings, str, token, normalize_token_options);
normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
@@ -699,6 +699,7 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
string_tree_t *token_tree = string_tree_new_size(len);
add_normalized_strings_tokenized(token_tree, str, tokens, options);
string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
string_tree_iterator_t *iter;
@@ -796,9 +797,6 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
char_array_destroy(temp_string);
}
char **expand_address(char *input, normalize_options_t options, size_t *n) {
options.address_components |= ADDRESS_ANY;

View File

@@ -304,6 +304,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) {
log_debug("node tail matches first token\n");
int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i + 1);
log_debug("tail_search_result=%d\n", tail_search_result);
node_id = start_node_id;
node = trie_get_node(self, node_id);
check_continuation = false;
@@ -331,17 +332,18 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
if (node.check <= 0 || node_id == start_node_id) {
log_debug("state = SEARCH_STATE_NO_MATCH\n");
state = SEARCH_STATE_NO_MATCH;
// check
if (last_match_index != -1) {
log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d", data);
log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d\n", data);
if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
i = last_match_index;
last_match_index = -1;
phrase_start = 0;
phrase_start = phrase_len = 0;
node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id);
continue;
@@ -360,7 +362,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
state = SEARCH_STATE_PARTIAL_MATCH;
if (!(node.base < 0) && (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN)) {
log_debug("phrase_start=%d\n", i);
log_debug("phrase_start=%d, node.base = %d, last_state=%d\n", i, node.base, last_state);
phrase_start = i;
}
@@ -373,6 +375,8 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
data = data_node.data;
log_debug("data = %d\n", data);
log_debug("phrase_start = %d\n", phrase_start);
last_match_index = i;
log_debug("last_match_index = %d\n", i);
}
@@ -388,7 +392,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
i = last_match_index;
last_match_index = -1;
phrase_start = 0;
phrase_start = phrase_len = 0;
node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id);
state = SEARCH_STATE_NO_MATCH;
@@ -407,22 +411,24 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
log_debug("Ideographic character\n");
last_node_id = node_id;
last_node = node;
} else if (continuation.check != node_id && last_match_index != i) {
log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start);
state = SEARCH_STATE_NO_MATCH;
phrase_start = 0;
node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id);
} else if (continuation.check != node_id && last_match_index == i) {
} else if (continuation.check != node_id && last_match_index != -1) {
log_debug("node->match no continuation\n");
if (*phrases == NULL) {
*phrases = phrase_array_new_size(1);
}
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
last_match_index = -1;
i = last_match_index;
last_match_index = -1;
phrase_start = phrase_len = 0;
node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id);
state = SEARCH_STATE_BEGIN;
} else if (continuation.check != node_id) {
log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start);
state = SEARCH_STATE_NO_MATCH;
phrase_start = phrase_len = 0;
node_id = last_node_id = start_node_id;
node = last_node = trie_get_node(self, start_node_id);
} else {
log_debug("Has continuation, node_id=%d\n", continuation_id);
last_node = node = continuation;