[phrases] fix case in trie search when searching for tokens in a string tail. If we're on the last token in a sequenence and the token matches the tail, check that the tail is complete, and if so return the match before exiting the loop. Affects multiword phrases that tend to appear toward the end of a sequence (long country names like "United States of America", etc.)

This commit is contained in:
Al
2016-12-29 16:15:33 -05:00
parent 2d077699e6
commit bdb51a244e

View File

@@ -198,11 +198,12 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke
if (!(*tail_ptr)) {
log_debug("tail matches!\n");
return token_index-1;
return token_index - 1;
}
log_debug("Searching tail: %s\n", tail_ptr);
for (int i = token_index; i < tokens->n; i++) {
size_t num_tokens = tokens->n;
for (int i = token_index; i < num_tokens; i++) {
token_t token = tokens->a[i];
char *ptr = str + token.offset;
@@ -210,7 +211,7 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke
if (!(*tail_ptr)) {
log_debug("tail matches!\n");
return i-1;
return i - 1;
}
if (token.type == WHITESPACE && *tail_ptr == ' ') continue;
@@ -224,6 +225,10 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke
if (strncmp((char *)tail_ptr, ptr, token_length) == 0) {
tail_ptr += token_length;
if (i == num_tokens - 1 && !(*tail_ptr)) {
return i;
}
} else {
return -1;
}