[phrases] Case where trie search finds a match, makes progress beyond the next token but has to fall back. Adding trie search test case
This commit is contained in:
@@ -84,7 +84,7 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
|
if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) {
|
||||||
normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
||||||
normalize_token(strings, str, token, normalize_token_options);
|
normalize_token(strings, str, token, normalize_token_options);
|
||||||
normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
||||||
@@ -699,6 +699,7 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
|
|||||||
string_tree_t *token_tree = string_tree_new_size(len);
|
string_tree_t *token_tree = string_tree_new_size(len);
|
||||||
|
|
||||||
add_normalized_strings_tokenized(token_tree, str, tokens, options);
|
add_normalized_strings_tokenized(token_tree, str, tokens, options);
|
||||||
|
|
||||||
string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
|
string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
|
||||||
|
|
||||||
string_tree_iterator_t *iter;
|
string_tree_iterator_t *iter;
|
||||||
@@ -796,9 +797,6 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
|
|||||||
char_array_destroy(temp_string);
|
char_array_destroy(temp_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
char **expand_address(char *input, normalize_options_t options, size_t *n) {
|
||||||
options.address_components |= ADDRESS_ANY;
|
options.address_components |= ADDRESS_ANY;
|
||||||
|
|
||||||
|
|||||||
@@ -304,6 +304,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) {
|
if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) {
|
||||||
log_debug("node tail matches first token\n");
|
log_debug("node tail matches first token\n");
|
||||||
int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i + 1);
|
int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i + 1);
|
||||||
|
log_debug("tail_search_result=%d\n", tail_search_result);
|
||||||
node_id = start_node_id;
|
node_id = start_node_id;
|
||||||
node = trie_get_node(self, node_id);
|
node = trie_get_node(self, node_id);
|
||||||
check_continuation = false;
|
check_continuation = false;
|
||||||
@@ -331,17 +332,18 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
|
|
||||||
|
|
||||||
if (node.check <= 0 || node_id == start_node_id) {
|
if (node.check <= 0 || node_id == start_node_id) {
|
||||||
|
log_debug("state = SEARCH_STATE_NO_MATCH\n");
|
||||||
state = SEARCH_STATE_NO_MATCH;
|
state = SEARCH_STATE_NO_MATCH;
|
||||||
// check
|
// check
|
||||||
if (last_match_index != -1) {
|
if (last_match_index != -1) {
|
||||||
log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d", data);
|
log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d\n", data);
|
||||||
if (*phrases == NULL) {
|
if (*phrases == NULL) {
|
||||||
*phrases = phrase_array_new_size(1);
|
*phrases = phrase_array_new_size(1);
|
||||||
}
|
}
|
||||||
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
||||||
i = last_match_index;
|
i = last_match_index;
|
||||||
last_match_index = -1;
|
last_match_index = -1;
|
||||||
phrase_start = 0;
|
phrase_start = phrase_len = 0;
|
||||||
node_id = last_node_id = start_node_id;
|
node_id = last_node_id = start_node_id;
|
||||||
node = last_node = trie_get_node(self, start_node_id);
|
node = last_node = trie_get_node(self, start_node_id);
|
||||||
continue;
|
continue;
|
||||||
@@ -360,7 +362,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
|
|
||||||
state = SEARCH_STATE_PARTIAL_MATCH;
|
state = SEARCH_STATE_PARTIAL_MATCH;
|
||||||
if (!(node.base < 0) && (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN)) {
|
if (!(node.base < 0) && (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN)) {
|
||||||
log_debug("phrase_start=%d\n", i);
|
log_debug("phrase_start=%d, node.base = %d, last_state=%d\n", i, node.base, last_state);
|
||||||
phrase_start = i;
|
phrase_start = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -373,6 +375,8 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
data = data_node.data;
|
data = data_node.data;
|
||||||
log_debug("data = %d\n", data);
|
log_debug("data = %d\n", data);
|
||||||
|
|
||||||
|
log_debug("phrase_start = %d\n", phrase_start);
|
||||||
|
|
||||||
last_match_index = i;
|
last_match_index = i;
|
||||||
log_debug("last_match_index = %d\n", i);
|
log_debug("last_match_index = %d\n", i);
|
||||||
}
|
}
|
||||||
@@ -388,7 +392,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
||||||
i = last_match_index;
|
i = last_match_index;
|
||||||
last_match_index = -1;
|
last_match_index = -1;
|
||||||
phrase_start = 0;
|
phrase_start = phrase_len = 0;
|
||||||
node_id = last_node_id = start_node_id;
|
node_id = last_node_id = start_node_id;
|
||||||
node = last_node = trie_get_node(self, start_node_id);
|
node = last_node = trie_get_node(self, start_node_id);
|
||||||
state = SEARCH_STATE_NO_MATCH;
|
state = SEARCH_STATE_NO_MATCH;
|
||||||
@@ -407,22 +411,24 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens,
|
|||||||
log_debug("Ideographic character\n");
|
log_debug("Ideographic character\n");
|
||||||
last_node_id = node_id;
|
last_node_id = node_id;
|
||||||
last_node = node;
|
last_node = node;
|
||||||
} else if (continuation.check != node_id && last_match_index != i) {
|
} else if (continuation.check != node_id && last_match_index != -1) {
|
||||||
log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start);
|
|
||||||
state = SEARCH_STATE_NO_MATCH;
|
|
||||||
phrase_start = 0;
|
|
||||||
node_id = last_node_id = start_node_id;
|
|
||||||
node = last_node = trie_get_node(self, start_node_id);
|
|
||||||
} else if (continuation.check != node_id && last_match_index == i) {
|
|
||||||
log_debug("node->match no continuation\n");
|
log_debug("node->match no continuation\n");
|
||||||
if (*phrases == NULL) {
|
if (*phrases == NULL) {
|
||||||
*phrases = phrase_array_new_size(1);
|
*phrases = phrase_array_new_size(1);
|
||||||
}
|
}
|
||||||
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data});
|
||||||
|
i = last_match_index;
|
||||||
last_match_index = -1;
|
last_match_index = -1;
|
||||||
|
phrase_start = phrase_len = 0;
|
||||||
node_id = last_node_id = start_node_id;
|
node_id = last_node_id = start_node_id;
|
||||||
node = last_node = trie_get_node(self, start_node_id);
|
node = last_node = trie_get_node(self, start_node_id);
|
||||||
state = SEARCH_STATE_BEGIN;
|
state = SEARCH_STATE_BEGIN;
|
||||||
|
} else if (continuation.check != node_id) {
|
||||||
|
log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start);
|
||||||
|
state = SEARCH_STATE_NO_MATCH;
|
||||||
|
phrase_start = phrase_len = 0;
|
||||||
|
node_id = last_node_id = start_node_id;
|
||||||
|
node = last_node = trie_get_node(self, start_node_id);
|
||||||
} else {
|
} else {
|
||||||
log_debug("Has continuation, node_id=%d\n", continuation_id);
|
log_debug("Has continuation, node_id=%d\n", continuation_id);
|
||||||
last_node = node = continuation;
|
last_node = node = continuation;
|
||||||
|
|||||||
@@ -9,6 +9,6 @@ CFLAGS = $(CFLAGS_BASE)
|
|||||||
|
|
||||||
TESTS = test_libpostal
|
TESTS = test_libpostal
|
||||||
noinst_PROGRAMS = test_libpostal
|
noinst_PROGRAMS = test_libpostal
|
||||||
test_libpostal_SOURCES = test.c test_expand.c test_transliterate.c test_numex.c
|
test_libpostal_SOURCES = test.c test_expand.c test_transliterate.c test_numex.c test_trie.c
|
||||||
test_libpostal_LDADD = ../src/libpostal.la
|
test_libpostal_LDADD = ../src/libpostal.la
|
||||||
test_libpostal_CFLAGS = $(CFLAGS_O3)
|
test_libpostal_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
SUITE_EXTERN(libpostal_expansion_tests);
|
SUITE_EXTERN(libpostal_expansion_tests);
|
||||||
SUITE_EXTERN(libpostal_transliteration_tests);
|
SUITE_EXTERN(libpostal_transliteration_tests);
|
||||||
SUITE_EXTERN(libpostal_numex_tests);
|
SUITE_EXTERN(libpostal_numex_tests);
|
||||||
|
SUITE_EXTERN(libpostal_trie_tests);
|
||||||
|
|
||||||
GREATEST_MAIN_DEFS();
|
GREATEST_MAIN_DEFS();
|
||||||
|
|
||||||
@@ -13,5 +14,6 @@ int main(int argc, char **argv) {
|
|||||||
RUN_SUITE(libpostal_expansion_tests);
|
RUN_SUITE(libpostal_expansion_tests);
|
||||||
RUN_SUITE(libpostal_transliteration_tests);
|
RUN_SUITE(libpostal_transliteration_tests);
|
||||||
RUN_SUITE(libpostal_numex_tests);
|
RUN_SUITE(libpostal_numex_tests);
|
||||||
|
RUN_SUITE(libpostal_trie_tests);
|
||||||
GREATEST_MAIN_END();
|
GREATEST_MAIN_END();
|
||||||
}
|
}
|
||||||
|
|||||||
59
test/test_trie.c
Normal file
59
test/test_trie.c
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
|
#include "greatest.h"
|
||||||
|
#include "../src/scanner.h"
|
||||||
|
#include "../src/trie.h"
|
||||||
|
#include "../src/trie_search.h"
|
||||||
|
|
||||||
|
SUITE(libpostal_trie_tests);
|
||||||
|
|
||||||
|
static greatest_test_res test_trie_add_get(trie_t *trie, char *key, uint32_t data) {
|
||||||
|
bool added = trie_add(trie, key, data);
|
||||||
|
ASSERT(added);
|
||||||
|
|
||||||
|
uint32_t trie_data;
|
||||||
|
bool fetched = trie_get_data(trie, key, &trie_data);
|
||||||
|
ASSERT(fetched);
|
||||||
|
ASSERT_EQ(data, trie_data);
|
||||||
|
|
||||||
|
PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
static greatest_test_res test_trie_setup(trie_t *trie) {
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "st", 1));
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "street", 2));
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "st rt", 3));
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "st rd", 3));
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "state route", 4));
|
||||||
|
CHECK_CALL(test_trie_add_get(trie, "maine", 5));
|
||||||
|
|
||||||
|
PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST test_trie(void) {
|
||||||
|
trie_t *trie = trie_new();
|
||||||
|
ASSERT(trie != NULL);
|
||||||
|
CHECK_CALL(test_trie_setup(trie));
|
||||||
|
|
||||||
|
char *input = "main st r 20";
|
||||||
|
token_array *tokens = tokenize_keep_whitespace(input);
|
||||||
|
phrase_array *phrases = trie_search_tokens(trie, input, tokens);
|
||||||
|
|
||||||
|
ASSERT(phrases != NULL);
|
||||||
|
ASSERT(phrases->n == 1);
|
||||||
|
phrase_t phrase = phrases->a[0];
|
||||||
|
ASSERT(phrase.start == 2);
|
||||||
|
ASSERT(phrase.len == 1);
|
||||||
|
|
||||||
|
trie_destroy(trie);
|
||||||
|
|
||||||
|
PASS();
|
||||||
|
}
|
||||||
|
|
||||||
|
GREATEST_SUITE(libpostal_trie_tests) {
|
||||||
|
RUN_TEST(test_trie);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user