From 9ac0379a65ea56323c3340bbef6ce530ab1e01c5 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 8 Feb 2016 01:07:56 -0500 Subject: [PATCH] [phrases] Case where trie search finds a match, makes progress beyond the next token but has to fall back. Adding trie search test case --- src/libpostal.c | 6 ++--- src/trie_search.c | 30 ++++++++++++++---------- test/Makefile.am | 2 +- test/test.c | 2 ++ test/test_trie.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 17 deletions(-) create mode 100644 test/test_trie.c diff --git a/src/libpostal.c b/src/libpostal.c index d3c9f81a..2ec83ba7 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -84,7 +84,7 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke } } - if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { + if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; @@ -699,6 +699,7 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_ string_tree_t *token_tree = string_tree_new_size(len); add_normalized_strings_tokenized(token_tree, str, tokens, options); + string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); string_tree_iterator_t *iter; @@ -796,9 +797,6 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_ char_array_destroy(temp_string); } - - - char **expand_address(char *input, normalize_options_t options, size_t *n) { options.address_components |= ADDRESS_ANY; diff --git a/src/trie_search.c b/src/trie_search.c index edf8967d..bf1f4ce3 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -304,6 +304,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) { log_debug("node tail matches first token\n"); int tail_search_result = trie_node_search_tail_tokens(self, node, str, tokens, ptr_len, i + 1); + log_debug("tail_search_result=%d\n", tail_search_result); node_id = start_node_id; node = trie_get_node(self, node_id); check_continuation = false; @@ -331,17 +332,18 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, if (node.check <= 0 || node_id == start_node_id) { + log_debug("state = SEARCH_STATE_NO_MATCH\n"); state = SEARCH_STATE_NO_MATCH; // check if (last_match_index != -1) { - log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d", data); + log_debug("last_match not NULL and state==SEARCH_STATE_NO_MATCH, data=%d\n", data); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); i = last_match_index; last_match_index = -1; - phrase_start = 0; + phrase_start = phrase_len = 0; node_id = last_node_id = start_node_id; node = last_node = trie_get_node(self, start_node_id); continue; @@ -360,7 +362,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, state = SEARCH_STATE_PARTIAL_MATCH; if (!(node.base < 0) && (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN)) { - log_debug("phrase_start=%d\n", i); + log_debug("phrase_start=%d, node.base = %d, last_state=%d\n", i, node.base, last_state); phrase_start = i; } @@ -373,6 +375,8 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, data = data_node.data; log_debug("data = %d\n", data); + log_debug("phrase_start = %d\n", phrase_start); + last_match_index = i; log_debug("last_match_index = %d\n", i); } @@ -388,7 +392,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); i = last_match_index; last_match_index = -1; - phrase_start = 0; + phrase_start = phrase_len = 0; node_id = last_node_id = start_node_id; node = last_node = trie_get_node(self, start_node_id); state = SEARCH_STATE_NO_MATCH; @@ -407,22 +411,24 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, log_debug("Ideographic character\n"); last_node_id = node_id; last_node = node; - } else if (continuation.check != node_id && last_match_index != i) { - log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start); - state = SEARCH_STATE_NO_MATCH; - phrase_start = 0; - node_id = last_node_id = start_node_id; - node = last_node = trie_get_node(self, start_node_id); - } else if (continuation.check != node_id && last_match_index == i) { + } else if (continuation.check != node_id && last_match_index != -1) { log_debug("node->match no continuation\n"); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, last_match_index - phrase_start + 1, data}); - last_match_index = -1; + i = last_match_index; + last_match_index = -1; + phrase_start = phrase_len = 0; node_id = last_node_id = start_node_id; node = last_node = trie_get_node(self, start_node_id); state = SEARCH_STATE_BEGIN; + } else if (continuation.check != node_id) { + log_debug("No continuation for phrase with start=%d, yielding tokens\n", phrase_start); + state = SEARCH_STATE_NO_MATCH; + phrase_start = phrase_len = 0; + node_id = last_node_id = start_node_id; + node = last_node = trie_get_node(self, start_node_id); } else { log_debug("Has continuation, node_id=%d\n", continuation_id); last_node = node = continuation; diff --git a/test/Makefile.am b/test/Makefile.am index 6c657a71..bb019b0b 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -9,6 +9,6 @@ CFLAGS = $(CFLAGS_BASE) TESTS = test_libpostal noinst_PROGRAMS = test_libpostal -test_libpostal_SOURCES = test.c test_expand.c test_transliterate.c test_numex.c +test_libpostal_SOURCES = test.c test_expand.c test_transliterate.c test_numex.c test_trie.c test_libpostal_LDADD = ../src/libpostal.la test_libpostal_CFLAGS = $(CFLAGS_O3) diff --git a/test/test.c b/test/test.c index 6ad486cc..a0ad1d4f 100644 --- a/test/test.c +++ b/test/test.c @@ -3,6 +3,7 @@ SUITE_EXTERN(libpostal_expansion_tests); SUITE_EXTERN(libpostal_transliteration_tests); SUITE_EXTERN(libpostal_numex_tests); +SUITE_EXTERN(libpostal_trie_tests); GREATEST_MAIN_DEFS(); @@ -13,5 +14,6 @@ int main(int argc, char **argv) { RUN_SUITE(libpostal_expansion_tests); RUN_SUITE(libpostal_transliteration_tests); RUN_SUITE(libpostal_numex_tests); + RUN_SUITE(libpostal_trie_tests); GREATEST_MAIN_END(); } diff --git a/test/test_trie.c b/test/test_trie.c new file mode 100644 index 00000000..457e7222 --- /dev/null +++ b/test/test_trie.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +#include "greatest.h" +#include "../src/scanner.h" +#include "../src/trie.h" +#include "../src/trie_search.h" + +SUITE(libpostal_trie_tests); + +static greatest_test_res test_trie_add_get(trie_t *trie, char *key, uint32_t data) { + bool added = trie_add(trie, key, data); + ASSERT(added); + + uint32_t trie_data; + bool fetched = trie_get_data(trie, key, &trie_data); + ASSERT(fetched); + ASSERT_EQ(data, trie_data); + + PASS(); +} + +static greatest_test_res test_trie_setup(trie_t *trie) { + CHECK_CALL(test_trie_add_get(trie, "st", 1)); + CHECK_CALL(test_trie_add_get(trie, "street", 2)); + CHECK_CALL(test_trie_add_get(trie, "st rt", 3)); + CHECK_CALL(test_trie_add_get(trie, "st rd", 3)); + CHECK_CALL(test_trie_add_get(trie, "state route", 4)); + CHECK_CALL(test_trie_add_get(trie, "maine", 5)); + + PASS(); +} + + +TEST test_trie(void) { + trie_t *trie = trie_new(); + ASSERT(trie != NULL); + CHECK_CALL(test_trie_setup(trie)); + + char *input = "main st r 20"; + token_array *tokens = tokenize_keep_whitespace(input); + phrase_array *phrases = trie_search_tokens(trie, input, tokens); + + ASSERT(phrases != NULL); + ASSERT(phrases->n == 1); + phrase_t phrase = phrases->a[0]; + ASSERT(phrase.start == 2); + ASSERT(phrase.len == 1); + + trie_destroy(trie); + + PASS(); +} + +GREATEST_SUITE(libpostal_trie_tests) { + RUN_TEST(test_trie); +}