From 1c5afcafd294e52fa9ec1296a84b1ebb46e581da Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:43:39 -0400 Subject: [PATCH] [phrases] when skipping/ignoring hyphens in trie search, make sure that the new longer phrase ends at a word boundary (space, hyphen, end of string, etc.) --- src/trie_search.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/trie_search.c b/src/trie_search.c index 8518db89..fa78adf8 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -633,6 +633,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u trie_data_node_t data_node; trie_node_t terminal_node; + bool phrase_at_hyphen = false; + while (idx < len) { char_len = utf8proc_iterate(ptr, len, &codepoint); log_debug("char_len = %zu, char=%d\n", char_len, codepoint); @@ -653,7 +655,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u for (i = 0; i < char_len; i++) { node_id = trie_get_transition_index(self, last_node, *char_ptr); node = trie_get_node(self, node_id); - log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr); + log_debug("At idx=%u, i=%zu, char=%.*s\n", idx, i, (int)char_len, char_ptr); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); @@ -665,7 +667,12 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u } if (is_hyphen && node.check != last_node_id) { - log_debug("No space transition\n"); + log_debug("No space transition, phrase_len=%zu\n", phrase_len); + if (phrase_len > 0 && phrase_len == idx) { + log_debug("phrase_at_hyphen\n"); + phrase_at_hyphen = true; + } + ptr += char_len; idx += char_len; separator_char_len = char_len; @@ -720,10 +727,20 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u log_debug("match_len=%zu\n", match_len); if (tail_match_len == current_tail_len - tail_pos) { + if (phrase_at_hyphen) { + char_len = utf8proc_iterate(ptr + char_len, len, &codepoint); + if (char_len > 0 && utf8proc_codepoint_valid(codepoint)) { + int cat = utf8proc_category(codepoint); + + if (codepoint != 0 && !utf8_is_hyphen(codepoint) && !utf8_is_separator(cat) && !utf8_is_punctuation(cat)) { + return (phrase_t){phrase_start, phrase_len, value}; + } + } + } if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; - log_debug("tail match! phrase_len=%u\n", phrase_len); + log_debug("tail match! phrase_len=%u, len=%zu\n", phrase_len, len); value = data_node.data; return (phrase_t){phrase_start, phrase_len, value}; } else {