diff --git a/src/string_utils.c b/src/string_utils.c index 7d94496b..5fa8dbb1 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -252,14 +252,18 @@ inline int utf8_compare(const char *str1, const char *str2) { return utf8_compare_len(str1, str2, strlen(str1)); } -size_t utf8_common_prefix(const char *str1, const char *str2) { + +size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) { size_t common_prefix = 0; + if (len == 0) return common_prefix; + int32_t c1 = 0; int32_t c2 = 0; - size_t len1 = strlen(str1); - size_t len2 = strlen(str2); + size_t remaining = len; + + ssize_t len1, len2; uint8_t *ptr1 = (uint8_t *)str1; uint8_t *ptr2 = (uint8_t *)str2; @@ -273,6 +277,9 @@ size_t utf8_common_prefix(const char *str1, const char *str2) { ptr1 += len1; ptr2 += len2; common_prefix += len1; + if (common_prefix >= len) { + return common_prefix; + } } else { break; } @@ -281,6 +288,15 @@ size_t utf8_common_prefix(const char *str1, const char *str2) { return common_prefix; } +size_t utf8_common_prefix(const char *str1, const char *str2) { + size_t len1 = strlen(str1); + size_t len2 = strlen(str2); + + size_t len = len1 <= len2 ? len1 : len2; + + return utf8_common_prefix_len(str1, str2, len); +} + size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 13be4e4f..8253989c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -72,6 +72,7 @@ char *utf8_lower(const char *s); // returns a copy, caller frees int utf8_compare(const char *str1, const char *str2); int utf8_compare_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix(const char *str1, const char *str2); +size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); diff --git a/src/trie_search.c b/src/trie_search.c index 8276cef3..edf8967d 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -696,10 +696,11 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u } } - match_len += utf8_common_prefix_len_ignore_separators((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos); + size_t tail_match_len = utf8_common_prefix_len((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos); + match_len += tail_match_len; log_debug("match_len=%zu\n", match_len); - if (match_len >= current_tail_len) { + if (tail_match_len == current_tail_len - tail_pos) { if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; @@ -717,7 +718,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u if (terminal_node.check == node_id) { log_debug("Transition to NUL byte matched\n"); if (terminal_node.base < 0) { - phrase_len = idx + char_len - phrase_start; + phrase_len = (uint32_t)(idx + char_len) - phrase_start; data_node = trie_get_data_node(self, terminal_node); value = data_node.data; }