From 39e83961ef8d1844925d2eed62e84052b635aa9c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 19 Dec 2015 01:29:49 -0500 Subject: [PATCH] [fix] Bug in suffix expansion affecting inseparable suffixes like burg as well as ordinal suffixes like first=>1st --- src/numex.c | 4 ++- src/transliterate.c | 4 +-- src/trie_search.c | 84 ++++++++++++++++++++++----------------------- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/numex.c b/src/numex.c index 22b07046..1883432c 100644 --- a/src/numex.c +++ b/src/numex.c @@ -968,7 +968,9 @@ char *replace_numeric_expressions(char *str, char *lang) { if (result.is_ordinal) { char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result); - char_array_append(replacement, ordinal_suffix); + if (ordinal_suffix != NULL) { + char_array_append(replacement, ordinal_suffix); + } } start = result.start + result.len; diff --git a/src/transliterate.c b/src/transliterate.c index be107564..b8982e1e 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -948,10 +948,10 @@ char *transliterate(char *trans_name, char *str, size_t len) { char_array_destroy(revisit); } - free(original_str); - log_debug("original_str=%s\n", original_str); + free(original_str); + str = char_array_to_string(new_str); log_debug("new_str = %s\n", str); diff --git a/src/trie_search.c b/src/trie_search.c index 149f506e..a6d8295b 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -480,50 +480,39 @@ phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, u while(index > 0) { char_len = utf8proc_iterate_reversed(ptr, index, &unich); - if (char_len <= 0) break; - if (!(utf8proc_codepoint_valid(unich))) break; + if (char_len <= 0) return NULL_PHRASE; + if (!(utf8proc_codepoint_valid(unich))) return NULL_PHRASE; index -= char_len; char_ptr = ptr + index; - for (int i=0; i < char_len; i++, char_ptr++, last_node = node, last_node_id = node_id) { - log_debug("char=%c\n", (unsigned char)*char_ptr); + if (in_tail && tail_remaining >= char_len && strncmp((char *)current_tail, (char *)char_ptr, char_len) == 0) { + tail_remaining -= char_len; + current_tail += char_len; + phrase_start = (uint32_t)index; - if (in_tail && *current_tail && *current_tail == *char_ptr) { - tail_remaining--; - current_tail++; - if (i == char_len - 1) { - phrase_start = (uint32_t)index; - if (index == 0 && tail_remaining == 0) { - log_debug("tail match! ..tail_value=%u\n",tail_value); - phrase_len = (uint32_t)(len - index); - value = tail_value; - index = 0; - break; - } else { - phrase_len += char_len; - } + log_debug("tail matched at char %.*s (len=%zd)\n", (int)char_len, char_ptr, char_len); + log_debug("tail_remaining = %zu\n", tail_remaining); - } - continue; - } else if (in_tail && tail_remaining == 0 && i == char_len - 1) { - log_debug("tail match! tail_value=%u\n", tail_value); - phrase_start = (uint32_t)(index + char_len); - phrase_len = (uint32_t)(len - index - char_len); + if (tail_remaining == 0) { + log_debug("tail match! tail_value=%u\n",tail_value); + phrase_len = (uint32_t)(len - index); value = tail_value; index = 0; break; - } else if (in_tail) { - log_debug("Done with tail\n"); - index = 0; - phrase_len = 0; - break; } + continue; + } else if (in_tail) { + break; + } + + for (int i=0; i < char_len; i++, char_ptr++, last_node = node, last_node_id = node_id) { + log_debug("char=%c\n", (unsigned char)*char_ptr); node_id = trie_get_transition_index(self, node, *char_ptr); node = trie_get_node(self, node_id); - if (node.check != last_node_id) { + if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); index = 0; break; @@ -542,32 +531,41 @@ phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, u log_debug("tail_remaining=%zu\n", tail_remaining); in_tail = true; + size_t remaining_char_len = char_len - i - 1; + log_debug("remaining_char_len = %zu\n", remaining_char_len); + + if (remaining_char_len > 0 && strncmp((char *)char_ptr, (char *)current_tail, remaining_char_len) == 0) { + log_debug("tail string comparison successful\n"); + tail_remaining -= remaining_char_len; + } else if (remaining_char_len > 0) { + log_debug("tail comparison unsuccessful, \n"); + index = 0; + break; + } + if (tail_remaining == 0) { phrase_start = (uint32_t)index; phrase_len = (uint32_t)(len - index); + log_debug("phrase_start = %d, phrase_len=%d\n", phrase_start, phrase_len); value = tail_value; index = 0; break; } + } else if (i == char_len - 1) { + trie_node_t terminal_node = trie_get_transition(self, node, '\0'); + if (terminal_node.check == node_id) { + int32_t data_index = -1 * terminal_node.base; + trie_data_node_t data_node = self->data->a[data_index]; + value = data_node.data; + phrase_start = (uint32_t)index; + phrase_len = (uint32_t)(len - index); + } } - } } - if (phrase_len > 0) { - trie_node_t terminal_node = trie_get_transition(self, node, '\0'); - if (terminal_node.check == node_id) { - int32_t data_index = -1*terminal_node.base; - trie_data_node_t data_node = self->data->a[data_index]; - value = data_node.data; - log_debug("value = %d\n", value); - } else { - return NULL_PHRASE; - } - } - return (phrase_t) {phrase_start, phrase_len, value}; }