diff --git a/src/normalize.c b/src/normalize.c index 7a16bdee..1e535823 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -421,10 +421,14 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t bool is_number = utf8_is_number(cat); next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch); - int next_cat = utf8proc_category(next_ch); - bool next_is_number = utf8_is_number(next_cat); - bool next_is_letter = utf8_is_letter(next_cat); - + int next_cat = UTF8PROC_CATEGORY_CN; + bool next_is_number = false; + bool next_is_letter = false; + if (next_char_len > 0) { + next_cat = utf8proc_category(next_ch); + next_is_number = utf8_is_number(next_cat); + next_is_letter = utf8_is_letter(next_cat); + } bool is_full_stop = ch == FULL_STOP_CODEPOINT; diff --git a/src/numex.c b/src/numex.c index c20efd34..18486b11 100644 --- a/src/numex.c +++ b/src/numex.c @@ -725,6 +725,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { while (idx < len) { if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) { char_len = utf8proc_iterate(ptr, len, &codepoint); + if (char_len <= 0) break; cat = utf8proc_category(codepoint); if (codepoint == 0) break; diff --git a/src/string_utils.c b/src/string_utils.c index 1500fa10..08e3118f 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -362,6 +362,7 @@ ssize_t utf8_len(const char *str, size_t len) { while (1) { char_len = utf8proc_iterate(ptr, -1, &ch); + if (char_len <= 0) break; if (ch == 0) break; remaining -= char_len; @@ -387,6 +388,7 @@ uint32_array *unicode_codepoints(const char *str) { while (1) { char_len = utf8proc_iterate(ptr, -1, &ch); + if (char_len <= 0) break; if (ch == 0) break; @@ -527,7 +529,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) { len1 = utf8proc_iterate(ptr1, -1, &c1); len2 = utf8proc_iterate(ptr2, -1, &c2); - if (c1 <= 0 || c2 <= 0) break; + if (len1 <= 0 || len2 <= 0 || c1 <= 0 || c2 <= 0) break; + if (c1 == c2) { ptr1 += len1; ptr2 += len2; @@ -572,6 +575,9 @@ size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *st len1 = utf8proc_iterate(ptr1, -1, &c1); len2 = utf8proc_iterate(ptr2, -1, &c2); + /* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie + ** suffix comparison where we may be in the middle of a multi-byte character + **/ if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) { ptr1++; ptr2++; @@ -631,6 +637,9 @@ bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t len1 = utf8proc_iterate(ptr1, -1, &c1); len2 = utf8proc_iterate(ptr2, -1, &c2); + /* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie + ** suffix comparison where we may be in the middle of a multi-byte character + **/ if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) { ptr1++; ptr2++; @@ -821,7 +830,7 @@ size_t string_right_spaces_len(char *str, size_t len) { while (1) { ssize_t char_len = utf8proc_iterate_reversed(ptr, index, &ch); - if (ch <= 0) break; + if (char_len <= 0 || ch == 0) break; if (!utf8_is_whitespace(ch)) { break; @@ -840,6 +849,7 @@ inline size_t string_hyphen_prefix_len(char *str, size_t len) { int32_t unichr; uint8_t *ptr = (uint8_t *)str; ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); + if (char_len <= 0 || unichr == 0) return 0; if (utf8_is_hyphen(unichr)) { return (size_t)char_len; } @@ -851,6 +861,7 @@ inline size_t string_hyphen_suffix_len(char *str, size_t len) { int32_t unichr; uint8_t *ptr = (uint8_t *)str; ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); + if (char_len <= 0 || unichr == 0) return 0; if (utf8_is_hyphen(unichr)) { return (size_t)char_len; } @@ -867,7 +878,7 @@ size_t string_left_spaces_len(char *str, size_t len) { while (1) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); - if (ch <= 0) break; + if (char_len <= 0 || ch == 0) break; if (!utf8_is_whitespace(ch)) { break; diff --git a/src/transliterate.c b/src/transliterate.c index 34d8d931..2bca610f 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -735,6 +735,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { step_name = step->name; if (step->type == STEP_RULESET && trans_node_id == NULL_NODE_ID) { log_warn("transliterator \"%s\" does not exist in trie\n", trans_name); + if (allocated_trans_name) free(trans_name); free(str); return NULL; } @@ -746,6 +747,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { if (step_node_id == NULL_NODE_ID) { log_warn("transliterator step \"%s\" does not exist\n", step_name); + if (allocated_trans_name) free(trans_name); free(str); return NULL; } @@ -787,13 +789,9 @@ char *transliterate(char *trans_name, char *str, size_t len) { while (idx < len) { log_debug("idx=%zu, ptr=%s\n", idx, ptr); char_len = utf8proc_iterate(ptr, len, &ch); - if (char_len == UTF8PROC_ERROR_INVALIDUTF8) { - log_warn("invalid UTF-8\n"); - char_len = 1; - ch = (int32_t)*ptr; - } else if (char_len <= 0) { - log_warn("char_len=%zd at idx=%zu\n", char_len, idx); - free(trans_name); + if (char_len <= 0) { + log_warn("invalid UTF-8 at position %zu in transliterating string: %.*s\n", idx, (int)len, str); + if (allocated_trans_name) free(trans_name); free(str); return NULL; } @@ -1047,8 +1045,8 @@ char *transliterate(char *trans_name, char *str, size_t len) { } + if (allocated_trans_name) free(trans_name); return str; - } void transliteration_table_destroy(void) { diff --git a/src/trie_search.c b/src/trie_search.c index fa78adf8..d32352c5 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -736,6 +736,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u return (phrase_t){phrase_start, phrase_len, value}; } } + + // Note: don't need to check the < 0 case because we're returning from this branch. } if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; diff --git a/src/unicode_scripts.c b/src/unicode_scripts.c index f04153c7..7e62e6b7 100644 --- a/src/unicode_scripts.c +++ b/src/unicode_scripts.c @@ -32,7 +32,7 @@ string_script_t get_string_script(char *str, size_t len) { while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); - if (ch == 0) break; + if (char_len <= 0 ||ch == 0) break; script = get_char_script((uint32_t)ch); @@ -46,6 +46,11 @@ string_script_t get_string_script(char *str, size_t len) { char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch); if (ch == 0) break; + /* Note: don't need to check char_len < 0 here because we're rewinding + ** previously valid UTF-8 characters and if anything invalid is detected, + ** we break out of the outer loop. + **/ + script = get_char_script((uint32_t)ch); if (!is_common_script(script)) { break;