#include "normalize.h" #define FULL_STOP_CODEPOINT 0x002e #define APOSTROPHE_CODEPOINT 0x0027 char *normalize_string_utf8(char *str, uint64_t options) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; bool have_utf8proc_options = false; char *normalized = NULL; bool normalized_allocated = false; if (options & NORMALIZE_STRING_TRIM) { char *trimmed = string_trim(str); if (trimmed != NULL) { normalized = trimmed; str = normalized; normalized_allocated = true; } } if (options & NORMALIZE_STRING_LOWERCASE) { char *lowercased = utf8_lower(str); if (lowercased != NULL) { if (normalized_allocated) { free(normalized); } normalized = lowercased; str = normalized; normalized_allocated = true; } } if (options & NORMALIZE_STRING_DECOMPOSE) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_NFD; } if (options & NORMALIZE_STRING_COMPOSE) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_NFC; } if (options & NORMALIZE_STRING_STRIP_ACCENTS) { have_utf8proc_options = true; utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS; } if (have_utf8proc_options) { utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options); if (utf8proc_normalized != NULL) { if (normalized_allocated) { free(normalized); } normalized = (char *)utf8proc_normalized; str = normalized; normalized_allocated = true; } } if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) { char *replaced = string_replace(str, '-', ' '); if (replaced != NULL) { if (normalized_allocated) { free(normalized); } normalized = replaced; str = normalized; normalized_allocated = true; } } return normalized; } char *normalize_string_latin(char *str, size_t len, uint64_t options) { char *latin_transliterator = LATIN_ASCII; if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) { latin_transliterator = LATIN_ASCII_SIMPLE; } char *transliterated = transliterate(latin_transliterator, str, len); char *utf8_normalized; if (transliterated == NULL) { utf8_normalized = normalize_string_utf8(str, options); } else { utf8_normalized = normalize_string_utf8(transliterated, options); free(transliterated); transliterated = NULL; } return utf8_normalized; } void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) { char *transliterated = NULL; char *utf8_normalized = NULL; char *prev_string = NULL; char *latin_transliterator = LATIN_ASCII; if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) { latin_transliterator = LATIN_ASCII_SIMPLE; } if (options & NORMALIZE_STRING_LATIN_ASCII) { transliterated = transliterate(latin_transliterator, str, len); if (transliterated != NULL) { utf8_normalized = normalize_string_utf8(transliterated, options); free(transliterated); transliterated = NULL; } if (utf8_normalized != NULL) { string_tree_add_string(tree, utf8_normalized); prev_string = utf8_normalized; utf8_normalized = NULL; } } char *str_copy = strndup(str, len); utf8_normalized = normalize_string_utf8(str_copy, options); free(str_copy); if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) { transliterated = transliterate(latin_transliterator, utf8_normalized, strlen(utf8_normalized)); free(utf8_normalized); } else { transliterated = utf8_normalized; } if (transliterated != NULL) { if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) { string_tree_add_string(tree, transliterated); } free(transliterated); transliterated = NULL; } else { string_tree_add_string(tree, str); } if (prev_string != NULL) { free(prev_string); } } string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); size_t consumed = 0; khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; char *ptr = str; script_t script; char *trans_name = NULL; char *lang; bool transliterate_latin = false; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; size_t script_len = script_span.len; bool is_ascii = script_span.ascii; // Shortcut if the string is all ASCII if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) { char *html_escaped = transliterate(HTML_ESCAPE, str, len); if (html_escaped != NULL) { str = html_escaped; } utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized != NULL) { if (html_escaped != NULL) { free(html_escaped); html_escaped = NULL; } string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); utf8_normalized = NULL; } kh_destroy(int_set, scripts); return tree; } log_debug("script_len=%zu\n", script_len); if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) { for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; foreach_transliterator(script, lang, trans_name, { if (!string_equals(trans_name, LATIN_ASCII)) { transliterate_latin = true; break; } }) if (transliterate_latin) break; } } if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { log_error("Error in kh_put\n"); string_tree_destroy(tree); kh_destroy(int_set, scripts); return NULL; } } consumed += script_len; ptr += script_len; } if (!transliterate_latin) { add_latin_alternatives(tree, str, len, options); } size_t transliterate_scripts = kh_size(scripts); if (transliterate_scripts > 0) { string_tree_t *transliterators = string_tree_new_size(transliterate_scripts); khint_t key; kh_foreach_key(scripts, key, { script = (script_t)key; for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; foreach_transliterator(script, lang, trans_name, { string_tree_add_string(transliterators, trans_name); }) } foreach_transliterator(script, "", trans_name, { string_tree_add_string(transliterators, trans_name); }) string_tree_finalize_token(transliterators); }) string_tree_iterator_t *trans_iter = string_tree_iterator_new(transliterators); for (; !string_tree_iterator_done(trans_iter); string_tree_iterator_next(trans_iter)) { char *prev = NULL; char *transliterated = str; string_tree_iterator_foreach_token(trans_iter, trans_name, { log_debug("Doing %s\n", trans_name); transliterated = transliterate(trans_name, transliterated, strlen(transliterated)); if (transliterated == NULL) { transliterated = prev != NULL ? prev : str; continue; } if (prev != NULL) { free(prev); } prev = transliterated; }) add_latin_alternatives(tree, transliterated, strlen(transliterated), options); if (transliterated != str) { free(transliterated); } } string_tree_iterator_destroy(trans_iter); string_tree_destroy(transliterators); } if (transliterate_latin) { add_latin_alternatives(tree, str, len, options); } kh_destroy(int_set, scripts); string_tree_finalize_token(tree); return tree; } inline string_tree_t *normalize_string(char *str, uint64_t options) { return normalize_string_languages(str, options, 0, NULL); } bool numeric_starts_with_alpha(char *str, token_t token) { if (token.type != NUMERIC || token.len == 0) return false; size_t idx = 0; uint8_t *ptr = (uint8_t *)str + token.offset; size_t len = token.len; int32_t ch; ssize_t char_len; bool contains_letter = false; bool append_char = true; while (idx < len) { char_len = utf8proc_iterate(ptr, len, &ch); if (char_len <= 0) break; bool is_hyphen = utf8_is_hyphen(ch); int cat = utf8proc_category(ch); bool is_letter = utf8_is_letter(cat); bool is_number = utf8_is_number(cat); if (is_number) { return contains_letter; } else if (is_letter) { contains_letter = true; } ptr += char_len; idx += char_len; } return false; } void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options) { size_t idx = 0; uint8_t *ptr = (uint8_t *)str + token.offset; size_t len = token.len; if (token.len == 0) return; bool alpha_numeric_split = false; char *append_if_not_numeric = NULL; int32_t ch; ssize_t char_len; bool last_was_letter = false; bool append_char = true; while (idx < len) { char_len = utf8proc_iterate(ptr, len, &ch); if (char_len <= 0) break; bool is_hyphen = utf8_is_hyphen(ch); int cat = utf8proc_category(ch); bool is_letter = utf8_is_letter(cat); bool is_number = utf8_is_number(cat); bool is_full_stop = ch == FULL_STOP_CODEPOINT; if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) { char_array_append(array, " "); append_char = false; } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { append_char = false; } if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) { ptr += char_len; idx += char_len; append_if_not_numeric = is_hyphen ? "-" : "."; append_char = true; continue; } if (!is_number && append_if_not_numeric != NULL) { char_array_append(array, append_if_not_numeric); append_if_not_numeric = NULL; } if (is_number && options & NORMALIZE_TOKEN_REPLACE_DIGITS) { char_array_append(array, DIGIT_CHAR); append_char = false; } if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) { char_array_append(array, " "); alpha_numeric_split = true; } if (is_full_stop) { if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) { break; } if (token.type == ACRONYM && options & NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) { append_char = false; } } if (idx == len - 2 && len > 2 && options & NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES) { char this_char = *ptr; char next_char = *(ptr + 1); if ((this_char == '\'' && next_char == 's') || (this_char == 's' && next_char == '\'')) { char_array_append(array, "s"); break; } } if (ch == APOSTROPHE_CODEPOINT && token.type == WORD && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) { append_char = false; } if (append_char) { char_array_append_len(array, (char *)ptr, char_len); } ptr += char_len; idx += char_len; append_char = true; last_was_letter = is_letter; } char_array_terminate(array); } inline void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) { cstring_array_start_token(array); add_normalized_token(array->str, str, token, options); }