[fix] using a char_array instead of copying the string in normalize_string
This commit is contained in:
@@ -114,7 +114,7 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
|
|
||||||
size_t consumed = 0;
|
size_t consumed = 0;
|
||||||
|
|
||||||
char *copy;
|
char_array *array = char_array_new_size(len);
|
||||||
|
|
||||||
while (consumed < len) {
|
while (consumed < len) {
|
||||||
string_script_t script_span = get_string_script(str, len - consumed);
|
string_script_t script_span = get_string_script(str, len - consumed);
|
||||||
@@ -122,15 +122,22 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
size_t script_len = script_span.len;
|
size_t script_len = script_span.len;
|
||||||
bool is_ascii = script_span.ascii;
|
bool is_ascii = script_span.ascii;
|
||||||
|
|
||||||
|
log_debug("script_len=%zu\n", script_len);
|
||||||
|
|
||||||
char *utf8_normalized = NULL;
|
char *utf8_normalized = NULL;
|
||||||
char *transliterated = NULL;
|
char *transliterated = NULL;
|
||||||
|
|
||||||
if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) {
|
char_array_clear(array);
|
||||||
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
|
char_array_cat_len(array, str, script_len);
|
||||||
if (utf8_normalized != NULL) {
|
char *str_script = char_array_get_string(array);
|
||||||
|
|
||||||
|
if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) {
|
||||||
|
utf8_normalized = normalize_string_utf8(str_script, NORMALIZE_STRING_LOWERCASE);
|
||||||
|
if (utf8_normalized != NULL) {
|
||||||
if (options & NORMALIZE_STRING_LATIN_ASCII) {
|
if (options & NORMALIZE_STRING_LATIN_ASCII) {
|
||||||
|
log_debug("LATIN_ASCII\n");
|
||||||
transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
|
transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
|
||||||
|
log_debug("done\n");
|
||||||
if (transliterated != NULL) {
|
if (transliterated != NULL) {
|
||||||
string_tree_add_string(tree, transliterated);
|
string_tree_add_string(tree, transliterated);
|
||||||
free(transliterated);
|
free(transliterated);
|
||||||
@@ -141,27 +148,21 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
}
|
}
|
||||||
free(utf8_normalized);
|
free(utf8_normalized);
|
||||||
utf8_normalized = NULL;
|
utf8_normalized = NULL;
|
||||||
|
|
||||||
}
|
}
|
||||||
} else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) {
|
} else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) {
|
||||||
copy = strndup(str, script_len);
|
add_latin_alternatives(tree, str_script, script_len, options);
|
||||||
if (copy != NULL) {
|
} else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
|
||||||
add_latin_alternatives(tree, str, script_len, options);
|
|
||||||
free(copy);
|
|
||||||
copy = NULL;
|
|
||||||
}
|
|
||||||
} else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
|
|
||||||
char *trans_name;
|
char *trans_name;
|
||||||
copy = strndup(str, script_len);
|
|
||||||
if (copy != NULL) {
|
add_latin_alternatives(tree, str_script, script_len, options);
|
||||||
add_latin_alternatives(tree, copy, script_len, options);
|
|
||||||
free(copy);
|
|
||||||
copy = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach_transliterator(script, "", trans_name, {
|
foreach_transliterator(script, "", trans_name, {
|
||||||
transliterated = transliterate(trans_name, str, script_len);
|
log_debug("doing %s\n", trans_name);
|
||||||
|
log_debug("str=%s\n", str);
|
||||||
|
log_debug("script_len=%zu\n", script_len);
|
||||||
|
transliterated = transliterate(trans_name, str_script, script_len);
|
||||||
|
|
||||||
|
log_debug("transliterated=%s\n", transliterated);
|
||||||
if (transliterated != NULL) {
|
if (transliterated != NULL) {
|
||||||
add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
|
add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
|
||||||
free(transliterated);
|
free(transliterated);
|
||||||
@@ -169,14 +170,16 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
})
|
})
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
log_debug("Adding str: %s\n", str);
|
||||||
string_tree_add_string_len(tree, str, script_len);
|
string_tree_add_string_len(tree, str, script_len);
|
||||||
}
|
}
|
||||||
string_tree_finalize_token(tree);
|
string_tree_finalize_token(tree);
|
||||||
|
|
||||||
consumed += script_len;
|
consumed += script_len;
|
||||||
str += script_len;
|
str += script_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char_array_destroy(array);
|
||||||
|
|
||||||
return tree;
|
return tree;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user