[fix] using a char_array instead of copying the string in normalize_string

This commit is contained in:
Al
2015-12-23 19:21:54 -05:00
parent 2eea999692
commit 3fbb3c587a

View File

@@ -114,7 +114,7 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
size_t consumed = 0; size_t consumed = 0;
char *copy; char_array *array = char_array_new_size(len);
while (consumed < len) { while (consumed < len) {
string_script_t script_span = get_string_script(str, len - consumed); string_script_t script_span = get_string_script(str, len - consumed);
@@ -122,15 +122,22 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
size_t script_len = script_span.len; size_t script_len = script_span.len;
bool is_ascii = script_span.ascii; bool is_ascii = script_span.ascii;
log_debug("script_len=%zu\n", script_len);
char *utf8_normalized = NULL; char *utf8_normalized = NULL;
char *transliterated = NULL; char *transliterated = NULL;
if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) { char_array_clear(array);
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); char_array_cat_len(array, str, script_len);
if (utf8_normalized != NULL) { char *str_script = char_array_get_string(array);
if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) {
utf8_normalized = normalize_string_utf8(str_script, NORMALIZE_STRING_LOWERCASE);
if (utf8_normalized != NULL) {
if (options & NORMALIZE_STRING_LATIN_ASCII) { if (options & NORMALIZE_STRING_LATIN_ASCII) {
log_debug("LATIN_ASCII\n");
transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized)); transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
log_debug("done\n");
if (transliterated != NULL) { if (transliterated != NULL) {
string_tree_add_string(tree, transliterated); string_tree_add_string(tree, transliterated);
free(transliterated); free(transliterated);
@@ -141,27 +148,21 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
} }
free(utf8_normalized); free(utf8_normalized);
utf8_normalized = NULL; utf8_normalized = NULL;
} }
} else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) { } else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) {
copy = strndup(str, script_len); add_latin_alternatives(tree, str_script, script_len, options);
if (copy != NULL) { } else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
add_latin_alternatives(tree, str, script_len, options);
free(copy);
copy = NULL;
}
} else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {
char *trans_name; char *trans_name;
copy = strndup(str, script_len);
if (copy != NULL) { add_latin_alternatives(tree, str_script, script_len, options);
add_latin_alternatives(tree, copy, script_len, options);
free(copy);
copy = NULL;
}
foreach_transliterator(script, "", trans_name, { foreach_transliterator(script, "", trans_name, {
transliterated = transliterate(trans_name, str, script_len); log_debug("doing %s\n", trans_name);
log_debug("str=%s\n", str);
log_debug("script_len=%zu\n", script_len);
transliterated = transliterate(trans_name, str_script, script_len);
log_debug("transliterated=%s\n", transliterated);
if (transliterated != NULL) { if (transliterated != NULL) {
add_latin_alternatives(tree, transliterated, strlen(transliterated), options); add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
free(transliterated); free(transliterated);
@@ -169,14 +170,16 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
}) })
} else { } else {
log_debug("Adding str: %s\n", str);
string_tree_add_string_len(tree, str, script_len); string_tree_add_string_len(tree, str, script_len);
} }
string_tree_finalize_token(tree); string_tree_finalize_token(tree);
consumed += script_len; consumed += script_len;
str += script_len; str += script_len;
} }
char_array_destroy(array);
return tree; return tree;