[normalize] use the new utf8proc lowercasing (as opposed to case folding), free copies since none of the string functions operate in-place any more, add minimal HTML escaping transliterator even to ASCII text

This commit is contained in:
Al
2017-01-01 20:06:32 -05:00
parent 5c56a44faa
commit a78937f265

View File

@@ -11,12 +11,26 @@ char *normalize_string_utf8(char *str, uint64_t options) {
bool have_utf8proc_options = false;
char *normalized = NULL;
bool normalized_allocated = false;
if (options & NORMALIZE_STRING_TRIM) {
char *trimmed = string_trim(str);
if (trimmed != NULL) {
normalized = trimmed;
str = normalized;
normalized_allocated = true;
}
}
if (options & NORMALIZE_STRING_LOWERCASE) {
char *lowercased = utf8_lower(str);
if (lowercased != NULL) {
if (normalized_allocated) {
free(normalized);
}
normalized = lowercased;
str = normalized;
normalized_allocated = true;
}
}
@@ -35,22 +49,30 @@ char *normalize_string_utf8(char *str, uint64_t options) {
utf8proc_options |= UTF8PROC_OPTIONS_STRIP_ACCENTS;
}
if (options & NORMALIZE_STRING_LOWERCASE) {
have_utf8proc_options = true;
utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
}
if (have_utf8proc_options) {
utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
if (utf8proc_normalized != NULL) {
if (normalized_allocated) {
free(normalized);
}
normalized = (char *)utf8proc_normalized;
str = normalized;
normalized_allocated = true;
}
}
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) {
char *replaced = string_replace(str, '-', ' ');
if (replaced != NULL) {
if (normalized_allocated) {
free(normalized);
}
normalized = replaced;
str = normalized;
normalized_allocated = true;
}
}
@@ -156,8 +178,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
// Shortcut if the string is all ASCII
if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) {
char *html_escaped = transliterate(HTML_ESCAPE, str, len);
if (html_escaped != NULL) {
str = html_escaped;
}
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
if (utf8_normalized != NULL) {
if (html_escaped != NULL) {
free(html_escaped);
html_escaped = NULL;
}
string_tree_add_string(tree, utf8_normalized);
string_tree_finalize_token(tree);
free(utf8_normalized);