From 66a71ab70d64e33943abb64ce2bd671f69520cf6 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 11 Aug 2015 23:36:08 -0400 Subject: [PATCH] [normalize] Need to do a Latin-ASCII transliteration even if the string is entirely ASCII since it may contain HTML escapes --- src/normalize.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/normalize.c b/src/normalize.c index 5d21c8d0..30dabfd3 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -116,11 +116,21 @@ string_tree_t *normalize_string(char *str, uint64_t options) { if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) { utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized != NULL) { - string_tree_add_string(tree, utf8_normalized); + + if (options & NORMALIZE_STRING_LATIN_ASCII) { + transliterated = transliterate(LATIN_ASCII, utf8_normalized, len); + if (transliterated != NULL) { + string_tree_add_string(tree, transliterated); + free(transliterated); + transliterated = NULL; + } + } else { + string_tree_add_string(tree, utf8_normalized); + } free(utf8_normalized); utf8_normalized = NULL; - } + } } else if (options & NORMALIZE_STRING_LATIN_ASCII && script == SCRIPT_LATIN && script_len > 0) { add_latin_alternatives(tree, str, script_len, options); } else if (options & NORMALIZE_STRING_TRANSLITERATE && script != SCRIPT_UNKNOWN && script_len > 0) {