[numex] adding one form of normalization which strips ordinal suffixes so {96th, Ninety-sixth} => 96. This is an additional form of normalization, so there's still one form where the suffixes are kept. One case that's still not handled is something like "IXe Arrondissement"

This commit is contained in:
Al
2017-04-18 21:39:54 -04:00
parent 92051863ba
commit cddc368533
3 changed files with 96 additions and 55 deletions

View File

@@ -3,8 +3,24 @@
#define FULL_STOP_CODEPOINT 0x002e
#define APOSTROPHE_CODEPOINT 0x0027
char *normalize_replace_numex(char *str, size_t num_languages, char **languages) {
char *numex_normalized = NULL;
char *normalize_string_utf8(char *str, uint64_t options) {
for (size_t i = 0; i < num_languages; i++) {
char *lang = languages[i];
char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang);
if (numex_replaced != NULL) {
if (numex_normalized != NULL) {
free(numex_normalized);
}
numex_normalized = numex_replaced;
}
}
return numex_normalized;
}
char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
uint8_t *utf8proc_normalized = NULL;
@@ -63,7 +79,7 @@ char *normalize_string_utf8(char *str, uint64_t options) {
}
}
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) {
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && string_contains_hyphen(str)) {
char *replaced = string_replace_char(str, '-', ' ');
if (replaced != NULL) {
if (normalized_allocated) {
@@ -76,11 +92,28 @@ char *normalize_string_utf8(char *str, uint64_t options) {
}
}
if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) {
char *numex_normalized = normalize_replace_numex(str, num_languages, languages);
if (numex_normalized != NULL) {
if (normalized_allocated) {
free(normalized);
}
normalized = numex_normalized;
str = normalized;
normalized_allocated = true;
}
}
return normalized;
}
char *normalize_string_utf8(char *str, uint64_t options) {
return normalize_string_utf8_languages(str, options, 0, NULL);
}
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
char *latin_transliterator = LATIN_ASCII;
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
latin_transliterator = LATIN_ASCII_SIMPLE;
@@ -90,9 +123,9 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
char *utf8_normalized;
if (transliterated == NULL) {
utf8_normalized = normalize_string_utf8(str, options);
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
} else {
utf8_normalized = normalize_string_utf8(transliterated, options);
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
free(transliterated);
transliterated = NULL;
}
@@ -100,7 +133,11 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
return utf8_normalized;
}
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
return normalize_string_latin_languages(str, len, options, 0, NULL);
}
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
char *transliterated = NULL;
char *utf8_normalized = NULL;
@@ -114,7 +151,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
if (options & NORMALIZE_STRING_LATIN_ASCII) {
transliterated = transliterate(latin_transliterator, str, len);
if (transliterated != NULL) {
utf8_normalized = normalize_string_utf8(transliterated, options);
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
free(transliterated);
transliterated = NULL;
}
@@ -127,7 +164,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
}
char *str_copy = strndup(str, len);
utf8_normalized = normalize_string_utf8(str_copy, options);
utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages);
free(str_copy);
if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
@@ -150,9 +187,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
if (prev_string != NULL) {
free(prev_string);
}
}
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
size_t len = strlen(str);
string_tree_t *tree = string_tree_new_size(len);
@@ -161,15 +198,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
khash_t(int_set) *scripts = kh_init(int_set);
char *utf8_normalized = NULL;
char *ptr = str;
char *numex_replaced = NULL;
script_t script;
char *trans_name = NULL;
char *lang;
bool transliterate_latin = false;
char *ptr = str;
bool have_latin_transliterator = false;
while (consumed < len) {
string_script_t script_span = get_string_script(ptr, len - consumed);
script = script_span.script;
@@ -182,12 +220,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
if (html_escaped != NULL) {
str = html_escaped;
}
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII;
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
if (utf8_normalized != NULL) {
if (html_escaped != NULL) {
free(html_escaped);
html_escaped = NULL;
}
string_tree_add_string(tree, utf8_normalized);
string_tree_finalize_token(tree);
free(utf8_normalized);
@@ -200,22 +242,22 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
log_debug("script_len=%zu\n", script_len);
if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) {
if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) {
for (size_t i = 0; i < num_languages; i++) {
lang = languages[i];
foreach_transliterator(script, lang, trans_name, {
if (!string_equals(trans_name, LATIN_ASCII)) {
transliterate_latin = true;
have_latin_transliterator = true;
break;
}
})
if (transliterate_latin) break;
if (have_latin_transliterator) break;
}
}
if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) {
if ((script != SCRIPT_LATIN || have_latin_transliterator) && script_len > 0) {
int ret;
khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
if (ret < 0) {
@@ -230,8 +272,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
ptr += script_len;
}
if (!transliterate_latin) {
add_latin_alternatives(tree, str, len, options);
if (!have_latin_transliterator) {
add_latin_alternatives(tree, str, len, options, num_languages, languages);
}
size_t transliterate_scripts = kh_size(scripts);
@@ -276,7 +318,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
prev = transliterated;
})
add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages);
if (transliterated != str) {
free(transliterated);
}
@@ -287,8 +329,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
}
if (transliterate_latin) {
add_latin_alternatives(tree, str, len, options);
if (have_latin_transliterator) {
add_latin_alternatives(tree, str, len, options, num_languages, languages);
}
kh_destroy(int_set, scripts);