diff --git a/src/libpostal.c b/src/libpostal.c index 92a10dfd..d9b0f436 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -79,6 +79,7 @@ static inline uint64_t get_normalize_string_options(libpostal_normalize_options_ normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; + normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; return normalize_string_options; } @@ -558,7 +559,6 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok address_expansion_t prefix_expansion; address_expansion_t suffix_expansion; - char_array *key = char_array_new_size(token.len); char *expansion; size_t num_strings = 0; @@ -582,10 +582,11 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok } if (!have_suffix && !have_prefix) { - char_array_destroy(key); return false; } - + + char_array *key = char_array_new_size(token.len); + if (have_prefix && have_suffix) { for (size_t i = 0; i < prefix_expansions->n; i++) { prefix_expansion = prefix_expansions->a[i]; @@ -760,16 +761,34 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to if ((suffix.len == 0 && prefix.len == 0)) return false; - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); } +static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + if (len_ordinal_suffix == 0) return false; + + cstring_array *strings = tree->strings; + // Add the original form first. When this function returns true, + // add_normalized_strings_token won't be called a second time. + add_normalized_strings_token(strings, str, token, options); + + char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1); + char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix); + char *expansion = char_array_get_string(key); + cstring_array_add_string(strings, expansion); + char_array_destroy(key); + return true; +} + static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; for (size_t i = 0; i < tokens->n; i++) { token_t token = tokens->a[i]; bool have_phrase = false; + bool have_ordinal = false; if (is_special_token(token.type)) { string_tree_add_string_len(tree, str + token.offset, token.len); @@ -783,9 +802,14 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s have_phrase = true; break; } + + if (normalize_ordinal_suffixes(tree, str, lang, token, options)) { + have_ordinal = true; + break; + } } - if (!have_phrase) { + if (!have_phrase && !have_ordinal) { add_normalized_strings_token(strings, str, token, options); } @@ -826,45 +850,18 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_ char_array_terminate(temp_string); char *tokenized_str = char_array_get_string(temp_string); - - char *new_str = tokenized_str; - char *last_numex_str = NULL; - if (options.expand_numex) { - char *numex_replaced = NULL; - for (size_t i = 0; i < options.num_languages; i++) { - lang = options.languages[i]; - - numex_replaced = replace_numeric_expressions(new_str, lang); - if (numex_replaced != NULL) { - new_str = numex_replaced; - - if (last_numex_str != NULL) { - free(last_numex_str); - } - last_numex_str = numex_replaced; - } - } - - } string_tree_t *alternatives; int ret; - log_debug("new_str=%s\n", new_str); - log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(new_str, options); - - if (last_numex_str != NULL) { - free(last_numex_str); - } + alternatives = add_string_alternatives(tokenized_str, options); if (alternatives == NULL) { log_debug("alternatives = NULL\n"); continue; } - iter = string_tree_iterator_new(alternatives); log_debug("iter->num_tokens=%d\n", iter->num_tokens); diff --git a/src/normalize.c b/src/normalize.c index e2ae9f6e..dd236703 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -3,8 +3,24 @@ #define FULL_STOP_CODEPOINT 0x002e #define APOSTROPHE_CODEPOINT 0x0027 +char *normalize_replace_numex(char *str, size_t num_languages, char **languages) { + char *numex_normalized = NULL; -char *normalize_string_utf8(char *str, uint64_t options) { + for (size_t i = 0; i < num_languages; i++) { + char *lang = languages[i]; + char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang); + if (numex_replaced != NULL) { + if (numex_normalized != NULL) { + free(numex_normalized); + } + numex_normalized = numex_replaced; + } + } + + return numex_normalized; +} + +char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; @@ -63,7 +79,7 @@ char *normalize_string_utf8(char *str, uint64_t options) { } } - if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) { + if (options & NORMALIZE_STRING_REPLACE_HYPHENS && string_contains_hyphen(str)) { char *replaced = string_replace_char(str, '-', ' '); if (replaced != NULL) { if (normalized_allocated) { @@ -76,11 +92,28 @@ char *normalize_string_utf8(char *str, uint64_t options) { } } + if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) { + char *numex_normalized = normalize_replace_numex(str, num_languages, languages); + if (numex_normalized != NULL) { + if (normalized_allocated) { + free(normalized); + } + normalized = numex_normalized; + str = normalized; + normalized_allocated = true; + } + + } + return normalized; } +char *normalize_string_utf8(char *str, uint64_t options) { + return normalize_string_utf8_languages(str, options, 0, NULL); +} -char *normalize_string_latin(char *str, size_t len, uint64_t options) { + +char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { char *latin_transliterator = LATIN_ASCII; if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) { latin_transliterator = LATIN_ASCII_SIMPLE; @@ -90,9 +123,9 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) { char *utf8_normalized; if (transliterated == NULL) { - utf8_normalized = normalize_string_utf8(str, options); + utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); } else { - utf8_normalized = normalize_string_utf8(transliterated, options); + utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); free(transliterated); transliterated = NULL; } @@ -100,7 +133,11 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) { return utf8_normalized; } -void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) { +char *normalize_string_latin(char *str, size_t len, uint64_t options) { + return normalize_string_latin_languages(str, len, options, 0, NULL); +} + +void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { char *transliterated = NULL; char *utf8_normalized = NULL; @@ -114,7 +151,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t if (options & NORMALIZE_STRING_LATIN_ASCII) { transliterated = transliterate(latin_transliterator, str, len); if (transliterated != NULL) { - utf8_normalized = normalize_string_utf8(transliterated, options); + utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); free(transliterated); transliterated = NULL; } @@ -127,7 +164,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t } char *str_copy = strndup(str, len); - utf8_normalized = normalize_string_utf8(str_copy, options); + utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages); free(str_copy); if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) { @@ -150,9 +187,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t if (prev_string != NULL) { free(prev_string); } - } + string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); @@ -161,15 +198,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; - - char *ptr = str; + char *numex_replaced = NULL; script_t script; char *trans_name = NULL; char *lang; - bool transliterate_latin = false; + char *ptr = str; + + bool have_latin_transliterator = false; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; @@ -182,12 +220,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu if (html_escaped != NULL) { str = html_escaped; } - utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); + + options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII; + + utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); if (utf8_normalized != NULL) { if (html_escaped != NULL) { free(html_escaped); html_escaped = NULL; } + string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); @@ -200,22 +242,22 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu log_debug("script_len=%zu\n", script_len); - if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) { + if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) { for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; foreach_transliterator(script, lang, trans_name, { if (!string_equals(trans_name, LATIN_ASCII)) { - transliterate_latin = true; + have_latin_transliterator = true; break; } }) - if (transliterate_latin) break; + if (have_latin_transliterator) break; } } - if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) { + if ((script != SCRIPT_LATIN || have_latin_transliterator) && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { @@ -230,8 +272,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu ptr += script_len; } - if (!transliterate_latin) { - add_latin_alternatives(tree, str, len, options); + if (!have_latin_transliterator) { + add_latin_alternatives(tree, str, len, options, num_languages, languages); } size_t transliterate_scripts = kh_size(scripts); @@ -276,7 +318,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu prev = transliterated; }) - add_latin_alternatives(tree, transliterated, strlen(transliterated), options); + add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages); if (transliterated != str) { free(transliterated); } @@ -287,8 +329,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu } - if (transliterate_latin) { - add_latin_alternatives(tree, str, len, options); + if (have_latin_transliterator) { + add_latin_alternatives(tree, str, len, options, num_languages, languages); } kh_destroy(int_set, scripts); diff --git a/src/normalize.h b/src/normalize.h index e18de053..d485f67f 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -33,6 +33,7 @@ As well as normalizations for individual string tokens: #include "string_utils.h" #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" +#include "numex.h" #include "transliterate.h" #include "trie.h" #include "tokens.h" @@ -47,6 +48,7 @@ As well as normalizations for individual string tokens: #define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 #define NORMALIZE_STRING_COMPOSE 1 << 7 #define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 +#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 #define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 #define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1