diff --git a/src/libpostal.c b/src/libpostal.c index d9b0f436..a38b5f31 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -764,10 +764,20 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); } -static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { +static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { + size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len); size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - if (len_ordinal_suffix == 0) return false; + bool ret = false; + + if (len_ordinal_suffix == 0 || token_digit_len + len_ordinal_suffix < token.len) { + return false; + } else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) { + size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len); + ret = prev_token_digit_len == prev_token.len; + } else { + ret = true; + } cstring_array *strings = tree->strings; // Add the original form first. When this function returns true, @@ -779,12 +789,14 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch char *expansion = char_array_get_string(key); cstring_array_add_string(strings, expansion); char_array_destroy(key); - return true; + return ret; } static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; + token_t prev_token = (token_t){0, 0, 0}; + for (size_t i = 0; i < tokens->n; i++) { token_t token = tokens->a[i]; bool have_phrase = false; @@ -803,7 +815,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s break; } - if (normalize_ordinal_suffixes(tree, str, lang, token, options)) { + if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { have_ordinal = true; break; } @@ -814,6 +826,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s } string_tree_finalize_token(tree); + prev_token = token; } } diff --git a/src/numex.c b/src/numex.c index ff34fcfb..f2a0b156 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1009,7 +1009,7 @@ static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, ge } -static size_t possible_ordinal_digit_len(char *str, size_t len) { +size_t possible_ordinal_digit_len(char *str, size_t len) { uint8_t *ptr = (uint8_t *)str; size_t idx = 0; @@ -1053,11 +1053,6 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } - size_t ordinal_digit_len = possible_ordinal_digit_len(str, len); - if (ordinal_digit_len == 0) { - return 0; - } - if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return 0; @@ -1081,8 +1076,8 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id); - if (phrase.len == len - ordinal_digit_len) { - return len - ordinal_digit_len; + if (phrase.len + phrase.start == len) { + return phrase.len; } } } diff --git a/src/numex.h b/src/numex.h index c000ff9c..d80f96e1 100644 --- a/src/numex.h +++ b/src/numex.h @@ -149,6 +149,7 @@ VECTOR_INIT(numex_result_array, numex_result_t) char *replace_numeric_expressions(char *str, char *lang); numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); +size_t possible_ordinal_digit_len(char *str, size_t len); bool numex_table_write(FILE *file); bool numex_table_save(char *filename);