[numex] adding ability to handle handle the degree symbol in numex parsing since it's technically a separate token
This commit is contained in:
@@ -764,10 +764,20 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
|
||||
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
|
||||
}
|
||||
|
||||
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
|
||||
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
|
||||
size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len);
|
||||
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
|
||||
|
||||
if (len_ordinal_suffix == 0) return false;
|
||||
bool ret = false;
|
||||
|
||||
if (len_ordinal_suffix == 0 || token_digit_len + len_ordinal_suffix < token.len) {
|
||||
return false;
|
||||
} else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) {
|
||||
size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len);
|
||||
ret = prev_token_digit_len == prev_token.len;
|
||||
} else {
|
||||
ret = true;
|
||||
}
|
||||
|
||||
cstring_array *strings = tree->strings;
|
||||
// Add the original form first. When this function returns true,
|
||||
@@ -779,12 +789,14 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch
|
||||
char *expansion = char_array_get_string(key);
|
||||
cstring_array_add_string(strings, expansion);
|
||||
char_array_destroy(key);
|
||||
return true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
|
||||
cstring_array *strings = tree->strings;
|
||||
|
||||
token_t prev_token = (token_t){0, 0, 0};
|
||||
|
||||
for (size_t i = 0; i < tokens->n; i++) {
|
||||
token_t token = tokens->a[i];
|
||||
bool have_phrase = false;
|
||||
@@ -803,7 +815,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
|
||||
break;
|
||||
}
|
||||
|
||||
if (normalize_ordinal_suffixes(tree, str, lang, token, options)) {
|
||||
if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) {
|
||||
have_ordinal = true;
|
||||
break;
|
||||
}
|
||||
@@ -814,6 +826,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
|
||||
}
|
||||
|
||||
string_tree_finalize_token(tree);
|
||||
prev_token = token;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
11
src/numex.c
11
src/numex.c
@@ -1009,7 +1009,7 @@ static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, ge
|
||||
|
||||
}
|
||||
|
||||
static size_t possible_ordinal_digit_len(char *str, size_t len) {
|
||||
size_t possible_ordinal_digit_len(char *str, size_t len) {
|
||||
uint8_t *ptr = (uint8_t *)str;
|
||||
size_t idx = 0;
|
||||
|
||||
@@ -1053,11 +1053,6 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t ordinal_digit_len = possible_ordinal_digit_len(str, len);
|
||||
if (ordinal_digit_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (numex_table == NULL) {
|
||||
log_error(NUMEX_SETUP_ERROR);
|
||||
return 0;
|
||||
@@ -1081,8 +1076,8 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
|
||||
|
||||
phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id);
|
||||
|
||||
if (phrase.len == len - ordinal_digit_len) {
|
||||
return len - ordinal_digit_len;
|
||||
if (phrase.len + phrase.start == len) {
|
||||
return phrase.len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,6 +149,7 @@ VECTOR_INIT(numex_result_array, numex_result_t)
|
||||
char *replace_numeric_expressions(char *str, char *lang);
|
||||
numex_result_array *convert_numeric_expressions(char *str, char *lang);
|
||||
size_t ordinal_suffix_len(char *s, size_t len, char *lang);
|
||||
size_t possible_ordinal_digit_len(char *str, size_t len);
|
||||
|
||||
bool numex_table_write(FILE *file);
|
||||
bool numex_table_save(char *filename);
|
||||
|
||||
Reference in New Issue
Block a user