[numex] adding ability to handle handle the degree symbol in numex parsing since it's technically a separate token

This commit is contained in:
Al
2017-04-19 20:18:21 -04:00
parent 19899b2f7d
commit f3adde746e
3 changed files with 21 additions and 12 deletions

View File

@@ -764,10 +764,20 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
}
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len);
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
if (len_ordinal_suffix == 0) return false;
bool ret = false;
if (len_ordinal_suffix == 0 || token_digit_len + len_ordinal_suffix < token.len) {
return false;
} else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) {
size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len);
ret = prev_token_digit_len == prev_token.len;
} else {
ret = true;
}
cstring_array *strings = tree->strings;
// Add the original form first. When this function returns true,
@@ -779,12 +789,14 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch
char *expansion = char_array_get_string(key);
cstring_array_add_string(strings, expansion);
char_array_destroy(key);
return true;
return ret;
}
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
cstring_array *strings = tree->strings;
token_t prev_token = (token_t){0, 0, 0};
for (size_t i = 0; i < tokens->n; i++) {
token_t token = tokens->a[i];
bool have_phrase = false;
@@ -803,7 +815,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
break;
}
if (normalize_ordinal_suffixes(tree, str, lang, token, options)) {
if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) {
have_ordinal = true;
break;
}
@@ -814,6 +826,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
}
string_tree_finalize_token(tree);
prev_token = token;
}
}

View File

@@ -1009,7 +1009,7 @@ static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, ge
}
static size_t possible_ordinal_digit_len(char *str, size_t len) {
size_t possible_ordinal_digit_len(char *str, size_t len) {
uint8_t *ptr = (uint8_t *)str;
size_t idx = 0;
@@ -1053,11 +1053,6 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
return 0;
}
size_t ordinal_digit_len = possible_ordinal_digit_len(str, len);
if (ordinal_digit_len == 0) {
return 0;
}
if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR);
return 0;
@@ -1081,8 +1076,8 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id);
if (phrase.len == len - ordinal_digit_len) {
return len - ordinal_digit_len;
if (phrase.len + phrase.start == len) {
return phrase.len;
}
}
}

View File

@@ -149,6 +149,7 @@ VECTOR_INIT(numex_result_array, numex_result_t)
char *replace_numeric_expressions(char *str, char *lang);
numex_result_array *convert_numeric_expressions(char *str, char *lang);
size_t ordinal_suffix_len(char *s, size_t len, char *lang);
size_t possible_ordinal_digit_len(char *str, size_t len);
bool numex_table_write(FILE *file);
bool numex_table_save(char *filename);