[numex] fix numex parsing when the spelled-out number is followed by a comma or other punctuation

This commit is contained in:
Al
2017-04-11 16:28:33 -04:00
parent fc91471434
commit b464eb6c07

View File

@@ -684,6 +684,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
bool is_space = false; bool is_space = false;
bool is_hyphen = false; bool is_hyphen = false;
bool is_punct = false;
char_array *number_str = NULL; char_array *number_str = NULL;
@@ -702,9 +703,13 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (codepoint == 0) break; if (codepoint == 0) break;
is_space = utf8_is_separator(cat); is_space = utf8_is_separator(cat);
is_punct = utf8_is_punctuation(cat);
if (is_space) { if (is_space) {
log_debug("is_space\n"); log_debug("is_space\n");
is_hyphen = false; is_hyphen = false;
} else if (is_punct) {
log_debug("is_punct\n");
is_hyphen = false;
} else { } else {
is_hyphen = utf8_is_hyphen(codepoint); is_hyphen = utf8_is_hyphen(codepoint);
if (is_hyphen) { if (is_hyphen) {
@@ -741,7 +746,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
possible_complete_token = true; possible_complete_token = true;
} else { } else {
log_debug("other char\n"); log_debug("other char\n");
if (result.len > 0 && (!whole_tokens_only || (prev_state.state != NUMEX_SEARCH_STATE_MATCH && complete_token))) { if (result.len > 0 && (!whole_tokens_only || (prev_state.state == NUMEX_SEARCH_STATE_MATCH && is_punct) || (prev_state.state != NUMEX_SEARCH_STATE_MATCH && complete_token))) {
results = (results != NULL) ? results : numex_result_array_new_size(1); results = (results != NULL) ? results : numex_result_array_new_size(1);
numex_result_array_push(results, result); numex_result_array_push(results, result);
log_debug("Adding phrase from partial token, value=%" PRId64 "\n", result.value); log_debug("Adding phrase from partial token, value=%" PRId64 "\n", result.value);
@@ -904,6 +909,71 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
return results; return results;
} }
static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) {
numex_language_t *language = get_numex_language(lang);
if (language == NULL) {
return NULL_PREFIX_RESULT;
}
bool whole_tokens_only = language->whole_tokens_only;
trie_prefix_result_t prefix = trie_get_prefix(trie, lang);
if (prefix.node_id == NULL_NODE_ID) {
return NULL_PREFIX_RESULT;
}
prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID) {
return NULL_PREFIX_RESULT;
}
trie_prefix_result_t ordinal_prefix = prefix;
char *gender_str = GENDER_NONE_PREFIX;
if (gender == GENDER_FEMININE) {
gender_str = GENDER_FEMININE_PREFIX;
} else if (gender == GENDER_MASCULINE) {
gender_str = GENDER_MASCULINE_PREFIX;
} else if (gender == GENDER_NEUTER) {
gender_str = GENDER_NEUTER_PREFIX;
}
prefix = trie_get_prefix_from_index(trie, gender_str, strlen(gender_str), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID && gender != GENDER_NONE && use_default_if_not_found) {
prefix = trie_get_prefix_from_index(trie, GENDER_NONE_PREFIX, strlen(GENDER_NONE_PREFIX), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
}
if (prefix.node_id == NULL_NODE_ID) {
return prefix;
}
trie_prefix_result_t gender_prefix = prefix;
char *category_str = CATEGORY_DEFAULT_PREFIX;
if (category == CATEGORY_PLURAL) {
category_str = CATEGORY_PLURAL_PREFIX;
}
prefix = trie_get_prefix_from_index(trie, category_str, strlen(category_str), gender_prefix.node_id, gender_prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID && category != CATEGORY_DEFAULT && use_default_if_not_found) {
prefix = trie_get_prefix_from_index(trie, CATEGORY_DEFAULT_PREFIX, strlen(CATEGORY_DEFAULT_PREFIX), gender_prefix.node_id, gender_prefix.tail_pos);
}
if (prefix.node_id == NULL_NODE_ID) {
return prefix;
}
prefix = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, prefix.node_id, prefix.tail_pos);
return prefix;
}
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) { char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) {
if (numex_table == NULL) { if (numex_table == NULL) {
log_error(NUMEX_SETUP_ERROR); log_error(NUMEX_SETUP_ERROR);
@@ -915,67 +985,8 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result
return NULL; return NULL;
} }
numex_language_t *language = get_numex_language(lang); bool use_default_if_not_found = true;
trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, result.gender, result.category, use_default_if_not_found);
if (language == NULL) {
return NULL;
}
bool whole_tokens_only = language->whole_tokens_only;
trie_prefix_result_t prefix = trie_get_prefix(trie, lang);
if (prefix.node_id == NULL_NODE_ID) {
return NULL;
}
prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID) {
return NULL;
}
trie_prefix_result_t ordinal_prefix = prefix;
char *gender = GENDER_NONE_PREFIX;
if (result.gender == GENDER_FEMININE) {
gender = GENDER_FEMININE_PREFIX;
} else if (result.gender == GENDER_MASCULINE) {
gender = GENDER_MASCULINE_PREFIX;
} else if (result.gender == GENDER_NEUTER) {
gender = GENDER_NEUTER_PREFIX;
}
prefix = trie_get_prefix_from_index(trie, gender, strlen(gender), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID && result.gender != GENDER_NONE) {
prefix = trie_get_prefix_from_index(trie, GENDER_NONE_PREFIX, strlen(GENDER_NONE_PREFIX), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
}
if (prefix.node_id == NULL_NODE_ID) {
return NULL;
}
trie_prefix_result_t gender_prefix = prefix;
char *category = CATEGORY_DEFAULT_PREFIX;
if (result.category == CATEGORY_PLURAL) {
category = CATEGORY_PLURAL_PREFIX;
}
prefix = trie_get_prefix_from_index(trie, category, strlen(category), gender_prefix.node_id, gender_prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID && result.category != CATEGORY_DEFAULT) {
prefix = trie_get_prefix_from_index(trie, CATEGORY_DEFAULT_PREFIX, strlen(CATEGORY_DEFAULT_PREFIX), gender_prefix.node_id, gender_prefix.tail_pos);
}
if (prefix.node_id == NULL_NODE_ID) {
return NULL;
}
prefix = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, prefix.node_id, prefix.tail_pos);
if (prefix.node_id == NULL_NODE_ID) { if (prefix.node_id == NULL_NODE_ID) {
return NULL; return NULL;