[numex] Fixing cases with stopwords not attached to a numeric expression
This commit is contained in:
28
src/numex.c
28
src/numex.c
@@ -648,6 +648,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
|
|
||||||
numex_search_state_t prev_state = start_state;
|
numex_search_state_t prev_state = start_state;
|
||||||
|
|
||||||
|
phrase_t stopword_phrase;
|
||||||
|
|
||||||
size_t len = strlen(str);
|
size_t len = strlen(str);
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
|
|
||||||
@@ -668,6 +670,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
char_array *number_str = NULL;
|
char_array *number_str = NULL;
|
||||||
|
|
||||||
bool last_was_separator = false;
|
bool last_was_separator = false;
|
||||||
|
bool last_was_stopword = false;
|
||||||
bool possible_complete_token = false;
|
bool possible_complete_token = false;
|
||||||
bool complete_token = false;
|
bool complete_token = false;
|
||||||
|
|
||||||
@@ -775,7 +778,10 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
if (result.len == 0) {
|
if (result.len == 0) {
|
||||||
result.start = idx + phrase.start;
|
result.start = idx + phrase.start;
|
||||||
}
|
}
|
||||||
result.len = idx + phrase.start + phrase.len - result.start;
|
|
||||||
|
if (rule.rule_type != NUMEX_STOPWORD) {
|
||||||
|
result.len = idx + phrase.start + phrase.len - result.start;
|
||||||
|
}
|
||||||
|
|
||||||
log_debug("idx=%zu, phrase.len=%d\n", idx, phrase.len);
|
log_debug("idx=%zu, phrase.len=%d\n", idx, phrase.len);
|
||||||
|
|
||||||
@@ -812,13 +818,21 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
} else if (rule.rule_type == NUMEX_STOPWORD && prev_rule.rule_type == NUMEX_NULL) {
|
} else if (rule.rule_type == NUMEX_STOPWORD && prev_rule.rule_type == NUMEX_NULL) {
|
||||||
log_debug("numex stopword\n");
|
log_debug("numex stopword\n");
|
||||||
rule = NUMEX_NULL_RULE;
|
rule = NUMEX_NULL_RULE;
|
||||||
|
last_was_separator = false;
|
||||||
|
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rule.rule_type != NUMEX_STOPWORD) {
|
if (rule.rule_type != NUMEX_STOPWORD) {
|
||||||
prev_rule = rule;
|
prev_rule = rule;
|
||||||
prev_result_len = result.len;
|
prev_result_len = result.len;
|
||||||
|
stopword_phrase = NULL_PHRASE;
|
||||||
|
} else {
|
||||||
|
stopword_phrase = phrase;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
last_was_stopword = rule.rule_type == NUMEX_STOPWORD;
|
||||||
|
|
||||||
if (rule.rule_type == NUMEX_ORDINAL_RULE) {
|
if (rule.rule_type == NUMEX_ORDINAL_RULE) {
|
||||||
result.is_ordinal = true;
|
result.is_ordinal = true;
|
||||||
if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) {
|
if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) {
|
||||||
@@ -830,7 +844,19 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|||||||
if (rule.rule_type != NUMEX_NULL && idx + phrase.start + phrase.len == len) {
|
if (rule.rule_type != NUMEX_NULL && idx + phrase.start + phrase.len == len) {
|
||||||
number_finished = true;
|
number_finished = true;
|
||||||
}
|
}
|
||||||
|
} else if (last_was_stopword) {
|
||||||
|
log_debug("last was stopword\n");
|
||||||
|
last_was_separator = false;
|
||||||
|
advance_index = false;
|
||||||
|
idx = stopword_phrase.start;
|
||||||
|
ptr = (uint8_t *)str + stopword_phrase.start;
|
||||||
|
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||||
|
if (prev_rule.rule_type != NUMEX_NULL) {
|
||||||
|
number_finished = true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!set_rule) {
|
if (!set_rule) {
|
||||||
rule = prev_rule = NUMEX_NULL_RULE;
|
rule = prev_rule = NUMEX_NULL_RULE;
|
||||||
log_debug("Resetting rules to NUMEX_NULL_RULE\n");
|
log_debug("Resetting rules to NUMEX_NULL_RULE\n");
|
||||||
|
|||||||
Reference in New Issue
Block a user