[numex] when parsing numex, bail on rules in whole_tokens_only languages if there are contiguous rules with no right context rules (example: something that wouldn't make sense like VL in Latin)

This commit is contained in:
Al
2017-10-20 02:34:30 -04:00
parent bd477976d1
commit 9d2a111286

View File

@@ -844,15 +844,21 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
result.value += rule.value; result.value += rule.value;
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value); log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value);
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) { } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
log_debug("Had previous token with no context, finishing previous rule before returning\n"); log_debug("Had previous token with no context, finishing previous rule before returning\n");
if (!whole_tokens_only || complete_token) {
result.len = prev_result_len; result.len = prev_result_len;
number_finished = true; number_finished = true;
advance_index = false; advance_index = false;
state = start_state; state = start_state;
rule = prev_rule = NUMEX_NULL_RULE; rule = prev_rule = NUMEX_NULL_RULE;
prev_result_len = 0; prev_result_len = 0;
} else {
rule = NUMEX_NULL_RULE;
last_was_separator = false;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
continue;
}
} else if (rule.rule_type != NUMEX_STOPWORD) { } else if (rule.rule_type != NUMEX_STOPWORD) {
result.value = rule.value; result.value = rule.value;
log_debug("Got number, result.value=%" PRId64 "\n", result.value); log_debug("Got number, result.value=%" PRId64 "\n", result.value);
@@ -879,6 +885,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) { if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) {
number_finished = true; number_finished = true;
} }
log_debug("rule is ordinal\n"); log_debug("rule is ordinal\n");
} }
@@ -1037,6 +1044,7 @@ size_t possible_ordinal_digit_len(char *str, size_t len) {
int32_t ch; int32_t ch;
size_t digit_len = 0; size_t digit_len = 0;
bool seen_first_digit = false;
while (idx < len) { while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch); ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
@@ -1048,10 +1056,14 @@ size_t possible_ordinal_digit_len(char *str, size_t len) {
// 0-9 only for this // 0-9 only for this
is_digit = ch >= 48 && ch <= 57; is_digit = ch >= 48 && ch <= 57;
if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) { if ((seen_first_digit && is_digit && !last_was_digit)) {
return 0; return 0;
} }
if (is_digit && !seen_first_digit) {
seen_first_digit = true;
}
if (is_digit) { if (is_digit) {
digit_len += char_len; digit_len += char_len;
} }