[numex] when parsing numex, bail on rules in whole_tokens_only languages if there are contiguous rules with no right context rules (example: something that wouldn't make sense like VL in Latin)

2017-10-20 02:34:30 -04:00
parent bd477976d1
commit 9d2a111286
1 changed files with 21 additions and 9 deletions
--- a/src/numex.c
+++ b/src/numex.c
@@ -844,15 +844,21 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
                       FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
                result.value += rule.value;
                log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value);
-            } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) {
+            } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
                log_debug("Had previous token with no context, finishing previous rule before returning\n");
-
-                result.len = prev_result_len;
-                number_finished = true;
-                advance_index = false;
-                state = start_state;
-                rule = prev_rule = NUMEX_NULL_RULE;
-                prev_result_len = 0;
+                if (!whole_tokens_only || complete_token) {
+                    result.len = prev_result_len;
+                    number_finished = true;
+                    advance_index = false;
+                    state = start_state;
+                    rule = prev_rule = NUMEX_NULL_RULE;
+                    prev_result_len = 0;
+                } else {
+                    rule = NUMEX_NULL_RULE;
+                    last_was_separator = false;
+                    state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
+                    continue;
+                }
            } else if (rule.rule_type != NUMEX_STOPWORD) {
                result.value = rule.value;
                log_debug("Got number, result.value=%" PRId64 "\n", result.value);
@@ -879,6 +885,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
                if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) {
                    number_finished = true;
                }
+
                log_debug("rule is ordinal\n");
            } 

@@ -1037,6 +1044,7 @@ size_t possible_ordinal_digit_len(char *str, size_t len) {
    int32_t ch;

    size_t digit_len = 0;
+    bool seen_first_digit = false;

    while (idx < len) {
        ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
@@ -1048,10 +1056,14 @@ size_t possible_ordinal_digit_len(char *str, size_t len) {
        // 0-9 only for this
        is_digit = ch >= 48 && ch <= 57;

-        if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) {
+        if ((seen_first_digit && is_digit && !last_was_digit)) {
            return 0;
        }

+        if (is_digit && !seen_first_digit) {
+            seen_first_digit = true;
+        }
+
        if (is_digit) {
            digit_len += char_len;
        }