[numex] Adding a replace_numeric_expressions method (returns NULL if no replacements were made), fixing lengths in situations where two unrelated numbers are joined by a stopword e.g. in the phrase "one and one" the "and" acts as a delimiter vs a phrase where the stopword acts as a joiner like "one hundred and twenty"

2015-07-24 15:30:58 -04:00
parent 12959aa483
commit 359cd62e20
2 changed files with 49 additions and 3 deletions
--- a/src/numex.c
+++ b/src/numex.c
@@ -1,4 +1,5 @@
 #include <math.h>
 #include <float.h>
 #include "numex.h"
 #include "file_utils.h"
@@ -8,7 +9,7 @@
 #define SEPARATOR_TOKENS "-"
-#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + 0.00001)
+#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON)
 numex_table_t *numex_table = NULL;
@@ -631,6 +632,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
    numex_result_t prev_result = NULL_NUMEX_RESULT;
    numex_result_t result = prev_result;
    size_t prev_result_len = 0;
    numex_result_array *results = NULL;
    numex_rule_t prev_rule = NUMEX_NULL_RULE;
@@ -666,6 +669,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
    bool possible_complete_token = false;
    bool complete_token = false;
    log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
    while (idx < len) {
        if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) {
            char_len = utf8proc_iterate(ptr, len, &codepoint);
@@ -762,7 +767,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
            }
            result.len = idx + phrase.start + phrase.len - result.start;
-            log_debug("ide=%d, phrase.len=%d\n", idx, phrase.len);
+            log_debug("idx=%d, phrase.len=%d\n", idx, phrase.len);
            log_debug("prev_rule.radix=%d\n", prev_rule.radix);
@@ -778,16 +783,19 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
            } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) {
                result.value += rule.value;
                log_debug("LEFT_CONTEXT_ADD, value = %lld\n", result.value);
-            } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
+            } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 &&
                       FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
                result.value += rule.value;
                log_debug("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", result.value);
            } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
                log_debug("Had previous token with no context, finishing previous rule before returning\n");
                result.len = prev_result_len;
                number_finished = true;
                advance_index = false;
                state = start_state;
                rule = prev_rule = NUMEX_NULL_RULE;
                prev_result_len = 0;
            } else if (rule.rule_type != NUMEX_STOPWORD) {
                result.value = rule.value;
                log_debug("Got number, result.value=%lld\n", result.value);
@@ -795,6 +803,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
            if (rule.rule_type != NUMEX_STOPWORD) {
                prev_rule = rule;
                prev_result_len = result.len;
            }
            if (rule.rule_type == NUMEX_ORDINAL_RULE) {
@@ -929,3 +938,39 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result
 }
 char *replace_numeric_expressions(char *str, char *lang) {
    numex_result_array *results = convert_numeric_expressions(str, lang);
    if (results == NULL) return NULL;
    size_t len = strlen(str);
    char_array *replacement = char_array_new_size(len);
    int start = 0;
    int end = 0;
    for (int i = 0; i < results->n; i++) {
        numex_result_t result = results->a[i];
        end = result.start;
        char numeric_string[INT64_MAX_STRING_SIZE] = {0};
        sprintf(numeric_string, "%" PRId64, result.value);
        char_array_append_len(replacement, str + start, end - start);
        char_array_append(replacement, numeric_string);
        if (result.is_ordinal) {
            char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result);
            char_array_append(replacement, ordinal_suffix);
        }
        start = result.start + result.len;
    }
    end = start;
    char_array_append_len(replacement, str + end, len - end);
    char_array_terminate(replacement);
    return char_array_to_string(replacement);
 }
--- a/src/numex.h
+++ b/src/numex.h
@@ -138,6 +138,7 @@ typedef struct numex_result {
 VECTOR_INIT(numex_result_array, numex_result_t)
 char *replace_numeric_expressions(char *str, char *lang);
 numex_result_array *convert_numeric_expressions(char *str, char *lang);
 char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result);