From 359cd62e20dbf145a9aeebc250a93c3104bc7e17 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Jul 2015 15:30:58 -0400 Subject: [PATCH] [numex] Adding a replace_numeric_expressions method (returns NULL if no replacements were made), fixing lengths in situations where two unrelated numbers are joined by a stopword e.g. in the phrase "one and one" the "and" acts as a delimiter vs a phrase where the stopword acts as a joiner like "one hundred and twenty" --- src/numex.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/numex.h | 1 + 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/numex.c b/src/numex.c index 354f79d7..276a00ab 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1,4 +1,5 @@ #include +#include #include "numex.h" #include "file_utils.h" @@ -8,7 +9,7 @@ #define SEPARATOR_TOKENS "-" -#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + 0.00001) +#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON) numex_table_t *numex_table = NULL; @@ -631,6 +632,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { numex_result_t prev_result = NULL_NUMEX_RESULT; numex_result_t result = prev_result; + size_t prev_result_len = 0; + numex_result_array *results = NULL; numex_rule_t prev_rule = NUMEX_NULL_RULE; @@ -666,6 +669,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { bool possible_complete_token = false; bool complete_token = false; + log_debug("Converting numex for str=%s, lang=%s\n", str, lang); + while (idx < len) { if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) { char_len = utf8proc_iterate(ptr, len, &codepoint); @@ -762,7 +767,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { } result.len = idx + phrase.start + phrase.len - result.start; - log_debug("ide=%d, phrase.len=%d\n", idx, phrase.len); + log_debug("idx=%d, phrase.len=%d\n", idx, phrase.len); log_debug("prev_rule.radix=%d\n", prev_rule.radix); @@ -778,16 +783,19 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) { result.value += rule.value; log_debug("LEFT_CONTEXT_ADD, value = %lld\n", result.value); - } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { + } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && + FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { result.value += rule.value; log_debug("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", result.value); } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) { log_debug("Had previous token with no context, finishing previous rule before returning\n"); + result.len = prev_result_len; number_finished = true; advance_index = false; state = start_state; rule = prev_rule = NUMEX_NULL_RULE; + prev_result_len = 0; } else if (rule.rule_type != NUMEX_STOPWORD) { result.value = rule.value; log_debug("Got number, result.value=%lld\n", result.value); @@ -795,6 +803,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (rule.rule_type != NUMEX_STOPWORD) { prev_rule = rule; + prev_result_len = result.len; } if (rule.rule_type == NUMEX_ORDINAL_RULE) { @@ -929,3 +938,39 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result } +char *replace_numeric_expressions(char *str, char *lang) { + numex_result_array *results = convert_numeric_expressions(str, lang); + if (results == NULL) return NULL; + + size_t len = strlen(str); + + char_array *replacement = char_array_new_size(len); + int start = 0; + int end = 0; + + for (int i = 0; i < results->n; i++) { + numex_result_t result = results->a[i]; + + end = result.start; + + char numeric_string[INT64_MAX_STRING_SIZE] = {0}; + sprintf(numeric_string, "%" PRId64, result.value); + + char_array_append_len(replacement, str + start, end - start); + char_array_append(replacement, numeric_string); + + if (result.is_ordinal) { + char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result); + char_array_append(replacement, ordinal_suffix); + } + + start = result.start + result.len; + } + + end = start; + char_array_append_len(replacement, str + end, len - end); + char_array_terminate(replacement); + + return char_array_to_string(replacement); +} + diff --git a/src/numex.h b/src/numex.h index c7343c17..2dc0ea25 100644 --- a/src/numex.h +++ b/src/numex.h @@ -138,6 +138,7 @@ typedef struct numex_result { VECTOR_INIT(numex_result_array, numex_result_t) +char *replace_numeric_expressions(char *str, char *lang); numex_result_array *convert_numeric_expressions(char *str, char *lang); char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result);