[numex] Adding a replace_numeric_expressions method (returns NULL if no replacements were made), fixing lengths in situations where two unrelated numbers are joined by a stopword e.g. in the phrase "one and one" the "and" acts as a delimiter vs a phrase where the stopword acts as a joiner like "one hundred and twenty"

This commit is contained in:
Al
2015-07-24 15:30:58 -04:00
parent 12959aa483
commit 359cd62e20
2 changed files with 49 additions and 3 deletions

View File

@@ -1,4 +1,5 @@
#include <math.h>
#include <float.h>
#include "numex.h"
#include "file_utils.h"
@@ -8,7 +9,7 @@
#define SEPARATOR_TOKENS "-"
#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + 0.00001)
#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON)
numex_table_t *numex_table = NULL;
@@ -631,6 +632,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
numex_result_t prev_result = NULL_NUMEX_RESULT;
numex_result_t result = prev_result;
size_t prev_result_len = 0;
numex_result_array *results = NULL;
numex_rule_t prev_rule = NUMEX_NULL_RULE;
@@ -666,6 +669,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
bool possible_complete_token = false;
bool complete_token = false;
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
while (idx < len) {
if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) {
char_len = utf8proc_iterate(ptr, len, &codepoint);
@@ -762,7 +767,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
}
result.len = idx + phrase.start + phrase.len - result.start;
log_debug("ide=%d, phrase.len=%d\n", idx, phrase.len);
log_debug("idx=%d, phrase.len=%d\n", idx, phrase.len);
log_debug("prev_rule.radix=%d\n", prev_rule.radix);
@@ -778,16 +783,19 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) {
result.value += rule.value;
log_debug("LEFT_CONTEXT_ADD, value = %lld\n", result.value);
} else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
} else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 &&
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
result.value += rule.value;
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", result.value);
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
log_debug("Had previous token with no context, finishing previous rule before returning\n");
result.len = prev_result_len;
number_finished = true;
advance_index = false;
state = start_state;
rule = prev_rule = NUMEX_NULL_RULE;
prev_result_len = 0;
} else if (rule.rule_type != NUMEX_STOPWORD) {
result.value = rule.value;
log_debug("Got number, result.value=%lld\n", result.value);
@@ -795,6 +803,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (rule.rule_type != NUMEX_STOPWORD) {
prev_rule = rule;
prev_result_len = result.len;
}
if (rule.rule_type == NUMEX_ORDINAL_RULE) {
@@ -929,3 +938,39 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result
}
char *replace_numeric_expressions(char *str, char *lang) {
numex_result_array *results = convert_numeric_expressions(str, lang);
if (results == NULL) return NULL;
size_t len = strlen(str);
char_array *replacement = char_array_new_size(len);
int start = 0;
int end = 0;
for (int i = 0; i < results->n; i++) {
numex_result_t result = results->a[i];
end = result.start;
char numeric_string[INT64_MAX_STRING_SIZE] = {0};
sprintf(numeric_string, "%" PRId64, result.value);
char_array_append_len(replacement, str + start, end - start);
char_array_append(replacement, numeric_string);
if (result.is_ordinal) {
char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result);
char_array_append(replacement, ordinal_suffix);
}
start = result.start + result.len;
}
end = start;
char_array_append_len(replacement, str + end, len - end);
char_array_terminate(replacement);
return char_array_to_string(replacement);
}

View File

@@ -138,6 +138,7 @@ typedef struct numex_result {
VECTOR_INIT(numex_result_array, numex_result_t)
char *replace_numeric_expressions(char *str, char *lang);
numex_result_array *convert_numeric_expressions(char *str, char *lang);
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result);