From fd1ebba720ca0c27e380ad84da45cd5e6a0f4c03 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 8 Jun 2015 21:29:04 -0400 Subject: [PATCH] [numex] Initial implementation of multilingual numeric expression parser --- src/numex.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 331 insertions(+), 1 deletion(-) diff --git a/src/numex.c b/src/numex.c index dae23fb0..ab2d7877 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1,3 +1,4 @@ +#include #include "numex.h" #include "file_utils.h" @@ -7,6 +8,8 @@ #define SEPARATOR_TOKENS "-" +#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + 0.00001) + numex_table_t *numex_table = NULL; numex_table_t *get_numex_table(void) { @@ -445,6 +448,8 @@ bool numex_table_read(FILE *f) { ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); } + trie_destroy(numex_table->trie); + numex_table->trie = trie_read(f); if (numex_table->trie == NULL) { goto exit_numex_table_load_error; @@ -464,7 +469,9 @@ bool numex_table_load(char *filename) { if ((f = fopen(filename, "rb")) == NULL) { return NULL; } - return numex_table_read(f); + bool ret = numex_table_read(f); + fclose(f); + return ret; } bool numex_table_write(FILE *f) { @@ -564,3 +571,326 @@ void numex_module_teardown(void) { numex_table_destroy(); numex_table = NULL; } + +#define NULL_NUMEX_PHRASE (numex_phrase_t) {0, GENDER_NONE, CATEGORY_DEFAULT, false, 0, 0} + +typedef enum { + NUMEX_SEARCH_STATE_BEGIN, + NUMEX_SEARCH_STATE_SKIP_TOKEN, + NUMEX_SEARCH_STATE_PARTIAL_MATCH, + NUMEX_SEARCH_STATE_MATCH +} numex_search_state_type; + +typedef struct numex_search_state { + uint32_t node_id; + numex_search_state_type state; +} numex_search_state_t; + +#define NULL_NUMEX_SEARCH_STATE (numex_search_state_t) {NULL_NODE_ID, NUMEX_SEARCH_STATE_BEGIN} + + +static inline numex_rule_t get_numex_rule(size_t i) { + if (i >= numex_table->rules->n) return NUMEX_NULL_RULE; + return numex_table->rules->a[i]; +} + +numex_phrase_array *convert_numeric_expressions(char *str, char *lang) { + if (numex_table == NULL) return NULL; + + trie_t *trie = numex_table->trie; + if (trie == NULL) return NULL; + + trie_prefix_result_t result = trie_get_prefix(trie, lang); + + if (result.node_id == NULL_NODE_ID) { + return NULL; + } + + result = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos); + + if (result.node_id == NULL_NODE_ID) { + return NULL; + } + + numex_phrase_t prev_phrase = NULL_NUMEX_PHRASE; + numex_phrase_t phrase = prev_phrase; + + numex_phrase_array *phrases = NULL; + + numex_rule_t prev_rule = NUMEX_NULL_RULE; + numex_rule_t rule = prev_rule; + + numex_search_state_t state = NULL_NUMEX_SEARCH_STATE; + + numex_search_state_t start_state = NULL_NUMEX_SEARCH_STATE; + start_state.node_id = result.node_id; + + numex_search_state_t prev_state = start_state; + + size_t len = strlen(str); + size_t idx = 0; + + int32_t codepoint = 0; + ssize_t char_len = 0; + uint8_t *ptr = (uint8_t *)str; + unsigned char ch = '\0'; + + bool advance_index = true; + bool advance_state = true; + + bool is_space = false; + bool is_hyphen = false; + + bool number_finished = false; + + uint32_t node_id = result.node_id; + uint32_t last_node_id = node_id; + + trie_node_t start_node = trie_get_node(trie, node_id); + + trie_node_t node = start_node; + trie_node_t last_node = start_node; + + char_array *number_str = NULL; + + bool stopword = false; + + while (idx < len) { + char_len = utf8proc_iterate(ptr, len, &codepoint); + if (char_len <= 0) { + return NULL; + } + + int remaining = char_len; + + if (!utf8proc_codepoint_valid(ch)) { + log_warn("Invalid codepoint: %d\n", codepoint); + idx += char_len; + ptr += char_len; + state = prev_state = start_state; + last_node = start_node; + last_node_id = start_state.node_id; + continue; + } + + int cat = utf8proc_category(codepoint); + + is_space = utf8_is_separator(cat); + if (is_space) { + log_info("is_space\n"); + is_hyphen = false; + } else { + is_hyphen = utf8_is_hyphen(codepoint); + if (is_hyphen) { + log_info("is_hyphen\n"); + } + } + + log_debug("Got char '%.*s', len=%zu at idx=%zu\n", (int)char_len, str + idx, char_len, idx); + + if (prev_state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN && !is_space && !is_hyphen) { + log_debug("Skipping\n"); + idx += char_len; + ptr += char_len; + + node = last_node = start_node; + node_id = last_node_id = start_state.node_id; + continue; + } + + state = start_state; + + uint8_t *back_ptr = ptr; + + bool check_match = false; + + for (int i = 0; remaining > 0; remaining--) { + ch = (unsigned char) *ptr++; + log_debug("char=%c, last_node_id=%d\n", ch, last_node_id); + + node_id = trie_get_transition_index(trie, last_node, ch); + node = trie_get_node(trie, node_id); + + if (node.check != last_node_id) { + log_debug("node.check != last_node_id\n"); + uint32_t match_id = trie_get_transition_index(trie, last_node, '\0'); + trie_node_t match_node = trie_get_node(trie, match_id); + if (match_node.check != last_node_id) { + state = start_state; + last_node = start_node; + last_node_id = start_state.node_id; + + log_debug("No NUL-byte transition, resetting state, last_node_id=%d\n", last_node_id); + + if (!is_space && !is_hyphen) { + log_debug("Fell off trie inside token. Setting to skip\n"); + state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; + ptr += remaining; + break; + } + + } else { + log_debug("Have NUL-byte transition\n"); + + check_match = true; + node_id = match_id; + node = match_node; + last_node = start_node; + last_node_id = start_state.node_id; + ptr = back_ptr; + remaining = 0; + advance_index = false; + } + + } else { + + log_debug("not null\n"); + state.state = NUMEX_SEARCH_STATE_PARTIAL_MATCH; + if (phrase.len == 0) { + phrase.start = idx; + phrase.len = char_len; + } + + if (node.base >= 0) { + last_node = node; + last_node_id = node_id; + } else if (node.base < 0) { + log_debug("node.base < 0\n"); + remaining--; + check_match = true; + } + } + + + if (check_match) { + + trie_data_node_t data_node = trie_get_data_node(trie, node); + + unsigned char *current_tail = trie->tail->a + data_node.tail; + + size_t tail_len = strlen((char *)current_tail); + char *query_tail = (char *)ptr; + size_t query_tail_len = strlen((char *)query_tail); + + log_info("query_tail=%s, current_tail=%s, bytes=%zu\n", query_tail, current_tail, query_tail_len); + + if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) { + bool set_rule = false; + state.state = NUMEX_SEARCH_STATE_MATCH; + + phrase.len = idx - phrase.start + tail_len; + log_info("phrase.start=%d\n, idx=%d, phrase.len=%d\n", phrase.start, idx, phrase.len); + + ptr += remaining + tail_len; + + char_len += remaining + tail_len; + remaining = 0; + + rule = get_numex_rule((size_t)data_node.data); + + log_info("rule.value=%lld\n", rule.value); + + if (rule.rule_type != NUMEX_NULL) { + set_rule = true; + + if (rule.gender != GENDER_NONE) { + phrase.gender = rule.gender; + } + + if (rule.category != CATEGORY_DEFAULT) { + phrase.category = rule.category; + } + + /* e.g. in English, "two hundred", when you get to hundred, multiply by the + left value mod the current value, which also covers things like + "one thousand two hundred" although in those cases should be less commmon in addresses + */ + + if (rule.rule_type == NUMEX_ORDINAL_RULE) { + phrase.is_ordinal = true; + number_finished = true; + log_info("rule is ordinal\n"); + } + + log_info("prev_rule.radix=%d\n", prev_rule.radix); + + if (rule.left_context_type == NUMEX_LEFT_CONTEXT_MULTIPLY) { + int64_t multiplier = phrase.value % rule.value; + if (multiplier != 0) { + phrase.value -= multiplier; + } else { + multiplier = 1; + } + phrase.value += rule.value * multiplier; + log_info("LEFT_CONTEXT_MULTIPLY, value = %lld\n", phrase.value); + } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) { + phrase.value += rule.value; + log_info("LEFT_CONTEXT_MULTIPLY, value = %lld\n", phrase.value); + } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { + phrase.value += rule.value; + log_info("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", phrase.value); + } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) { + log_info("Had previous token with no context, finishing previous rule before returning\n"); + + number_finished = true; + advance_index = false; + state = start_state; + last_node = start_node; + last_node_id = start_state.node_id; + ptr = back_ptr; + rule = prev_rule = NUMEX_NULL_RULE; + break; + } else if (rule.rule_type != NUMEX_STOPWORD) { + phrase.value = rule.value; + log_info("Got number, phrase.value=%lld\n", phrase.value); + } + + if (rule.rule_type != NUMEX_STOPWORD) { + prev_rule = rule; + } else { + stopword = true; + } + + } + if (!set_rule) { + rule = prev_rule = NUMEX_NULL_RULE; + log_info("Resetting\n"); + } + set_rule = false; + + } else { + log_info("Tail did not match\n"); + advance_index = false; + } + + state = start_state; + last_node = start_node; + last_node_id = start_state.node_id; + + check_match = false; + } + + + } + + if (advance_index) { + idx += char_len; + } + + if (number_finished) { + phrases = (phrases != NULL) ? phrases : numex_phrase_array_new_size(1); + numex_phrase_array_push(phrases, phrase); + log_info("phrase.value=%lld\n", phrase.value); + phrase = NULL_NUMEX_PHRASE; + number_finished = false; + } + + + prev_state = state; + + advance_index = true; + + } + + return phrases; +}