[numex] Initial implementation of multilingual numeric expression parser

2015-06-08 21:29:04 -04:00
parent 6267b3a431
commit fd1ebba720
1 changed files with 331 additions and 1 deletions
--- a/src/numex.c
+++ b/src/numex.c
@@ -1,3 +1,4 @@
+#include <math.h>
 #include "numex.h"
 #include "file_utils.h"

@@ -7,6 +8,8 @@

 #define SEPARATOR_TOKENS "-"

+#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + 0.00001)
+
 numex_table_t *numex_table = NULL;

 numex_table_t *get_numex_table(void) {
@@ -445,6 +448,8 @@ bool numex_table_read(FILE *f) {
        ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);
    }

+    trie_destroy(numex_table->trie);
+
    numex_table->trie = trie_read(f);
    if (numex_table->trie == NULL) {
        goto exit_numex_table_load_error;
@@ -464,7 +469,9 @@ bool numex_table_load(char *filename) {
    if ((f = fopen(filename, "rb")) == NULL) {
        return NULL;
    }
-    return numex_table_read(f);
+    bool ret = numex_table_read(f);
+    fclose(f);
+    return ret;
 }

 bool numex_table_write(FILE *f) {
@@ -564,3 +571,326 @@ void numex_module_teardown(void) {
    numex_table_destroy();
    numex_table = NULL;
 }
+
+#define NULL_NUMEX_PHRASE (numex_phrase_t) {0, GENDER_NONE, CATEGORY_DEFAULT, false, 0, 0}
+
+typedef enum {
+    NUMEX_SEARCH_STATE_BEGIN,
+    NUMEX_SEARCH_STATE_SKIP_TOKEN,
+    NUMEX_SEARCH_STATE_PARTIAL_MATCH,
+    NUMEX_SEARCH_STATE_MATCH
+} numex_search_state_type;
+
+typedef struct numex_search_state {
+    uint32_t node_id;
+    numex_search_state_type state;
+} numex_search_state_t;
+
+#define NULL_NUMEX_SEARCH_STATE (numex_search_state_t) {NULL_NODE_ID, NUMEX_SEARCH_STATE_BEGIN}
+
+
+static inline numex_rule_t get_numex_rule(size_t i) {
+    if (i >= numex_table->rules->n) return NUMEX_NULL_RULE;
+    return numex_table->rules->a[i];
+}
+
+numex_phrase_array *convert_numeric_expressions(char *str, char *lang) {
+    if (numex_table == NULL) return NULL;
+
+    trie_t *trie = numex_table->trie;
+    if (trie == NULL) return NULL;
+
+    trie_prefix_result_t result = trie_get_prefix(trie, lang);
+
+    if (result.node_id == NULL_NODE_ID) {
+        return NULL;
+    }
+
+    result = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos);
+
+    if (result.node_id == NULL_NODE_ID) {
+        return NULL;
+    }
+
+    numex_phrase_t prev_phrase = NULL_NUMEX_PHRASE;
+    numex_phrase_t phrase = prev_phrase;
+
+    numex_phrase_array *phrases = NULL;
+
+    numex_rule_t prev_rule = NUMEX_NULL_RULE;
+    numex_rule_t rule = prev_rule;
+
+    numex_search_state_t state = NULL_NUMEX_SEARCH_STATE;
+
+    numex_search_state_t start_state = NULL_NUMEX_SEARCH_STATE;
+    start_state.node_id = result.node_id;
+
+    numex_search_state_t prev_state = start_state;
+
+    size_t len = strlen(str);
+    size_t idx = 0;
+
+    int32_t codepoint = 0;
+    ssize_t char_len = 0;
+    uint8_t *ptr = (uint8_t *)str;
+    unsigned char ch = '\0';
+
+    bool advance_index = true;
+    bool advance_state = true;
+
+    bool is_space = false;
+    bool is_hyphen = false;
+
+    bool number_finished = false;
+
+    uint32_t node_id = result.node_id;
+    uint32_t last_node_id = node_id;
+
+    trie_node_t start_node = trie_get_node(trie, node_id);
+
+    trie_node_t node = start_node;
+    trie_node_t last_node = start_node;
+
+    char_array *number_str = NULL;
+
+    bool stopword = false;
+
+    while (idx < len) {
+        char_len = utf8proc_iterate(ptr, len, &codepoint);
+        if (char_len <= 0) {
+            return NULL;
+        }
+
+        int remaining = char_len;
+
+        if (!utf8proc_codepoint_valid(ch)) {
+            log_warn("Invalid codepoint: %d\n", codepoint);
+            idx += char_len;
+            ptr += char_len;
+            state = prev_state = start_state;
+            last_node = start_node;
+            last_node_id = start_state.node_id;
+            continue;
+        }
+
+        int cat = utf8proc_category(codepoint);
+
+        is_space = utf8_is_separator(cat);
+        if (is_space) {
+            log_info("is_space\n");
+            is_hyphen = false;
+        } else {
+            is_hyphen = utf8_is_hyphen(codepoint);
+            if (is_hyphen) {
+                log_info("is_hyphen\n");
+            }
+        }
+
+        log_debug("Got char '%.*s', len=%zu at idx=%zu\n", (int)char_len, str + idx, char_len, idx);
+
+        if (prev_state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN && !is_space && !is_hyphen) {
+            log_debug("Skipping\n");
+            idx += char_len;
+            ptr += char_len;
+
+            node = last_node = start_node;
+            node_id = last_node_id = start_state.node_id;
+            continue;
+        }
+
+        state = start_state;
+ 
+        uint8_t *back_ptr = ptr;
+
+        bool check_match = false;
+
+        for (int i = 0; remaining > 0; remaining--) {
+            ch = (unsigned char) *ptr++;
+            log_debug("char=%c, last_node_id=%d\n", ch, last_node_id);
+
+            node_id = trie_get_transition_index(trie, last_node, ch);
+            node = trie_get_node(trie, node_id);
+
+            if (node.check != last_node_id) {
+                log_debug("node.check != last_node_id\n");
+                uint32_t match_id = trie_get_transition_index(trie, last_node, '\0');
+                trie_node_t match_node = trie_get_node(trie, match_id);
+                if (match_node.check != last_node_id) {
+                    state = start_state;
+                    last_node = start_node;
+                    last_node_id = start_state.node_id;
+
+                    log_debug("No NUL-byte transition, resetting state, last_node_id=%d\n", last_node_id);
+
+                    if (!is_space && !is_hyphen) {
+                        log_debug("Fell off trie inside token. Setting to skip\n");
+                        state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
+                        ptr += remaining;
+                        break;
+                    }
+
+                } else {
+                    log_debug("Have NUL-byte transition\n");
+
+                    check_match = true;
+                    node_id = match_id;
+                    node = match_node;
+                    last_node = start_node;
+                    last_node_id = start_state.node_id;
+                    ptr = back_ptr;
+                    remaining = 0;
+                    advance_index = false;
+                }
+
+            } else {
+
+                log_debug("not null\n");
+                state.state = NUMEX_SEARCH_STATE_PARTIAL_MATCH;
+                if (phrase.len == 0) {
+                    phrase.start = idx;
+                    phrase.len = char_len;
+                }
+
+                if (node.base >= 0) {
+                    last_node = node;
+                    last_node_id = node_id;
+                } else if (node.base < 0) {
+                    log_debug("node.base < 0\n");
+                    remaining--;
+                    check_match = true;
+                }
+            }
+
+
+            if (check_match) {
+
+                trie_data_node_t data_node = trie_get_data_node(trie, node);
+
+                unsigned char *current_tail = trie->tail->a + data_node.tail;
+
+                size_t tail_len = strlen((char *)current_tail);
+                char *query_tail = (char *)ptr;
+                size_t query_tail_len = strlen((char *)query_tail);
+
+                log_info("query_tail=%s, current_tail=%s, bytes=%zu\n", query_tail, current_tail, query_tail_len);
+
+                if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) {
+                    bool set_rule = false;
+                    state.state = NUMEX_SEARCH_STATE_MATCH;
+
+                    phrase.len = idx - phrase.start + tail_len;
+                    log_info("phrase.start=%d\n, idx=%d, phrase.len=%d\n", phrase.start, idx, phrase.len);
+
+                    ptr += remaining + tail_len;
+
+                    char_len += remaining + tail_len;
+                    remaining = 0;
+
+                    rule = get_numex_rule((size_t)data_node.data);
+
+                    log_info("rule.value=%lld\n", rule.value);
+
+                    if (rule.rule_type != NUMEX_NULL) {
+                        set_rule = true;
+
+                        if (rule.gender != GENDER_NONE) {
+                            phrase.gender = rule.gender;
+                        }
+
+                        if (rule.category != CATEGORY_DEFAULT) {
+                            phrase.category = rule.category;
+                        }
+
+                        /* e.g. in English, "two hundred", when you get to hundred, multiply by the 
+                           left value mod the current value, which also covers things like
+                           "one thousand two hundred" although in those cases should be less commmon in addresses
+                        */
+
+                        if (rule.rule_type == NUMEX_ORDINAL_RULE) {
+                            phrase.is_ordinal = true;
+                            number_finished = true;
+                            log_info("rule is ordinal\n");
+                        } 
+
+                        log_info("prev_rule.radix=%d\n", prev_rule.radix);
+
+                        if (rule.left_context_type == NUMEX_LEFT_CONTEXT_MULTIPLY) {
+                            int64_t multiplier = phrase.value % rule.value;
+                            if (multiplier != 0) {
+                                phrase.value -= multiplier;
+                            } else {
+                                multiplier = 1;
+                            }
+                            phrase.value += rule.value * multiplier;
+                            log_info("LEFT_CONTEXT_MULTIPLY, value = %lld\n", phrase.value);
+                        } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) {
+                            phrase.value += rule.value;
+                            log_info("LEFT_CONTEXT_MULTIPLY, value = %lld\n", phrase.value);
+                        } else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 && FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
+                            phrase.value += rule.value;
+                            log_info("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", phrase.value);
+                        } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
+                            log_info("Had previous token with no context, finishing previous rule before returning\n");
+
+                            number_finished = true;
+                            advance_index = false;
+                            state = start_state;
+                            last_node = start_node;
+                            last_node_id = start_state.node_id;
+                            ptr = back_ptr;
+                            rule = prev_rule = NUMEX_NULL_RULE;
+                            break;
+                        } else if (rule.rule_type != NUMEX_STOPWORD) {
+                            phrase.value = rule.value;
+                            log_info("Got number, phrase.value=%lld\n", phrase.value);
+                        }
+
+                        if (rule.rule_type != NUMEX_STOPWORD) {
+                            prev_rule = rule;
+                        } else {
+                            stopword = true;
+                        }
+
+                    }
+                    if (!set_rule) {
+                        rule = prev_rule = NUMEX_NULL_RULE;
+                        log_info("Resetting\n");
+                    }
+                    set_rule = false;
+
+                } else {
+                    log_info("Tail did not match\n");
+                    advance_index = false;
+                }
+
+                state = start_state;
+                last_node = start_node;
+                last_node_id = start_state.node_id;
+
+                check_match = false;
+            }
+
+
+        }
+
+        if (advance_index) {
+            idx += char_len;
+        }
+
+        if (number_finished) {
+            phrases = (phrases != NULL) ? phrases : numex_phrase_array_new_size(1);
+            numex_phrase_array_push(phrases, phrase);
+            log_info("phrase.value=%lld\n", phrase.value);
+            phrase = NULL_NUMEX_PHRASE;
+            number_finished = false;
+        }
+
+        
+        prev_state = state;
+
+        advance_index = true;
+
+    }
+
+    return phrases;
+}