982 lines
26 KiB
C
982 lines
26 KiB
C
#include <math.h>
|
|
#include <float.h>
|
|
#include "numex.h"
|
|
#include "file_utils.h"
|
|
|
|
#define NUMEX_TABLE_SIGNATURE 0xBBBBBBBB
|
|
|
|
#define SEPARATOR_TOKENS "-"
|
|
|
|
#define FLOOR_LOG_BASE(num, base) floor((log((float)num) / log((float)base)) + FLT_EPSILON)
|
|
|
|
numex_table_t *numex_table = NULL;
|
|
|
|
numex_table_t *get_numex_table(void) {
|
|
return numex_table;
|
|
}
|
|
|
|
void numex_table_destroy(void) {
|
|
numex_table_t *numex_table = get_numex_table();
|
|
if (numex_table == NULL) return;
|
|
|
|
if (numex_table->trie != NULL) {
|
|
trie_destroy(numex_table->trie);
|
|
}
|
|
|
|
if (numex_table->languages != NULL) {
|
|
numex_language_t *language;
|
|
kh_foreach_value(numex_table->languages, language, {
|
|
numex_language_destroy(language);
|
|
})
|
|
|
|
kh_destroy(str_numex_language, numex_table->languages);
|
|
}
|
|
|
|
if (numex_table->rules != NULL) {
|
|
numex_rule_array_destroy(numex_table->rules);
|
|
}
|
|
|
|
if (numex_table->ordinal_indicators != NULL) {
|
|
ordinal_indicator_array_destroy(numex_table->ordinal_indicators);
|
|
}
|
|
|
|
free(numex_table);
|
|
}
|
|
|
|
numex_table_t *numex_table_init(void) {
|
|
numex_table_t *numex_table = get_numex_table();
|
|
|
|
if (numex_table == NULL) {
|
|
numex_table = malloc(sizeof(numex_table_t));
|
|
|
|
if (numex_table == NULL) return NULL;
|
|
|
|
numex_table->trie = trie_new();
|
|
if (numex_table->trie == NULL) {
|
|
goto exit_numex_table_created;
|
|
}
|
|
|
|
|
|
numex_table->languages = kh_init(str_numex_language);
|
|
if (numex_table->languages == NULL) {
|
|
goto exit_numex_table_created;
|
|
}
|
|
|
|
numex_table->rules = numex_rule_array_new();
|
|
if (numex_table->rules == NULL) {
|
|
goto exit_numex_table_created;
|
|
}
|
|
|
|
numex_table->ordinal_indicators = ordinal_indicator_array_new();
|
|
if (numex_table->ordinal_indicators == NULL) {
|
|
goto exit_numex_table_created;
|
|
}
|
|
|
|
}
|
|
|
|
return numex_table;
|
|
exit_numex_table_created:
|
|
numex_table_destroy();
|
|
exit(1);
|
|
}
|
|
|
|
numex_table_t *numex_table_new(void) {
|
|
numex_table_t *numex_table = numex_table_init();
|
|
if (numex_table != NULL) {
|
|
numex_rule_t null_rule = NUMEX_NULL_RULE;
|
|
numex_rule_array_push(numex_table->rules, null_rule);
|
|
numex_rule_t stopword_rule = NUMEX_STOPWORD_RULE;
|
|
numex_rule_array_push(numex_table->rules, stopword_rule);
|
|
}
|
|
return numex_table;
|
|
}
|
|
|
|
|
|
numex_language_t *numex_language_new(char *name, bool whole_tokens_only, size_t rules_index, size_t num_rules, size_t ordinals_index, size_t num_ordinals) {
|
|
numex_language_t *language = malloc(sizeof(numex_language_t));
|
|
if (language == NULL) return NULL;
|
|
|
|
language->name = strdup(name);
|
|
language->whole_tokens_only = whole_tokens_only;
|
|
language->rules_index = rules_index;
|
|
language->num_rules = num_rules;
|
|
language->ordinals_index = ordinals_index;
|
|
language->num_ordinals = num_ordinals;
|
|
|
|
return language;
|
|
}
|
|
|
|
void numex_language_destroy(numex_language_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->name != NULL) {
|
|
free(self->name);
|
|
}
|
|
|
|
free(self);
|
|
}
|
|
|
|
bool numex_table_add_language(numex_language_t *language) {
|
|
if (numex_table == NULL) {
|
|
return false;
|
|
}
|
|
|
|
int ret;
|
|
khiter_t k = kh_put(str_numex_language, numex_table->languages, language->name, &ret);
|
|
kh_value(numex_table->languages, k) = language;
|
|
|
|
return true;
|
|
}
|
|
|
|
numex_language_t *get_numex_language(char *name) {
|
|
if (numex_table == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
khiter_t k;
|
|
k = kh_get(str_numex_language, numex_table->languages, name);
|
|
return k != kh_end(numex_table->languages) ? kh_value(numex_table->languages, k) : NULL;
|
|
}
|
|
|
|
numex_language_t *numex_language_read(FILE *f) {
|
|
size_t lang_name_len;
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&lang_name_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
char name[lang_name_len];
|
|
|
|
if (!file_read_chars(f, name, lang_name_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
bool whole_tokens_only;
|
|
if (!file_read_uint8(f, (uint8_t *)&whole_tokens_only)) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t rules_index;
|
|
if (!file_read_uint64(f, (uint64_t *)&rules_index)) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t num_rules;
|
|
if (!file_read_uint64(f, (uint64_t *)&num_rules)) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t ordinals_index;
|
|
if (!file_read_uint64(f, (uint64_t *)&ordinals_index)) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t num_ordinals;
|
|
if (!file_read_uint64(f, (uint64_t *)&num_ordinals)) {
|
|
return NULL;
|
|
}
|
|
|
|
numex_language_t *language = numex_language_new(name, whole_tokens_only, rules_index, num_rules, ordinals_index, num_ordinals);
|
|
|
|
return language;
|
|
|
|
}
|
|
|
|
bool numex_language_write(numex_language_t *language, FILE *f) {
|
|
size_t lang_name_len = strlen(language->name) + 1;
|
|
|
|
if (!file_write_uint64(f, (uint64_t)lang_name_len)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_chars(f, language->name, lang_name_len)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint8(f, language->whole_tokens_only)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, language->rules_index)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, language->num_rules)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, language->ordinals_index)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, language->num_ordinals)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
bool numex_rule_read(FILE *f, numex_rule_t *rule) {
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->left_context_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->right_context_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->rule_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->gender)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->category)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint32(f, &rule->radix)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&rule->value)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool numex_rule_write(numex_rule_t rule, FILE *f) {
|
|
if (!file_write_uint64(f, (uint64_t)rule.left_context_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)rule.right_context_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)rule.rule_type)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)rule.gender)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)rule.category)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint32(f, (uint32_t)rule.radix)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)rule.value)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void ordinal_indicator_destroy(ordinal_indicator_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->key != NULL) {
|
|
free(self->key);
|
|
}
|
|
|
|
if (self->suffix != NULL) {
|
|
free(self->suffix);
|
|
}
|
|
|
|
free(self);
|
|
}
|
|
|
|
ordinal_indicator_t *ordinal_indicator_new(char *key, gender_t gender, grammatical_category_t category, char *suffix) {
|
|
ordinal_indicator_t *ordinal = malloc(sizeof(ordinal_indicator_t));
|
|
if (ordinal == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
ordinal->key = strdup(key);
|
|
if (ordinal->key == NULL) {
|
|
ordinal_indicator_destroy(ordinal);
|
|
return NULL;
|
|
}
|
|
|
|
ordinal->suffix = strdup(suffix);
|
|
if (ordinal->suffix == NULL) {
|
|
ordinal_indicator_destroy(ordinal);
|
|
return NULL;
|
|
}
|
|
|
|
ordinal->category = category;
|
|
ordinal->gender = gender;
|
|
|
|
return ordinal;
|
|
}
|
|
|
|
ordinal_indicator_t *ordinal_indicator_read(FILE *f) {
|
|
size_t key_len;
|
|
if (!file_read_uint64(f, (uint64_t *)&key_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
char key[key_len];
|
|
|
|
if (!file_read_chars(f, key, key_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
gender_t gender;
|
|
if (!file_read_uint64(f, (uint64_t *)&gender)) {
|
|
return NULL;
|
|
}
|
|
|
|
grammatical_category_t category;
|
|
if (!file_read_uint64(f, (uint64_t *)&category)) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t ordinal_suffix_len;
|
|
if (!file_read_uint64(f, (uint64_t *)&ordinal_suffix_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
char ordinal_suffix[ordinal_suffix_len];
|
|
|
|
if (!file_read_chars(f, ordinal_suffix, ordinal_suffix_len)) {
|
|
return NULL;
|
|
}
|
|
|
|
return ordinal_indicator_new(key, gender, category, ordinal_suffix);
|
|
}
|
|
|
|
|
|
bool ordinal_indicator_write(ordinal_indicator_t *ordinal, FILE *f) {
|
|
size_t key_len = strlen(ordinal->key) + 1;
|
|
if (!file_write_uint64(f, key_len) ||
|
|
!file_write_chars(f, ordinal->key, key_len)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)ordinal->gender)) {
|
|
return false;
|
|
}
|
|
|
|
if (!file_write_uint64(f, (uint64_t)ordinal->category)) {
|
|
return false;
|
|
}
|
|
|
|
size_t name_len = strlen(ordinal->suffix) + 1;
|
|
if (!file_write_uint64(f, name_len) ||
|
|
!file_write_chars(f, ordinal->suffix, name_len)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
bool numex_table_read(FILE *f) {
|
|
if (f == NULL) {
|
|
log_warn("FILE pointer was NULL in numex_table_read\n");
|
|
return false;
|
|
}
|
|
|
|
uint32_t signature;
|
|
|
|
log_debug("Reading signature\n");
|
|
|
|
if (!file_read_uint32(f, &signature) || signature != NUMEX_TABLE_SIGNATURE) {
|
|
return false;
|
|
}
|
|
|
|
numex_table = numex_table_init();
|
|
|
|
log_debug("Numex table initialized\n");
|
|
|
|
size_t num_languages;
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&num_languages)) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
|
|
log_debug("read num_languages = %d\n", num_languages);
|
|
|
|
int i = 0;
|
|
|
|
numex_language_t *language;
|
|
|
|
for (i = 0; i < num_languages; i++) {
|
|
language = numex_language_read(f);
|
|
if (language == NULL || !numex_table_add_language(language)) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
}
|
|
|
|
log_debug("read languages\n");
|
|
|
|
|
|
size_t num_rules;
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&num_rules)) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
|
|
log_debug("read num_rules = %zu\n", num_rules);
|
|
|
|
numex_rule_t rule;
|
|
|
|
for (i = 0; i < num_rules; i++) {
|
|
if (!numex_rule_read(f, &rule)) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
numex_rule_array_push(numex_table->rules, rule);
|
|
}
|
|
|
|
log_debug("read rules\n");
|
|
|
|
size_t num_ordinals;
|
|
|
|
if (!file_read_uint64(f, (uint64_t *)&num_ordinals)) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
|
|
ordinal_indicator_t *ordinal;
|
|
|
|
for (i = 0; i < num_ordinals; i++) {
|
|
ordinal = ordinal_indicator_read(f);
|
|
if (ordinal == NULL) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);
|
|
}
|
|
|
|
trie_destroy(numex_table->trie);
|
|
|
|
numex_table->trie = trie_read(f);
|
|
if (numex_table->trie == NULL) {
|
|
goto exit_numex_table_load_error;
|
|
}
|
|
|
|
log_debug("read trie\n");
|
|
|
|
return true;
|
|
|
|
exit_numex_table_load_error:
|
|
numex_table_destroy();
|
|
return false;
|
|
}
|
|
|
|
bool numex_table_load(char *filename) {
|
|
FILE *f;
|
|
if ((f = fopen(filename, "rb")) == NULL) {
|
|
return NULL;
|
|
}
|
|
bool ret = numex_table_read(f);
|
|
fclose(f);
|
|
return ret;
|
|
}
|
|
|
|
bool numex_table_write(FILE *f) {
|
|
if (!file_write_uint32(f, (uint32_t)NUMEX_TABLE_SIGNATURE)) {
|
|
return false;
|
|
}
|
|
|
|
size_t num_languages = kh_size(numex_table->languages);
|
|
|
|
if (!file_write_uint64(f, (uint64_t)num_languages)) {
|
|
return false;
|
|
}
|
|
|
|
numex_language_t *language;
|
|
|
|
kh_foreach_value(numex_table->languages, language, {
|
|
if (!numex_language_write(language, f)) {
|
|
return false;
|
|
}
|
|
})
|
|
|
|
size_t num_rules = numex_table->rules->n;
|
|
|
|
if (!file_write_uint64(f, (uint64_t)num_rules)) {
|
|
return false;
|
|
}
|
|
|
|
numex_rule_t rule;
|
|
|
|
int i = 0;
|
|
|
|
for (i = 0; i < num_rules; i++) {
|
|
rule = numex_table->rules->a[i];
|
|
|
|
if (!numex_rule_write(rule, f)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
size_t num_ordinals = numex_table->ordinal_indicators->n;
|
|
|
|
if (!file_write_uint64(f, (uint64_t)num_ordinals)) {
|
|
return false;
|
|
}
|
|
|
|
ordinal_indicator_t *ordinal;
|
|
|
|
for (i = 0; i < num_ordinals; i++) {
|
|
ordinal = numex_table->ordinal_indicators->a[i];
|
|
|
|
if (!ordinal_indicator_write(ordinal, f)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (!trie_write(numex_table->trie, f)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool numex_table_save(char *filename) {
|
|
if (numex_table == NULL || filename == NULL) {
|
|
return false;
|
|
}
|
|
|
|
FILE *f;
|
|
|
|
if ((f = fopen(filename, "wb")) != NULL) {
|
|
bool ret = numex_table_write(f);
|
|
fclose(f);
|
|
return ret;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Initializes numex trie/module
|
|
Must be called only once before the module can be used
|
|
*/
|
|
bool numex_module_setup(char *filename) {
|
|
if (filename == NULL) {
|
|
numex_table = numex_table_new();
|
|
return numex_table != NULL;
|
|
} else if (numex_table == NULL) {
|
|
return numex_table_load(filename);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/* Teardown method for the module
|
|
Called once when done with the module (usually at
|
|
the end of a main method)
|
|
*/
|
|
void numex_module_teardown(void) {
|
|
numex_table_destroy();
|
|
numex_table = NULL;
|
|
}
|
|
|
|
#define NULL_NUMEX_RESULT (numex_result_t) {0, GENDER_NONE, CATEGORY_DEFAULT, false, 0, 0}
|
|
|
|
typedef enum {
|
|
NUMEX_SEARCH_STATE_BEGIN,
|
|
NUMEX_SEARCH_STATE_SKIP_TOKEN,
|
|
NUMEX_SEARCH_STATE_PARTIAL_MATCH,
|
|
NUMEX_SEARCH_STATE_MATCH
|
|
} numex_search_state_type;
|
|
|
|
typedef struct numex_search_state {
|
|
uint32_t node_id;
|
|
numex_search_state_type state;
|
|
} numex_search_state_t;
|
|
|
|
#define NULL_NUMEX_SEARCH_STATE (numex_search_state_t) {NULL_NODE_ID, NUMEX_SEARCH_STATE_BEGIN}
|
|
|
|
|
|
static inline numex_rule_t get_numex_rule(size_t i) {
|
|
if (i >= numex_table->rules->n) return NUMEX_NULL_RULE;
|
|
return numex_table->rules->a[i];
|
|
}
|
|
|
|
numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
|
if (numex_table == NULL) return NULL;
|
|
|
|
trie_t *trie = numex_table->trie;
|
|
if (trie == NULL) return NULL;
|
|
|
|
numex_language_t *language = get_numex_language(lang);
|
|
|
|
if (language == NULL) return NULL;
|
|
|
|
bool whole_tokens_only = language->whole_tokens_only;
|
|
|
|
trie_prefix_result_t prefix = trie_get_prefix(trie, lang);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
prefix = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, prefix.node_id, prefix.tail_pos);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
numex_result_t prev_result = NULL_NUMEX_RESULT;
|
|
numex_result_t result = prev_result;
|
|
|
|
size_t prev_result_len = 0;
|
|
|
|
numex_result_array *results = NULL;
|
|
|
|
numex_rule_t prev_rule = NUMEX_NULL_RULE;
|
|
numex_rule_t rule = prev_rule;
|
|
|
|
numex_search_state_t state = NULL_NUMEX_SEARCH_STATE;
|
|
|
|
numex_search_state_t start_state = NULL_NUMEX_SEARCH_STATE;
|
|
uint32_t start_node_id = prefix.node_id;
|
|
start_state.node_id = start_node_id;
|
|
|
|
numex_search_state_t prev_state = start_state;
|
|
|
|
size_t len = strlen(str);
|
|
size_t idx = 0;
|
|
|
|
int cat;
|
|
int32_t codepoint = 0;
|
|
ssize_t char_len = 0;
|
|
uint8_t *ptr = (uint8_t *)str;
|
|
unsigned char ch = '\0';
|
|
|
|
bool advance_index = true;
|
|
bool advance_state = true;
|
|
|
|
bool number_finished = false;
|
|
|
|
bool is_space = false;
|
|
bool is_hyphen = false;
|
|
|
|
char_array *number_str = NULL;
|
|
|
|
bool last_was_separator = false;
|
|
bool possible_complete_token = false;
|
|
bool complete_token = false;
|
|
|
|
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
|
|
|
|
while (idx < len) {
|
|
if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) {
|
|
char_len = utf8proc_iterate(ptr, len, &codepoint);
|
|
cat = utf8proc_category(codepoint);
|
|
|
|
if (codepoint == 0) break;
|
|
|
|
is_space = utf8_is_separator(cat);
|
|
if (is_space) {
|
|
log_debug("is_space\n");
|
|
is_hyphen = false;
|
|
} else {
|
|
is_hyphen = utf8_is_hyphen(codepoint);
|
|
if (is_hyphen) {
|
|
log_debug("is_hyphen\n");
|
|
}
|
|
}
|
|
|
|
idx += char_len;
|
|
ptr += char_len;
|
|
|
|
if (is_space || is_hyphen) {
|
|
state = start_state;
|
|
last_was_separator = true;
|
|
if (possible_complete_token) {
|
|
log_debug("Complete token\n");
|
|
complete_token = true;
|
|
possible_complete_token = false;
|
|
} else if (prev_state.state == NUMEX_SEARCH_STATE_MATCH) {
|
|
log_debug("Complete token\n");
|
|
complete_token = true;
|
|
prev_state = NULL_NUMEX_SEARCH_STATE;
|
|
} else {
|
|
complete_token = false;
|
|
}
|
|
} else if (whole_tokens_only && last_was_separator) {
|
|
log_debug("last was separator\n");
|
|
last_was_separator = false;
|
|
possible_complete_token = true;
|
|
} else {
|
|
log_debug("other char\n");
|
|
if (result.len > 0 && (!whole_tokens_only || (prev_state.state != NUMEX_SEARCH_STATE_MATCH && complete_token))) {
|
|
results = (results != NULL) ? results : numex_result_array_new_size(1);
|
|
numex_result_array_push(results, result);
|
|
log_debug("Adding phrase from partial token, value=%lld\n", result.value);
|
|
prev_rule = rule = NUMEX_NULL_RULE;
|
|
}
|
|
result = NULL_NUMEX_RESULT;
|
|
rule = prev_rule = NUMEX_NULL_RULE;
|
|
last_was_separator = false;
|
|
possible_complete_token = false;
|
|
complete_token = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
phrase_t phrase = trie_search_prefixes_from_index(trie, str + idx, start_node_id);
|
|
|
|
state = start_state;
|
|
|
|
if (phrase.len == 0) {
|
|
log_debug("phrase.len == 0, skipping token\n");
|
|
last_was_separator = false;
|
|
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
|
continue;
|
|
}
|
|
|
|
uint32_t rule_index = phrase.data;
|
|
|
|
bool set_rule = false;
|
|
state.state = NUMEX_SEARCH_STATE_MATCH;
|
|
|
|
log_debug("phrase.len=%lld, phrase.data=%d\n", phrase.len, phrase.data);
|
|
|
|
rule = get_numex_rule((size_t)phrase.data);
|
|
log_debug("rule.value=%lld\n", rule.value);
|
|
|
|
if (rule.rule_type != NUMEX_NULL) {
|
|
set_rule = true;
|
|
|
|
if (rule.gender != GENDER_NONE) {
|
|
result.gender = rule.gender;
|
|
}
|
|
|
|
if (rule.category != CATEGORY_DEFAULT) {
|
|
result.category = rule.category;
|
|
}
|
|
|
|
/* e.g. in English, "two hundred", when you get to hundred, multiply by the
|
|
left value mod the current value, which also covers things like
|
|
"one thousand two hundred" although those cases should be less commmon in addresses
|
|
*/
|
|
|
|
if (result.len == 0) {
|
|
result.start = idx + phrase.start;
|
|
}
|
|
result.len = idx + phrase.start + phrase.len - result.start;
|
|
|
|
log_debug("idx=%d, phrase.len=%d\n", idx, phrase.len);
|
|
|
|
log_debug("prev_rule.radix=%d\n", prev_rule.radix);
|
|
|
|
if (rule.left_context_type == NUMEX_LEFT_CONTEXT_MULTIPLY) {
|
|
int64_t multiplier = result.value % rule.value;
|
|
if (multiplier != 0) {
|
|
result.value -= multiplier;
|
|
} else {
|
|
multiplier = 1;
|
|
}
|
|
result.value += rule.value * multiplier;
|
|
log_debug("LEFT_CONTEXT_MULTIPLY, value = %lld\n", result.value);
|
|
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_ADD) {
|
|
result.value += rule.value;
|
|
log_debug("LEFT_CONTEXT_ADD, value = %lld\n", result.value);
|
|
} else if (prev_rule.right_context_type == NUMEX_RIGHT_CONTEXT_ADD && rule.value > 0 && prev_rule.radix > 0 &&
|
|
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
|
|
result.value += rule.value;
|
|
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%lld\n", result.value);
|
|
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
|
|
log_debug("Had previous token with no context, finishing previous rule before returning\n");
|
|
|
|
result.len = prev_result_len;
|
|
number_finished = true;
|
|
advance_index = false;
|
|
state = start_state;
|
|
rule = prev_rule = NUMEX_NULL_RULE;
|
|
prev_result_len = 0;
|
|
} else if (rule.rule_type != NUMEX_STOPWORD) {
|
|
result.value = rule.value;
|
|
log_debug("Got number, result.value=%lld\n", result.value);
|
|
} else if (rule.rule_type == NUMEX_STOPWORD && prev_rule.rule_type == NUMEX_NULL) {
|
|
log_debug("numex stopword\n");
|
|
rule = NUMEX_NULL_RULE;
|
|
}
|
|
|
|
if (rule.rule_type != NUMEX_STOPWORD) {
|
|
prev_rule = rule;
|
|
prev_result_len = result.len;
|
|
}
|
|
|
|
if (rule.rule_type == NUMEX_ORDINAL_RULE) {
|
|
result.is_ordinal = true;
|
|
if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) {
|
|
number_finished = true;
|
|
}
|
|
log_debug("rule is ordinal\n");
|
|
}
|
|
|
|
if (rule.rule_type != NUMEX_NULL && idx + phrase.start + phrase.len == len) {
|
|
number_finished = true;
|
|
}
|
|
}
|
|
if (!set_rule) {
|
|
rule = prev_rule = NUMEX_NULL_RULE;
|
|
log_debug("Resetting rules to NUMEX_NULL_RULE\n");
|
|
}
|
|
|
|
set_rule = false;
|
|
|
|
if (advance_index) {
|
|
idx += phrase.start + phrase.len;
|
|
ptr += phrase.start + phrase.len;
|
|
}
|
|
|
|
advance_index = true;
|
|
|
|
if (number_finished) {
|
|
results = (results != NULL) ? results : numex_result_array_new_size(1);
|
|
numex_result_array_push(results, result);
|
|
log_debug("Adding phrase, value=%lld\n", result.value);
|
|
result = NULL_NUMEX_RESULT;
|
|
number_finished = false;
|
|
}
|
|
|
|
prev_state = state;
|
|
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) {
|
|
if (numex_table == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
trie_t *trie = numex_table->trie;
|
|
if (trie == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
numex_language_t *language = get_numex_language(lang);
|
|
|
|
if (language == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
bool whole_tokens_only = language->whole_tokens_only;
|
|
|
|
trie_prefix_result_t prefix = trie_get_prefix(trie, lang);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
|
|
trie_prefix_result_t ordinal_prefix = prefix;
|
|
|
|
char *gender = GENDER_NONE_PREFIX;
|
|
if (result.gender == GENDER_FEMININE) {
|
|
gender = GENDER_FEMININE_PREFIX;
|
|
} else if (result.gender == GENDER_MASCULINE) {
|
|
gender = GENDER_MASCULINE_PREFIX;
|
|
} else if (result.gender == GENDER_NEUTER) {
|
|
gender = GENDER_NEUTER_PREFIX;
|
|
}
|
|
|
|
prefix = trie_get_prefix_from_index(trie, gender, strlen(gender), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID && result.gender != GENDER_NONE) {
|
|
prefix = trie_get_prefix_from_index(trie, GENDER_NONE_PREFIX, strlen(GENDER_NONE_PREFIX), ordinal_prefix.node_id, ordinal_prefix.tail_pos);
|
|
}
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
trie_prefix_result_t gender_prefix = prefix;
|
|
|
|
char *category = CATEGORY_DEFAULT_PREFIX;
|
|
|
|
if (result.category == CATEGORY_PLURAL) {
|
|
category = CATEGORY_PLURAL_PREFIX;
|
|
}
|
|
|
|
prefix = trie_get_prefix_from_index(trie, category, strlen(category), gender_prefix.node_id, gender_prefix.tail_pos);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID && result.category != CATEGORY_DEFAULT) {
|
|
prefix = trie_get_prefix_from_index(trie, CATEGORY_DEFAULT_PREFIX, strlen(CATEGORY_DEFAULT_PREFIX), gender_prefix.node_id, gender_prefix.tail_pos);
|
|
}
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
prefix = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, prefix.node_id, prefix.tail_pos);
|
|
|
|
if (prefix.node_id == NULL_NODE_ID) {
|
|
return NULL;
|
|
}
|
|
|
|
phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, prefix.node_id);
|
|
|
|
if (phrase.len == 0) {
|
|
return NULL;
|
|
}
|
|
|
|
if (phrase.data >= numex_table->ordinal_indicators->n) {
|
|
return NULL;
|
|
}
|
|
|
|
ordinal_indicator_t *ordinal = numex_table->ordinal_indicators->a[phrase.data];
|
|
return ordinal->suffix;
|
|
|
|
}
|
|
|
|
char *replace_numeric_expressions(char *str, char *lang) {
|
|
numex_result_array *results = convert_numeric_expressions(str, lang);
|
|
if (results == NULL) return NULL;
|
|
|
|
size_t len = strlen(str);
|
|
|
|
char_array *replacement = char_array_new_size(len);
|
|
int start = 0;
|
|
int end = 0;
|
|
|
|
for (int i = 0; i < results->n; i++) {
|
|
numex_result_t result = results->a[i];
|
|
|
|
end = result.start;
|
|
|
|
char numeric_string[INT64_MAX_STRING_SIZE] = {0};
|
|
sprintf(numeric_string, "%" PRId64, result.value);
|
|
|
|
char_array_append_len(replacement, str + start, end - start);
|
|
char_array_append(replacement, numeric_string);
|
|
|
|
if (result.is_ordinal) {
|
|
char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result);
|
|
char_array_append(replacement, ordinal_suffix);
|
|
}
|
|
|
|
start = result.start + result.len;
|
|
}
|
|
|
|
end = start;
|
|
char_array_append_len(replacement, str + end, len - end);
|
|
char_array_terminate(replacement);
|
|
numex_result_array_destroy(results);
|
|
|
|
return char_array_to_string(replacement);
|
|
}
|
|
|