155 lines
4.8 KiB
C
155 lines
4.8 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
|
|
#include "constants.h"
|
|
#include "log/log.h"
|
|
#include "numex.h"
|
|
#include "numex_rule.h"
|
|
#include "numex_data.c"
|
|
|
|
|
|
int main(int argc, char **argv) {
|
|
char *filename;
|
|
|
|
if (argc == 2) {
|
|
filename = argv[1];
|
|
} else {
|
|
filename = DEFAULT_NUMEX_PATH;
|
|
}
|
|
|
|
FILE *f = fopen(filename, "wb");
|
|
|
|
if (f == NULL) {
|
|
log_error("File could not be opened, ensure directory exists: %s\n", filename);
|
|
numex_module_teardown();
|
|
exit(1);
|
|
}
|
|
|
|
if (!numex_module_init()) {
|
|
log_error("Numex table initialization unsuccessful\n");
|
|
numex_module_teardown();
|
|
exit(1);
|
|
}
|
|
|
|
numex_table_t *numex_table = get_numex_table();
|
|
|
|
size_t num_languages = sizeof(numex_languages) / sizeof(numex_language_source_t);
|
|
|
|
size_t num_source_keys = sizeof(numex_keys) / sizeof(char *);
|
|
size_t num_source_rules = sizeof(numex_rules) / sizeof(numex_rule_t);
|
|
|
|
if (num_source_keys != num_source_rules) {
|
|
log_error("num_sourcE_keys != num_source_rules, aborting\n");
|
|
numex_module_teardown();
|
|
exit(1);
|
|
}
|
|
|
|
size_t num_ordinal_indicator_rules = sizeof(ordinal_indicator_rules) / sizeof(ordinal_indicator_t);
|
|
|
|
char_array *key = char_array_new();
|
|
|
|
for (int i = 0; i < num_languages; i++) {
|
|
numex_language_source_t lang_source = numex_languages[i];
|
|
|
|
char *lang = lang_source.name;
|
|
|
|
int j;
|
|
|
|
size_t rule_index = lang_source.rule_index;
|
|
size_t num_rules = lang_source.num_rules;
|
|
size_t ordinal_indicator_index = lang_source.ordinal_indicator_index;
|
|
size_t num_ordinal_indicators = lang_source.num_ordinal_indicators;
|
|
|
|
numex_rule_t rule;
|
|
|
|
uint32_t value;
|
|
|
|
log_info("Doing language=%s\n", lang);
|
|
|
|
for (j = rule_index; j < rule_index + num_rules; j++) {
|
|
char *numex_key = numex_keys[j];
|
|
numex_rule_t rule = numex_rules[j];
|
|
|
|
value = rule.rule_type != NUMEX_STOPWORD ? numex_table->rules->n : NUMEX_STOPWORD_INDEX;
|
|
numex_rule_array_push(numex_table->rules, rule);
|
|
|
|
char_array_clear(key);
|
|
char_array_cat(key, lang);
|
|
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
|
|
char_array_cat(key, numex_key);
|
|
|
|
char *str_key = char_array_get_string(key);
|
|
|
|
trie_add(numex_table->trie, str_key, value);
|
|
}
|
|
|
|
for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) {
|
|
value = numex_table->ordinal_indicators->n;
|
|
ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j];
|
|
ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_source.key, ordinal_source.gender, ordinal_source.category, ordinal_source.suffix);
|
|
ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal);
|
|
|
|
char_array_clear(key);
|
|
char_array_cat(key, lang);
|
|
char_array_cat(key, ORDINAL_NAMESPACE_PREFIX);
|
|
|
|
switch (ordinal_source.gender) {
|
|
case GENDER_MASCULINE:
|
|
char_array_cat(key, GENDER_MASCULINE_PREFIX);
|
|
break;
|
|
case GENDER_FEMININE:
|
|
char_array_cat(key, GENDER_FEMININE_PREFIX);
|
|
break;
|
|
case GENDER_NEUTER:
|
|
char_array_cat(key, GENDER_NEUTER_PREFIX);
|
|
break;
|
|
case GENDER_NONE:
|
|
default:
|
|
char_array_cat(key, GENDER_NONE_PREFIX);
|
|
}
|
|
|
|
switch (ordinal_source.category) {
|
|
case CATEGORY_PLURAL:
|
|
char_array_cat(key, CATEGORY_PLURAL_PREFIX);
|
|
break;
|
|
case CATEGORY_DEFAULT:
|
|
default:
|
|
char_array_cat(key, CATEGORY_DEFAULT_PREFIX);
|
|
|
|
}
|
|
|
|
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
|
|
|
|
char *reversed = utf8_reversed_string(ordinal_source.key);
|
|
char_array_cat(key, reversed);
|
|
free(reversed);
|
|
|
|
char *str_key = char_array_get_string(key);
|
|
|
|
if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) {
|
|
trie_add(numex_table->trie, str_key, value);
|
|
} else {
|
|
log_warn("Key exists: %s, skipping\n", str_key);
|
|
}
|
|
}
|
|
|
|
numex_language_t *language = numex_language_new(lang_source.name, lang_source.whole_tokens_only, lang_source.rule_index, lang_source.num_rules, lang_source.ordinal_indicator_index, lang_source.num_ordinal_indicators);
|
|
numex_table_add_language(language);
|
|
|
|
}
|
|
|
|
char_array_destroy(key);
|
|
|
|
if (!numex_table_write(f)) {
|
|
log_error("Error writing numex table\n");
|
|
exit(1);
|
|
}
|
|
|
|
fclose(f);
|
|
|
|
numex_module_teardown();
|
|
|
|
log_info("Done\n");
|
|
}
|