From 3a74a8c179c928702d55adc1f210f0c09b314570 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 16 May 2015 23:22:16 -0400 Subject: [PATCH] [transliteration] script to build transliteration table, trie, C structures, etc. from the rules --- src/transliteration_table_builder.c | 601 ++++++++++++++++++++++++++++ 1 file changed, 601 insertions(+) create mode 100644 src/transliteration_table_builder.c diff --git a/src/transliteration_table_builder.c b/src/transliteration_table_builder.c new file mode 100644 index 00000000..9d81ca55 --- /dev/null +++ b/src/transliteration_table_builder.c @@ -0,0 +1,601 @@ +/* transliteration_table_create.c +Creates the transliteration data structures from generated rule file. +Only used once at setup/make time, not overly concerned with optimization +*/ + +#include +#include + +#include "collections.h" +#include "log/log.h" +#include "klib/ksort.h" +#include "string_utils.h" +#include "trie.h" +#include "transliterate.h" +#include "transliteration_rule.h" +#include "transliteration_data.c" + +#include "utf8proc/utf8proc.h" + +#define MAX_UTF8_CHAR_SIZE 4 + +string_tree_t *regex_string_tree(char *regex, size_t len) { + uint8_t *char_ptr = (uint8_t *)regex; + char last_ch = '\0'; + bool in_group = false; + bool in_set = false; + bool in_brackets = false; + + int32_t codepoint; + int32_t last_codepoint = 0; + ssize_t char_len; + + size_t bracket_start; + size_t bracket_len; + + char temp_char[MAX_UTF8_CHAR_SIZE]; + ssize_t temp_char_len; + + string_tree_t *tree = string_tree_new(); + + if (len == 0) { + // Single token with no + string_tree_add_string_len(tree, regex, len); + string_tree_finalize_token(tree); + return tree; + } + + uint32_array *char_set = uint32_array_new(); + + size_t idx = 0; + + int i, j, k; + + bool add_to_index = false; + + while (idx < len) { + char_len = utf8proc_iterate(char_ptr, len, &codepoint); + if (char_len <= 0) { + uint32_array_destroy(char_set); + string_tree_destroy(tree); + return NULL; + } + + if (!(utf8proc_codepoint_valid(codepoint))) { + idx += char_len; + char_ptr += char_len; + continue; + } + + add_to_index = true; + + if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("begin set\n"); + in_set = true; + add_to_index = false; + uint32_array_clear(char_set); + } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) { + log_debug("end set"); + + for (j = 0; j < char_set->n; j++) { + temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char); + log_debug("Adding string %.*s\n", temp_char_len, temp_char); + string_tree_add_string_len(tree, temp_char, temp_char_len); + } + string_tree_finalize_token(tree); + + uint32_array_clear(char_set); + // Add a special codepoint to the sequence to distinguish from an escaped square bracket + codepoint = END_SET_CODEPOINT; + in_set = false; + add_to_index = false; + } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + in_brackets = true; + bracket_start = idx + char_len; + bracket_len = 0; + } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) { + log_debug("Adding bracketed string: %.*s\n", regex + bracket_start, bracket_len); + string_tree_add_string_len(tree, regex + bracket_start, bracket_len); + in_brackets = false; + } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("group\n"); + add_to_index = false; + } else if (codepoint == STAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("star\n"); + // For *, we add an optional transition to the empty + codepoint = REPEAT_ZERO_CODEPOINT; + } else if (codepoint == PLUS_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("plus\n"); + codepoint = REPEAT_ONE_CODEPOINT; + } else if (codepoint == DOLLAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("dollar\n"); + codepoint = WORD_BOUNDARY_CODEPOINT; + + if (in_set) { + uint32_array_push(char_set, codepoint); + add_to_index = false; + } + } else if (in_set) { + log_debug("in set\n"); + // Queue node, we'll add them to the trie + uint32_array_push(char_set, codepoint); + add_to_index = false; + } else if (in_brackets) { + add_to_index = false; + bracket_len += char_len; + } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + add_to_index = false; + } + + log_debug("codepoint = %d\n", codepoint); + + if (add_to_index) { + temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char); + log_debug("char = %.*s\n", temp_char_len, temp_char); + string_tree_add_string_len(tree, temp_char, temp_char_len); + string_tree_finalize_token(tree); + } + + idx += char_len; + char_ptr += char_len; + } + + uint32_array_destroy(char_set); + + return tree; + +} + +group_capture_array *parse_groups(char *regex, size_t len) { + uint8_t *char_ptr = (uint8_t *)regex; + char last_ch = '\0'; + bool in_group = false; + bool in_set = false; + + int32_t codepoint, last_codepoint = 0; + ssize_t char_len; + + char temp_char[MAX_UTF8_CHAR_SIZE]; + ssize_t temp_char_len; + + + if (len == 0) { + return NULL; + } + + group_capture_array *groups = group_capture_array_new_size(1); + + size_t idx = 0; + + int i, j, k; + + size_t pos = 0; + size_t group_start = 0; + size_t chars_in_group = 0; + + while (idx < len) { + char_len = utf8proc_iterate(char_ptr, len, &codepoint); + if (char_len <= 0) { + log_error("char %s had len=%zd\n", char_ptr, char_len); + return NULL; + } + + if (!(utf8proc_codepoint_valid(codepoint))) { + idx += char_len; + char_ptr += char_len; + pos++; + continue; + } + + if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("begin set\n"); + in_set = true; + } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("end set"); + pos++; + in_set = false; + } else if (codepoint == LPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("begin group\n"); + in_group = true; + group_start = pos; + } else if (codepoint == RPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { + log_debug("close group\n"); + in_group = false; + group_capture_array_push(groups, (group_capture_t){group_start, pos - group_start}); + } else { + log_debug("other char\n"); + pos++; + } + + idx += char_len; + char_ptr += char_len; + + } + + return groups; + +} + + +// Methods used by trie builder and setup/teardown +bool transliteration_table_add_step(transliteration_table_t *self, step_type_t type, char *name) { + transliteration_step_t *step = transliteration_step_new(name, type); + + step_array_push(self->steps, step); + return true; +} + +int main(int argc, char **argv) { + char *filename; + + if (argc == 2) { + filename = argv[1]; + } else { + filename = DEFAULT_TRANSLITERATION_PATH; + } + + FILE *f = fopen(filename, "wb"); + + if (f == NULL) { + log_error("File could not be opened, ensure directory exists: %s", filename); + exit(1); + } + + size_t num_source_rules = sizeof(rules_source) / sizeof(transliteration_rule_source_t); + size_t num_source_steps = sizeof(steps_source) / sizeof(transliteration_step_source_t); + size_t num_source_transliterators = sizeof(transliterators_source) / sizeof(transliterator_source_t); + + char *key; + size_t key_len; + + context_type_t pre_context_type; + size_t pre_context_max_len; + char *pre_context; + size_t pre_context_len; + + context_type_t post_context_type; + size_t post_context_max_len; + char *post_context; + size_t post_context_len; + + char *replacement; + size_t replacement_len; + + int move; + char *group_regex_str; + size_t group_regex_len; + + transliteration_module_setup(NULL); + + transliteration_table_t *trans_table = get_transliteration_table(); + + trie_t *trie = trans_table->trie; + + for (int i = 0; i < num_source_transliterators; i++) { + transliterator_source_t trans_source = transliterators_source[i]; + + size_t trans_name_len = strlen(trans_source.name); + + log_info("Doing transliterator: %s\n", trans_source.name); + + char_array *trans_key = char_array_from_string(trans_source.name); + char_array_cat(trans_key, NAMESPACE_SEPARATOR_CHAR); + + transliterator_t *trans = transliterator_new(trans_source.name, trans_source.internal, trans_table->steps->n, trans_source.steps_length); + + for (int j = 0; j < trans_source.steps_length; j++) { + transliteration_step_source_t step_source = steps_source[trans_source.steps_start + j]; + + size_t step_name_len = strlen(step_source.name); + + log_info("Doing step: %s, type=%d\n", step_source.name, step_source.type); + + if (!transliteration_table_add_step(trans_table, step_source.type, step_source.name)) { + log_error("Step couldn't be added\n"); + goto exit_teardown; + } + + if (step_source.type != STEP_RULESET) { + continue; + } + + char_array *step_key = char_array_from_string(char_array_get_string(trans_key)); + char_array_cat(step_key, step_source.name); + char_array_cat(step_key, NAMESPACE_SEPARATOR_CHAR); + + for (int k = 0; k < step_source.rules_length; k++) { + transliteration_rule_source_t rule_source = rules_source[step_source.rules_start + k]; + key = rule_source.key; + key_len = rule_source.key_len; + + pre_context_type = rule_source.pre_context_type; + pre_context_max_len = rule_source.pre_context_max_len; + pre_context = rule_source.pre_context; + pre_context_len = rule_source.pre_context_len; + + post_context_type = rule_source.post_context_type; + post_context_max_len = rule_source.post_context_max_len; + post_context = rule_source.post_context; + post_context_len = rule_source.post_context_len; + + replacement = rule_source.replacement; + replacement_len = rule_source.replacement_len; + + move = rule_source.move; + group_regex_str = rule_source.group_regex_str; + group_regex_len = rule_source.group_regex_len; + + uint32_t data = trans_table->replacements->n; + + char_array *rule_key = char_array_from_string(char_array_get_string(step_key)); + size_t step_len = rule_key->n; + + uint32_t replacement_string_index = cstring_array_num_strings(trans_table->replacement_strings); + cstring_array_add_string_len(trans_table->replacement_strings, replacement, replacement_len); + + group_capture_array *groups = parse_groups(group_regex_str, group_regex_len); + + transliteration_replacement_t *trans_repl = transliteration_replacement_new(replacement_string_index, move, groups); + + uint32_t replacement_index = trans_table->replacements->n; + transliteration_replacement_array_push(trans_table->replacements, trans_repl); + + int c; + + char *token; + + string_tree_t *tree = regex_string_tree(key, key_len); + + string_tree_t *pre_context_tree = NULL; + string_tree_iterator_t *pre_context_iter = NULL; + + cstring_array *pre_context_strings = NULL; + + if (pre_context_type != CONTEXT_TYPE_NONE) { + pre_context_strings = cstring_array_new(); + } + + if (pre_context_type == CONTEXT_TYPE_REGEX) { + log_debug("pre_context_type == CONTEXT_TYPE_REGEX\n"); + pre_context_tree = regex_string_tree(pre_context, pre_context_len); + + pre_context_iter = string_tree_iterator_new(pre_context_tree); + + char_array *pre_context_perm = char_array_new_size(pre_context_len); + + for (; string_tree_iterator_done(pre_context_iter); string_tree_iterator_next(pre_context_iter)) { + char_array_clear(pre_context_perm); + for (c = 0; c < pre_context_iter->num_tokens; c++) { + token = string_tree_iterator_get_string(pre_context_iter, c); + if (token == NULL || strlen(token) == 0) { + log_warn("pre_token_context is NULL or 0 length: %s\n", token); + } + char_array_cat(pre_context_perm, token); + } + token = char_array_get_string(pre_context_perm); + if (token == NULL || strlen(token) == 0) { + log_warn("pre_perm is NULL or 0 length\n"); + } + cstring_array_add_string(pre_context_strings, token); + } + + char_array_destroy(pre_context_perm); + string_tree_iterator_destroy(pre_context_iter); + string_tree_destroy(pre_context_tree); + } else if (pre_context_type == CONTEXT_TYPE_STRING) { + if (pre_context == NULL || strlen(pre_context) == 0) { + log_warn("pre_context STRING NULL or 0 length\n"); + } + cstring_array_add_string(pre_context_strings, pre_context); + } else if (pre_context_type == CONTEXT_TYPE_WORD_BOUNDARY) { + cstring_array_add_string(pre_context_strings, WORD_BOUNDARY_CHAR); + } + + size_t num_pre_context_strings; + if (pre_context_type != CONTEXT_TYPE_NONE) { + num_pre_context_strings = cstring_array_num_strings(pre_context_strings); + log_info("num_pre_context_strings = %zu\n", num_pre_context_strings); + } else { + num_pre_context_strings = 0; + } + + + string_tree_t *post_context_tree = NULL; + string_tree_iterator_t *post_context_iter = NULL; + + cstring_array *post_context_strings = NULL; + + if (post_context_type != CONTEXT_TYPE_NONE) { + post_context_strings = cstring_array_new(); + } + + if (post_context_type == CONTEXT_TYPE_REGEX) { + log_debug("post_context_type == CONTEXT_TYPE_REGEX\n"); + post_context_tree = regex_string_tree(post_context, post_context_len); + + post_context_iter = string_tree_iterator_new(post_context_tree); + + char_array *post_context_perm = char_array_new_size(post_context_len); + + for (; string_tree_iterator_done(post_context_iter); string_tree_iterator_next(post_context_iter)) { + char_array_clear(post_context_perm); + for (c = 0; c < post_context_iter->num_tokens; c++) { + token = string_tree_iterator_get_string(post_context_iter, c); + if (token == NULL || strlen(token) == 0) { + log_error("post_token_context is NULL or 0 length\n"); + } + char_array_cat(post_context_perm, token); + } + + cstring_array_add_string(post_context_strings, char_array_get_string(post_context_perm)); + } + + char_array_destroy(post_context_perm); + string_tree_iterator_destroy(post_context_iter); + string_tree_destroy(post_context_tree); + } else if (post_context_type == CONTEXT_TYPE_STRING) { + if (post_context == NULL || strlen(post_context) == 0) { + log_error("post_context STRING NULL or 0 length\n"); + } + cstring_array_add_string(post_context_strings, post_context); + } else if (post_context_type == CONTEXT_TYPE_WORD_BOUNDARY) { + cstring_array_add_string(post_context_strings, WORD_BOUNDARY_CHAR); + } + + size_t num_post_context_strings = 0; + if (post_context_type != CONTEXT_TYPE_NONE) { + num_post_context_strings = cstring_array_num_strings(post_context_strings); + log_info("num_post_context_strings = %zu\n", num_post_context_strings); + } + + cstring_array *context_strings = NULL; + size_t num_context_strings = 0; + char *context_start_char = NULL; + bool combined_context_strings = false; + + int ante, post; + + if (num_pre_context_strings > 0 && num_post_context_strings > 0) { + context_start_char = PRE_CONTEXT_CHAR; + combined_context_strings = true; + size_t max_string_size = 2 * MAX_UTF8_CHAR_SIZE + + ((pre_context_max_len * MAX_UTF8_CHAR_SIZE) * + (post_context_max_len * MAX_UTF8_CHAR_SIZE)); + num_context_strings = num_pre_context_strings * num_post_context_strings; + char_array *context = char_array_new_size(max_string_size); + context_strings = cstring_array_new_size(num_context_strings * max_string_size + num_context_strings); + for (ante = 0; ante < num_pre_context_strings; ante++) { + char_array_clear(context); + + token = cstring_array_get_token(pre_context_strings, ante); + if (token == NULL || strlen(token) == 0) { + log_error("pre_context token was NULL or 0 length\n"); + goto exit_teardown; + } + + char_array_cat(context, token); + + for (post = 0; post < num_post_context_strings; post++) { + char_array_cat(context, POST_CONTEXT_CHAR); + token = cstring_array_get_token(post_context_strings, post); + char_array_cat(context, token); + if (token == NULL || strlen(token) == 0) { + log_error("post_context token was NULL or 0 length\n"); + goto exit_teardown; + } + + token = char_array_get_string(context); + cstring_array_add_string(context_strings, token); + + } + + } + + char_array_destroy(context); + + } else if (num_pre_context_strings > 0) { + context_start_char = PRE_CONTEXT_CHAR; + num_context_strings = num_pre_context_strings; + context_strings = pre_context_strings; + } else if (num_post_context_strings > 0) { + context_start_char = POST_CONTEXT_CHAR; + num_context_strings = num_post_context_strings; + context_strings = post_context_strings; + } + + if (num_context_strings > 0) { + log_info("num_context_strings = %zu\n", num_context_strings); + } + + + if (tree == NULL) { + log_error("Tree was NULL, rule=%s\n", key); + goto exit_teardown; + } + + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + + //log_info("iter->remaining=%d\n", iter->remaining); + + char *key_str; + + for (; string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + rule_key->n = step_len; + + for (c = 0; c < iter->num_tokens; c++) { + token = string_tree_iterator_get_string(iter, c); + if (token == NULL) { + log_error("string_tree_iterator_get_string was NULL: %s\n", key); + goto exit_teardown; + } + char_array_cat(rule_key, token); + } + + + size_t context_key_len; + + if (num_context_strings == 0) { + token = char_array_get_string(rule_key); + trie_add(trans_table->trie, token, replacement_index); + } else { + char_array_cat(rule_key, context_start_char); + context_key_len = rule_key->n; + + for (c = 0; c < num_context_strings; c++) { + rule_key->n = context_key_len; + token = cstring_array_get_token(context_strings, c); + if (token == NULL) { + log_error("token was NULL for c=%d\n", c); + } + char_array_cat(rule_key, token); + token = char_array_get_string(rule_key); + trie_add(trans_table->trie, token, replacement_index); + } + + } + + } + + string_tree_iterator_destroy(iter); + string_tree_destroy(tree); + + char_array_destroy(rule_key); + + if (pre_context_strings != NULL) { + cstring_array_destroy(pre_context_strings); + } + + if (post_context_strings != NULL) { + cstring_array_destroy(post_context_strings); + } + + // Only needed if we created a combined context array + if (combined_context_strings) { + cstring_array_destroy(context_strings); + } + } + + char_array_destroy(step_key); + + } + + char_array_destroy(trans_key); + + if (!transliteration_table_add_transliterator(trans)) { + goto exit_teardown; + } + + } + + transliteration_table_write(f); + fclose(f); + transliteration_module_teardown(); + log_info("Done!\n"); + exit(EXIT_SUCCESS); + +exit_teardown: + log_error("FAIL\n"); + transliteration_module_teardown(); + exit(EXIT_FAILURE); + +} \ No newline at end of file