[transliteration] transliteration struct definitions, memory allocaiton, builder methods and I/O, stubbing transliterate method for the moment

This commit is contained in:
Al
2015-05-16 23:23:23 -04:00
parent 3a74a8c179
commit b983a83a89
2 changed files with 859 additions and 0 deletions

713
src/transliterate.c Normal file
View File

@@ -0,0 +1,713 @@
#include "transliterate.h"
#include "file_utils.h"
#define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
static transliteration_table_t *trans_table = NULL;
transliteration_table_t *get_transliteration_table(void) {
return trans_table;
}
transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length) {
transliterator_t *trans = malloc(sizeof(transliterator_t));
if (trans == NULL) {
return NULL;
}
trans->name = strdup(name);
trans->internal = internal;
trans->steps_index = steps_index;
trans->steps_length = steps_length;
return trans;
}
void transliterator_destroy(transliterator_t *self) {
if (self == NULL) return;
if (self->name) {
free(self->name);
}
free(self);
}
transliterator_t *get_transliterator(char *name) {
transliteration_table_t *trans_table = get_transliteration_table();
if (trans_table == NULL) {
return NULL;
}
khiter_t k;
k = kh_get(str_transliterator, trans_table->transliterators, name);
return (k == kh_end(trans_table->transliterators)) ? kh_value(trans_table->transliterators, k) : NULL;
}
// N.B. stub
char *transliterate(char *trans_name, char *str) {
return str;
}
void transliteration_table_destroy(void) {
transliteration_table_t *trans_table = get_transliteration_table();
if (trans_table == NULL) return;
if (trans_table->trie) {
trie_destroy(trans_table->trie);
}
if (trans_table->transliterators) {
transliterator_t *trans;
kh_foreach_value(trans_table->transliterators, trans, {
transliterator_destroy(trans);
})
kh_destroy(str_transliterator, trans_table->transliterators);
}
if (trans_table->steps) {
step_array_destroy(trans_table->steps);
}
if (trans_table->replacements) {
transliteration_replacement_array_destroy(trans_table->replacements);
}
if (trans_table->replacement_strings) {
cstring_array_destroy(trans_table->replacement_strings);
}
free(trans_table);
}
transliteration_table_t *transliteration_table_init(void) {
transliteration_table_t *trans_table = get_transliteration_table();
if (trans_table == NULL) {
trans_table = malloc(sizeof(transliteration_table_t));
trans_table->trie = trie_new();
if (trans_table->trie == NULL) {
goto exit_trans_table_created;
}
trans_table->transliterators = kh_init(str_transliterator);
if (trans_table->transliterators == NULL) {
goto exit_trans_table_created;
}
trans_table->steps = step_array_new();
if (trans_table->steps == NULL) {
goto exit_trans_table_created;
}
trans_table->replacements = transliteration_replacement_array_new();
if (trans_table->replacements == NULL) {
goto exit_trans_table_created;
}
trans_table->replacement_strings = cstring_array_new();
if (trans_table->replacement_strings == NULL) {
goto exit_trans_table_created;
}
}
return trans_table;
exit_trans_table_created:
transliteration_table_destroy();
exit(1);
}
transliteration_step_t *transliteration_step_new(char *name, step_type_t type) {
transliteration_step_t *self = malloc(sizeof(transliteration_step_t));
if (self == NULL) {
return NULL;
}
self->name = strdup(name);
if (self->name == NULL) {
transliteration_step_destroy(self);
}
self->type = type;
return self;
}
void transliteration_step_destroy(transliteration_step_t *self) {
if (self == NULL) {
return;
}
if (self->name != NULL) {
free(self->name);
}
free(self);
}
transliteration_replacement_t *transliteration_replacement_new(uint32_t string_index, int32_t move, group_capture_array *groups) {
transliteration_replacement_t *replacement = malloc(sizeof(transliteration_replacement_t));
if (replacement == NULL) {
return NULL;
}
replacement->num_groups = groups == NULL ? 0 : groups->n;
replacement->groups = groups;
replacement->string_index = string_index;
replacement->move = move;
return replacement;
exit_replacement_created:
transliteration_replacement_destroy(replacement);
return NULL;
}
void transliteration_replacement_destroy(transliteration_replacement_t *self) {
if (self == NULL) return;
if (self->groups != NULL) {
group_capture_array_destroy(self->groups);
}
free(self);
}
bool transliteration_table_add_transliterator(transliterator_t *trans) {
if (trans_table == NULL) {
return false;
}
int ret;
khiter_t k = kh_put(str_transliterator, trans_table->transliterators, trans->name, &ret);
kh_value(trans_table->transliterators, k) = trans;
return true;
}
char *transliterator_replace_strings(trie_t *trie, cstring_array *replacements, char *input) {
phrase_array *phrases;
char_array *str;
char *current = input;
bool is_original = true;
size_t len = strlen(input);
// We may go through several rounds of replacements
while (1) {
phrases = trie_search(trie, current);
if (!phrases) {
break;
} else {
str = char_array_new_size(len);
phrase_t phrase;
int start = 0;
int end = 0;
for (int i = 0; i < phrases->n; i++) {
phrase = phrases->a[i];
end = phrase.start;
char_array_append_len(str, input + start, end - start);
char_array_append(str, cstring_array_get_token(replacements, phrase.data));
start = phrase.start + phrase.len;
}
char_array_append_len(str, input + end, len - end);
char_array_terminate(str);
if (!is_original) {
free(current);
}
// Destroys the char array itself, but not the string it holds
current = char_array_to_string(str);
is_original = false;
}
}
return current;
}
transliterator_t *transliterator_read(FILE *f) {
size_t trans_name_len;
if (!file_read_int64(f, (int64_t *) &trans_name_len)) {
return false;
}
char name[trans_name_len];
if (!file_read_chars(f, name, trans_name_len)) {
return false;
}
bool internal;
if (!file_read_int8(f, (int8_t *)&internal)) {
return false;
}
uint32_t steps_index;
if (!file_read_int32(f, (int32_t *)&steps_index)) {
return false;
}
uint32_t steps_length;
if (!file_read_int32(f, (int32_t *)&steps_length)) {
return false;
}
transliterator_t *trans = transliterator_new(name, internal, steps_index, steps_length);
return trans;
}
bool transliterator_write(transliterator_t *trans, FILE *f) {
size_t trans_name_len = strlen(trans->name) + 1;
if (!file_write_int64(f, trans_name_len) ||
!file_write_chars(f, trans->name, trans_name_len)) {
return false;
}
if (!file_write_int8(f, (int8_t)trans->internal)) {
return false;
}
if (!file_write_int32(f, trans->steps_index)) {
return false;
}
if (!file_write_int32(f, trans->steps_length)) {
return false;
}
return true;
}
transliteration_step_t *transliteration_step_read(FILE *f) {
size_t step_name_len;
log_info("reading step\n");;
transliteration_step_t *step = malloc(sizeof(transliteration_step_t));
if (step == NULL) {
return NULL;
}
if (!file_read_int32(f, (int32_t *) &step->type)) {
goto exit_step_destroy;
}
if (!file_read_int64(f, (int64_t *) &step_name_len)) {
goto exit_step_destroy;
}
char *name = malloc(step_name_len);
if (name == NULL) {
goto exit_step_destroy;
}
if (!file_read_chars(f, name, step_name_len)) {
free(name);
goto exit_step_destroy;
}
step->name = name;
return step;
exit_step_destroy:
free(step);
}
bool transliteration_step_write(transliteration_step_t *step, FILE *f) {
if (!file_write_int32(f, step->type)) {
return false;
}
// Include the NUL byte
size_t step_name_len = strlen(step->name) + 1;
if (!file_write_int64(f, step_name_len) ||
!file_write_chars(f, step->name, step_name_len)) {
return false;
}
return true;
}
bool group_capture_read(FILE *f, group_capture_t *group) {
if (!file_read_int32(f, (int32_t *)&group->start)) {
return false;
}
if (!file_read_int32(f, (int32_t *)&group->len)) {
return false;
}
return true;
}
bool group_capture_write(group_capture_t group, FILE *f) {
if (!file_write_int32(f, group.start) ||
!file_write_int32(f, group.len)) {
return false;
}
return true;
}
transliteration_replacement_t *transliteration_replacement_read(FILE *f) {
uint32_t string_index;
if (!file_read_int32(f, (int32_t *)&string_index)) {
return NULL;
}
int32_t move;
if (!file_read_int32(f, &move)) {
return NULL;
}
size_t num_groups;
if (!file_read_int64(f, (int64_t *)&num_groups)) {
return NULL;
}
group_capture_array *groups = group_capture_array_new_size(num_groups);
group_capture_t group;
for (int i = 0; i < num_groups; i++) {
if (!group_capture_read(f, &group)) {
group_capture_array_destroy(groups);
return NULL;
}
group_capture_array_push(groups, group);
}
return transliteration_replacement_new(string_index, move, groups);
}
bool transliteration_replacement_write(transliteration_replacement_t *replacement, FILE *f) {
if (!file_write_int32(f, replacement->string_index)) {
return false;
}
if (!file_write_int32(f, replacement->move)) {
return false;
}
if (!file_write_int64(f, replacement->num_groups)) {
return false;
}
group_capture_t group;
for (int i = 0; i < replacement->num_groups; i++) {
group = replacement->groups->a[i];
if (!group_capture_write(group, f)) {
return false;
}
}
return true;
}
bool transliteration_table_read(FILE *f) {
if (f == NULL) {
return false;
}
uint32_t signature;
log_info("Reading signature\n");
if (!file_read_int32(f, (int32_t *)&signature) || signature != TRANSLITERATION_TABLE_SIGNATURE) {
return false;
}
trans_table = transliteration_table_init();
log_info("Table initialized\n");
size_t num_transliterators;
if (!file_read_int64(f, (int64_t *)&num_transliterators)) {
goto exit_trans_table_load_error;
}
log_info("num_transliterators = %zu\n", num_transliterators);
int i;
transliterator_t *trans;
for (i = 0; i < num_transliterators; i++) {
trans = transliterator_read(f);
if (trans == NULL) {
log_error("trans was NULL\n");
goto exit_trans_table_load_error;
} else {
log_info("read trans with name: %s\n", trans->name);
}
if (!transliteration_table_add_transliterator(trans)) {
goto exit_trans_table_load_error;
}
}
log_info("Read transliterators\n");
size_t num_steps;
if (!file_read_int64(f, (int64_t *)&num_steps)) {
goto exit_trans_table_load_error;
}
log_info("num_steps = %llu\n", num_steps);
step_array_resize(trans_table->steps, num_steps);
log_info("resized\n");
transliteration_step_t *step;
for (i = 0; i < num_steps; i++) {
step = transliteration_step_read(f);
if (step == NULL) {
goto exit_trans_table_load_error;
}
log_info("Read step with name %s and type %d\n", step->name, step->type);
step_array_push(trans_table->steps, step);
}
log_info("Done with steps\n");
transliteration_replacement_t *replacement;
size_t num_replacements;
if (!file_read_int64(f, (int64_t *)&num_replacements)) {
goto exit_trans_table_load_error;
}
log_info("num_replacements = %zu\n", num_replacements);
transliteration_replacement_array_resize(trans_table->replacements, num_replacements);
log_info("resized\n");
for (i = 0; i < num_replacements; i++) {
replacement = transliteration_replacement_read(f);
if (replacement == NULL) {
goto exit_trans_table_load_error;
}
transliteration_replacement_array_push(trans_table->replacements, replacement);
}
log_info("Done with replacements\n");
size_t num_replacement_tokens;
if (!file_read_int64(f, (int64_t *)&num_replacement_tokens)) {
goto exit_trans_table_load_error;
}
log_info("num_replacement_tokens = %zu\n", num_replacement_tokens);
uint32_array_resize(trans_table->replacement_strings->indices, num_replacement_tokens);
log_info("resized\n");
uint32_t token_index;
for (i = 0; i < num_replacement_tokens; i++) {
if (!file_read_int32(f, (int32_t *)&token_index)) {
goto exit_trans_table_load_error;
}
uint32_array_push(trans_table->replacement_strings->indices, token_index);
}
log_info("Done with replacement token indices\n");
size_t replacement_strings_len;
if (!file_read_int64(f, (int64_t *)&replacement_strings_len)) {
goto exit_trans_table_load_error;
}
log_info("replacement_strings_len = %d\n", replacement_strings_len);
char_array_resize(trans_table->replacement_strings->str, replacement_strings_len);
log_info("resized\n");
if (!file_read_chars(f, trans_table->replacement_strings->str->a, replacement_strings_len)) {
goto exit_trans_table_load_error;
}
log_info("Read replacement_strings\n");
trans_table->replacement_strings->str->n = replacement_strings_len;
// Free the default trie
trie_destroy(trans_table->trie);
trans_table->trie = trie_read(f);
log_info("Read trie\n");
if (trans_table->trie == NULL) {
goto exit_trans_table_load_error;
}
return true;
exit_trans_table_load_error:
transliteration_table_destroy();
return false;
}
bool transliteration_table_write(FILE *f) {
if (f == NULL) {
return false;
}
char *trans_name;
transliterator_t *trans;
if (!file_write_int32(f, TRANSLITERATION_TABLE_SIGNATURE)) {
return false;
}
size_t num_transliterators = kh_size(trans_table->transliterators);
if (!file_write_int64(f, (int64_t)num_transliterators)) {
return false;
}
kh_foreach_value(trans_table->transliterators, trans, {
if (!transliterator_write(trans, f)) {
return false;
}
})
transliteration_step_t *step;
int i;
size_t num_steps = trans_table->steps->n;
if (!file_write_int64(f, (int64_t)num_steps)) {
return false;
}
for (i = 0; i < num_steps; i++) {
step = trans_table->steps->a[i];
if (!transliteration_step_write(step, f)) {
return false;
}
}
size_t num_replacements = trans_table->replacements->n;
if (!file_write_int64(f, (int64_t)num_replacements)) {
return false;
}
transliteration_replacement_t *replacement;
for (i = 0; i < trans_table->replacements->n; i++) {
replacement = trans_table->replacements->a[i];
if (!transliteration_replacement_write(replacement, f)) {
return false;
}
}
size_t replacement_tokens_len = trans_table->replacement_strings->indices->n;
if (!file_write_int64(f, (int64_t) replacement_tokens_len)) {
return false;
}
for (i = 0; i < replacement_tokens_len; i++) {
if (!file_write_int32(f, (int32_t)trans_table->replacement_strings->indices->a[i])) {
return false;
}
}
size_t replacement_strings_len = trans_table->replacement_strings->str->n;
if (!file_write_int64(f, (int64_t) replacement_strings_len)) {
return false;
}
if (!file_write_chars(f, trans_table->replacement_strings->str->a, replacement_strings_len)) {
return false;
}
if (!trie_write(trans_table->trie, f)) {
return false;
}
return true;
}
bool transliteration_table_load(char *filename) {
if (filename == NULL || trans_table != NULL) {
return false;
}
FILE *f;
if ((f = fopen(filename, "rb")) != NULL) {
bool ret = transliteration_table_read(f);
fclose(f);
return ret;
} else {
return false;
}
}
bool transliteration_table_save(char *filename) {
if (trans_table == NULL || filename == NULL) {
return false;
}
FILE *f;
if ((f = fopen(filename, "wb")) != NULL) {
bool ret = transliteration_table_write(f);
fclose(f);
return ret;
} else {
return false;
}
}
bool transliteration_module_setup(char *filename) {
if (filename == NULL && trans_table == NULL) {
// Just init the table
trans_table = transliteration_table_init();
return true;
} else if (trans_table == NULL) {
return transliteration_table_load(filename);
}
return false;
}
void transliteration_module_teardown(void) {
transliteration_table_destroy();
}

146
src/transliterate.h Normal file
View File

@@ -0,0 +1,146 @@
#ifndef TRANSLITERATE_H
#define TRANSLITERATE_H
#include <regex.h>
#include <stdbool.h>
#include <stdlib.h>
#include "collections.h"
#include "klib/khash.h"
#include "string_utils.h"
#include "trie.h"
#include "trie_search.h"
#define DEFAULT_TRANSLITERATION_PATH "../data/transliteration/transliteration.dat"
#define MAX_TRANS_NAME_LEN 100
typedef enum {
STEP_RULESET,
STEP_TRANSFORM,
STEP_UNICODE_NORMALIZATION
} step_type_t;
typedef struct transliteration_step {
step_type_t type;
char *name;
} transliteration_step_t;
transliteration_step_t *transliteration_step_new(char *name, step_type_t type);
void transliteration_step_destroy(transliteration_step_t *self);
VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy)
typedef struct transliterator {
char *name;
uint8_t internal;
uint32_t steps_index;
size_t steps_length;
} transliterator_t;
#define MAX_GROUP_LEN 5
typedef struct group_capture {
size_t start;
size_t len;
} group_capture_t;
VECTOR_INIT(group_capture_array, group_capture_t)
typedef struct transliteration_replacement {
uint32_t string_index;
int32_t move;
size_t num_groups;
group_capture_array *groups;
} transliteration_replacement_t;
transliteration_replacement_t *transliteration_replacement_new(
uint32_t string_index,
int32_t move,
group_capture_array *groups
);
void transliteration_replacement_destroy(transliteration_replacement_t *self);
VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy)
KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *)
typedef struct transliteration_table {
khash_t(str_transliterator) *transliterators;
step_array *steps;
trie_t *trie;
transliteration_replacement_array *replacements;
cstring_array *replacement_strings;
} transliteration_table_t;
#define NAMESPACE_SEPARATOR_CHAR "|"
#define NAMESPACE_SEPARATOR_CHAR_LEN strlen(NAMESPACE_SEPARATOR_CHAR)
// Control characters are special
#define WORD_BOUNDARY_CHAR "\x01"
#define WORD_BOUNDARY_CODEPOINT 1
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
#define PRE_CONTEXT_CHAR "\x02"
#define PRE_CONTEXT_CODEPOINT 2
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
#define POST_CONTEXT_CHAR "\x03"
#define POST_CONTEXT_CODEPOINT 3
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
#define EMPTY_TRANSITION_CHAR "\x04"
#define EMPTY_TRANSITION_CODEPOINT 4
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
#define REPEAT_ZERO_CHAR "\x05"
#define REPEAT_ZERO_CODEPOINT 5
#define REPEAT_ZERO_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
#define REPEAT_ONE_CHAR "\x06"
#define REPEAT_ONE_CODEPOINT 6
#define REPEAT_ONE_CHAR_LEN strlen(REPEAT_ONE_CHAR)
#define BEGIN_SET_CHAR "\x0e"
#define BEGIN_SET_CODEPOINT 14
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
#define END_SET_CHAR "\x0f"
#define END_SET_CODEPOINT 15
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
#define GROUP_INDICATOR_CHAR "\x10"
#define GROUP_INDICATOR_CODEPOINT 16
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
#define DOLLAR_CODEPOINT 36
#define LPAREN_CODEPOINT 40
#define RPAREN_CODEPOINT 41
#define STAR_CODEPOINT 42
#define PLUS_CODEPOINT 43
#define LSQUARE_CODEPOINT 91
#define BACKSLASH_CODEPOINT 92
#define RSQUARE_CODEPOINT 93
#define LCURLY_CODEPOINT 123
#define RCURLY_CODEPOINT 125
// Primary API
transliteration_table_t *get_transliteration_table(void);
transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length);
void transliterator_destroy(transliterator_t *self);
bool transliteration_table_add_transliterator(transliterator_t *trans);
transliterator_t *get_transliterator(char *name);
char *transliterate(char *trans_name, char *str);
bool transliteration_table_write(FILE *file);
bool transliteration_table_save(char *filename);
// Module setup/teardown
bool transliteration_module_setup(char *filename);
void transliteration_module_teardown(void);
#endif