[transliteration] transliteration struct definitions, memory allocaiton, builder methods and I/O, stubbing transliterate method for the moment

This commit is contained in:
Al
2015-05-16 23:23:23 -04:00
parent 3a74a8c179
commit b983a83a89
2 changed files with 859 additions and 0 deletions

146
src/transliterate.h Normal file
View File

@@ -0,0 +1,146 @@
#ifndef TRANSLITERATE_H
#define TRANSLITERATE_H
#include <regex.h>
#include <stdbool.h>
#include <stdlib.h>
#include "collections.h"
#include "klib/khash.h"
#include "string_utils.h"
#include "trie.h"
#include "trie_search.h"
#define DEFAULT_TRANSLITERATION_PATH "../data/transliteration/transliteration.dat"
#define MAX_TRANS_NAME_LEN 100
typedef enum {
STEP_RULESET,
STEP_TRANSFORM,
STEP_UNICODE_NORMALIZATION
} step_type_t;
typedef struct transliteration_step {
step_type_t type;
char *name;
} transliteration_step_t;
transliteration_step_t *transliteration_step_new(char *name, step_type_t type);
void transliteration_step_destroy(transliteration_step_t *self);
VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy)
typedef struct transliterator {
char *name;
uint8_t internal;
uint32_t steps_index;
size_t steps_length;
} transliterator_t;
#define MAX_GROUP_LEN 5
typedef struct group_capture {
size_t start;
size_t len;
} group_capture_t;
VECTOR_INIT(group_capture_array, group_capture_t)
typedef struct transliteration_replacement {
uint32_t string_index;
int32_t move;
size_t num_groups;
group_capture_array *groups;
} transliteration_replacement_t;
transliteration_replacement_t *transliteration_replacement_new(
uint32_t string_index,
int32_t move,
group_capture_array *groups
);
void transliteration_replacement_destroy(transliteration_replacement_t *self);
VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy)
KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *)
typedef struct transliteration_table {
khash_t(str_transliterator) *transliterators;
step_array *steps;
trie_t *trie;
transliteration_replacement_array *replacements;
cstring_array *replacement_strings;
} transliteration_table_t;
#define NAMESPACE_SEPARATOR_CHAR "|"
#define NAMESPACE_SEPARATOR_CHAR_LEN strlen(NAMESPACE_SEPARATOR_CHAR)
// Control characters are special
#define WORD_BOUNDARY_CHAR "\x01"
#define WORD_BOUNDARY_CODEPOINT 1
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
#define PRE_CONTEXT_CHAR "\x02"
#define PRE_CONTEXT_CODEPOINT 2
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
#define POST_CONTEXT_CHAR "\x03"
#define POST_CONTEXT_CODEPOINT 3
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
#define EMPTY_TRANSITION_CHAR "\x04"
#define EMPTY_TRANSITION_CODEPOINT 4
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
#define REPEAT_ZERO_CHAR "\x05"
#define REPEAT_ZERO_CODEPOINT 5
#define REPEAT_ZERO_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
#define REPEAT_ONE_CHAR "\x06"
#define REPEAT_ONE_CODEPOINT 6
#define REPEAT_ONE_CHAR_LEN strlen(REPEAT_ONE_CHAR)
#define BEGIN_SET_CHAR "\x0e"
#define BEGIN_SET_CODEPOINT 14
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
#define END_SET_CHAR "\x0f"
#define END_SET_CODEPOINT 15
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
#define GROUP_INDICATOR_CHAR "\x10"
#define GROUP_INDICATOR_CODEPOINT 16
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
#define DOLLAR_CODEPOINT 36
#define LPAREN_CODEPOINT 40
#define RPAREN_CODEPOINT 41
#define STAR_CODEPOINT 42
#define PLUS_CODEPOINT 43
#define LSQUARE_CODEPOINT 91
#define BACKSLASH_CODEPOINT 92
#define RSQUARE_CODEPOINT 93
#define LCURLY_CODEPOINT 123
#define RCURLY_CODEPOINT 125
// Primary API
transliteration_table_t *get_transliteration_table(void);
transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length);
void transliterator_destroy(transliterator_t *self);
bool transliteration_table_add_transliterator(transliterator_t *trans);
transliterator_t *get_transliterator(char *name);
char *transliterate(char *trans_name, char *str);
bool transliteration_table_write(FILE *file);
bool transliteration_table_save(char *filename);
// Module setup/teardown
bool transliteration_module_setup(char *filename);
void transliteration_module_teardown(void);
#endif