175 lines
5.9 KiB
C
175 lines
5.9 KiB
C
#ifndef TRANSLITERATE_H
|
|
#define TRANSLITERATE_H
|
|
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include <stdbool.h>
|
|
|
|
#include "collections.h"
|
|
#include "constants.h"
|
|
#include "klib/khash.h"
|
|
#include "string_utils.h"
|
|
#include "trie.h"
|
|
#include "trie_search.h"
|
|
#include "unicode_scripts.h"
|
|
|
|
#define LATIN_ASCII "latin-ascii"
|
|
|
|
#define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR "transliteration.dat"
|
|
|
|
#define MAX_TRANS_NAME_LEN 100
|
|
|
|
typedef enum {
|
|
STEP_RULESET,
|
|
STEP_TRANSFORM,
|
|
STEP_UNICODE_NORMALIZATION
|
|
} step_type_t;
|
|
|
|
typedef struct transliteration_step {
|
|
step_type_t type;
|
|
char *name;
|
|
} transliteration_step_t;
|
|
|
|
transliteration_step_t *transliteration_step_new(char *name, step_type_t type);
|
|
void transliteration_step_destroy(transliteration_step_t *self);
|
|
|
|
VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy)
|
|
|
|
typedef struct transliterator {
|
|
char *name;
|
|
uint8_t internal;
|
|
uint32_t steps_index;
|
|
size_t steps_length;
|
|
} transliterator_t;
|
|
|
|
#define MAX_GROUP_LEN 5
|
|
|
|
typedef struct group_capture {
|
|
size_t start;
|
|
size_t len;
|
|
} group_capture_t;
|
|
|
|
VECTOR_INIT(group_capture_array, group_capture_t)
|
|
|
|
typedef struct transliteration_replacement {
|
|
uint32_t string_index;
|
|
uint32_t revisit_index;
|
|
size_t num_groups;
|
|
group_capture_array *groups;
|
|
} transliteration_replacement_t;
|
|
|
|
transliteration_replacement_t *transliteration_replacement_new(
|
|
uint32_t string_index,
|
|
uint32_t revisit_index,
|
|
group_capture_array *groups
|
|
);
|
|
|
|
void transliteration_replacement_destroy(transliteration_replacement_t *self);
|
|
|
|
VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy)
|
|
|
|
KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *)
|
|
|
|
#define kh_script_lang_hash(key) ((khint_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language)))
|
|
#define kh_script_lang_equal(a, b) (((a).script == (b).script) && strcmp((a).language, (b).language) == 0)
|
|
|
|
typedef struct transliterator_index {
|
|
size_t transliterator_index;
|
|
size_t num_transliterators;
|
|
} transliterator_index_t;
|
|
|
|
#define NULL_TRANSLITERATOR_INDEX (transliterator_index_t) {0, 0}
|
|
|
|
KHASH_INIT(script_language_index, script_language_t, transliterator_index_t, 1, kh_script_lang_hash, kh_script_lang_equal)
|
|
|
|
typedef struct transliteration_table {
|
|
khash_t(str_transliterator) *transliterators;
|
|
|
|
khash_t(script_language_index) *script_languages;
|
|
cstring_array *transliterator_names;
|
|
|
|
step_array *steps;
|
|
trie_t *trie;
|
|
|
|
transliteration_replacement_array *replacements;
|
|
cstring_array *replacement_strings;
|
|
cstring_array *revisit_strings;
|
|
} transliteration_table_t;
|
|
|
|
// Control characters are special
|
|
#define WORD_BOUNDARY_CHAR "\x01"
|
|
#define WORD_BOUNDARY_CODEPOINT 1
|
|
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
|
|
#define PRE_CONTEXT_CHAR "\x86"
|
|
#define PRE_CONTEXT_CODEPOINT 134
|
|
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
|
|
#define POST_CONTEXT_CHAR "\x87"
|
|
#define POST_CONTEXT_CODEPOINT 135
|
|
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
|
|
#define EMPTY_TRANSITION_CHAR "\x04"
|
|
#define EMPTY_TRANSITION_CODEPOINT 4
|
|
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
|
|
#define REPEAT_CHAR "\x05"
|
|
#define REPEAT_CODEPOINT 5
|
|
#define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
|
|
#define GROUP_INDICATOR_CHAR "\x1d"
|
|
#define GROUP_INDICATOR_CODEPOINT 29
|
|
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
|
#define BEGIN_SET_CHAR "\x0f"
|
|
#define BEGIN_SET_CODEPOINT 15
|
|
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
|
|
#define END_SET_CHAR "\x0e"
|
|
#define END_SET_CODEPOINT 14
|
|
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
|
|
|
|
|
|
#define DOLLAR_CODEPOINT 36
|
|
|
|
#define LPAREN_CODEPOINT 40
|
|
#define RPAREN_CODEPOINT 41
|
|
|
|
#define STAR_CODEPOINT 42
|
|
#define PLUS_CODEPOINT 43
|
|
|
|
#define LSQUARE_CODEPOINT 91
|
|
#define BACKSLASH_CODEPOINT 92
|
|
#define RSQUARE_CODEPOINT 93
|
|
|
|
#define LCURLY_CODEPOINT 123
|
|
#define RCURLY_CODEPOINT 125
|
|
|
|
|
|
// Primary API
|
|
transliteration_table_t *get_transliteration_table(void);
|
|
|
|
transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length);
|
|
void transliterator_destroy(transliterator_t *self);
|
|
|
|
bool transliteration_table_add_transliterator(transliterator_t *trans);
|
|
|
|
transliterator_t *get_transliterator(char *name);
|
|
char *transliterate(char *trans_name, char *str, size_t len);
|
|
|
|
bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index);
|
|
transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language);
|
|
|
|
#define foreach_transliterator(script, language, transliterator_var, code) do { \
|
|
transliteration_table_t *__trans_table = get_transliteration_table(); \
|
|
transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \
|
|
for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \
|
|
transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i); \
|
|
if (transliterator_var == NULL) break; \
|
|
code; \
|
|
} \
|
|
} while (0);
|
|
|
|
bool transliteration_table_write(FILE *file);
|
|
bool transliteration_table_save(char *filename);
|
|
|
|
// Module setup/teardown
|
|
bool transliteration_module_init(void);
|
|
bool transliteration_module_setup(char *filename);
|
|
void transliteration_module_teardown(void);
|
|
|
|
#endif
|