diff --git a/src/string_utils.c b/src/string_utils.c index d7b8840b..b9ad7d4e 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -614,7 +614,7 @@ inline int32_t cstring_array_get_offset(cstring_array *self, uint32_t i) { return (int32_t)self->indices->a[i]; } -inline char *cstring_array_get_token(cstring_array *self, uint32_t i) { +inline char *cstring_array_get_string(cstring_array *self, uint32_t i) { int32_t data_index = cstring_array_get_offset(self, i); if (data_index < 0) return NULL; return self->str->a + data_index; @@ -822,7 +822,7 @@ char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i) uint32_t base_index = self->tree->token_indices->a[i]; uint32_t offset = self->path[i]; - return cstring_array_get_token(self->tree->strings, base_index + offset); + return cstring_array_get_string(self->tree->strings, base_index + offset); } bool string_tree_iterator_done(string_tree_iterator_t *self) { diff --git a/src/string_utils.h b/src/string_utils.h index f9ef0ea6..90d78df1 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -145,7 +145,7 @@ uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len void cstring_array_append_string(cstring_array *self, char *str); void cstring_array_append_string_len(cstring_array *self, char *str, size_t len); int32_t cstring_array_get_offset(cstring_array *self, uint32_t i); -char *cstring_array_get_token(cstring_array *self, uint32_t i); +char *cstring_array_get_string(cstring_array *self, uint32_t i); int64_t cstring_array_token_length(cstring_array *self, uint32_t i); void cstring_array_destroy(cstring_array *self); diff --git a/src/tokens.c b/src/tokens.c index e3543fdd..f0bdc135 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -37,7 +37,7 @@ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens) char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index) { if (index < self->tokens->n) { - return cstring_array_get_token(self->str, index); + return cstring_array_get_string(self->str, index); } else { return NULL; } diff --git a/src/transliterate.c b/src/transliterate.c index 32198c7e..d097029b 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -611,7 +611,7 @@ static char *replace_groups(trie_t *trie, char *str, char *replacement, group_ca log_debug("in group ref\n"); sscanf((char *)replacement_ptr, "%d", &group_ref); log_debug("Got group_ref=%d\n", group_ref); - char *group = cstring_array_get_token(group_strings, group_ref-1); + char *group = cstring_array_get_string(group_strings, group_ref-1); log_debug("Got group=%s\n", group); if (group != NULL) { char_array_cat(ret, group); @@ -817,11 +817,11 @@ char *transliterate(char *trans_name, char *str, size_t len) { if (replacement != NULL) { - char *replacement_string = cstring_array_get_token(trans_table->replacement_strings, replacement->string_index); + char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index); char *revisit_string = NULL; if (replacement->revisit_index != 0) { log_debug("revisit_index = %d\n", replacement->revisit_index); - revisit_string = cstring_array_get_token(trans_table->revisit_strings, replacement->revisit_index); + revisit_string = cstring_array_get_string(trans_table->revisit_strings, replacement->revisit_index); } bool free_revisit = false; @@ -936,17 +936,17 @@ char *transliterate(char *trans_name, char *str, size_t len) { } else if (step->type == STEP_UNICODE_NORMALIZATION) { log_debug("unicode normalization\n"); - int utf8proc_options = UTF8PROC_NULLTERM | UTF8PROC_STABLE; + int utf8proc_options = UTF8PROC_OPTIONS_BASE; if (strcmp(step->name, NFD) == 0) { - utf8proc_options = utf8proc_options | UTF8PROC_DECOMPOSE; + utf8proc_options = UTF8PROC_OPTIONS_NFD; } else if (strcmp(step->name, NFC) == 0) { - utf8proc_options = utf8proc_options | UTF8PROC_COMPOSE; + utf8proc_options = UTF8PROC_OPTIONS_NFC; } else if (strcmp(step->name, NFKD) == 0) { - utf8proc_options = utf8proc_options | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT; + utf8proc_options = UTF8PROC_OPTIONS_NFKD; } else if (strcmp(step->name, NFKC) == 0) { - utf8proc_options = utf8proc_options | UTF8PROC_COMPOSE | UTF8PROC_COMPAT; + utf8proc_options = UTF8PROC_OPTIONS_NKFC; } else if (strcmp(step->name, STRIP_MARK) == 0) { - utf8proc_options = utf8proc_options | UTF8PROC_STRIPMARK; + utf8proc_options = UTF8PROC_OPTIONS_STRIP_ACCENTS; } uint8_t *utf8proc_normalized = NULL; @@ -1199,7 +1199,7 @@ char *transliterator_replace_strings(trie_t *trie, cstring_array *replacements, phrase = phrases->a[i]; end = phrase.start; char_array_append_len(str, input + start, end - start); - char_array_append(str, cstring_array_get_token(replacements, phrase.data)); + char_array_append(str, cstring_array_get_string(replacements, phrase.data)); start = phrase.start + phrase.len; } diff --git a/src/transliterate.h b/src/transliterate.h index 3dcc6778..f3c07950 100644 --- a/src/transliterate.h +++ b/src/transliterate.h @@ -12,6 +12,8 @@ #include "trie_search.h" #include "unicode_scripts.h" +#define LATIN_ASCII "latin-ascii" + #define DEFAULT_TRANSLITERATION_PATH "../data/transliteration/transliteration.dat" #define MAX_TRANS_NAME_LEN 100 @@ -160,7 +162,7 @@ transliterator_index_t get_transliterator_index_for_script_language(script_t scr #define foreach_transliterator(script, language, transliterator_var, code) do { \ transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \ for (int __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \ - transliterator_var = cstring_array_get_token(trans_table->transliterator_names, __i); \ + transliterator_var = cstring_array_get_string(trans_table->transliterator_names, __i); \ if (transliterator_var == NULL) break; \ code; \ } \ diff --git a/src/transliteration_table_builder.c b/src/transliteration_table_builder.c index 6bde224b..9a5f498c 100644 --- a/src/transliteration_table_builder.c +++ b/src/transliteration_table_builder.c @@ -457,7 +457,7 @@ int main(int argc, char **argv) { for (ante = 0; ante < num_pre_context_strings; ante++) { char_array_clear(context); - token = cstring_array_get_token(pre_context_strings, ante); + token = cstring_array_get_string(pre_context_strings, ante); if (token == NULL || strlen(token) == 0) { log_error("pre_context token was NULL or 0 length\n"); goto exit_teardown; @@ -469,7 +469,7 @@ int main(int argc, char **argv) { for (post = 0; post < num_post_context_strings; post++) { context->n = context_len; char_array_cat(context, POST_CONTEXT_CHAR); - token = cstring_array_get_token(post_context_strings, post); + token = cstring_array_get_string(post_context_strings, post); char_array_cat(context, token); if (token == NULL || strlen(token) == 0) { log_error("post_context token was NULL or 0 length\n"); @@ -542,7 +542,7 @@ int main(int argc, char **argv) { for (c = 0; c < num_context_strings; c++) { rule_key->n = context_key_len; - token = cstring_array_get_token(context_strings, c); + token = cstring_array_get_string(context_strings, c); if (token == NULL) { log_error("token was NULL for c=%d\n", c); }