[utils] cstring_array_get_token=>cstring_array_get_string
This commit is contained in:
@@ -614,7 +614,7 @@ inline int32_t cstring_array_get_offset(cstring_array *self, uint32_t i) {
|
|||||||
return (int32_t)self->indices->a[i];
|
return (int32_t)self->indices->a[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline char *cstring_array_get_token(cstring_array *self, uint32_t i) {
|
inline char *cstring_array_get_string(cstring_array *self, uint32_t i) {
|
||||||
int32_t data_index = cstring_array_get_offset(self, i);
|
int32_t data_index = cstring_array_get_offset(self, i);
|
||||||
if (data_index < 0) return NULL;
|
if (data_index < 0) return NULL;
|
||||||
return self->str->a + data_index;
|
return self->str->a + data_index;
|
||||||
@@ -822,7 +822,7 @@ char *string_tree_iterator_get_string(string_tree_iterator_t *self, uint32_t i)
|
|||||||
uint32_t base_index = self->tree->token_indices->a[i];
|
uint32_t base_index = self->tree->token_indices->a[i];
|
||||||
uint32_t offset = self->path[i];
|
uint32_t offset = self->path[i];
|
||||||
|
|
||||||
return cstring_array_get_token(self->tree->strings, base_index + offset);
|
return cstring_array_get_string(self->tree->strings, base_index + offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool string_tree_iterator_done(string_tree_iterator_t *self) {
|
bool string_tree_iterator_done(string_tree_iterator_t *self) {
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ uint32_t cstring_array_add_string_len(cstring_array *self, char *str, size_t len
|
|||||||
void cstring_array_append_string(cstring_array *self, char *str);
|
void cstring_array_append_string(cstring_array *self, char *str);
|
||||||
void cstring_array_append_string_len(cstring_array *self, char *str, size_t len);
|
void cstring_array_append_string_len(cstring_array *self, char *str, size_t len);
|
||||||
int32_t cstring_array_get_offset(cstring_array *self, uint32_t i);
|
int32_t cstring_array_get_offset(cstring_array *self, uint32_t i);
|
||||||
char *cstring_array_get_token(cstring_array *self, uint32_t i);
|
char *cstring_array_get_string(cstring_array *self, uint32_t i);
|
||||||
int64_t cstring_array_token_length(cstring_array *self, uint32_t i);
|
int64_t cstring_array_token_length(cstring_array *self, uint32_t i);
|
||||||
|
|
||||||
void cstring_array_destroy(cstring_array *self);
|
void cstring_array_destroy(cstring_array *self);
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens)
|
|||||||
|
|
||||||
char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index) {
|
char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index) {
|
||||||
if (index < self->tokens->n) {
|
if (index < self->tokens->n) {
|
||||||
return cstring_array_get_token(self->str, index);
|
return cstring_array_get_string(self->str, index);
|
||||||
} else {
|
} else {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -611,7 +611,7 @@ static char *replace_groups(trie_t *trie, char *str, char *replacement, group_ca
|
|||||||
log_debug("in group ref\n");
|
log_debug("in group ref\n");
|
||||||
sscanf((char *)replacement_ptr, "%d", &group_ref);
|
sscanf((char *)replacement_ptr, "%d", &group_ref);
|
||||||
log_debug("Got group_ref=%d\n", group_ref);
|
log_debug("Got group_ref=%d\n", group_ref);
|
||||||
char *group = cstring_array_get_token(group_strings, group_ref-1);
|
char *group = cstring_array_get_string(group_strings, group_ref-1);
|
||||||
log_debug("Got group=%s\n", group);
|
log_debug("Got group=%s\n", group);
|
||||||
if (group != NULL) {
|
if (group != NULL) {
|
||||||
char_array_cat(ret, group);
|
char_array_cat(ret, group);
|
||||||
@@ -817,11 +817,11 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
|
|
||||||
|
|
||||||
if (replacement != NULL) {
|
if (replacement != NULL) {
|
||||||
char *replacement_string = cstring_array_get_token(trans_table->replacement_strings, replacement->string_index);
|
char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
|
||||||
char *revisit_string = NULL;
|
char *revisit_string = NULL;
|
||||||
if (replacement->revisit_index != 0) {
|
if (replacement->revisit_index != 0) {
|
||||||
log_debug("revisit_index = %d\n", replacement->revisit_index);
|
log_debug("revisit_index = %d\n", replacement->revisit_index);
|
||||||
revisit_string = cstring_array_get_token(trans_table->revisit_strings, replacement->revisit_index);
|
revisit_string = cstring_array_get_string(trans_table->revisit_strings, replacement->revisit_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool free_revisit = false;
|
bool free_revisit = false;
|
||||||
@@ -936,17 +936,17 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
|
|
||||||
} else if (step->type == STEP_UNICODE_NORMALIZATION) {
|
} else if (step->type == STEP_UNICODE_NORMALIZATION) {
|
||||||
log_debug("unicode normalization\n");
|
log_debug("unicode normalization\n");
|
||||||
int utf8proc_options = UTF8PROC_NULLTERM | UTF8PROC_STABLE;
|
int utf8proc_options = UTF8PROC_OPTIONS_BASE;
|
||||||
if (strcmp(step->name, NFD) == 0) {
|
if (strcmp(step->name, NFD) == 0) {
|
||||||
utf8proc_options = utf8proc_options | UTF8PROC_DECOMPOSE;
|
utf8proc_options = UTF8PROC_OPTIONS_NFD;
|
||||||
} else if (strcmp(step->name, NFC) == 0) {
|
} else if (strcmp(step->name, NFC) == 0) {
|
||||||
utf8proc_options = utf8proc_options | UTF8PROC_COMPOSE;
|
utf8proc_options = UTF8PROC_OPTIONS_NFC;
|
||||||
} else if (strcmp(step->name, NFKD) == 0) {
|
} else if (strcmp(step->name, NFKD) == 0) {
|
||||||
utf8proc_options = utf8proc_options | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
|
utf8proc_options = UTF8PROC_OPTIONS_NFKD;
|
||||||
} else if (strcmp(step->name, NFKC) == 0) {
|
} else if (strcmp(step->name, NFKC) == 0) {
|
||||||
utf8proc_options = utf8proc_options | UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
|
utf8proc_options = UTF8PROC_OPTIONS_NKFC;
|
||||||
} else if (strcmp(step->name, STRIP_MARK) == 0) {
|
} else if (strcmp(step->name, STRIP_MARK) == 0) {
|
||||||
utf8proc_options = utf8proc_options | UTF8PROC_STRIPMARK;
|
utf8proc_options = UTF8PROC_OPTIONS_STRIP_ACCENTS;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *utf8proc_normalized = NULL;
|
uint8_t *utf8proc_normalized = NULL;
|
||||||
@@ -1199,7 +1199,7 @@ char *transliterator_replace_strings(trie_t *trie, cstring_array *replacements,
|
|||||||
phrase = phrases->a[i];
|
phrase = phrases->a[i];
|
||||||
end = phrase.start;
|
end = phrase.start;
|
||||||
char_array_append_len(str, input + start, end - start);
|
char_array_append_len(str, input + start, end - start);
|
||||||
char_array_append(str, cstring_array_get_token(replacements, phrase.data));
|
char_array_append(str, cstring_array_get_string(replacements, phrase.data));
|
||||||
start = phrase.start + phrase.len;
|
start = phrase.start + phrase.len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,8 @@
|
|||||||
#include "trie_search.h"
|
#include "trie_search.h"
|
||||||
#include "unicode_scripts.h"
|
#include "unicode_scripts.h"
|
||||||
|
|
||||||
|
#define LATIN_ASCII "latin-ascii"
|
||||||
|
|
||||||
#define DEFAULT_TRANSLITERATION_PATH "../data/transliteration/transliteration.dat"
|
#define DEFAULT_TRANSLITERATION_PATH "../data/transliteration/transliteration.dat"
|
||||||
|
|
||||||
#define MAX_TRANS_NAME_LEN 100
|
#define MAX_TRANS_NAME_LEN 100
|
||||||
@@ -160,7 +162,7 @@ transliterator_index_t get_transliterator_index_for_script_language(script_t scr
|
|||||||
#define foreach_transliterator(script, language, transliterator_var, code) do { \
|
#define foreach_transliterator(script, language, transliterator_var, code) do { \
|
||||||
transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \
|
transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \
|
||||||
for (int __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \
|
for (int __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \
|
||||||
transliterator_var = cstring_array_get_token(trans_table->transliterator_names, __i); \
|
transliterator_var = cstring_array_get_string(trans_table->transliterator_names, __i); \
|
||||||
if (transliterator_var == NULL) break; \
|
if (transliterator_var == NULL) break; \
|
||||||
code; \
|
code; \
|
||||||
} \
|
} \
|
||||||
|
|||||||
@@ -457,7 +457,7 @@ int main(int argc, char **argv) {
|
|||||||
for (ante = 0; ante < num_pre_context_strings; ante++) {
|
for (ante = 0; ante < num_pre_context_strings; ante++) {
|
||||||
char_array_clear(context);
|
char_array_clear(context);
|
||||||
|
|
||||||
token = cstring_array_get_token(pre_context_strings, ante);
|
token = cstring_array_get_string(pre_context_strings, ante);
|
||||||
if (token == NULL || strlen(token) == 0) {
|
if (token == NULL || strlen(token) == 0) {
|
||||||
log_error("pre_context token was NULL or 0 length\n");
|
log_error("pre_context token was NULL or 0 length\n");
|
||||||
goto exit_teardown;
|
goto exit_teardown;
|
||||||
@@ -469,7 +469,7 @@ int main(int argc, char **argv) {
|
|||||||
for (post = 0; post < num_post_context_strings; post++) {
|
for (post = 0; post < num_post_context_strings; post++) {
|
||||||
context->n = context_len;
|
context->n = context_len;
|
||||||
char_array_cat(context, POST_CONTEXT_CHAR);
|
char_array_cat(context, POST_CONTEXT_CHAR);
|
||||||
token = cstring_array_get_token(post_context_strings, post);
|
token = cstring_array_get_string(post_context_strings, post);
|
||||||
char_array_cat(context, token);
|
char_array_cat(context, token);
|
||||||
if (token == NULL || strlen(token) == 0) {
|
if (token == NULL || strlen(token) == 0) {
|
||||||
log_error("post_context token was NULL or 0 length\n");
|
log_error("post_context token was NULL or 0 length\n");
|
||||||
@@ -542,7 +542,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
for (c = 0; c < num_context_strings; c++) {
|
for (c = 0; c < num_context_strings; c++) {
|
||||||
rule_key->n = context_key_len;
|
rule_key->n = context_key_len;
|
||||||
token = cstring_array_get_token(context_strings, c);
|
token = cstring_array_get_string(context_strings, c);
|
||||||
if (token == NULL) {
|
if (token == NULL) {
|
||||||
log_error("token was NULL for c=%d\n", c);
|
log_error("token was NULL for c=%d\n", c);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user