[normalize] cstring_array instead of string_tree for token-based normalization

This commit is contained in:
Al
2015-07-28 19:09:50 -04:00
parent 90d4da9e72
commit 551904d202
2 changed files with 7 additions and 9 deletions

View File

@@ -148,11 +148,12 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
}
void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint64_t options) {
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) {
size_t idx = 0;
uint8_t *ptr = (uint8_t *)str + token.offset;
size_t len = token.len;
if (token.len == 0) return;
int32_t ch;
ssize_t char_len;
@@ -160,9 +161,7 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
bool last_was_letter = false;
bool append_char = true;
cstring_array *array = tree->strings;
size_t initial_n = array->str->n;
cstring_array_start_token(array);
while (idx < len) {
char_len = utf8proc_iterate(ptr, len, &ch);
@@ -203,7 +202,7 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
}
}
if (ch == APOSTROPHE_CODEPOINT && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
if (ch == APOSTROPHE_CODEPOINT && token.type == WORD && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
append_char = false;
}
@@ -219,7 +218,6 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
}
if (array->str->n > initial_n) {
string_tree_finalize_token(tree);
}
cstring_array_terminate(array);
}

View File

@@ -58,7 +58,7 @@ char *normalize_string_utf8(char *str, uint64_t options);
char *normalize_string_latin(char *str, size_t len, uint64_t options);
// Takes NORMALIZE_TOKEN_* options
void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint64_t options);
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options);
// Takes NORMALIZE_STRING_* options
string_tree_t *normalize_string(char *str, uint64_t options);