[normalize] cstring_array instead of string_tree for token-based normalization
This commit is contained in:
@@ -148,11 +148,12 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint64_t options) {
|
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) {
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
|
|
||||||
uint8_t *ptr = (uint8_t *)str + token.offset;
|
uint8_t *ptr = (uint8_t *)str + token.offset;
|
||||||
size_t len = token.len;
|
size_t len = token.len;
|
||||||
|
if (token.len == 0) return;
|
||||||
|
|
||||||
int32_t ch;
|
int32_t ch;
|
||||||
ssize_t char_len;
|
ssize_t char_len;
|
||||||
@@ -160,9 +161,7 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
|
|||||||
bool last_was_letter = false;
|
bool last_was_letter = false;
|
||||||
bool append_char = true;
|
bool append_char = true;
|
||||||
|
|
||||||
cstring_array *array = tree->strings;
|
cstring_array_start_token(array);
|
||||||
|
|
||||||
size_t initial_n = array->str->n;
|
|
||||||
|
|
||||||
while (idx < len) {
|
while (idx < len) {
|
||||||
char_len = utf8proc_iterate(ptr, len, &ch);
|
char_len = utf8proc_iterate(ptr, len, &ch);
|
||||||
@@ -203,7 +202,7 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ch == APOSTROPHE_CODEPOINT && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
|
if (ch == APOSTROPHE_CODEPOINT && token.type == WORD && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
|
||||||
append_char = false;
|
append_char = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -219,7 +218,6 @@ void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint6
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (array->str->n > initial_n) {
|
cstring_array_terminate(array);
|
||||||
string_tree_finalize_token(tree);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ char *normalize_string_utf8(char *str, uint64_t options);
|
|||||||
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
||||||
|
|
||||||
// Takes NORMALIZE_TOKEN_* options
|
// Takes NORMALIZE_TOKEN_* options
|
||||||
void add_token_alternatives(string_tree_t *tree, char *str, token_t token, uint64_t options);
|
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options);
|
||||||
|
|
||||||
// Takes NORMALIZE_STRING_* options
|
// Takes NORMALIZE_STRING_* options
|
||||||
string_tree_t *normalize_string(char *str, uint64_t options);
|
string_tree_t *normalize_string(char *str, uint64_t options);
|
||||||
|
|||||||
Reference in New Issue
Block a user