[normalize] Adding a char_array version of normalize token
This commit is contained in:
@@ -147,8 +147,7 @@ string_tree_t *normalize_string(char *str, uint64_t options) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void append_normalized_token(char_array *array, char *str, token_t token, uint64_t options) {
|
||||||
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) {
|
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
|
|
||||||
uint8_t *ptr = (uint8_t *)str + token.offset;
|
uint8_t *ptr = (uint8_t *)str + token.offset;
|
||||||
@@ -164,8 +163,6 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
bool last_was_letter = false;
|
bool last_was_letter = false;
|
||||||
bool append_char = true;
|
bool append_char = true;
|
||||||
|
|
||||||
cstring_array_start_token(array);
|
|
||||||
|
|
||||||
while (idx < len) {
|
while (idx < len) {
|
||||||
char_len = utf8proc_iterate(ptr, len, &ch);
|
char_len = utf8proc_iterate(ptr, len, &ch);
|
||||||
|
|
||||||
@@ -180,7 +177,7 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
|
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
|
||||||
|
|
||||||
if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
|
if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
|
||||||
cstring_array_append_string(array, " ");
|
char_array_append(array, " ");
|
||||||
append_char = false;
|
append_char = false;
|
||||||
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
|
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
|
||||||
append_char = false;
|
append_char = false;
|
||||||
@@ -195,12 +192,12 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!is_number && append_if_not_numeric != NULL) {
|
if (!is_number && append_if_not_numeric != NULL) {
|
||||||
cstring_array_append_string(array, append_if_not_numeric);
|
char_array_append(array, append_if_not_numeric);
|
||||||
append_if_not_numeric = NULL;
|
append_if_not_numeric = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
|
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
|
||||||
cstring_array_append_string(array, " ");
|
char_array_append(array, " ");
|
||||||
alpha_numeric_split = true;
|
alpha_numeric_split = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,7 +218,7 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
if (this_char == '\'' && next_char == 's') {
|
if (this_char == '\'' && next_char == 's') {
|
||||||
break;
|
break;
|
||||||
} else if (this_char == 's' && next_char == '\'') {
|
} else if (this_char == 's' && next_char == '\'') {
|
||||||
cstring_array_append_string(array, "s");
|
char_array_append(array, "s");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -231,7 +228,7 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (append_char) {
|
if (append_char) {
|
||||||
cstring_array_append_string_len(array, (char *)ptr, char_len);
|
char_array_append_len(array, (char *)ptr, char_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr += char_len;
|
ptr += char_len;
|
||||||
@@ -242,6 +239,14 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) {
|
||||||
|
|
||||||
|
cstring_array_start_token(array);
|
||||||
|
|
||||||
|
append_normalized_token(array->str, str, token, options);
|
||||||
|
|
||||||
cstring_array_terminate(array);
|
cstring_array_terminate(array);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ char *normalize_string_utf8(char *str, uint64_t options);
|
|||||||
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
char *normalize_string_latin(char *str, size_t len, uint64_t options);
|
||||||
|
|
||||||
// Takes NORMALIZE_TOKEN_* options
|
// Takes NORMALIZE_TOKEN_* options
|
||||||
|
void append_normalized_token(char_array *array, char *str, token_t token, uint64_t options);
|
||||||
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options);
|
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options);
|
||||||
|
|
||||||
// Takes NORMALIZE_STRING_* options
|
// Takes NORMALIZE_STRING_* options
|
||||||
|
|||||||
Reference in New Issue
Block a user