[fix] terminate the char_array if input token is zero-length in add_normalized_token

This commit is contained in:
Al
2017-04-28 11:25:07 -04:00
parent 04eb2d4539
commit 6ea2273263

View File

@@ -388,94 +388,95 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
uint8_t *ptr = (uint8_t *)str + token.offset; uint8_t *ptr = (uint8_t *)str + token.offset;
size_t len = token.len; size_t len = token.len;
if (token.len == 0) return; if (token.len > 0) {
bool alpha_numeric_split = false; bool alpha_numeric_split = false;
char *append_if_not_numeric = NULL; char *append_if_not_numeric = NULL;
int32_t ch; int32_t ch;
ssize_t char_len; ssize_t char_len;
bool last_was_letter = false; bool last_was_letter = false;
bool append_char = true; bool append_char = true;
while (idx < len) { while (idx < len) {
char_len = utf8proc_iterate(ptr, len, &ch); char_len = utf8proc_iterate(ptr, len, &ch);
if (char_len <= 0) break; if (char_len <= 0) break;
bool is_hyphen = utf8_is_hyphen(ch); bool is_hyphen = utf8_is_hyphen(ch);
int cat = utf8proc_category(ch); int cat = utf8proc_category(ch);
bool is_letter = utf8_is_letter(cat); bool is_letter = utf8_is_letter(cat);
bool is_number = utf8_is_number(cat); bool is_number = utf8_is_number(cat);
bool is_full_stop = ch == FULL_STOP_CODEPOINT; bool is_full_stop = ch == FULL_STOP_CODEPOINT;
if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) { if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
char_array_append(array, " "); char_array_append(array, " ");
append_char = false; append_char = false;
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
append_char = false;
}
if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {
ptr += char_len;
idx += char_len;
append_if_not_numeric = is_hyphen ? "-" : ".";
append_char = true;
continue;
}
if (!is_number && append_if_not_numeric != NULL) {
char_array_append(array, append_if_not_numeric);
append_if_not_numeric = NULL;
}
if (is_number && options & NORMALIZE_TOKEN_REPLACE_DIGITS) {
char_array_append(array, DIGIT_CHAR);
append_char = false;
}
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
char_array_append(array, " ");
alpha_numeric_split = true;
}
if (is_full_stop) {
if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) {
break;
}
if (token.type == ACRONYM && options & NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) {
append_char = false; append_char = false;
} }
}
if (idx == len - 2 && len > 2 && options & NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES) { if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {
char this_char = *ptr; ptr += char_len;
char next_char = *(ptr + 1); idx += char_len;
append_if_not_numeric = is_hyphen ? "-" : ".";
if ((this_char == '\'' && next_char == 's') || (this_char == 's' && next_char == '\'')) { append_char = true;
char_array_append(array, "s"); continue;
break;
} }
if (!is_number && append_if_not_numeric != NULL) {
char_array_append(array, append_if_not_numeric);
append_if_not_numeric = NULL;
}
if (is_number && options & NORMALIZE_TOKEN_REPLACE_DIGITS) {
char_array_append(array, DIGIT_CHAR);
append_char = false;
}
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
char_array_append(array, " ");
alpha_numeric_split = true;
}
if (is_full_stop) {
if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) {
break;
}
if (token.type == ACRONYM && options & NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) {
append_char = false;
}
}
if (idx == len - 2 && len > 2 && options & NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES) {
char this_char = *ptr;
char next_char = *(ptr + 1);
if ((this_char == '\'' && next_char == 's') || (this_char == 's' && next_char == '\'')) {
char_array_append(array, "s");
break;
}
}
if (ch == APOSTROPHE_CODEPOINT && token.type == WORD && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
append_char = false;
}
if (append_char) {
char_array_append_len(array, (char *)ptr, char_len);
}
ptr += char_len;
idx += char_len;
append_char = true;
last_was_letter = is_letter;
} }
if (ch == APOSTROPHE_CODEPOINT && token.type == WORD && options & NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) {
append_char = false;
}
if (append_char) {
char_array_append_len(array, (char *)ptr, char_len);
}
ptr += char_len;
idx += char_len;
append_char = true;
last_was_letter = is_letter;
} }
char_array_terminate(array); char_array_terminate(array);