From 70195fffd5dcd8d152ead538d57f34901ae7080d Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 27 Mar 2015 20:55:36 -0400 Subject: [PATCH] [utils] new methods on string_utils for better dynamic strings which retains the benefits of sds without having to worry about the pointer changing, renaming contiguous string array methods to something more succinct --- src/string_utils.c | 280 +++++++++++++++++++++++++++++++++++++++------ src/string_utils.h | 67 +++++++++-- src/tokens.c | 32 ++++-- src/tokens.h | 8 +- 4 files changed, 332 insertions(+), 55 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index 45bcd16b..6438a03c 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -1,8 +1,8 @@ -#include "string_utils.h" #include +#include "string_utils.h" -#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n) - 1) +#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n)) int string_compare_case_insensitive(const char *str1, const char *str2) { int c1, c2; @@ -80,9 +80,9 @@ void string_lower(char *s) { for (; *s; ++s) *s = tolower(*s); } -uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len) { +uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) { uint num_replacements = 0; - size_t len = sdslen(str); + for (int i = 0; i < len; i++) { for (int j = 0; j < trans_len; j++) { if (str[i] == word_chars[j]) { @@ -95,6 +95,21 @@ uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_ return num_replacements; } +ssize_t utf8proc_iterate_reversed(const uint8_t *str, const uint8_t *start, int32_t *dst) { + ssize_t len; + + const uint8_t *ptr = str; + + *dst = -1; + + do { + if (ptr <= start) return 0; + ptr--; len++; + } while ((*ptr & 0xC0) == 0x80); + + return utf8proc_iterate(ptr, len, dst); +} + char *utf8_reversed_string(const char *s) { int32_t unich; ssize_t len, remaining; @@ -136,14 +151,31 @@ bool utf8_is_letter(int32_t ch) { } +/* Caution: this function does not make a copy of str. Keep original pointer and free that, e.g. + +char *str = strdup("foobar"); +// Use stripped for comparison, etc. but copy the string if you need to keep a pointer to it +char *stripped = string_strip_whitespace(str); +// Only free the original pointer to str +free(str); +*/ + char *string_strip_whitespace(char *str) { char *end; - while (isspace(*str)) str++; + size_t initial_spaces = 0; + size_t ending_spaces = 0; - if (*str == '\0') + char *ptr = str; + + while (isspace(*ptr++)) { + initial_spaces++; + } + + if (*ptr == '\0') return str; + end = str + strlen(str) - 1; while (end > str && isspace(*end)) end--; @@ -152,47 +184,229 @@ char *string_strip_whitespace(char *str) { return str; } -void contiguous_string_array_add_string_unterminated(char_array *array, char *str) { - while (*str) { +char_array *char_array_from_string(char *str) { + char_array *array = char_array_new(); + array->a = str; + array->m = array->n = strlen(str); +} + +char *char_array_to_string(char_array *array, bool free_array) { + if (free_array) free(array); + return array->a; +} + +static inline void char_array_strip_nul_byte(char_array *array) { + if (array->n > 0 && array->a[array->n - 1] == '\0') { + array->n--; + } +} + +void char_array_append(char_array *array, char *str) { + while(*str) { char_array_push(array, *str++); } } -void contiguous_string_array_add_string(char_array *array, char *str) { - contiguous_string_array_add_string_unterminated(array, str); - char_array_push(array, '\0'); -} - -void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len) { - for (int i = 0; i < len; i++) { +void char_array_append_len(char_array *array, char *str, size_t len) { + for (size_t i = 0; i < len; i++) { char_array_push(array, *str++); } } -void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len) { - contiguous_string_array_add_string_unterminated_len(array, str, len); +void char_array_terminate(char_array *array) { char_array_push(array, '\0'); } -// Designed for using the char_array and uchar_array to store lots of short strings -int contiguous_string_array_next_index(char_array *string_array, int i, size_t n) { - if (INVALID_INDEX(i, string_array->n)) { - return -1; +void char_array_cat(char_array *array, char *str) { + char_array_strip_nul_byte(array); + char_array_append(array, str); + char_array_terminate(array); +} + +void char_array_cat_len(char_array *array, char *str, size_t len) { + char_array_strip_nul_byte(array); + char_array_append_len(array, str, len); + char_array_terminate(array); +} + +void char_array_add(char_array *array, char *str) { + char_array_append(array, str); + char_array_terminate(array); +} + +void char_array_add_len(char_array *array, char *str, size_t len) { + char_array_append_len(array, str, len); + char_array_terminate(array); +} + + +static void vchar_array_append_joined(char_array *array, char *separator, int count, va_list args) { + if (count <= 0) { + return; } - int len = 0; - char *array = string_array->a + i; - - - while (*array && i + len <= n - 1) { - array++; - len++; + for (size_t i = 0; i < count - 1; i++) { + char *arg = va_arg(args, char *); + char_array_append(array, arg); + char_array_append(array, separator); } - if (len < n - 1) { - return len + 1; - } - - return -1; + char *arg = va_arg(args, char *); + char_array_append(array, arg); + char_array_terminate(array); } + +void char_array_add_joined(char_array *array, char *separator, int count, ...) { + va_list args; + va_start(args, count); + vchar_array_append_joined(array, separator, count, args); + va_end(args); +} + +void char_array_cat_joined(char_array *array, char *separator, int count, ...) { + char_array_strip_nul_byte(array); + va_list args; + va_start(args, count); + vchar_array_append_joined(array, separator, count, args); + va_end(args); +} + +// Based on antirez's sdscatvprintf implementation +void char_array_cat_printf(char_array *array, char *format, ...) { + va_list args; + va_start(args, format); + + char_array_strip_nul_byte(array); + + va_list cpy; + + char *arg; + + char *buf; + size_t buflen; + + size_t last_n = array->n; + size_t size = array->m < 8 ? 16 : array->m * 2; + + while(1) { + char_array_resize(array, size); + buf = array->a + last_n; + buflen = size - last_n; + if (buf == NULL) return; + array->a[size-2] = '\0'; + va_copy(cpy, args); + vsnprintf(buf, buflen, format, cpy); + if (array->a[size-2] != '\0') { + size *= 2; + continue; + } else { + array->n += strlen(buf); + } + break; + } + + va_end(args); +} + +cstring_array_t *cstring_array_new(void) { + cstring_array_t *array = malloc(sizeof(cstring_array_t)); + if (array == NULL) return NULL; + + array->indices = uint32_array_new(); + if (array->indices == NULL) { + cstring_array_destroy(array); + return NULL; + } + + array->str = char_array_new(); + if (array->str == NULL) { + cstring_array_destroy(array); + return NULL; + } + + return array; +} + + +void cstring_array_destroy(cstring_array_t *self) { + if (self == NULL) return; + if (self->indices) { + uint32_array_destroy(self->indices); + } + if (self->str) { + char_array_destroy(self->str); + } + free(self); +} + +cstring_array_t *cstring_array_new_size(size_t size) { + cstring_array_t *array = cstring_array_new(); + char_array_resize(array->str, size); + return array; +} + +cstring_array_t *cstring_array_from_char_array(char_array *str) { + cstring_array_t *array = malloc(sizeof(cstring_array_t)); + if (array == NULL) return NULL; + + array->str = str; + array->indices = uint32_array_new_size(1); + uint32_array_push(array->indices, 0); + char *ptr = str->a; + uint32_t i = 0; + for (i = 0; i < str->n - 1; i++, ptr++) { + if (*ptr == '\0') { + uint32_array_push(array->indices, i + 1); + } + } + return array; +} + +void cstring_array_start_token(cstring_array_t *self) { + uint32_array_push(self->indices, self->str->n); +} + +void cstring_array_add_string(cstring_array_t *self, char *str) { + cstring_array_start_token(self); + char_array_append(self->str, str); + char_array_terminate(self->str); +} + +void cstring_array_add_string_len(cstring_array_t *self, char *str, size_t len) { + cstring_array_start_token(self); + char_array_append_len(self->str, str, len); + char_array_terminate(self->str); +} + +int32_t cstring_array_get_offset(cstring_array_t *self, uint32_t i) { + if (INVALID_INDEX(i, self->indices->n)) { + return -1; + } + return (int32_t)self->indices->a[i]; +} + +char *cstring_array_get_token(cstring_array_t *self, uint32_t i) { + int32_t data_index = cstring_array_get_offset(self, i); + return self->str->a + data_index; +} + +cstring_array_t *cstring_array_split(char *str, const char *separator, size_t separator_len, int *count) { + *count = 0; + char_array *array = char_array_new_size(strlen(str)); + + uint32_t index = 0; + + while (*str) { + if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) { + char_array_push(array, '\0'); + str += separator_len; + } else { + char_array_push(array, *str); + str++; + } + } + char_array_push(array, '\0'); + + return cstring_array_from_char_array(array); +} diff --git a/src/string_utils.h b/src/string_utils.h index 720cdcc4..ccd88365 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -8,14 +8,12 @@ extern "C" { #include #include #include +#include #include "collections.h" #include "sds/sds.h" #include "utf8proc/utf8proc.h" #include "vector.h" -VECTOR_INIT_FREE_DATA(string_array, sds, sdsfree) - - // NOTE: this particular implementation works only for ASCII strings int string_compare_case_insensitive(const char *str1, const char *str2); int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len); @@ -27,20 +25,71 @@ void string_upper(char *str); bool string_starts_with(const char *str, const char *start); bool string_ends_with(const char *str, const char *ending); -uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len); +uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len); char *utf8_reversed_string(const char *s); // returns a copy, caller frees bool utf8_is_letter(int32_t ch); char *string_strip_whitespace(char *str); -void contiguous_string_array_add_string_unterminated(char_array *array, char *str); -void contiguous_string_array_add_string(char_array *array, char *str); -void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len); -void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len); +/* Caller has to free the original string, + also keep in mind that after operating on a char array, + the pointer to the original string may get realloc'd and change + so need to set the char pointer to array.a when done. + Consider a macro which does this consistently +*/ +char_array *char_array_from_string(char *str); +char *char_array_to_string(char_array *array, bool free_array); -int contiguous_string_array_next_index(char_array *string_array, int i, size_t n); +void char_array_append(char_array *array, char *str); +void char_array_append_len(char_array *array, char *str, size_t len); +void char_array_terminate(char_array *array); +// Similar to strcat, strips NUL-byte and guarantees 0-terminated +void char_array_cat(char_array *array, char *str); +void char_array_cat_len(char_array *array, char *str, size_t len); + +// Strips NUL-byte but does not NUL-terminate +void char_array_cat_unterminated(char_array *array, char *str); +void char_array_cat_unterminated_len(char_array *array, char *str, size_t len); + +// Cat with printf args +void char_array_cat_printf(char_array *array, char *format, ...); + +void char_array_add_joined(char_array *array, char *separator, int count, ...); +void char_array_cat_joined(char_array *array, char *separator, int count, ...); + + +/* +cstring_arrays represent n strings stored contiguously, delimited by NUL-byte. + +Instead of storing an array of char pointers (char **), cstring_arrays use this format: + +array->indices = {0, 4, 9}; +array->str = {'f', 'o', 'o', '\0', 'b', 'a', 'r', '\0', 'b', 'a', 'z', '\0'}; + +*/ + +typedef struct cstring_array { + uint32_array *indices; + char_array *str; +} cstring_array_t; + +cstring_array_t *cstring_array_new(void); + +cstring_array_t *cstring_array_new_size(size_t size); + +cstring_array_t *cstring_array_from_char_array(char_array *str); + +cstring_array_t *cstring_array_split(char *str, const char *separator, size_t separator_len, int *count); + +void cstring_array_join_strings(cstring_array_t *self, char *separator, int count, ...); +void cstring_array_add_string(cstring_array_t *self, char *s); +void cstring_array_add_string_len(cstring_array_t *self, char *s, size_t len); +int32_t cstring_array_get_offset(cstring_array_t *self, uint32_t i); +char *cstring_array_get_token(cstring_array_t *self, uint32_t i); + +void cstring_array_destroy(cstring_array_t *self); #ifdef __cplusplus } diff --git a/src/tokens.c b/src/tokens.c index c2889b67..d79e7e7c 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -3,27 +3,41 @@ tokenized_string_t *tokenized_string_new(void) { tokenized_string_t *self = malloc(sizeof(tokenized_string_t)); - self->str = char_array_new(); + self->str = cstring_array_new(); self->tokens = token_array_new(); return self; } -void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t position) { + +void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, size_t position) { char *ptr = (char *) (src + position); - size_t offset = self->str->n; - contiguous_string_array_add_string_len(self->str, ptr, len); + cstring_array_add_string_len(self->str, ptr, len); - token_t token = (token_t){offset, len, token_type, position}; + token_t token = (token_t){position, len, token_type}; token_array_push(self->tokens, token); } -char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index) { +tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens) { + tokenized_string_t *self = malloc(sizeof(tokenized_string_t)); + self->str = cstring_array_new_size(strlen(src)); + self->tokens = tokens; + + token_t token; + + for (int i = 0; i < tokens->n; i++) { + token = tokens->a[i]; + cstring_array_add_string_len(self->str, src + token.offset, token.len); + } + return self; +} + + +char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index) { if (index < self->tokens->n) { - uint64_t i = self->tokens->a[index].offset; - return (char *)self->str->a + i; + return cstring_array_get_token(self->str, index); } else { return NULL; } @@ -33,7 +47,7 @@ void tokenized_string_destroy(tokenized_string_t *self) { if (!self) return; if (self->str) - char_array_destroy(self->str); + cstring_array_destroy(self->str); if (self->tokens) token_array_destroy(self->tokens); free(self); diff --git a/src/tokens.h b/src/tokens.h index f4563a63..913ae9a3 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -18,19 +18,19 @@ typedef struct token { size_t offset; size_t len; uint16_t type; - uint64_t src_position; } token_t; VECTOR_INIT(token_array, token_t) typedef struct tokenized_string { - char_array *str; + cstring_array_t *str; token_array *tokens; } tokenized_string_t; tokenized_string_t *tokenized_string_new(void); -void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t src_position); -char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index); +tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens); +void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, size_t position); +char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index); void tokenized_string_destroy(tokenized_string_t *self);