[utils] new methods on string_utils for better dynamic strings which retains the benefits of sds without having to worry about the pointer changing, renaming contiguous string array methods to something more succinct

This commit is contained in:
Al
2015-03-27 20:55:36 -04:00
parent 2d1c24a6e9
commit 70195fffd5
4 changed files with 332 additions and 55 deletions

View File

@@ -1,8 +1,8 @@
#include "string_utils.h"
#include <stdio.h>
#include "string_utils.h"
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n) - 1)
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
int string_compare_case_insensitive(const char *str1, const char *str2) {
int c1, c2;
@@ -80,9 +80,9 @@ void string_lower(char *s) {
for (; *s; ++s) *s = tolower(*s);
}
uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len) {
uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
uint num_replacements = 0;
size_t len = sdslen(str);
for (int i = 0; i < len; i++) {
for (int j = 0; j < trans_len; j++) {
if (str[i] == word_chars[j]) {
@@ -95,6 +95,21 @@ uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_
return num_replacements;
}
ssize_t utf8proc_iterate_reversed(const uint8_t *str, const uint8_t *start, int32_t *dst) {
ssize_t len;
const uint8_t *ptr = str;
*dst = -1;
do {
if (ptr <= start) return 0;
ptr--; len++;
} while ((*ptr & 0xC0) == 0x80);
return utf8proc_iterate(ptr, len, dst);
}
char *utf8_reversed_string(const char *s) {
int32_t unich;
ssize_t len, remaining;
@@ -136,14 +151,31 @@ bool utf8_is_letter(int32_t ch) {
}
/* Caution: this function does not make a copy of str. Keep original pointer and free that, e.g.
char *str = strdup("foobar");
// Use stripped for comparison, etc. but copy the string if you need to keep a pointer to it
char *stripped = string_strip_whitespace(str);
// Only free the original pointer to str
free(str);
*/
char *string_strip_whitespace(char *str) {
char *end;
while (isspace(*str)) str++;
size_t initial_spaces = 0;
size_t ending_spaces = 0;
if (*str == '\0')
char *ptr = str;
while (isspace(*ptr++)) {
initial_spaces++;
}
if (*ptr == '\0')
return str;
end = str + strlen(str) - 1;
while (end > str && isspace(*end)) end--;
@@ -152,47 +184,229 @@ char *string_strip_whitespace(char *str) {
return str;
}
void contiguous_string_array_add_string_unterminated(char_array *array, char *str) {
while (*str) {
char_array *char_array_from_string(char *str) {
char_array *array = char_array_new();
array->a = str;
array->m = array->n = strlen(str);
}
char *char_array_to_string(char_array *array, bool free_array) {
if (free_array) free(array);
return array->a;
}
static inline void char_array_strip_nul_byte(char_array *array) {
if (array->n > 0 && array->a[array->n - 1] == '\0') {
array->n--;
}
}
void char_array_append(char_array *array, char *str) {
while(*str) {
char_array_push(array, *str++);
}
}
void contiguous_string_array_add_string(char_array *array, char *str) {
contiguous_string_array_add_string_unterminated(array, str);
char_array_push(array, '\0');
}
void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len) {
for (int i = 0; i < len; i++) {
void char_array_append_len(char_array *array, char *str, size_t len) {
for (size_t i = 0; i < len; i++) {
char_array_push(array, *str++);
}
}
void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len) {
contiguous_string_array_add_string_unterminated_len(array, str, len);
void char_array_terminate(char_array *array) {
char_array_push(array, '\0');
}
// Designed for using the char_array and uchar_array to store lots of short strings
int contiguous_string_array_next_index(char_array *string_array, int i, size_t n) {
if (INVALID_INDEX(i, string_array->n)) {
return -1;
void char_array_cat(char_array *array, char *str) {
char_array_strip_nul_byte(array);
char_array_append(array, str);
char_array_terminate(array);
}
void char_array_cat_len(char_array *array, char *str, size_t len) {
char_array_strip_nul_byte(array);
char_array_append_len(array, str, len);
char_array_terminate(array);
}
void char_array_add(char_array *array, char *str) {
char_array_append(array, str);
char_array_terminate(array);
}
void char_array_add_len(char_array *array, char *str, size_t len) {
char_array_append_len(array, str, len);
char_array_terminate(array);
}
static void vchar_array_append_joined(char_array *array, char *separator, int count, va_list args) {
if (count <= 0) {
return;
}
int len = 0;
char *array = string_array->a + i;
while (*array && i + len <= n - 1) {
array++;
len++;
for (size_t i = 0; i < count - 1; i++) {
char *arg = va_arg(args, char *);
char_array_append(array, arg);
char_array_append(array, separator);
}
if (len < n - 1) {
return len + 1;
}
return -1;
char *arg = va_arg(args, char *);
char_array_append(array, arg);
char_array_terminate(array);
}
void char_array_add_joined(char_array *array, char *separator, int count, ...) {
va_list args;
va_start(args, count);
vchar_array_append_joined(array, separator, count, args);
va_end(args);
}
void char_array_cat_joined(char_array *array, char *separator, int count, ...) {
char_array_strip_nul_byte(array);
va_list args;
va_start(args, count);
vchar_array_append_joined(array, separator, count, args);
va_end(args);
}
// Based on antirez's sdscatvprintf implementation
void char_array_cat_printf(char_array *array, char *format, ...) {
va_list args;
va_start(args, format);
char_array_strip_nul_byte(array);
va_list cpy;
char *arg;
char *buf;
size_t buflen;
size_t last_n = array->n;
size_t size = array->m < 8 ? 16 : array->m * 2;
while(1) {
char_array_resize(array, size);
buf = array->a + last_n;
buflen = size - last_n;
if (buf == NULL) return;
array->a[size-2] = '\0';
va_copy(cpy, args);
vsnprintf(buf, buflen, format, cpy);
if (array->a[size-2] != '\0') {
size *= 2;
continue;
} else {
array->n += strlen(buf);
}
break;
}
va_end(args);
}
cstring_array_t *cstring_array_new(void) {
cstring_array_t *array = malloc(sizeof(cstring_array_t));
if (array == NULL) return NULL;
array->indices = uint32_array_new();
if (array->indices == NULL) {
cstring_array_destroy(array);
return NULL;
}
array->str = char_array_new();
if (array->str == NULL) {
cstring_array_destroy(array);
return NULL;
}
return array;
}
void cstring_array_destroy(cstring_array_t *self) {
if (self == NULL) return;
if (self->indices) {
uint32_array_destroy(self->indices);
}
if (self->str) {
char_array_destroy(self->str);
}
free(self);
}
cstring_array_t *cstring_array_new_size(size_t size) {
cstring_array_t *array = cstring_array_new();
char_array_resize(array->str, size);
return array;
}
cstring_array_t *cstring_array_from_char_array(char_array *str) {
cstring_array_t *array = malloc(sizeof(cstring_array_t));
if (array == NULL) return NULL;
array->str = str;
array->indices = uint32_array_new_size(1);
uint32_array_push(array->indices, 0);
char *ptr = str->a;
uint32_t i = 0;
for (i = 0; i < str->n - 1; i++, ptr++) {
if (*ptr == '\0') {
uint32_array_push(array->indices, i + 1);
}
}
return array;
}
void cstring_array_start_token(cstring_array_t *self) {
uint32_array_push(self->indices, self->str->n);
}
void cstring_array_add_string(cstring_array_t *self, char *str) {
cstring_array_start_token(self);
char_array_append(self->str, str);
char_array_terminate(self->str);
}
void cstring_array_add_string_len(cstring_array_t *self, char *str, size_t len) {
cstring_array_start_token(self);
char_array_append_len(self->str, str, len);
char_array_terminate(self->str);
}
int32_t cstring_array_get_offset(cstring_array_t *self, uint32_t i) {
if (INVALID_INDEX(i, self->indices->n)) {
return -1;
}
return (int32_t)self->indices->a[i];
}
char *cstring_array_get_token(cstring_array_t *self, uint32_t i) {
int32_t data_index = cstring_array_get_offset(self, i);
return self->str->a + data_index;
}
cstring_array_t *cstring_array_split(char *str, const char *separator, size_t separator_len, int *count) {
*count = 0;
char_array *array = char_array_new_size(strlen(str));
uint32_t index = 0;
while (*str) {
if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) {
char_array_push(array, '\0');
str += separator_len;
} else {
char_array_push(array, *str);
str++;
}
}
char_array_push(array, '\0');
return cstring_array_from_char_array(array);
}

View File

@@ -8,14 +8,12 @@ extern "C" {
#include <ctype.h>
#include <string.h>
#include <stdbool.h>
#include <stdarg.h>
#include "collections.h"
#include "sds/sds.h"
#include "utf8proc/utf8proc.h"
#include "vector.h"
VECTOR_INIT_FREE_DATA(string_array, sds, sdsfree)
// NOTE: this particular implementation works only for ASCII strings
int string_compare_case_insensitive(const char *str1, const char *str2);
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len);
@@ -27,20 +25,71 @@ void string_upper(char *str);
bool string_starts_with(const char *str, const char *start);
bool string_ends_with(const char *str, const char *ending);
uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len);
uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len);
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
bool utf8_is_letter(int32_t ch);
char *string_strip_whitespace(char *str);
void contiguous_string_array_add_string_unterminated(char_array *array, char *str);
void contiguous_string_array_add_string(char_array *array, char *str);
void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len);
void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len);
/* Caller has to free the original string,
also keep in mind that after operating on a char array,
the pointer to the original string may get realloc'd and change
so need to set the char pointer to array.a when done.
Consider a macro which does this consistently
*/
char_array *char_array_from_string(char *str);
char *char_array_to_string(char_array *array, bool free_array);
int contiguous_string_array_next_index(char_array *string_array, int i, size_t n);
void char_array_append(char_array *array, char *str);
void char_array_append_len(char_array *array, char *str, size_t len);
void char_array_terminate(char_array *array);
// Similar to strcat, strips NUL-byte and guarantees 0-terminated
void char_array_cat(char_array *array, char *str);
void char_array_cat_len(char_array *array, char *str, size_t len);
// Strips NUL-byte but does not NUL-terminate
void char_array_cat_unterminated(char_array *array, char *str);
void char_array_cat_unterminated_len(char_array *array, char *str, size_t len);
// Cat with printf args
void char_array_cat_printf(char_array *array, char *format, ...);
void char_array_add_joined(char_array *array, char *separator, int count, ...);
void char_array_cat_joined(char_array *array, char *separator, int count, ...);
/*
cstring_arrays represent n strings stored contiguously, delimited by NUL-byte.
Instead of storing an array of char pointers (char **), cstring_arrays use this format:
array->indices = {0, 4, 9};
array->str = {'f', 'o', 'o', '\0', 'b', 'a', 'r', '\0', 'b', 'a', 'z', '\0'};
*/
typedef struct cstring_array {
uint32_array *indices;
char_array *str;
} cstring_array_t;
cstring_array_t *cstring_array_new(void);
cstring_array_t *cstring_array_new_size(size_t size);
cstring_array_t *cstring_array_from_char_array(char_array *str);
cstring_array_t *cstring_array_split(char *str, const char *separator, size_t separator_len, int *count);
void cstring_array_join_strings(cstring_array_t *self, char *separator, int count, ...);
void cstring_array_add_string(cstring_array_t *self, char *s);
void cstring_array_add_string_len(cstring_array_t *self, char *s, size_t len);
int32_t cstring_array_get_offset(cstring_array_t *self, uint32_t i);
char *cstring_array_get_token(cstring_array_t *self, uint32_t i);
void cstring_array_destroy(cstring_array_t *self);
#ifdef __cplusplus
}

View File

@@ -3,27 +3,41 @@
tokenized_string_t *tokenized_string_new(void) {
tokenized_string_t *self = malloc(sizeof(tokenized_string_t));
self->str = char_array_new();
self->str = cstring_array_new();
self->tokens = token_array_new();
return self;
}
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t position) {
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, size_t position) {
char *ptr = (char *) (src + position);
size_t offset = self->str->n;
contiguous_string_array_add_string_len(self->str, ptr, len);
cstring_array_add_string_len(self->str, ptr, len);
token_t token = (token_t){offset, len, token_type, position};
token_t token = (token_t){position, len, token_type};
token_array_push(self->tokens, token);
}
char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index) {
tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens) {
tokenized_string_t *self = malloc(sizeof(tokenized_string_t));
self->str = cstring_array_new_size(strlen(src));
self->tokens = tokens;
token_t token;
for (int i = 0; i < tokens->n; i++) {
token = tokens->a[i];
cstring_array_add_string_len(self->str, src + token.offset, token.len);
}
return self;
}
char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index) {
if (index < self->tokens->n) {
uint64_t i = self->tokens->a[index].offset;
return (char *)self->str->a + i;
return cstring_array_get_token(self->str, index);
} else {
return NULL;
}
@@ -33,7 +47,7 @@ void tokenized_string_destroy(tokenized_string_t *self) {
if (!self)
return;
if (self->str)
char_array_destroy(self->str);
cstring_array_destroy(self->str);
if (self->tokens)
token_array_destroy(self->tokens);
free(self);

View File

@@ -18,19 +18,19 @@ typedef struct token {
size_t offset;
size_t len;
uint16_t type;
uint64_t src_position;
} token_t;
VECTOR_INIT(token_array, token_t)
typedef struct tokenized_string {
char_array *str;
cstring_array_t *str;
token_array *tokens;
} tokenized_string_t;
tokenized_string_t *tokenized_string_new(void);
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t src_position);
char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index);
tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens);
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, size_t position);
char *tokenized_string_get_token(tokenized_string_t *self, uint32_t index);
void tokenized_string_destroy(tokenized_string_t *self);