[utils] string utils, file utils, contiguous arrays of strings used for storing tokenized strings, klib for generic hashtables and vectors, antirez's sds for certain types of string building, utf8proc for iterating over utf-8 strings and unicode normalization

This commit is contained in:
Al
2015-03-03 12:27:19 -05:00
parent 27269e18ca
commit 5216aba1b6
16 changed files with 16961 additions and 0 deletions

90
src/file_utils.c Normal file
View File

@@ -0,0 +1,90 @@
#include "file_utils.h"
char *file_getline(FILE * f)
{
char buf[BUFSIZ];
char *ret = NULL;
size_t buf_len = 0;
size_t ret_size = 0;
while (fgets(buf, BUFSIZ, f) != NULL) {
buf_len = strlen(buf);
if (buf_len == 0) break;
ret = realloc(ret, ret_size + buf_len + 1);
memcpy(ret+ret_size, buf, buf_len);
ret_size += buf_len;
ret[ret_size] = '\0';
if (ret[ret_size - 1] == '\n') {
ret[ret_size - 1] = '\0';
// Handle carriage returns
if (ret_size > 1 && ret[ret_size-2] == '\r') {
ret[ret_size - 2] = '\0';
}
break;
}
}
if (ret_size == 0) {
return NULL;
}
return ret;
}
bool file_read_int32(FILE *file, int32_t *value) {
unsigned char buf[4];
if (fread(buf, 4, 1, file) == 1) {
*value = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
return true;
}
return false;
}
bool file_write_int32(FILE *file, int32_t value) {
unsigned char buf[4];
buf[0] = (value >> 24) & 0xff;
buf[1] = (value >> 16) & 0xff;
buf[2] = (value >> 8) & 0xff;
buf[3] = value & 0xff;
return (fwrite(buf, 4, 1, file) == 1);
}
bool file_read_int16(FILE *file, int16_t *value) {
unsigned char buf[2];
if (fread(buf, 2, 1, file) == 1) {
*value = (buf[0] << 8) | buf[1];
return true;
}
return false;
}
bool file_write_int16(FILE *file, int16_t value) {
unsigned char buf[2];
buf[0] = value >> 8;
buf[1] = value & 0xff;
return (fwrite(buf, 2, 1, file) == 1);
}
bool file_read_int8(FILE *file, int8_t *value) {
return (fread(value, sizeof(int8_t), 1, file) == 1);
}
bool file_write_int8(FILE *file, int8_t value) {
return (fwrite(&value, sizeof(int8_t), 1, file) == 1);
}
bool file_read_chars(FILE *file, char *buf, size_t len) {
return (fread(buf, sizeof(char), len, file) == len);
}
bool file_write_chars(FILE *file, const char *buf, size_t len) {
return (fwrite(buf, sizeof(char), len, file) == len);
}

45
src/file_utils.h Normal file
View File

@@ -0,0 +1,45 @@
#ifndef FILE_UTILS_H
#define FILE_UTILS_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#ifndef BUFSIZ
#define BUFSIZ 4096
#endif
#ifdef _WIN32
#define PATH_SEPERATOR "\\"
#else
#define PATH_SEPERATOR "/"
#endif
#define PATH_SEPERATOR_LEN strlen(PATH_SEPERATOR)
char *file_getline(FILE * f);
bool file_read_int32(FILE *file, int32_t *value);
bool file_write_int32(FILE *file, int32_t value);
bool file_read_int16(FILE *file, int16_t *value);
bool file_write_int16(FILE *file, int16_t value);
bool file_read_int8(FILE *file, int8_t *value);
bool file_write_int8(FILE *file, int8_t value);
bool file_read_chars(FILE *file, char *buf, size_t len);
bool file_write_chars(FILE *file, const char *buf, size_t len);
#ifdef __cplusplus
}
#endif
#endif

587
src/klib/khash.h Normal file
View File

@@ -0,0 +1,587 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "khash.h"
KHASH_MAP_INIT_INT(32, char)
int main() {
int ret, is_missing;
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
k = kh_get(32, h, 5);
kh_del(32, h, k);
for (k = kh_begin(h); k != kh_end(h); ++k)
if (kh_exist(h, k)) kh_value(h, k) = 1;
kh_destroy(32, h);
return 0;
}
*/
/*
2013-05-02 (0.2.8):
* Use quadratic probing. When the capacity is power of 2, stepping function
i*(i+1)/2 guarantees to traverse each bucket. It is better than double
hashing on cache performance and is more robust than linear probing.
In theory, double hashing should be more robust than quadratic probing.
However, my implementation is probably not for large hash tables, because
the second hash function is closely tied to the first hash function,
which reduce the effectiveness of double hashing.
Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
2011-12-29 (0.2.7):
* Minor code clean up; no actual effect.
2011-09-16 (0.2.6):
* The capacity is a power of 2. This seems to dramatically improve the
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- http://code.google.com/p/ulib/
- http://nothings.org/computer/judy/
* Allow to optionally use linear probing which usually has better
performance for random input. Double hashing is still the default as it
is more robust to certain non-random input.
* Added Wang's integer hash function (not used by default). This hash
function is more robust to certain non-random input.
2011-02-14 (0.2.5):
* Allow to declare global functions.
2009-09-26 (0.2.4):
* Improve portability
2008-09-19 (0.2.3):
* Corrected the example
* Improved interfaces
2008-09-11 (0.2.2):
* Improved speed a little in kh_put()
2008-09-10 (0.2.1):
* Added kh_clear()
* Fixed a compiling error
2008-09-02 (0.2.0):
* Changed to token concatenation which increases flexibility.
2008-08-31 (0.1.2):
* Fixed a bug in kh_get(), which has not been tested previously.
2008-08-31 (0.1.1):
* Added destructor
*/
#ifndef __AC_KHASH_H
#define __AC_KHASH_H
/*!
@header
Generic hash table library.
*/
#define AC_VERSION_KHASH_H "0.2.8"
#include <stdlib.h>
#include <string.h>
#include <limits.h>
/* compiler specific configuration */
#if UINT_MAX == 0xffffffffu
typedef unsigned int khint32_t;
#elif ULONG_MAX == 0xffffffffu
typedef unsigned long khint32_t;
#endif
#if ULONG_MAX == ULLONG_MAX
typedef unsigned long khint64_t;
#else
typedef unsigned long long khint64_t;
#endif
#ifndef kh_inline
#ifdef _MSC_VER
#define kh_inline __inline
#else
#define kh_inline inline
#endif
#endif /* kh_inline */
typedef khint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#ifndef kcalloc
#define kcalloc(N,Z) calloc(N,Z)
#endif
#ifndef kmalloc
#define kmalloc(Z) malloc(Z)
#endif
#ifndef krealloc
#define krealloc(P,Z) realloc(P,Z)
#endif
#ifndef kfree
#define kfree(P) free(P)
#endif
static const double __ac_HASH_UPPER = 0.77;
#define __KHASH_TYPE(name, khkey_t, khval_t) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t;
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
extern kh_##name##_t *kh_init_##name(void); \
extern void kh_destroy_##name(kh_##name##_t *h); \
extern void kh_clear_##name(kh_##name##_t *h); \
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
SCOPE kh_##name##_t *kh_init_##name(void) { \
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
} \
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
kfree((void *)h->keys); kfree(h->flags); \
kfree((void *)h->vals); \
kfree(h); \
} \
} \
SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t k, i, last, mask, step = 0; \
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
i = (i + (++step)) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (!new_flags) return -1; \
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { /* expand */ \
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (!new_keys) { kfree(new_flags); return -1; } \
h->keys = new_keys; \
if (kh_is_map) { \
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
if (!new_vals) { kfree(new_flags); return -1; } \
h->vals = new_vals; \
} \
} /* otherwise shrink */ \
} \
} \
if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
khint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khint_t k, i, step = 0; \
k = __hash_func(key); \
i = k & new_mask; \
while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
} \
} \
} \
} \
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
} \
kfree(h->flags); /* free the working space */ \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
return 0; \
} \
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
if (h->n_buckets > (h->size<<1)) { \
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
*ret = -1; return h->n_buckets; \
} \
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
*ret = -1; return h->n_buckets; \
} \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
i = (i + (++step)) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
else x = i; \
} \
} \
} \
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} \
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
#define KHASH_DECLARE(name, khkey_t, khval_t) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@abstract Integer hash function
@param key The integer [khint32_t]
@return The hash value [khint_t]
*/
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function
@abstract Integer comparison function
*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract 64-bit integer hash function
@param key The integer [khint64_t]
@return The hash value [khint_t]
*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function
@abstract 64-bit integer comparison function
*/
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract const char* hash function
@param s Pointer to a null terminated string
@return The hash value
*/
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
khint_t h = (khint_t)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}
/*! @function
@abstract Another interface to const char* hash function
@param key Pointer to a null terminated string [const char*]
@return The hash value [khint_t]
*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function
@abstract Const char* comparison function
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
/* --- END OF HASH FUNCTIONS --- */
/* Other convenient macros... */
/*!
@abstract Type of the hash table.
@param name Name of the hash table [symbol]
*/
#define khash_t(name) kh_##name##_t
/*! @function
@abstract Initiate a hash table.
@param name Name of the hash table [symbol]
@return Pointer to the hash table [khash_t(name)*]
*/
#define kh_init(name) kh_init_##name()
/*! @function
@abstract Destroy a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_destroy(name, h) kh_destroy_##name(h)
/*! @function
@abstract Reset a hash table without deallocating memory.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_clear(name, h) kh_clear_##name(h)
/*! @function
@abstract Resize a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param s New size [khint_t]
*/
#define kh_resize(name, h, s) kh_resize_##name(h, s)
/*! @function
@abstract Insert a key to the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@param r Extra return code: -1 if the operation failed;
0 if the key is present in the hash table;
1 if the bucket is empty (never used); 2 if the element in
the bucket has been deleted [int*]
@return Iterator to the inserted element [khint_t]
*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
/*! @function
@abstract Retrieve a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
*/
#define kh_get(name, h, k) kh_get_##name(h, k)
/*! @function
@abstract Remove a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Iterator to the element to be deleted [khint_t]
*/
#define kh_del(name, h, k) kh_del_##name(h, k)
/*! @function
@abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return 1 if containing data; 0 otherwise [int]
*/
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
/*! @function
@abstract Get key given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return Key [type of keys]
*/
#define kh_key(h, x) ((h)->keys[x])
/*! @function
@abstract Get value given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return Value [type of values]
@discussion For hash sets, calling this results in segfault.
*/
#define kh_val(h, x) ((h)->vals[x])
/*! @function
@abstract Alias of kh_val()
*/
#define kh_value(h, x) ((h)->vals[x])
/*! @function
@abstract Get the start iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The start iterator [khint_t]
*/
#define kh_begin(h) (khint_t)(0)
/*! @function
@abstract Get the end iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The end iterator [khint_t]
*/
#define kh_end(h) ((h)->n_buckets)
/*! @function
@abstract Get the number of elements in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of elements in the hash table [khint_t]
*/
#define kh_size(h) ((h)->size)
/*! @function
@abstract Get the number of buckets in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of buckets in the hash table [khint_t]
*/
#define kh_n_buckets(h) ((h)->n_buckets)
/*! @function
@abstract Iterate over the entries in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param kvar Variable to which key will be assigned
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(kvar) = kh_key(h,__i); \
(vvar) = kh_val(h,__i); \
code; \
} }
/*! @function
@abstract Iterate over the values in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(vvar) = kh_val(h,__i); \
code; \
} }
/* More conenient interfaces */
/*! @function
@abstract Instantiate a hash set containing integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_STR(name) \
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_STR(name, khval_t) \
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
#endif /* __AC_KHASH_H */

117
src/klib/klist.h Normal file
View File

@@ -0,0 +1,117 @@
/* The MIT License
Copyright (c) 2008-2009, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef _AC_KLIST_H
#define _AC_KLIST_H
#include <stdlib.h>
#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \
typedef struct { \
size_t cnt, n, max; \
kmptype_t **buf; \
} kmp_##name##_t; \
static inline kmp_##name##_t *kmp_init_##name(void) { \
return calloc(1, sizeof(kmp_##name##_t)); \
} \
static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \
size_t k; \
for (k = 0; k < mp->n; ++k) { \
kmpfree_f(mp->buf[k]); free(mp->buf[k]); \
} \
free(mp->buf); free(mp); \
} \
static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
++mp->cnt; \
if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \
return mp->buf[--mp->n]; \
} \
static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
--mp->cnt; \
if (mp->n == mp->max) { \
mp->max = mp->max? mp->max<<1 : 16; \
mp->buf = realloc(mp->buf, sizeof(kmptype_t *) * mp->max); \
} \
mp->buf[mp->n++] = p; \
}
#define kmempool_t(name) kmp_##name##_t
#define kmp_init(name) kmp_init_##name()
#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
#define KLIST_INIT(name, kltype_t, kmpfree_t) \
struct __kl1_##name { \
kltype_t data; \
struct __kl1_##name *next; \
}; \
typedef struct __kl1_##name kl1_##name; \
KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \
typedef struct { \
kl1_##name *head, *tail; \
kmp_##name##_t *mp; \
size_t size; \
} kl_##name##_t; \
static inline kl_##name##_t *kl_init_##name(void) { \
kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \
kl->mp = kmp_init(name); \
kl->head = kl->tail = kmp_alloc(name, kl->mp); \
kl->head->next = 0; \
return kl; \
} \
static inline void kl_destroy_##name(kl_##name##_t *kl) { \
kl1_##name *p; \
for (p = kl->head; p != kl->tail; p = p->next) \
kmp_free(name, kl->mp, p); \
kmp_free(name, kl->mp, p); \
kmp_destroy(name, kl->mp); \
free(kl); \
} \
static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \
kl1_##name *q, *p = kmp_alloc(name, kl->mp); \
q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \
++kl->size; \
return &q->data; \
} \
static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
kl1_##name *p; \
if (kl->head->next == 0) return -1; \
--kl->size; \
p = kl->head; kl->head = kl->head->next; \
if (d) *d = p->data; \
kmp_free(name, kl->mp, p); \
return 0; \
}
#define kliter_t(name) kl1_##name
#define klist_t(name) kl_##name##_t
#define kl_val(iter) ((iter)->data)
#define kl_next(iter) ((iter)->next)
#define kl_begin(kl) ((kl)->head)
#define kl_end(kl) ((kl)->tail)
#define kl_init(name) kl_init_##name()
#define kl_destroy(name, kl) kl_destroy_##name(kl)
#define kl_pushp(name, kl) kl_pushp_##name(kl)
#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
#endif

278
src/klib/ksort.h Normal file
View File

@@ -0,0 +1,278 @@
/* The MIT License
Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
2011-04-10 (0.1.6):
* Added sample
2011-03 (0.1.5):
* Added shuffle/permutation
2008-11-16 (0.1.4):
* Fixed a bug in introsort() that happens in rare cases.
2008-11-05 (0.1.3):
* Fixed a bug in introsort() for complex comparisons.
* Fixed a bug in mergesort(). The previous version is not stable.
2008-09-15 (0.1.2):
* Accelerated introsort. On my Mac (not on another Linux machine),
my implementation is as fast as std::sort on random input.
* Added combsort and in introsort, switch to combsort if the
recursion is too deep.
2008-09-13 (0.1.1):
* Added k-small algorithm
2008-09-05 (0.1.0):
* Initial version
*/
#ifndef AC_KSORT_H
#define AC_KSORT_H
#include <stdlib.h>
#include <string.h>
typedef struct {
void *left, *right;
int depth;
} ks_isort_stack_t;
#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
#define KSORT_INIT(name, type_t, __sort_lt) \
void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
{ \
type_t *a2[2], *a, *b; \
int curr, shift; \
\
a2[0] = array; \
a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
a = a2[curr]; b = a2[1-curr]; \
if (shift == 0) { \
type_t *p = b, *i, *eb = a + n; \
for (i = a; i < eb; i += 2) { \
if (i == eb - 1) *p++ = *i; \
else { \
if (__sort_lt(*(i+1), *i)) { \
*p++ = *(i+1); *p++ = *i; \
} else { \
*p++ = *i; *p++ = *(i+1); \
} \
} \
} \
} else { \
size_t i, step = 1ul<<shift; \
for (i = 0; i < n; i += step<<1) { \
type_t *p, *j, *k, *ea, *eb; \
if (n < i + step) { \
ea = a + n; eb = a; \
} else { \
ea = a + i + step; \
eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
} \
j = a + i; k = a + i + step; p = b + i; \
while (j < ea && k < eb) { \
if (__sort_lt(*k, *j)) *p++ = *k++; \
else *p++ = *j++; \
} \
while (j < ea) *p++ = *j++; \
while (k < eb) *p++ = *k++; \
} \
} \
curr = 1 - curr; \
} \
if (curr == 1) { \
type_t *p = a2[0], *i = a2[1], *eb = array + n; \
for (; p < eb; ++i) *p++ = *i; \
} \
if (temp == 0) free(a2[1]); \
} \
void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
{ \
size_t k = i; \
type_t tmp = l[i]; \
while ((k = (k << 1) + 1) < n) { \
if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
if (__sort_lt(l[k], tmp)) break; \
l[i] = l[k]; i = k; \
} \
l[i] = tmp; \
} \
void ks_heapmake_##name(size_t lsize, type_t l[]) \
{ \
size_t i; \
for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
ks_heapadjust_##name(i, lsize, l); \
} \
void ks_heapsort_##name(size_t lsize, type_t l[]) \
{ \
size_t i; \
for (i = lsize - 1; i > 0; --i) { \
type_t tmp; \
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
} \
} \
static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
{ \
type_t *i, *j, swap_tmp; \
for (i = s + 1; i < t; ++i) \
for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
} \
} \
void ks_combsort_##name(size_t n, type_t a[]) \
{ \
const double shrink_factor = 1.2473309501039786540366528676643; \
int do_swap; \
size_t gap = n; \
type_t tmp, *i, *j; \
do { \
if (gap > 2) { \
gap = (size_t)(gap / shrink_factor); \
if (gap == 9 || gap == 10) gap = 11; \
} \
do_swap = 0; \
for (i = a; i < a + n - gap; ++i) { \
j = i + gap; \
if (__sort_lt(*j, *i)) { \
tmp = *i; *i = *j; *j = tmp; \
do_swap = 1; \
} \
} \
} while (do_swap || gap > 2); \
if (gap != 1) __ks_insertsort_##name(a, a + n); \
} \
void ks_introsort_##name(size_t n, type_t a[]) \
{ \
int d; \
ks_isort_stack_t *top, *stack; \
type_t rp, swap_tmp; \
type_t *s, *t, *i, *j, *k; \
\
if (n < 1) return; \
else if (n == 2) { \
if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
return; \
} \
for (d = 2; 1ul<<d < n; ++d); \
stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
top = stack; s = a; t = a + (n-1); d <<= 1; \
while (1) { \
if (s < t) { \
if (--d == 0) { \
ks_combsort_##name(t - s + 1, s); \
t = s; \
continue; \
} \
i = s; j = t; k = i + ((j-i)>>1) + 1; \
if (__sort_lt(*k, *i)) { \
if (__sort_lt(*k, *j)) k = j; \
} else k = __sort_lt(*j, *i)? i : j; \
rp = *k; \
if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
for (;;) { \
do ++i; while (__sort_lt(*i, rp)); \
do --j; while (i <= j && __sort_lt(rp, *j)); \
if (j <= i) break; \
swap_tmp = *i; *i = *j; *j = swap_tmp; \
} \
swap_tmp = *i; *i = *t; *t = swap_tmp; \
if (i-s > t-i) { \
if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
s = t-i > 16? i+1 : t; \
} else { \
if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
t = i-s > 16? i-1 : s; \
} \
} else { \
if (top == stack) { \
free(stack); \
__ks_insertsort_##name(a, a+n); \
return; \
} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
} \
} \
} \
/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
/* 0 <= kk < n */ \
type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
{ \
type_t *low, *high, *k, *ll, *hh, *mid; \
low = arr; high = arr + n - 1; k = arr + kk; \
for (;;) { \
if (high <= low) return *k; \
if (high == low + 1) { \
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
return *k; \
} \
mid = low + (high - low) / 2; \
if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
KSORT_SWAP(type_t, *mid, *(low+1)); \
ll = low + 1; hh = high; \
for (;;) { \
do ++ll; while (__sort_lt(*ll, *low)); \
do --hh; while (__sort_lt(*low, *hh)); \
if (hh < ll) break; \
KSORT_SWAP(type_t, *ll, *hh); \
} \
KSORT_SWAP(type_t, *low, *hh); \
if (hh <= k) low = ll; \
if (hh >= k) high = hh - 1; \
} \
} \
void ks_shuffle_##name(size_t n, type_t a[]) \
{ \
int i, j; \
for (i = n; i > 1; --i) { \
type_t tmp; \
j = (int)(drand48() * i); \
tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
} \
} \
void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
{ /* reference: http://code.activestate.com/recipes/272884/ */ \
int i, k, pop = n; \
for (i = (int)r, k = 0; i >= 0; --i) { \
double z = 1., x = drand48(); \
type_t tmp; \
while (x < z) z -= z * i / (pop--); \
if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
++k; \
} \
}
#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
#define ks_lt_generic(a, b) ((a) < (b))
#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
typedef const char *ksstr_t;
#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
#endif

86
src/klib/kvec.h Normal file
View File

@@ -0,0 +1,86 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "kvec.h"
int main() {
kvec_t(int) array;
kv_init(array);
kv_push(int, array, 10); // append
kv_a(int, array, 20) = 5; // dynamic
kv_A(array, 20) = 4; // static
kv_destroy(array);
return 0;
}
*/
/*
2008-09-22 (0.1.0):
* The initial version.
*/
#ifndef AC_KVEC_H
#define AC_KVEC_H
#include <stdlib.h>
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#define kvec_t(type) struct { size_t n, m; type *a; }
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
#define kv_destroy(v) free((v).a)
#define kv_A(v, i) ((v).a[(i)])
#define kv_pop(v) ((v).a[--(v).n])
#define kv_size(v) ((v).n)
#define kv_max(v) ((v).m)
// `kv_clear` added by Al Barrentine on 2014-11-17
#define kv_clear(v) ((v).n = 0)
#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
#define kv_copy(type, v1, v0) do { \
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
(v1).n = (v0).n; \
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
} while (0) \
#define kv_push(type, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)
#define kv_pushp(type, v) (((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: 0), ((v).a + ((v).n++))
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
: 0), (v).a[(i)])
#endif

907
src/sds/sds.c Normal file
View File

@@ -0,0 +1,907 @@
/* SDS (Simple Dynamic Strings), A C dynamic strings library.
*
* Copyright (c) 2006-2014, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include "sds.h"
/* Create a new sds string with the content specified by the 'init' pointer
* and 'initlen'.
* If NULL is used for 'init' the string is initialized with zero bytes.
*
* The string is always null-termined (all the sds strings are, always) so
* even if you create an sds string with:
*
* mystring = sdsnewlen("abc",3");
*
* You can print the string with printf() as there is an implicit \0 at the
* end of the string. However the string is binary safe and can contain
* \0 characters in the middle, as the length is stored in the sds header. */
sds sdsnewlen(const void *init, size_t initlen) {
struct sdshdr *sh;
if (init) {
sh = malloc(sizeof *sh+initlen+1);
} else {
sh = calloc(sizeof *sh+initlen+1,1);
}
if (sh == NULL) return NULL;
sh->len = initlen;
sh->free = 0;
if (initlen && init)
memcpy(sh->buf, init, initlen);
sh->buf[initlen] = '\0';
return (char*)sh->buf;
}
/* Create an empty (zero length) sds string. Even in this case the string
* always has an implicit null term. */
sds sdsempty(void) {
return sdsnewlen("",0);
}
/* Create a new sds string starting from a null termined C string. */
sds sdsnew(const char *init) {
size_t initlen = (init == NULL) ? 0 : strlen(init);
return sdsnewlen(init, initlen);
}
/* Duplicate an sds string. */
sds sdsdup(const sds s) {
return sdsnewlen(s, sdslen(s));
}
/* Free an sds string. No operation is performed if 's' is NULL. */
void sdsfree(sds s) {
if (s == NULL) return;
free(s-sizeof(struct sdshdr));
}
/* Set the sds string length to the length as obtained with strlen(), so
* considering as content only up to the first null term character.
*
* This function is useful when the sds string is hacked manually in some
* way, like in the following example:
*
* s = sdsnew("foobar");
* s[2] = '\0';
* sdsupdatelen(s);
* printf("%d\n", sdslen(s));
*
* The output will be "2", but if we comment out the call to sdsupdatelen()
* the output will be "6" as the string was modified but the logical length
* remains 6 bytes. */
void sdsupdatelen(sds s) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
int reallen = strlen(s);
sh->free += (sh->len-reallen);
sh->len = reallen;
}
/* Modify an sds string on-place to make it empty (zero length).
* However all the existing buffer is not discarded but set as free space
* so that next append operations will not require allocations up to the
* number of bytes previously available. */
void sdsclear(sds s) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
sh->free += sh->len;
sh->len = 0;
sh->buf[0] = '\0';
}
/* Enlarge the free space at the end of the sds string so that the caller
* is sure that after calling this function can overwrite up to addlen
* bytes after the end of the string, plus one more byte for nul term.
*
* Note: this does not change the *length* of the sds string as returned
* by sdslen(), but only the free buffer space we have. */
sds sdsMakeRoomFor(sds s, size_t addlen) {
struct sdshdr *sh, *newsh;
size_t free = sdsavail(s);
size_t len, newlen;
if (free >= addlen) return s;
len = sdslen(s);
sh = (void*) (s-sizeof *sh);;
newlen = (len+addlen);
if (newlen < SDS_MAX_PREALLOC)
newlen *= 2;
else
newlen += SDS_MAX_PREALLOC;
newsh = realloc(sh, sizeof *newsh+newlen+1);
if (newsh == NULL) return NULL;
newsh->free = newlen - len;
return newsh->buf;
}
/* Reallocate the sds string so that it has no free space at the end. The
* contained string remains not altered, but next concatenation operations
* will require a reallocation.
*
* After the call, the passed sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdsRemoveFreeSpace(sds s) {
struct sdshdr *sh;
sh = (void*) (s-sizeof *sh);;
sh = realloc(sh, sizeof *sh+sh->len+1);
sh->free = 0;
return sh->buf;
}
/* Return the total size of the allocation of the specifed sds string,
* including:
* 1) The sds header before the pointer.
* 2) The string.
* 3) The free buffer at the end if any.
* 4) The implicit null term.
*/
size_t sdsAllocSize(sds s) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
return sizeof(*sh)+sh->len+sh->free+1;
}
/* Increment the sds length and decrements the left free space at the
* end of the string according to 'incr'. Also set the null term
* in the new end of the string.
*
* This function is used in order to fix the string length after the
* user calls sdsMakeRoomFor(), writes something after the end of
* the current string, and finally needs to set the new length.
*
* Note: it is possible to use a negative increment in order to
* right-trim the string.
*
* Usage example:
*
* Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the
* following schema, to cat bytes coming from the kernel to the end of an
* sds string without copying into an intermediate buffer:
*
* oldlen = sdslen(s);
* s = sdsMakeRoomFor(s, BUFFER_SIZE);
* nread = read(fd, s+oldlen, BUFFER_SIZE);
* ... check for nread <= 0 and handle it ...
* sdsIncrLen(s, nread);
*/
void sdsIncrLen(sds s, int incr) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
assert(sh->free >= incr);
sh->len += incr;
sh->free -= incr;
assert(sh->free >= 0);
s[sh->len] = '\0';
}
/* Grow the sds to have the specified length. Bytes that were not part of
* the original length of the sds will be set to zero.
*
* if the specified length is smaller than the current length, no operation
* is performed. */
sds sdsgrowzero(sds s, size_t len) {
struct sdshdr *sh = (void*) (s-sizeof *sh);
size_t totlen, curlen = sh->len;
if (len <= curlen) return s;
s = sdsMakeRoomFor(s,len-curlen);
if (s == NULL) return NULL;
/* Make sure added region doesn't contain garbage */
sh = (void*)(s-sizeof *sh);
memset(s+curlen,0,(len-curlen+1)); /* also set trailing \0 byte */
totlen = sh->len+sh->free;
sh->len = len;
sh->free = totlen-sh->len;
return s;
}
/* Append the specified binary-safe string pointed by 't' of 'len' bytes to the
* end of the specified sds string 's'.
*
* After the call, the passed sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdscatlen(sds s, const void *t, size_t len) {
struct sdshdr *sh;
size_t curlen = sdslen(s);
s = sdsMakeRoomFor(s,len);
if (s == NULL) return NULL;
sh = (void*) (s-sizeof *sh);;
memcpy(s+curlen, t, len);
sh->len = curlen+len;
sh->free = sh->free-len;
s[curlen+len] = '\0';
return s;
}
/* Append the specified null termianted C string to the sds string 's'.
*
* After the call, the passed sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdscat(sds s, const char *t) {
return sdscatlen(s, t, strlen(t));
}
/* Append the specified sds 't' to the existing sds 's'.
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdscatsds(sds s, const sds t) {
return sdscatlen(s, t, sdslen(t));
}
/* Destructively modify the sds string 's' to hold the specified binary
* safe string pointed by 't' of length 'len' bytes. */
sds sdscpylen(sds s, const char *t, size_t len) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
size_t totlen = sh->free+sh->len;
if (totlen < len) {
s = sdsMakeRoomFor(s,len-sh->len);
if (s == NULL) return NULL;
sh = (void*) (s-sizeof *sh);;
totlen = sh->free+sh->len;
}
memcpy(s, t, len);
s[len] = '\0';
sh->len = len;
sh->free = totlen-len;
return s;
}
/* Like sdscpylen() but 't' must be a null-termined string so that the length
* of the string is obtained with strlen(). */
sds sdscpy(sds s, const char *t) {
return sdscpylen(s, t, strlen(t));
}
/* Like sdscatpritf() but gets va_list instead of being variadic. */
sds sdscatvprintf(sds s, const char *fmt, va_list ap) {
va_list cpy;
char *buf, *t;
size_t buflen = 16;
while(1) {
buf = malloc(buflen);
if (buf == NULL) return NULL;
buf[buflen-2] = '\0';
va_copy(cpy,ap);
vsnprintf(buf, buflen, fmt, cpy);
if (buf[buflen-2] != '\0') {
free(buf);
buflen *= 2;
continue;
}
break;
}
t = sdscat(s, buf);
free(buf);
return t;
}
/* Append to the sds string 's' a string obtained using printf-alike format
* specifier.
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call.
*
* Example:
*
* s = sdsempty("Sum is: ");
* s = sdscatprintf(s,"%d+%d = %d",a,b,a+b).
*
* Often you need to create a string from scratch with the printf-alike
* format. When this is the need, just use sdsempty() as the target string:
*
* s = sdscatprintf(sdsempty(), "... your format ...", args);
*/
sds sdscatprintf(sds s, const char *fmt, ...) {
va_list ap;
char *t;
va_start(ap, fmt);
t = sdscatvprintf(s,fmt,ap);
va_end(ap);
return t;
}
/* Remove the part of the string from left and from right composed just of
* contiguous characters found in 'cset', that is a null terminted C string.
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call.
*
* Example:
*
* s = sdsnew("AA...AA.a.aa.aHelloWorld :::");
* s = sdstrim(s,"A. :");
* printf("%s\n", s);
*
* Output will be just "Hello World".
*/
void sdstrim(sds s, const char *cset) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
char *start, *end, *sp, *ep;
size_t len;
sp = start = s;
ep = end = s+sdslen(s)-1;
while(sp <= end && strchr(cset, *sp)) sp++;
while(ep > start && strchr(cset, *ep)) ep--;
len = (sp > ep) ? 0 : ((ep-sp)+1);
if (sh->buf != sp) memmove(sh->buf, sp, len);
sh->buf[len] = '\0';
sh->free = sh->free+(sh->len-len);
sh->len = len;
}
/* Turn the string into a smaller (or equal) string containing only the
* substring specified by the 'start' and 'end' indexes.
*
* start and end can be negative, where -1 means the last character of the
* string, -2 the penultimate character, and so forth.
*
* The interval is inclusive, so the start and end characters will be part
* of the resulting string.
*
* The string is modified in-place.
*
* Example:
*
* s = sdsnew("Hello World");
* sdsrange(s,1,-1); => "ello World"
*/
void sdsrange(sds s, int start, int end) {
struct sdshdr *sh = (void*) (s-sizeof *sh);;
size_t newlen, len = sdslen(s);
if (len == 0) return;
if (start < 0) {
start = len+start;
if (start < 0) start = 0;
}
if (end < 0) {
end = len+end;
if (end < 0) end = 0;
}
newlen = (start > end) ? 0 : (end-start)+1;
if (newlen != 0) {
if (start >= (signed)len) {
newlen = 0;
} else if (end >= (signed)len) {
end = len-1;
newlen = (start > end) ? 0 : (end-start)+1;
}
} else {
start = 0;
}
if (start && newlen) memmove(sh->buf, sh->buf+start, newlen);
sh->buf[newlen] = 0;
sh->free = sh->free+(sh->len-newlen);
sh->len = newlen;
}
/* Apply tolower() to every character of the sds string 's'. */
void sdstolower(sds s) {
int len = sdslen(s), j;
for (j = 0; j < len; j++) s[j] = tolower(s[j]);
}
/* Apply toupper() to every character of the sds string 's'. */
void sdstoupper(sds s) {
int len = sdslen(s), j;
for (j = 0; j < len; j++) s[j] = toupper(s[j]);
}
/* Compare two sds strings s1 and s2 with memcmp().
*
* Return value:
*
* 1 if s1 > s2.
* -1 if s1 < s2.
* 0 if s1 and s2 are exactly the same binary string.
*
* If two strings share exactly the same prefix, but one of the two has
* additional characters, the longer string is considered to be greater than
* the smaller one. */
int sdscmp(const sds s1, const sds s2) {
size_t l1, l2, minlen;
int cmp;
l1 = sdslen(s1);
l2 = sdslen(s2);
minlen = (l1 < l2) ? l1 : l2;
cmp = memcmp(s1,s2,minlen);
if (cmp == 0) return l1-l2;
return cmp;
}
/* Split 's' with separator in 'sep'. An array
* of sds strings is returned. *count will be set
* by reference to the number of tokens returned.
*
* On out of memory, zero length string, zero length
* separator, NULL is returned.
*
* Note that 'sep' is able to split a string using
* a multi-character separator. For example
* sdssplit("foo_-_bar","_-_"); will return two
* elements "foo" and "bar".
*
* This version of the function is binary-safe but
* requires length arguments. sdssplit() is just the
* same function but for zero-terminated strings.
*/
sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count) {
int elements = 0, slots = 5, start = 0, j;
sds *tokens;
if (seplen < 1 || len < 0) return NULL;
tokens = malloc(sizeof(sds)*slots);
if (tokens == NULL) return NULL;
if (len == 0) {
*count = 0;
return tokens;
}
for (j = 0; j < (len-(seplen-1)); j++) {
/* make sure there is room for the next element and the final one */
if (slots < elements+2) {
sds *newtokens;
slots *= 2;
newtokens = realloc(tokens,sizeof(sds)*slots);
if (newtokens == NULL) goto cleanup;
tokens = newtokens;
}
/* search the separator */
if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) {
tokens[elements] = sdsnewlen(s+start,j-start);
if (tokens[elements] == NULL) goto cleanup;
elements++;
start = j+seplen;
j = j+seplen-1; /* skip the separator */
}
}
/* Add the final element. We are sure there is room in the tokens array. */
tokens[elements] = sdsnewlen(s+start,len-start);
if (tokens[elements] == NULL) goto cleanup;
elements++;
*count = elements;
return tokens;
cleanup:
{
int i;
for (i = 0; i < elements; i++) sdsfree(tokens[i]);
free(tokens);
*count = 0;
return NULL;
}
}
/* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */
void sdsfreesplitres(sds *tokens, int count) {
if (!tokens) return;
while(count--)
sdsfree(tokens[count]);
free(tokens);
}
/* Create an sds string from a long long value. It is much faster than:
*
* sdscatprintf(sdsempty(),"%lld\n", value);
*/
sds sdsfromlonglong(long long value) {
char buf[32], *p;
unsigned long long v;
v = (value < 0) ? -value : value;
p = buf+31; /* point to the last character */
do {
*p-- = '0'+(v%10);
v /= 10;
} while(v);
if (value < 0) *p-- = '-';
p++;
return sdsnewlen(p,32-(p-buf));
}
/* Append to the sds string "s" an escaped string representation where
* all the non-printable characters (tested with isprint()) are turned into
* escapes in the form "\n\r\a...." or "\x<hex-number>".
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdscatrepr(sds s, const char *p, size_t len) {
s = sdscatlen(s,"\"",1);
while(len--) {
switch(*p) {
case '\\':
case '"':
s = sdscatprintf(s,"\\%c",*p);
break;
case '\n': s = sdscatlen(s,"\\n",2); break;
case '\r': s = sdscatlen(s,"\\r",2); break;
case '\t': s = sdscatlen(s,"\\t",2); break;
case '\a': s = sdscatlen(s,"\\a",2); break;
case '\b': s = sdscatlen(s,"\\b",2); break;
default:
if (isprint(*p))
s = sdscatprintf(s,"%c",*p);
else
s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
break;
}
p++;
}
return sdscatlen(s,"\"",1);
}
/* Helper function for sdssplitargs() that returns non zero if 'c'
* is a valid hex digit. */
int is_hex_digit(char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F');
}
/* Helper function for sdssplitargs() that converts a hex digit into an
* integer from 0 to 15 */
int hex_digit_to_int(char c) {
switch(c) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'a': case 'A': return 10;
case 'b': case 'B': return 11;
case 'c': case 'C': return 12;
case 'd': case 'D': return 13;
case 'e': case 'E': return 14;
case 'f': case 'F': return 15;
default: return 0;
}
}
/* Split a line into arguments, where every argument can be in the
* following programming-language REPL-alike form:
*
* foo bar "newline are supported\n" and "\xff\x00otherstuff"
*
* The number of arguments is stored into *argc, and an array
* of sds is returned.
*
* The caller should free the resulting array of sds strings with
* sdsfreesplitres().
*
* Note that sdscatrepr() is able to convert back a string into
* a quoted string in the same format sdssplitargs() is able to parse.
*
* The function returns the allocated tokens on success, even when the
* input string is empty, or NULL if the input contains unbalanced
* quotes or closed quotes followed by non space characters
* as in: "foo"bar or "foo'
*/
sds *sdssplitargs(const char *line, int *argc) {
const char *p = line;
char *current = NULL;
char **vector = NULL;
*argc = 0;
while(1) {
/* skip blanks */
while(*p && isspace(*p)) p++;
if (*p) {
/* get a token */
int inq=0; /* set to 1 if we are in "quotes" */
int insq=0; /* set to 1 if we are in 'single quotes' */
int done=0;
if (current == NULL) current = sdsempty();
while(!done) {
if (inq) {
if (*p == '\\' && *(p+1) == 'x' &&
is_hex_digit(*(p+2)) &&
is_hex_digit(*(p+3)))
{
unsigned char byte;
byte = (hex_digit_to_int(*(p+2))*16)+
hex_digit_to_int(*(p+3));
current = sdscatlen(current,(char*)&byte,1);
p += 3;
} else if (*p == '\\' && *(p+1)) {
char c;
p++;
switch(*p) {
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'b': c = '\b'; break;
case 'a': c = '\a'; break;
default: c = *p; break;
}
current = sdscatlen(current,&c,1);
} else if (*p == '"') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p+1) && !isspace(*(p+1))) goto err;
done=1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current,p,1);
}
} else if (insq) {
if (*p == '\\' && *(p+1) == '\'') {
p++;
current = sdscatlen(current,"'",1);
} else if (*p == '\'') {
/* closing quote must be followed by a space or
* nothing at all. */
if (*(p+1) && !isspace(*(p+1))) goto err;
done=1;
} else if (!*p) {
/* unterminated quotes */
goto err;
} else {
current = sdscatlen(current,p,1);
}
} else {
switch(*p) {
case ' ':
case '\n':
case '\r':
case '\t':
case '\0':
done=1;
break;
case '"':
inq=1;
break;
case '\'':
insq=1;
break;
default:
current = sdscatlen(current,p,1);
break;
}
}
if (*p) p++;
}
/* add the token to the vector */
vector = realloc(vector,((*argc)+1)*sizeof(char*));
vector[*argc] = current;
(*argc)++;
current = NULL;
} else {
/* Even on empty input string return something not NULL. */
if (vector == NULL) vector = malloc(sizeof(void*));
return vector;
}
}
err:
while((*argc)--)
sdsfree(vector[*argc]);
free(vector);
if (current) sdsfree(current);
*argc = 0;
return NULL;
}
/* Modify the string substituting all the occurrences of the set of
* characters specified in the 'from' string to the corresponding character
* in the 'to' array.
*
* For instance: sdsmapchars(mystring, "ho", "01", 2)
* will have the effect of turning the string "hello" into "0ell1".
*
* The function returns the sds string pointer, that is always the same
* as the input pointer since no resize is needed. */
sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) {
size_t j, i, l = sdslen(s);
for (j = 0; j < l; j++) {
for (i = 0; i < setlen; i++) {
if (s[j] == from[i]) {
s[j] = to[i];
break;
}
}
}
return s;
}
/* Join an array of C strings using the specified separator (also a C string).
* Returns the result as an sds string. */
sds sdsjoin(char **argv, int argc, char *sep, size_t seplen) {
sds join = sdsempty();
int j;
for (j = 0; j < argc; j++) {
join = sdscat(join, argv[j]);
if (j != argc-1) join = sdscatlen(join,sep,seplen);
}
return join;
}
/* Like sdsjoin, but joins an array of SDS strings. */
sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen) {
sds join = sdsempty();
int j;
for (j = 0; j < argc; j++) {
join = sdscatsds(join, argv[j]);
if (j != argc-1) join = sdscatlen(join,sep,seplen);
}
return join;
}
#ifdef SDS_TEST_MAIN
#include <stdio.h>
#include "testhelp.h"
int main(void) {
{
struct sdshdr *sh;
sds x = sdsnew("foo"), y;
test_cond("Create a string and obtain the length",
sdslen(x) == 3 && memcmp(x,"foo\0",4) == 0)
sdsfree(x);
x = sdsnewlen("foo",2);
test_cond("Create a string with specified length",
sdslen(x) == 2 && memcmp(x,"fo\0",3) == 0)
x = sdscat(x,"bar");
test_cond("Strings concatenation",
sdslen(x) == 5 && memcmp(x,"fobar\0",6) == 0);
x = sdscpy(x,"a");
test_cond("sdscpy() against an originally longer string",
sdslen(x) == 1 && memcmp(x,"a\0",2) == 0)
x = sdscpy(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk");
test_cond("sdscpy() against an originally shorter string",
sdslen(x) == 33 &&
memcmp(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk\0",33) == 0)
sdsfree(x);
x = sdscatprintf(sdsempty(),"%d",123);
test_cond("sdscatprintf() seems working in the base case",
sdslen(x) == 3 && memcmp(x,"123\0",4) ==0)
sdsfree(x);
x = sdsnew("xxciaoyyy");
sdstrim(x,"xy");
test_cond("sdstrim() correctly trims characters",
sdslen(x) == 4 && memcmp(x,"ciao\0",5) == 0)
y = sdsdup(x);
sdsrange(y,1,1);
test_cond("sdsrange(...,1,1)",
sdslen(y) == 1 && memcmp(y,"i\0",2) == 0)
sdsfree(y);
y = sdsdup(x);
sdsrange(y,1,-1);
test_cond("sdsrange(...,1,-1)",
sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0)
sdsfree(y);
y = sdsdup(x);
sdsrange(y,-2,-1);
test_cond("sdsrange(...,-2,-1)",
sdslen(y) == 2 && memcmp(y,"ao\0",3) == 0)
sdsfree(y);
y = sdsdup(x);
sdsrange(y,2,1);
test_cond("sdsrange(...,2,1)",
sdslen(y) == 0 && memcmp(y,"\0",1) == 0)
sdsfree(y);
y = sdsdup(x);
sdsrange(y,1,100);
test_cond("sdsrange(...,1,100)",
sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0)
sdsfree(y);
y = sdsdup(x);
sdsrange(y,100,100);
test_cond("sdsrange(...,100,100)",
sdslen(y) == 0 && memcmp(y,"\0",1) == 0)
sdsfree(y);
sdsfree(x);
x = sdsnew("foo");
y = sdsnew("foa");
test_cond("sdscmp(foo,foa)", sdscmp(x,y) > 0)
sdsfree(y);
sdsfree(x);
x = sdsnew("bar");
y = sdsnew("bar");
test_cond("sdscmp(bar,bar)", sdscmp(x,y) == 0)
sdsfree(y);
sdsfree(x);
x = sdsnew("aar");
y = sdsnew("bar");
test_cond("sdscmp(bar,bar)", sdscmp(x,y) < 0)
sdsfree(y);
sdsfree(x);
x = sdsnewlen("\a\n\0foo\r",7);
y = sdscatrepr(sdsempty(),x,sdslen(x));
test_cond("sdscatrepr(...data...)",
memcmp(y,"\"\\a\\n\\x00foo\\r\"",15) == 0)
{
int oldfree;
sdsfree(x);
x = sdsnew("0");
sh = (void*) (x-(sizeof(struct sdshdr)));
test_cond("sdsnew() free/len buffers", sh->len == 1 && sh->free == 0);
x = sdsMakeRoomFor(x,1);
sh = (void*) (x-(sizeof(struct sdshdr)));
test_cond("sdsMakeRoomFor()", sh->len == 1 && sh->free > 0);
oldfree = sh->free;
x[1] = '1';
sdsIncrLen(x,1);
test_cond("sdsIncrLen() -- content", x[0] == '0' && x[1] == '1');
test_cond("sdsIncrLen() -- len", sh->len == 2);
test_cond("sdsIncrLen() -- free", sh->free == oldfree-1);
}
}
test_report()
return 0;
}
#endif

101
src/sds/sds.h Normal file
View File

@@ -0,0 +1,101 @@
/* SDS (Simple Dynamic Strings), A C dynamic strings library.
*
* Copyright (c) 2006-2014, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SDS_H
#define __SDS_H
#define SDS_MAX_PREALLOC (1024*1024)
#include <sys/types.h>
#include <stdarg.h>
typedef char *sds;
struct sdshdr {
int len;
int free;
char buf[];
};
static inline size_t sdslen(const sds s) {
struct sdshdr *sh = (void*)(s-sizeof *sh);
return sh->len;
}
static inline size_t sdsavail(const sds s) {
struct sdshdr *sh = (void*)(s-sizeof *sh);
return sh->free;
}
sds sdsnewlen(const void *init, size_t initlen);
sds sdsnew(const char *init);
sds sdsempty(void);
size_t sdslen(const sds s);
sds sdsdup(const sds s);
void sdsfree(sds s);
size_t sdsavail(const sds s);
sds sdsgrowzero(sds s, size_t len);
sds sdscatlen(sds s, const void *t, size_t len);
sds sdscat(sds s, const char *t);
sds sdscatsds(sds s, const sds t);
sds sdscpylen(sds s, const char *t, size_t len);
sds sdscpy(sds s, const char *t);
sds sdscatvprintf(sds s, const char *fmt, va_list ap);
#ifdef __GNUC__
sds sdscatprintf(sds s, const char *fmt, ...)
__attribute__((format(printf, 2, 3)));
#else
sds sdscatprintf(sds s, const char *fmt, ...);
#endif
void sdstrim(sds s, const char *cset);
void sdsrange(sds s, int start, int end);
void sdsupdatelen(sds s);
void sdsclear(sds s);
int sdscmp(const sds s1, const sds s2);
sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count);
void sdsfreesplitres(sds *tokens, int count);
void sdstolower(sds s);
void sdstoupper(sds s);
sds sdsfromlonglong(long long value);
sds sdscatrepr(sds s, const char *p, size_t len);
sds *sdssplitargs(const char *line, int *argc);
sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen);
sds sdsjoin(char **argv, int argc, char *sep, size_t seplen);
sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen);
/* Low level functions exposed to the user API */
sds sdsMakeRoomFor(sds s, size_t addlen);
void sdsIncrLen(sds s, int incr);
sds sdsRemoveFreeSpace(sds s);
size_t sdsAllocSize(sds s);
#endif

198
src/string_utils.c Normal file
View File

@@ -0,0 +1,198 @@
#include "string_utils.h"
#include <stdio.h>
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n) - 1)
int string_compare_case_insensitive(const char *str1, const char *str2) {
int c1, c2;
do {
c1 = tolower(*str1++);
c2 = tolower(*str2++);
} while (c1 == c2 && c1 != 0);
return c1 - c2;
}
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len) {
register unsigned char *s1 = (unsigned char *) str1;
register unsigned char *s2 = (unsigned char *) str2;
unsigned char c1, c2;
if (!len)
return 0;
do {
c1 = *s1++;
c2 = *s2++;
if (!c1 || !c2)
break;
if (c1 == c2)
continue;
c1 = tolower(c1);
c2 = tolower(c2);
if (c1 != c2)
break;
} while (--len);
return (int)c1 - (int)c2;
}
int string_common_prefix(const char *str1, const char *str2) {
int common_prefix;
for (common_prefix = 0; *str1 && *str2 && *str1 == *str2; str1++, str2++)
common_prefix++;
return common_prefix;
}
int string_common_suffix(const char *str1, const char *str2) {
int common_suffix = 0;
int str1_len = strlen(str1);
int str2_len = strlen(str2);
int min_len = (str1_len < str2_len) ? str1_len : str2_len;
for (int i=1; i <= min_len && str1[str1_len-i] == str2[str2_len-i]; i++)
common_suffix++;
return common_suffix;
}
bool string_starts_with(const char *str, const char *start) {
for (; *start; str++, start++)
if (*str != *start)
return false;
return true;
}
bool string_ends_with(const char *str, const char *ending) {
int end_len = strlen(ending);
int str_len = strlen(str);
return str_len < end_len ? false : !strcmp(str + str_len - end_len, ending);
}
void string_upper(char *s) {
for (; *s; ++s) *s = toupper(*s);
}
void string_lower(char *s) {
for (; *s; ++s) *s = tolower(*s);
}
uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len) {
uint num_replacements = 0;
size_t len = sdslen(str);
for (int i = 0; i < len; i++) {
for (int j = 0; j < trans_len; j++) {
if (str[i] == word_chars[j]) {
str[i] = word_repls[j];
num_replacements++;
break;
}
}
}
return num_replacements;
}
char *utf8_reversed_string(const char *s) {
int32_t unich;
ssize_t len, remaining;
size_t slen = strlen(s);
char *out = malloc(slen + 1);
uint8_t *ptr = (uint8_t *)s;
char *rev_ptr = out + slen;
while(1) {
len = utf8proc_iterate(ptr, -1, &unich);
remaining = len;
if (len <= 0 || unich == 0) break;
if (!(utf8proc_codepoint_valid(unich))) goto error_free_output;
rev_ptr -= len;
memcpy(rev_ptr, (char *)ptr, len);
ptr += len;
}
out[slen] = '\0';
return out;
error_free_output:
free(out);
return NULL;
}
bool utf8_is_letter(int32_t ch) {
const utf8proc_property_t *props = utf8proc_get_property(ch);
utf8proc_propval_t cat = props->category;
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO || \
cat == UTF8PROC_CATEGORY_LM;
}
char *string_strip_whitespace(char *str) {
char *end;
while (isspace(*str)) str++;
if (*str == '\0')
return str;
end = str + strlen(str) - 1;
while (end > str && isspace(*end)) end--;
*(end+1) = '\0';
return str;
}
void contiguous_string_array_add_string_unterminated(char_array *array, char *str) {
while (*str) {
char_array_push(array, *str++);
}
}
void contiguous_string_array_add_string(char_array *array, char *str) {
contiguous_string_array_add_string_unterminated(array, str);
char_array_push(array, '\0');
}
void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len) {
for (int i = 0; i < len; i++) {
char_array_push(array, *str++);
}
}
void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len) {
contiguous_string_array_add_string_unterminated_len(array, str, len);
char_array_push(array, '\0');
}
// Designed for using the char_array and uchar_array to store lots of short strings
int contiguous_string_array_next_index(char_array *string_array, int i, size_t n) {
if (INVALID_INDEX(i, string_array->n)) {
return -1;
}
int len = 0;
char *array = string_array->a + i;
while (*array && i + len <= n - 1) {
array++;
len++;
}
if (len < n - 1) {
return len + 1;
}
return -1;
}

49
src/string_utils.h Normal file
View File

@@ -0,0 +1,49 @@
#ifndef STRING_UTILS_H
#define STRING_UTILS_H
#ifdef __cplusplus
extern "C" {
#endif
#include <ctype.h>
#include <string.h>
#include <stdbool.h>
#include "collections.h"
#include "sds/sds.h"
#include "utf8proc/utf8proc.h"
#include "vector.h"
VECTOR_INIT_FREE_DATA(string_array, sds, sdsfree)
// NOTE: this particular implementation works only for ASCII strings
int string_compare_case_insensitive(const char *str1, const char *str2);
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len);
int string_common_prefix(const char *str1, const char *str2);
void string_lower(char *str);
void string_upper(char *str);
bool string_starts_with(const char *str, const char *start);
bool string_ends_with(const char *str, const char *ending);
uint string_translate(sds str, char *word_chars, char *word_repls, size_t trans_len);
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
bool utf8_is_letter(int32_t ch);
char *string_strip_whitespace(char *str);
void contiguous_string_array_add_string_unterminated(char_array *array, char *str);
void contiguous_string_array_add_string(char_array *array, char *str);
void contiguous_string_array_add_string_unterminated_len(char_array *array, char *str, size_t len);
void contiguous_string_array_add_string_len(char_array *array, char *str, size_t len);
int contiguous_string_array_next_index(char_array *string_array, int i, size_t n);
#ifdef __cplusplus
}
#endif
#endif

67
src/token_types.h Normal file
View File

@@ -0,0 +1,67 @@
#ifndef TOKEN_TYPES_H
#define TOKEN_TYPES_H
#ifdef __cplusplus
extern "C" {
#endif
// Doing these as #defines so we can duplicate the values exactly in Python
#define END 0 // Null byte
// Word types
#define WORD 1 // Any letter-only word (includes all unicode letters)
#define ABBREVIATION 2 // Loose abbreviations (ending in ".")
// Numbers and numeric types
#define NUMBER 50 // All digits
#define NUMERIC 51 // Any sequence containing a digit
#define ORDINAL 52 // 1st, 2nd, etc.
#define NUMERIC_RANGE 53 // 2-3, Queens addresses, US ZIP+4 codes
#define ORDINAL_RANGE 54 // 1-2nd, 1st-2nd
#define ROMAN_NUMERAL 55 // II, III, VI, etc.
#define US_PHONE 56 // US phone number (with or without country code)
#define INTL_PHONE 57 // A non-US phone number (must have country code)
// Punctuation types, may separate a phrase
#define PERIOD 100
#define EXCLAMATION 101
#define QUESTION_MARK 102
#define COMMA 103
#define COLON 104
#define SEMICOLON 105
#define PLUS 106
#define AMPERSAND 107
#define AT_SIGN 108
#define POUND 109
#define ELLIPSIS 110
#define DASH 111
#define BREAKING_DASH 112
#define HYPHEN 113
#define LPAREN 114
#define RPAREN 115
#define LBSQUARE 116
#define RBSQUARE 117
#define DOUBLE_QUOTE 118
#define SINGLE_QUOTE 119
#define LEFT_DOUBLE_QUOTE 120
#define RIGHT_DOUBLE_QUOTE 121
#define LEFT_SINGLE_QUOTE 122
#define RIGHT_SINGLE_QUOTE 123
#define SLASH 124
#define BACKSLASH 125
#define GREATER_THAN 126
#define LESS_THAN 127
// Non-letters and whitespace
#define OTHER 200
#define WHITESPACE 300
#define NEWLINE 301
#ifdef __cplusplus
}
#endif
#endif

40
src/tokens.c Normal file
View File

@@ -0,0 +1,40 @@
#include "tokens.h"
tokenized_string_t *tokenized_string_new(void) {
tokenized_string_t *self = malloc(sizeof(tokenized_string_t));
self->str = char_array_new();
self->tokens = token_array_new();
return self;
}
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t position) {
char *ptr = (char *) (src + position);
size_t offset = self->str->n;
contiguous_string_array_add_string_len(self->str, ptr, len);
token_t token = (token_t){offset, len, token_type, position};
token_array_push(self->tokens, token);
}
char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index) {
if (index < self->tokens->n) {
uint64_t i = self->tokens->a[index].offset;
return (char *)self->str->a + i;
} else {
return NULL;
}
}
void tokenized_string_destroy(tokenized_string_t *self) {
if (!self)
return;
if (self->str)
char_array_destroy(self->str);
if (self->tokens)
token_array_destroy(self->tokens);
free(self);
}

41
src/tokens.h Normal file
View File

@@ -0,0 +1,41 @@
#ifndef TOKENS_H
#define TOKENS_H
#ifdef __cplusplus
extern "C" {
#endif
#include "klib/khash.h"
#include "sds/sds.h"
#include "collections.h"
#include "string_utils.h"
#include "token_types.h"
#include "vector.h"
typedef struct token {
size_t offset;
size_t len;
uint16_t type;
uint64_t src_position;
} token_t;
VECTOR_INIT(token_array, token_t)
typedef struct tokenized_string {
char_array *str;
token_array *tokens;
} tokenized_string_t;
tokenized_string_t *tokenized_string_new(void);
void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_t len, uint16_t token_type, uint64_t src_position);
char *tokenized_string_get_token(tokenized_string_t *self, uint64_t index);
void tokenized_string_destroy(tokenized_string_t *self);
#ifdef __cplusplus
}
#endif
#endif

587
src/utf8proc/utf8proc.c Normal file
View File

@@ -0,0 +1,587 @@
/*
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
* This library contains derived data from a modified version of the
* Unicode data files.
*
* The original data files are available at
* http://www.unicode.org/Public/UNIDATA/
*
* Please notice the copyright statement in the file "utf8proc_data.c".
*/
/*
* File name: utf8proc.c
*
* Description:
* Implementation of libutf8proc.
*/
#include "utf8proc.h"
#include "utf8proc_data.c"
const int8_t utf8proc_utf8class[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
#define UTF8PROC_HANGUL_SBASE 0xAC00
#define UTF8PROC_HANGUL_LBASE 0x1100
#define UTF8PROC_HANGUL_VBASE 0x1161
#define UTF8PROC_HANGUL_TBASE 0x11A7
#define UTF8PROC_HANGUL_LCOUNT 19
#define UTF8PROC_HANGUL_VCOUNT 21
#define UTF8PROC_HANGUL_TCOUNT 28
#define UTF8PROC_HANGUL_NCOUNT 588
#define UTF8PROC_HANGUL_SCOUNT 11172
/* END is exclusive */
#define UTF8PROC_HANGUL_L_START 0x1100
#define UTF8PROC_HANGUL_L_END 0x115A
#define UTF8PROC_HANGUL_L_FILLER 0x115F
#define UTF8PROC_HANGUL_V_START 0x1160
#define UTF8PROC_HANGUL_V_END 0x11A3
#define UTF8PROC_HANGUL_T_START 0x11A8
#define UTF8PROC_HANGUL_T_END 0x11FA
#define UTF8PROC_HANGUL_S_START 0xAC00
#define UTF8PROC_HANGUL_S_END 0xD7A4
#define UTF8PROC_BOUNDCLASS_START 0
#define UTF8PROC_BOUNDCLASS_OTHER 1
#define UTF8PROC_BOUNDCLASS_CR 2
#define UTF8PROC_BOUNDCLASS_LF 3
#define UTF8PROC_BOUNDCLASS_CONTROL 4
#define UTF8PROC_BOUNDCLASS_EXTEND 5
#define UTF8PROC_BOUNDCLASS_L 6
#define UTF8PROC_BOUNDCLASS_V 7
#define UTF8PROC_BOUNDCLASS_T 8
#define UTF8PROC_BOUNDCLASS_LV 9
#define UTF8PROC_BOUNDCLASS_LVT 10
const char *utf8proc_version(void) {
return "1.1.6";
}
const char *utf8proc_errmsg(ssize_t errcode) {
switch (errcode) {
case UTF8PROC_ERROR_NOMEM:
return "Memory for processing UTF-8 data could not be allocated.";
case UTF8PROC_ERROR_OVERFLOW:
return "UTF-8 string is too long to be processed.";
case UTF8PROC_ERROR_INVALIDUTF8:
return "Invalid UTF-8 string";
case UTF8PROC_ERROR_NOTASSIGNED:
return "Unassigned Unicode code point found in UTF-8 string.";
case UTF8PROC_ERROR_INVALIDOPTS:
return "Invalid options for UTF-8 processing chosen.";
default:
return "An unknown error occured while processing UTF-8 data.";
}
}
ssize_t utf8proc_iterate(
const uint8_t *str, ssize_t strlen, int32_t *dst
) {
int length;
int i;
int32_t uc = -1;
*dst = -1;
if (!strlen) return 0;
length = utf8proc_utf8class[str[0]];
if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
for (i=1; i<length; i++) {
if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
}
switch (length) {
case 1:
uc = str[0];
break;
case 2:
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
if (uc < 0x80) uc = -1;
break;
case 3:
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
+ (str[2] & 0x3F);
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
break;
case 4:
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
break;
}
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
return UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc;
return length;
}
bool utf8proc_codepoint_valid(int32_t uc) {
if (uc < 0 || uc >= 0x110000 ||
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
(uc >= 0xFDD0 && uc < 0xFDF0)) return false;
else return true;
}
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
if (uc < 0x00) {
return 0;
} else if (uc < 0x80) {
dst[0] = uc;
return 1;
} else if (uc < 0x800) {
dst[0] = 0xC0 + (uc >> 6);
dst[1] = 0x80 + (uc & 0x3F);
return 2;
} else if (uc == 0xFFFF) {
dst[0] = 0xFF;
return 1;
} else if (uc == 0xFFFE) {
dst[0] = 0xFE;
return 1;
} else if (uc < 0x10000) {
dst[0] = 0xE0 + (uc >> 12);
dst[1] = 0x80 + ((uc >> 6) & 0x3F);
dst[2] = 0x80 + (uc & 0x3F);
return 3;
} else if (uc < 0x110000) {
dst[0] = 0xF0 + (uc >> 18);
dst[1] = 0x80 + ((uc >> 12) & 0x3F);
dst[2] = 0x80 + ((uc >> 6) & 0x3F);
dst[3] = 0x80 + (uc & 0x3F);
return 4;
} else return 0;
}
const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
/* ASSERT: uc >= 0 && uc < 0x110000 */
return utf8proc_properties + (
utf8proc_stage2table[
utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
]
);
}
#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
int options, int *last_boundclass) {
/* ASSERT: uc >= 0 && uc < 0x110000 */
const utf8proc_property_t *property;
utf8proc_propval_t category;
int32_t hangul_sindex;
property = utf8proc_get_property(uc);
category = property->category;
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
int32_t hangul_tindex;
if (bufsize >= 1) {
dst[0] = UTF8PROC_HANGUL_LBASE +
hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
}
hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
if (!hangul_tindex) return 2;
if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
return 3;
}
}
if (options & UTF8PROC_REJECTNA) {
if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
}
if (options & UTF8PROC_IGNORE) {
if (property->ignorable) return 0;
}
if (options & UTF8PROC_LUMP) {
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
utf8proc_decompose_lump(0x0027);
if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
utf8proc_decompose_lump(0x002D);
if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
utf8proc_decompose_lump(0x003C);
if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
utf8proc_decompose_lump(0x003E);
if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
utf8proc_decompose_lump(0x005E);
if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
utf8proc_decompose_lump(0x005F);
if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
if (category == UTF8PROC_CATEGORY_ZL ||
category == UTF8PROC_CATEGORY_ZP)
utf8proc_decompose_lump(0x000A);
}
}
if (options & UTF8PROC_STRIPMARK) {
if (category == UTF8PROC_CATEGORY_MN ||
category == UTF8PROC_CATEGORY_MC ||
category == UTF8PROC_CATEGORY_ME) return 0;
}
if (options & UTF8PROC_CASEFOLD) {
if (property->casefold_mapping) {
const int32_t *casefold_entry;
ssize_t written = 0;
for (casefold_entry = property->casefold_mapping;
*casefold_entry >= 0; casefold_entry++) {
written += utf8proc_decompose_char(*casefold_entry, dst+written,
(bufsize > written) ? (bufsize - written) : 0, options,
last_boundclass);
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
}
return written;
}
}
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
if (property->decomp_mapping &&
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
const int32_t *decomp_entry;
ssize_t written = 0;
for (decomp_entry = property->decomp_mapping;
*decomp_entry >= 0; decomp_entry++) {
written += utf8proc_decompose_char(*decomp_entry, dst+written,
(bufsize > written) ? (bufsize - written) : 0, options,
last_boundclass);
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
}
return written;
}
}
if (options & UTF8PROC_CHARBOUND) {
bool boundary;
int tbc, lbc;
tbc =
(uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
(uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
((category == UTF8PROC_CATEGORY_ZL ||
category == UTF8PROC_CATEGORY_ZP ||
category == UTF8PROC_CATEGORY_CC ||
category == UTF8PROC_CATEGORY_CF) &&
!(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
(uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
UTF8PROC_BOUNDCLASS_V :
(uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
UTF8PROC_BOUNDCLASS_T :
(uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
) :
UTF8PROC_BOUNDCLASS_OTHER;
lbc = *last_boundclass;
boundary =
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR &&
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
(lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(lbc == UTF8PROC_BOUNDCLASS_L &&
(tbc == UTF8PROC_BOUNDCLASS_L ||
tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_LV ||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LV ||
lbc == UTF8PROC_BOUNDCLASS_V) &&
(tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
lbc == UTF8PROC_BOUNDCLASS_T) &&
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
true;
*last_boundclass = tbc;
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 2) dst[1] = uc;
return 2;
}
}
if (bufsize >= 1) *dst = uc;
return 1;
}
ssize_t utf8proc_decompose(
const uint8_t *str, ssize_t strlen,
int32_t *buffer, ssize_t bufsize, int options
) {
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
ssize_t wpos = 0;
if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
return UTF8PROC_ERROR_INVALIDOPTS;
if ((options & UTF8PROC_STRIPMARK) &&
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
return UTF8PROC_ERROR_INVALIDOPTS;
{
int32_t uc;
ssize_t rpos = 0;
ssize_t decomp_result;
int boundclass = UTF8PROC_BOUNDCLASS_START;
while (1) {
if (options & UTF8PROC_NULLTERM) {
rpos += utf8proc_iterate(str + rpos, -1, &uc);
/* checking of return value is not neccessary,
as 'uc' is < 0 in case of error */
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
if (uc == 0) break;
} else {
if (rpos >= strlen) break;
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
}
decomp_result = utf8proc_decompose_char(
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
&boundclass
);
if (decomp_result < 0) return decomp_result;
wpos += decomp_result;
/* prohibiting integer overflows due to too long strings: */
if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
return UTF8PROC_ERROR_OVERFLOW;
}
}
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
ssize_t pos = 0;
while (pos < wpos-1) {
int32_t uc1, uc2;
const utf8proc_property_t *property1, *property2;
uc1 = buffer[pos];
uc2 = buffer[pos+1];
property1 = utf8proc_get_property(uc1);
property2 = utf8proc_get_property(uc2);
if (property1->combining_class > property2->combining_class &&
property2->combining_class > 0) {
buffer[pos] = uc2;
buffer[pos+1] = uc1;
if (pos > 0) pos--; else pos++;
} else {
pos++;
}
}
}
return wpos;
}
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
ASSERT: 'buffer' has one spare byte of free space at the end! */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
ssize_t rpos;
ssize_t wpos = 0;
int32_t uc;
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
if (options & UTF8PROC_NLF2LS) {
if (options & UTF8PROC_NLF2PS) {
buffer[wpos++] = 0x000A;
} else {
buffer[wpos++] = 0x2028;
}
} else {
if (options & UTF8PROC_NLF2PS) {
buffer[wpos++] = 0x2029;
} else {
buffer[wpos++] = 0x0020;
}
}
} else if ((options & UTF8PROC_STRIPCC) &&
(uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
if (uc == 0x0009) buffer[wpos++] = 0x0020;
} else {
buffer[wpos++] = uc;
}
}
length = wpos;
}
if (options & UTF8PROC_COMPOSE) {
int32_t *starter = NULL;
int32_t current_char;
const utf8proc_property_t *starter_property = NULL, *current_property;
utf8proc_propval_t max_combining_class = -1;
ssize_t rpos;
ssize_t wpos = 0;
int32_t composition;
for (rpos = 0; rpos < length; rpos++) {
current_char = buffer[rpos];
current_property = utf8proc_get_property(current_char);
if (starter && current_property->combining_class > max_combining_class) {
/* combination perhaps possible */
int32_t hangul_lindex;
int32_t hangul_sindex;
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
int32_t hangul_vindex;
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
*starter = UTF8PROC_HANGUL_SBASE +
(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
UTF8PROC_HANGUL_TCOUNT;
starter_property = NULL;
continue;
}
}
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
int32_t hangul_tindex;
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
*starter += hangul_tindex;
starter_property = NULL;
continue;
}
}
if (!starter_property) {
starter_property = utf8proc_get_property(*starter);
}
if (starter_property->comb1st_index >= 0 &&
current_property->comb2nd_index >= 0) {
composition = utf8proc_combinations[
starter_property->comb1st_index +
current_property->comb2nd_index
];
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
!(utf8proc_get_property(composition)->comp_exclusion))) {
*starter = composition;
starter_property = NULL;
continue;
}
}
}
buffer[wpos] = current_char;
if (current_property->combining_class) {
if (current_property->combining_class > max_combining_class) {
max_combining_class = current_property->combining_class;
}
} else {
starter = buffer + wpos;
starter_property = NULL;
max_combining_class = -1;
}
wpos++;
}
length = wpos;
}
{
ssize_t rpos, wpos = 0;
int32_t uc;
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
}
((uint8_t *)buffer)[wpos] = 0;
return wpos;
}
}
ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
) {
int32_t *buffer;
ssize_t result;
*dstptr = NULL;
result = utf8proc_decompose(str, strlen, NULL, 0, options);
if (result < 0) return result;
buffer = malloc(result * sizeof(int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose(str, strlen, buffer, result, options);
if (result < 0) {
free(buffer);
return result;
}
result = utf8proc_reencode(buffer, result, options);
if (result < 0) {
free(buffer);
return result;
}
{
int32_t *newptr;
newptr = realloc(buffer, (size_t)result+1);
if (newptr) buffer = newptr;
}
*dstptr = (uint8_t *)buffer;
return result;
}
uint8_t *utf8proc_NFD(const uint8_t *str) {
uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_DECOMPOSE);
return retval;
}
uint8_t *utf8proc_NFC(const uint8_t *str) {
uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE);
return retval;
}
uint8_t *utf8proc_NFKD(const uint8_t *str) {
uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
return retval;
}
uint8_t *utf8proc_NFKC(const uint8_t *str) {
uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval;
}

385
src/utf8proc/utf8proc.h Normal file
View File

@@ -0,0 +1,385 @@
/*
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/*
* File name: utf8proc.h
*
* Description:
* Header files for libutf8proc, which is a mapping tool for UTF-8 strings
* with following features:
* - decomposing and composing of strings
* - replacing compatibility characters with their equivalents
* - stripping of "default ignorable characters"
* like SOFT-HYPHEN or ZERO-WIDTH-SPACE
* - folding of certain characters for string comparison
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
* (see "LUMP" option)
* - optional rejection of strings containing non-assigned code points
* - stripping of control characters
* - stripping of character marks (accents, etc.)
* - transformation of LF, CRLF, CR and NEL to line-feed (LF)
* or to the unicode chararacters for paragraph separation (PS)
* or line separation (LS).
* - unicode case folding (for case insensitive string comparisons)
* - rejection of illegal UTF-8 data
* (i.e. UTF-8 encoded UTF-16 surrogates)
* - support for korean hangul characters
* Unicode Version 5.0.0 is supported.
*/
#ifndef UTF8PROC_H
#define UTF8PROC_H
#include <stdlib.h>
#include <sys/types.h>
#ifdef _MSC_VER
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef short int16_t;
typedef unsigned short uint16_t;
typedef int int32_t;
#ifdef _WIN64
#define ssize_t __int64
#else
#define ssize_t int
#endif
typedef unsigned char bool;
enum {false, true};
#else
#include <stdbool.h>
#include <inttypes.h>
#endif
#include <limits.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef SSIZE_MAX
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
#endif
#define UTF8PROC_NULLTERM (1<<0)
#define UTF8PROC_STABLE (1<<1)
#define UTF8PROC_COMPAT (1<<2)
#define UTF8PROC_COMPOSE (1<<3)
#define UTF8PROC_DECOMPOSE (1<<4)
#define UTF8PROC_IGNORE (1<<5)
#define UTF8PROC_REJECTNA (1<<6)
#define UTF8PROC_NLF2LS (1<<7)
#define UTF8PROC_NLF2PS (1<<8)
#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
#define UTF8PROC_STRIPCC (1<<9)
#define UTF8PROC_CASEFOLD (1<<10)
#define UTF8PROC_CHARBOUND (1<<11)
#define UTF8PROC_LUMP (1<<12)
#define UTF8PROC_STRIPMARK (1<<13)
/*
* Flags being regarded by several functions in the library:
* NULLTERM: The given UTF-8 input is NULL terminated.
* STABLE: Unicode Versioning Stability has to be respected.
* COMPAT: Compatiblity decomposition
* (i.e. formatting information is lost)
* COMPOSE: Return a result with composed characters.
* DECOMPOSE: Return a result with decomposed characters.
* IGNORE: Strip "default ignorable characters"
* REJECTNA: Return an error, if the input contains unassigned
* code points.
* NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
* representing a line break, and should be converted to the
* unicode character for line separation (LS).
* NLF2PS: Indicating that NLF-sequences are representing a paragraph
* break, and should be converted to the unicode character for
* paragraph separation (PS).
* NLF2LF: Indicating that the meaning of NLF-sequences is unknown.
* STRIPCC: Strips and/or convers control characters.
* NLF-sequences are transformed into space, except if one of
* the NLF2LS/PS/LF options is given.
* HorizontalTab (HT) and FormFeed (FF) are treated as a
* NLF-sequence in this case.
* All other control characters are simply removed.
* CASEFOLD: Performs unicode case folding, to be able to do a
* case-insensitive string comparison.
* CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
* is representing a single grapheme cluster (see UAX#29).
* LUMP: Lumps certain characters together
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
* (See lump.txt for details.)
* If NLF2LF is set, this includes a transformation of
* paragraph and line separators to ASCII line-feed (LF).
* STRIPMARK: Strips all character markings
* (non-spacing, spacing and enclosing) (i.e. accents)
* NOTE: this option works only with COMPOSE or DECOMPOSE
*/
#define UTF8PROC_ERROR_NOMEM -1
#define UTF8PROC_ERROR_OVERFLOW -2
#define UTF8PROC_ERROR_INVALIDUTF8 -3
#define UTF8PROC_ERROR_NOTASSIGNED -4
#define UTF8PROC_ERROR_INVALIDOPTS -5
/*
* Error codes being returned by almost all functions:
* ERROR_NOMEM: Memory could not be allocated.
* ERROR_OVERFLOW: The given string is too long to be processed.
* ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
* ERROR_NOTASSIGNED: The REJECTNA flag was set,
* and an unassigned code point was found.
* ERROR_INVALIDOPTS: Invalid options have been used.
*/
typedef int16_t utf8proc_propval_t;
typedef struct utf8proc_property_struct {
utf8proc_propval_t category;
utf8proc_propval_t combining_class;
utf8proc_propval_t bidi_class;
utf8proc_propval_t decomp_type;
const int32_t *decomp_mapping;
unsigned bidi_mirrored:1;
int32_t uppercase_mapping;
int32_t lowercase_mapping;
int32_t titlecase_mapping;
int32_t comb1st_index;
int32_t comb2nd_index;
unsigned comp_exclusion:1;
unsigned ignorable:1;
unsigned control_boundary:1;
unsigned extend:1;
const int32_t *casefold_mapping;
} utf8proc_property_t;
#define UTF8PROC_CATEGORY_LU 1
#define UTF8PROC_CATEGORY_LL 2
#define UTF8PROC_CATEGORY_LT 3
#define UTF8PROC_CATEGORY_LM 4
#define UTF8PROC_CATEGORY_LO 5
#define UTF8PROC_CATEGORY_MN 6
#define UTF8PROC_CATEGORY_MC 7
#define UTF8PROC_CATEGORY_ME 8
#define UTF8PROC_CATEGORY_ND 9
#define UTF8PROC_CATEGORY_NL 10
#define UTF8PROC_CATEGORY_NO 11
#define UTF8PROC_CATEGORY_PC 12
#define UTF8PROC_CATEGORY_PD 13
#define UTF8PROC_CATEGORY_PS 14
#define UTF8PROC_CATEGORY_PE 15
#define UTF8PROC_CATEGORY_PI 16
#define UTF8PROC_CATEGORY_PF 17
#define UTF8PROC_CATEGORY_PO 18
#define UTF8PROC_CATEGORY_SM 19
#define UTF8PROC_CATEGORY_SC 20
#define UTF8PROC_CATEGORY_SK 21
#define UTF8PROC_CATEGORY_SO 22
#define UTF8PROC_CATEGORY_ZS 23
#define UTF8PROC_CATEGORY_ZL 24
#define UTF8PROC_CATEGORY_ZP 25
#define UTF8PROC_CATEGORY_CC 26
#define UTF8PROC_CATEGORY_CF 27
#define UTF8PROC_CATEGORY_CS 28
#define UTF8PROC_CATEGORY_CO 29
#define UTF8PROC_CATEGORY_CN 30
#define UTF8PROC_BIDI_CLASS_L 1
#define UTF8PROC_BIDI_CLASS_LRE 2
#define UTF8PROC_BIDI_CLASS_LRO 3
#define UTF8PROC_BIDI_CLASS_R 4
#define UTF8PROC_BIDI_CLASS_AL 5
#define UTF8PROC_BIDI_CLASS_RLE 6
#define UTF8PROC_BIDI_CLASS_RLO 7
#define UTF8PROC_BIDI_CLASS_PDF 8
#define UTF8PROC_BIDI_CLASS_EN 9
#define UTF8PROC_BIDI_CLASS_ES 10
#define UTF8PROC_BIDI_CLASS_ET 11
#define UTF8PROC_BIDI_CLASS_AN 12
#define UTF8PROC_BIDI_CLASS_CS 13
#define UTF8PROC_BIDI_CLASS_NSM 14
#define UTF8PROC_BIDI_CLASS_BN 15
#define UTF8PROC_BIDI_CLASS_B 16
#define UTF8PROC_BIDI_CLASS_S 17
#define UTF8PROC_BIDI_CLASS_WS 18
#define UTF8PROC_BIDI_CLASS_ON 19
#define UTF8PROC_DECOMP_TYPE_FONT 1
#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
#define UTF8PROC_DECOMP_TYPE_INITIAL 3
#define UTF8PROC_DECOMP_TYPE_MEDIAL 4
#define UTF8PROC_DECOMP_TYPE_FINAL 5
#define UTF8PROC_DECOMP_TYPE_ISOLATED 6
#define UTF8PROC_DECOMP_TYPE_CIRCLE 7
#define UTF8PROC_DECOMP_TYPE_SUPER 8
#define UTF8PROC_DECOMP_TYPE_SUB 9
#define UTF8PROC_DECOMP_TYPE_VERTICAL 10
#define UTF8PROC_DECOMP_TYPE_WIDE 11
#define UTF8PROC_DECOMP_TYPE_NARROW 12
#define UTF8PROC_DECOMP_TYPE_SMALL 13
#define UTF8PROC_DECOMP_TYPE_SQUARE 14
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
extern const int8_t utf8proc_utf8class[256];
const char *utf8proc_version(void);
const char *utf8proc_errmsg(ssize_t errcode);
/*
* Returns a static error string for the given error code.
*/
ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
/*
* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
* The maximum number of bytes read is 'strlen', unless 'strlen' is
* negative.
* If a valid unicode char could be read, it is stored in the variable
* being pointed to by 'dst', otherwise that variable will be set to -1.
* In case of success the number of bytes read is returned, otherwise a
* negative error code is returned.
*/
bool utf8proc_codepoint_valid(int32_t uc);
/*
* Returns 1, if the given unicode code-point is valid, otherwise 0.
*/
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
/*
* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
* the byte array being pointed to by 'dst'. This array has to be at least
* 4 bytes long.
* In case of success the number of bytes written is returned,
* otherwise 0.
* This function does not check if 'uc' is a valid unicode code point.
*/
const utf8proc_property_t *utf8proc_get_property(int32_t uc);
/*
* Returns a pointer to a (constant) struct containing information about
* the unicode char with the given code point 'uc'.
* If the character is not existent a pointer to a special struct is
* returned, where 'category' is a NULL pointer.
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
* 0x10FFFF, otherwise the program might crash!
*/
ssize_t utf8proc_decompose_char(
int32_t uc, int32_t *dst, ssize_t bufsize,
int options, int *last_boundclass
);
/*
* Writes a decomposition of the unicode char 'uc' into the array being
* pointed to by 'dst'.
* Following flags in the 'options' field are regarded:
* REJECTNA: an unassigned unicode code point leads to an error
* IGNORE: "default ignorable" chars are stripped
* CASEFOLD: unicode casefolding is applied
* COMPAT: replace certain characters with their
* compatibility decomposition
* CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
* LUMP: lumps certain different characters together
* STRIPMARK: removes all character marks
* The pointer 'last_boundclass' has to point to an integer variable which
* is storing the last character boundary class, if the CHARBOUND option
* is used.
* In case of success the number of chars written is returned,
* in case of an error, a negative error code is returned.
* If the number of written chars would be bigger than 'bufsize',
* the buffer (up to 'bufsize') has inpredictable data, and the needed
* buffer size is returned.
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
* 0x10FFFF, otherwise the program might crash!
*/
ssize_t utf8proc_decompose(
const uint8_t *str, ssize_t strlen,
int32_t *buffer, ssize_t bufsize, int options
);
/*
* Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
* string, and orders the decomposed sequences correctly.
* If the NULLTERM flag in 'options' is set, processing will be stopped,
* when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
* The result in form of unicode code points is written into the buffer
* being pointed to by 'buffer', having the length of 'bufsize' entries.
* In case of success the number of chars written is returned,
* in case of an error, a negative error code is returned.
* If the number of written chars would be bigger than 'bufsize',
* the buffer (up to 'bufsize') has inpredictable data, and the needed
* buffer size is returned.
*/
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
/*
* Reencodes the sequence of unicode characters given by the pointer
* 'buffer' and 'length' as UTF-8.
* The result is stored in the same memory area where the data is read.
* Following flags in the 'options' field are regarded:
* NLF2LS: converts LF, CRLF, CR and NEL into LS
* NLF2PS: converts LF, CRLF, CR and NEL into PS
* NLF2LF: converts LF, CRLF, CR and NEL into LF
* STRIPCC: strips or converts all non-affected control characters
* COMPOSE: tries to combine decomposed characters into composite
* characters
* STABLE: prohibits combining characters which would violate
* the unicode versioning stability
* In case of success the length of the resulting UTF-8 string is
* returned, otherwise a negative error code is returned.
* WARNING: The amount of free space being pointed to by 'buffer', has to
* exceed the amount of the input data by one byte, and the
* entries of the array pointed to by 'str' have to be in the
* range of 0x0000 to 0x10FFFF, otherwise the program might
* crash!
*/
ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
);
/*
* Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
* string, which is allocated dynamically, and afterwards pointed to by
* the pointer being pointed to by 'dstptr'.
* If the NULLTERM flag in the 'options' field is set, the length is
* determined by a NULL terminator, otherwise the parameter 'strlen' is
* evaluated to determine the string length, but in any case the result
* will be NULL terminated (though it might contain NULL characters
* before). Other flags in the 'options' field are passed to the functions
* defined above, and regarded as described.
* In case of success the length of the new string is returned,
* otherwise a negative error code is returned.
* NOTICE: The memory of the new UTF-8 string will have been allocated with
* 'malloc', and has theirfore to be freed with 'free'.
*/
uint8_t *utf8proc_NFD(const uint8_t *str);
uint8_t *utf8proc_NFC(const uint8_t *str);
uint8_t *utf8proc_NFKD(const uint8_t *str);
uint8_t *utf8proc_NFKC(const uint8_t *str);
/*
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
* normalized version of the null-terminated string 'str'.
*/
#ifdef __cplusplus
}
#endif
#endif

13383
src/utf8proc/utf8proc_data.c Normal file

File diff suppressed because it is too large Load Diff