[strings] adding latest utf8proc, new functions for utf8_lower (instead of case folding) and utf8_upper, and a utf8_is_whitespace that takes things like tabs into account

2016-12-31 00:52:12 -05:00
parent db16e656ca
commit 8978000320
4 changed files with 13218 additions and 15322 deletions
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -174,14 +174,63 @@ error_free_output:
    return NULL;
 }

-char *utf8_lower(const char *s) {
+char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
    ssize_t len = (ssize_t)strlen(s);
-    uint8_t *dest;
+    utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
+    utf8proc_uint8_t *dest = NULL;

-    utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
-    return (char *)dest;
+    utf8proc_ssize_t result;
+    result = utf8proc_decompose(str, len, NULL, 0, options);
+
+    if (result < 0) return NULL;
+    utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
+    if (buffer == NULL) return NULL;
+
+    result = utf8proc_decompose(str, len, buffer, result, options);
+    if (result < 0) {
+        free(buffer);
+        return NULL;
+    }
+
+    for (utf8proc_ssize_t i = 0; i < result; i++) {
+        utf8proc_int32_t uc = buffer[i];
+        utf8proc_int32_t norm;
+
+        if (casing == UTF8_LOWER) {
+            norm = utf8proc_tolower(uc);
+        } else if (casing == UTF8_UPPER) {
+            norm = utf8proc_toupper(uc);
+        }
+    }
+
+    result = utf8proc_reencode(buffer, result, options);
+
+    utf8proc_int32_t *newptr;
+    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
+    if (newptr) buffer = newptr;
+
+    free(buffer);
+
+    return (char *)buffer;
 }

+inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
+    return utf8_case(s, UTF8_LOWER, options);
+}
+
+inline char *utf8_lower(const char *s) {
+    return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
+}
+
+inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
+    return utf8_case(s, UTF8_UPPER, options);
+}
+
+inline char *utf8_upper(const char *s) {
+    return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
+}
+
+
 inline bool utf8_is_letter(int cat) {
    return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
            || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
@@ -219,6 +268,18 @@ inline bool utf8_is_separator(int cat) {
    return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
 }

+inline bool utf8_is_whitespace(int32_t ch) {
+    int cat = utf8proc_category(ch);
+    return utf8_is_separator(cat) || 
+           ch == 9 || // character tabulation
+           ch == 10 || // line feed
+           ch == 11 || // line tabulation
+           ch == 12 || // form feed
+           ch == 13 || // carriage return
+           ch == 133 // next line
+           ;
+}
+
 int utf8_compare_len(const char *str1, const char *str2, size_t len) {
    if (len == 0) return 0;

@@ -415,11 +476,10 @@ inline bool string_contains_hyphen(char *str) {
    return string_next_hyphen_index(str, strlen(str)) >= 0;
 }

-size_t string_right_spaces(char *str) {
+size_t string_right_spaces_len(char *str, size_t len) {
    size_t spaces = 0;

    uint8_t *ptr = (uint8_t *)str;
-    ssize_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = len;

@@ -428,24 +488,22 @@ size_t string_right_spaces(char *str) {

        if (ch <= 0) break;

-        int cat = utf8proc_category(ch);
-        if (!utf8_is_separator(cat)) {
+        if (!utf8_is_whitespace(ch)) {
            break;
        }

        index -= char_len;
-        spaces++;
+        spaces += char_len;
    }

    return spaces;

 }

-size_t string_left_spaces(char *str) {
+size_t string_left_spaces_len(char *str, size_t len) {
    size_t spaces = 0;

    uint8_t *ptr = (uint8_t *)str;
-    size_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = 0;

@@ -454,22 +512,21 @@ size_t string_left_spaces(char *str) {

        if (ch <= 0) break;

-        int cat = utf8proc_category(ch);
-        if (!utf8_is_separator(cat)) {
+        if (!utf8_is_whitespace(ch)) {
            break;
        }
        index += char_len;
        ptr += char_len;
-        spaces++;
+        spaces += char_len;
    }

    return spaces;
 }

 char *string_trim(char *str) {
-    size_t left_spaces = string_left_spaces(str);
-    size_t right_spaces = string_right_spaces(str);
    size_t len = strlen(str);
+    size_t left_spaces = string_left_spaces_len(str, len);
+    size_t right_spaces = string_right_spaces_len(str, len);
    char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
    return ret;
 }