[utils] UTF-8 lowercasing and string comparison, including a version which ignores dashes/spaces

2015-06-10 18:26:52 -04:00
parent cb603562e0
commit aad5f3edd3
2 changed files with 133 additions and 3 deletions
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -15,7 +15,7 @@ int string_compare_case_insensitive(const char *str1, const char *str2) {
    return c1 - c2;
 }

-int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len) {
+int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
    register unsigned char *s1 = (unsigned char *) str1;
    register unsigned char *s2 = (unsigned char *) str2;

@@ -142,6 +142,14 @@ error_free_output:
    return NULL;
 }

+char *utf8_lower(const char *s) {
+    ssize_t len = (ssize_t)strlen(s);
+    uint8_t *dest;
+
+    ssize_t dest_len = utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
+    return (char *)dest;
+}
+
 inline bool utf8_is_letter(int cat) {
    return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
            || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
@@ -179,6 +187,105 @@ inline bool utf8_is_separator(int cat) {
    return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
 }

+int utf8_compare_len(const char *str1, const char *str2, size_t len) {
+    if (len == 0) return 0;
+
+    int32_t c1, c2;
+    ssize_t len1, len2;
+
+    uint8_t *ptr1 = (uint8_t *)str1;
+    uint8_t *ptr2 = (uint8_t *)str2;
+
+    size_t remaining = len;
+
+    while (1) {
+        len1 = utf8proc_iterate(ptr1, -1, &c1);
+        len2 = utf8proc_iterate(ptr2, -1, &c2);
+
+        if (c1 == 0 || c2 == 0) break;
+
+        if (c1 == c2) {
+            ptr1 += len1;
+            ptr2 += len2;
+            remaining -= len1;
+        } else {
+            break;
+        }
+
+        if (remaining == 0) break;
+
+    }
+
+    return (int) c1 - c2;
+}
+
+
+int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len) {
+    if (len == 0) return 0;
+
+    int32_t c1, c2;
+    ssize_t len1, len2;
+
+    uint8_t *ptr1 = (uint8_t *)str1;
+    uint8_t *ptr2 = (uint8_t *)str2;
+
+    size_t remaining = len;
+
+    while (1) {
+        len1 = utf8proc_iterate(ptr1, -1, &c1);
+        len2 = utf8proc_iterate(ptr2, -1, &c2);
+
+        if (c1 == 0 || c2 == 0) break;
+
+        if (c1 == c2) {
+            ptr1 += len1;
+            ptr2 += len2;
+            remaining -= len1;
+        } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
+            ptr1 += len1;
+            remaining -= len1;
+        } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
+            ptr2 += len2;
+        } else {
+            break;
+        }
+
+        if (remaining == 0) break;
+
+    }
+
+    return (int) c1 - c2;
+}
+
+int utf8_compare_ignore_separators(const char *str1, const char *str2) {
+    int32_t c1, c2;
+    ssize_t len1, len2;
+
+    uint8_t *ptr1 = (uint8_t *)str1;
+    uint8_t *ptr2 = (uint8_t *)str2;
+
+    while (1) {
+        len1 = utf8proc_iterate(ptr1, -1, &c1);
+        len2 = utf8proc_iterate(ptr2, -1, &c2);
+
+        if (c1 == 0 || c2 == 0) break;
+
+        if (c1 == c2) {
+            ptr1 += len1;
+            ptr2 += len2;
+        } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
+            ptr1 += len1;
+        } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
+            ptr2 += len2;
+        } else {
+            break;
+        }
+
+    }
+
+    return (int) c1 - c2;
+}
+

 size_t string_rtrim(char *str) {
    size_t spaces = 0;
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -13,11 +13,27 @@ extern "C" {
 #include "utf8proc/utf8proc.h"
 #include "vector.h"

-  #define MAX_UTF8_CHAR_SIZE 4
+#define MAX_UTF8_CHAR_SIZE 4
+
+
+#define UTF8PROC_OPTIONS_BASE UTF8PROC_NULLTERM | UTF8PROC_STABLE
+
+// Unicode normalization forms
+#define UTF8PROC_OPTIONS_NFD UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE
+#define UTF8PROC_OPTIONS_NFC UTF8PROC_OPTIONS_BASE | UTF8PROC_COMPOSE
+#define UTF8PROC_OPTIONS_NFKD UTF8PROC_OPTIONS_NFD | UTF8PROC_COMPAT
+#define UTF8PROC_OPTIONS_NFKC UTF8PROC_OPTIONS_NFC | UTF8PROC_COMPAT
+
+// Strip accents
+#define UTF8PROC_OPTIONS_STRIP_ACCENTS UTF8PROC_OPTIONS_BASE | UTF8PROC_STRIPMARK
+
+// Lowercase
+#define UTF8PROC_OPTIONS_LOWERCASE UTF8PROC_OPTIONS_BASE | UTF8PROC_CASEFOLD
+

 // NOTE: this particular implementation works only for ASCII strings
 int string_compare_case_insensitive(const char *str1, const char *str2);
-int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len);
+int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
 int string_common_prefix(const char *str1, const char *str2);

 void string_lower(char *str);
@@ -30,6 +46,13 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls,

 char *utf8_reversed_string(const char *s); // returns a copy, caller frees
 ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
+
+char *utf8_lower(const char *s); // returns a copy, caller frees
+int utf8_compare(const char *str1, const char *str2);
+int utf8_compare_len(const char *str1, const char *str2, size_t len);
+int utf8_compare_ignore_separators(const char *str1, const char *str2);
+int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len);
+
 bool utf8_is_hyphen(int32_t ch);
 bool utf8_is_letter(int cat);
 bool utf8_is_number(int cat);