[utils] UTF-8 lowercasing and string comparison, including a version which ignores dashes/spaces
This commit is contained in:
@@ -15,7 +15,7 @@ int string_compare_case_insensitive(const char *str1, const char *str2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
|
||||
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len) {
|
||||
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
|
||||
register unsigned char *s1 = (unsigned char *) str1;
|
||||
register unsigned char *s2 = (unsigned char *) str2;
|
||||
|
||||
@@ -142,6 +142,14 @@ error_free_output:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *utf8_lower(const char *s) {
|
||||
ssize_t len = (ssize_t)strlen(s);
|
||||
uint8_t *dest;
|
||||
|
||||
ssize_t dest_len = utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
|
||||
return (char *)dest;
|
||||
}
|
||||
|
||||
inline bool utf8_is_letter(int cat) {
|
||||
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|
||||
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
|
||||
@@ -179,6 +187,105 @@ inline bool utf8_is_separator(int cat) {
|
||||
return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
|
||||
}
|
||||
|
||||
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
|
||||
if (len == 0) return 0;
|
||||
|
||||
int32_t c1, c2;
|
||||
ssize_t len1, len2;
|
||||
|
||||
uint8_t *ptr1 = (uint8_t *)str1;
|
||||
uint8_t *ptr2 = (uint8_t *)str2;
|
||||
|
||||
size_t remaining = len;
|
||||
|
||||
while (1) {
|
||||
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
||||
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
||||
|
||||
if (c1 == 0 || c2 == 0) break;
|
||||
|
||||
if (c1 == c2) {
|
||||
ptr1 += len1;
|
||||
ptr2 += len2;
|
||||
remaining -= len1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
if (remaining == 0) break;
|
||||
|
||||
}
|
||||
|
||||
return (int) c1 - c2;
|
||||
}
|
||||
|
||||
|
||||
int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len) {
|
||||
if (len == 0) return 0;
|
||||
|
||||
int32_t c1, c2;
|
||||
ssize_t len1, len2;
|
||||
|
||||
uint8_t *ptr1 = (uint8_t *)str1;
|
||||
uint8_t *ptr2 = (uint8_t *)str2;
|
||||
|
||||
size_t remaining = len;
|
||||
|
||||
while (1) {
|
||||
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
||||
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
||||
|
||||
if (c1 == 0 || c2 == 0) break;
|
||||
|
||||
if (c1 == c2) {
|
||||
ptr1 += len1;
|
||||
ptr2 += len2;
|
||||
remaining -= len1;
|
||||
} else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
|
||||
ptr1 += len1;
|
||||
remaining -= len1;
|
||||
} else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
|
||||
ptr2 += len2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
if (remaining == 0) break;
|
||||
|
||||
}
|
||||
|
||||
return (int) c1 - c2;
|
||||
}
|
||||
|
||||
int utf8_compare_ignore_separators(const char *str1, const char *str2) {
|
||||
int32_t c1, c2;
|
||||
ssize_t len1, len2;
|
||||
|
||||
uint8_t *ptr1 = (uint8_t *)str1;
|
||||
uint8_t *ptr2 = (uint8_t *)str2;
|
||||
|
||||
while (1) {
|
||||
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
||||
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
||||
|
||||
if (c1 == 0 || c2 == 0) break;
|
||||
|
||||
if (c1 == c2) {
|
||||
ptr1 += len1;
|
||||
ptr2 += len2;
|
||||
} else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
|
||||
ptr1 += len1;
|
||||
} else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
|
||||
ptr2 += len2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (int) c1 - c2;
|
||||
}
|
||||
|
||||
|
||||
size_t string_rtrim(char *str) {
|
||||
size_t spaces = 0;
|
||||
|
||||
@@ -13,11 +13,27 @@ extern "C" {
|
||||
#include "utf8proc/utf8proc.h"
|
||||
#include "vector.h"
|
||||
|
||||
#define MAX_UTF8_CHAR_SIZE 4
|
||||
#define MAX_UTF8_CHAR_SIZE 4
|
||||
|
||||
|
||||
#define UTF8PROC_OPTIONS_BASE UTF8PROC_NULLTERM | UTF8PROC_STABLE
|
||||
|
||||
// Unicode normalization forms
|
||||
#define UTF8PROC_OPTIONS_NFD UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE
|
||||
#define UTF8PROC_OPTIONS_NFC UTF8PROC_OPTIONS_BASE | UTF8PROC_COMPOSE
|
||||
#define UTF8PROC_OPTIONS_NFKD UTF8PROC_OPTIONS_NFD | UTF8PROC_COMPAT
|
||||
#define UTF8PROC_OPTIONS_NFKC UTF8PROC_OPTIONS_NFC | UTF8PROC_COMPAT
|
||||
|
||||
// Strip accents
|
||||
#define UTF8PROC_OPTIONS_STRIP_ACCENTS UTF8PROC_OPTIONS_BASE | UTF8PROC_STRIPMARK
|
||||
|
||||
// Lowercase
|
||||
#define UTF8PROC_OPTIONS_LOWERCASE UTF8PROC_OPTIONS_BASE | UTF8PROC_CASEFOLD
|
||||
|
||||
|
||||
// NOTE: this particular implementation works only for ASCII strings
|
||||
int string_compare_case_insensitive(const char *str1, const char *str2);
|
||||
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len);
|
||||
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
|
||||
int string_common_prefix(const char *str1, const char *str2);
|
||||
|
||||
void string_lower(char *str);
|
||||
@@ -30,6 +46,13 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls,
|
||||
|
||||
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
|
||||
ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
|
||||
|
||||
char *utf8_lower(const char *s); // returns a copy, caller frees
|
||||
int utf8_compare(const char *str1, const char *str2);
|
||||
int utf8_compare_len(const char *str1, const char *str2, size_t len);
|
||||
int utf8_compare_ignore_separators(const char *str1, const char *str2);
|
||||
int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len);
|
||||
|
||||
bool utf8_is_hyphen(int32_t ch);
|
||||
bool utf8_is_letter(int cat);
|
||||
bool utf8_is_number(int cat);
|
||||
|
||||
Reference in New Issue
Block a user