[utils] UTF-8 lowercasing and string comparison, including a version which ignores dashes/spaces

This commit is contained in:
Al
2015-06-10 18:26:52 -04:00
parent cb603562e0
commit aad5f3edd3
2 changed files with 133 additions and 3 deletions

View File

@@ -15,7 +15,7 @@ int string_compare_case_insensitive(const char *str1, const char *str2) {
return c1 - c2;
}
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len) {
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len) {
register unsigned char *s1 = (unsigned char *) str1;
register unsigned char *s2 = (unsigned char *) str2;
@@ -142,6 +142,14 @@ error_free_output:
return NULL;
}
char *utf8_lower(const char *s) {
ssize_t len = (ssize_t)strlen(s);
uint8_t *dest;
ssize_t dest_len = utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
return (char *)dest;
}
inline bool utf8_is_letter(int cat) {
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
@@ -179,6 +187,105 @@ inline bool utf8_is_separator(int cat) {
return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
}
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
if (len == 0) return 0;
int32_t c1, c2;
ssize_t len1, len2;
uint8_t *ptr1 = (uint8_t *)str1;
uint8_t *ptr2 = (uint8_t *)str2;
size_t remaining = len;
while (1) {
len1 = utf8proc_iterate(ptr1, -1, &c1);
len2 = utf8proc_iterate(ptr2, -1, &c2);
if (c1 == 0 || c2 == 0) break;
if (c1 == c2) {
ptr1 += len1;
ptr2 += len2;
remaining -= len1;
} else {
break;
}
if (remaining == 0) break;
}
return (int) c1 - c2;
}
int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len) {
if (len == 0) return 0;
int32_t c1, c2;
ssize_t len1, len2;
uint8_t *ptr1 = (uint8_t *)str1;
uint8_t *ptr2 = (uint8_t *)str2;
size_t remaining = len;
while (1) {
len1 = utf8proc_iterate(ptr1, -1, &c1);
len2 = utf8proc_iterate(ptr2, -1, &c2);
if (c1 == 0 || c2 == 0) break;
if (c1 == c2) {
ptr1 += len1;
ptr2 += len2;
remaining -= len1;
} else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
ptr1 += len1;
remaining -= len1;
} else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
ptr2 += len2;
} else {
break;
}
if (remaining == 0) break;
}
return (int) c1 - c2;
}
int utf8_compare_ignore_separators(const char *str1, const char *str2) {
int32_t c1, c2;
ssize_t len1, len2;
uint8_t *ptr1 = (uint8_t *)str1;
uint8_t *ptr2 = (uint8_t *)str2;
while (1) {
len1 = utf8proc_iterate(ptr1, -1, &c1);
len2 = utf8proc_iterate(ptr2, -1, &c2);
if (c1 == 0 || c2 == 0) break;
if (c1 == c2) {
ptr1 += len1;
ptr2 += len2;
} else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) {
ptr1 += len1;
} else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) {
ptr2 += len2;
} else {
break;
}
}
return (int) c1 - c2;
}
size_t string_rtrim(char *str) {
size_t spaces = 0;