[utils] utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie suffix comparison where we may be in the middle of a multi-byte character. Adding a standard utf8_common_prefix method
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "log/log.h"
|
||||||
#include "string_utils.h"
|
#include "string_utils.h"
|
||||||
|
|
||||||
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
|
#define INVALID_INDEX(i, n) ((i) < 0 || (i) >= (n))
|
||||||
@@ -40,15 +41,15 @@ int string_compare_len_case_insensitive(const char *str1, const char *str2, size
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int string_common_prefix(const char *str1, const char *str2) {
|
size_t string_common_prefix(const char *str1, const char *str2) {
|
||||||
int common_prefix;
|
size_t common_prefix;
|
||||||
for (common_prefix = 0; *str1 && *str2 && *str1 == *str2; str1++, str2++)
|
for (common_prefix = 0; *str1 && *str2 && *str1 == *str2; str1++, str2++)
|
||||||
common_prefix++;
|
common_prefix++;
|
||||||
return common_prefix;
|
return common_prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
int string_common_suffix(const char *str1, const char *str2) {
|
size_t string_common_suffix(const char *str1, const char *str2) {
|
||||||
int common_suffix = 0;
|
size_t common_suffix = 0;
|
||||||
size_t str1_len = strlen(str1);
|
size_t str1_len = strlen(str1);
|
||||||
size_t str2_len = strlen(str2);
|
size_t str2_len = strlen(str2);
|
||||||
size_t min_len = (str1_len < str2_len) ? str1_len : str2_len;
|
size_t min_len = (str1_len < str2_len) ? str1_len : str2_len;
|
||||||
@@ -222,11 +223,41 @@ inline int utf8_compare(const char *str1, const char *str2) {
|
|||||||
return utf8_compare_len(str1, str2, strlen(str1));
|
return utf8_compare_len(str1, str2, strlen(str1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t utf8_common_prefix(const char *str1, const char *str2) {
|
||||||
|
size_t common_prefix = 0;
|
||||||
|
|
||||||
|
int32_t c1 = 0;
|
||||||
|
int32_t c2 = 0;
|
||||||
|
ssize_t char_len1, char_len2;
|
||||||
|
|
||||||
|
size_t len1 = strlen(str1);
|
||||||
|
size_t len2 = strlen(str2);
|
||||||
|
|
||||||
|
uint8_t *ptr1 = (uint8_t *)str1;
|
||||||
|
uint8_t *ptr2 = (uint8_t *)str2;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
||||||
|
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
||||||
|
|
||||||
|
if (c1 <= 0 || c2 <= 0) break;
|
||||||
|
if (c1 == c2) {
|
||||||
|
ptr1 += len1;
|
||||||
|
ptr2 += len2;
|
||||||
|
common_prefix += len1;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return common_prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len) {
|
size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len) {
|
||||||
if (len == 0) return 0;
|
if (len == 0) return 0;
|
||||||
|
|
||||||
int32_t c1, c2;
|
int32_t c1 = -1, c2 = -1;
|
||||||
ssize_t len1, len2;
|
ssize_t len1, len2;
|
||||||
|
|
||||||
uint8_t *ptr1 = (uint8_t *)str1;
|
uint8_t *ptr1 = (uint8_t *)str1;
|
||||||
@@ -242,6 +273,16 @@ size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *st
|
|||||||
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
len1 = utf8proc_iterate(ptr1, -1, &c1);
|
||||||
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
len2 = utf8proc_iterate(ptr2, -1, &c2);
|
||||||
|
|
||||||
|
if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
|
||||||
|
ptr1++;
|
||||||
|
ptr2++;
|
||||||
|
remaining--;
|
||||||
|
match_len++;
|
||||||
|
one_char_match = true;
|
||||||
|
if (remaining == 0) break;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (c1 == 0 || c2 == 0) break;
|
if (c1 == 0 || c2 == 0) break;
|
||||||
|
|
||||||
if (c1 == c2) {
|
if (c1 == c2) {
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ extern "C" {
|
|||||||
// NOTE: this particular implementation works only for ASCII strings
|
// NOTE: this particular implementation works only for ASCII strings
|
||||||
int string_compare_case_insensitive(const char *str1, const char *str2);
|
int string_compare_case_insensitive(const char *str1, const char *str2);
|
||||||
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
|
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
|
||||||
int string_common_prefix(const char *str1, const char *str2);
|
size_t string_common_prefix(const char *str1, const char *str2);
|
||||||
|
size_t string_common_suffix(const char *str1, const char *str2);
|
||||||
|
|
||||||
void string_lower(char *str);
|
void string_lower(char *str);
|
||||||
void string_upper(char *str);
|
void string_upper(char *str);
|
||||||
@@ -50,6 +51,7 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds
|
|||||||
char *utf8_lower(const char *s); // returns a copy, caller frees
|
char *utf8_lower(const char *s); // returns a copy, caller frees
|
||||||
int utf8_compare(const char *str1, const char *str2);
|
int utf8_compare(const char *str1, const char *str2);
|
||||||
int utf8_compare_len(const char *str1, const char *str2, size_t len);
|
int utf8_compare_len(const char *str1, const char *str2, size_t len);
|
||||||
|
size_t utf8_common_prefix(const char *str1, const char *str2);
|
||||||
size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2);
|
size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2);
|
||||||
size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len);
|
size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user