[utils] function to create an array of uint32_t codepoints from a UTF-8 string, a few bug fixes to string_utils
This commit is contained in:
@@ -361,12 +361,35 @@ ssize_t utf8_len(const char *str, size_t len) {
|
|||||||
if (remaining == 0) break;
|
if (remaining == 0) break;
|
||||||
|
|
||||||
ptr += char_len;
|
ptr += char_len;
|
||||||
num_utf8_chars += char_len;
|
num_utf8_chars++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return num_utf8_chars;
|
return num_utf8_chars;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_array *unicode_codepoints(const char *str) {
|
||||||
|
if (str == NULL) return NULL;
|
||||||
|
|
||||||
|
uint32_array *a = uint32_array_new();
|
||||||
|
|
||||||
|
int32_t ch = 0;
|
||||||
|
ssize_t num_utf8_chars = 0;
|
||||||
|
ssize_t char_len;
|
||||||
|
|
||||||
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
char_len = utf8proc_iterate(ptr, -1, &ch);
|
||||||
|
|
||||||
|
if (ch == 0) break;
|
||||||
|
|
||||||
|
uint32_array_push(a, (uint32_t)ch);
|
||||||
|
ptr += char_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
|
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
|
||||||
if (len == 0) return 0;
|
if (len == 0) return 0;
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds
|
|||||||
char *utf8_lower_options(const char *s, utf8proc_option_t options);
|
char *utf8_lower_options(const char *s, utf8proc_option_t options);
|
||||||
char *utf8_lower(const char *s);
|
char *utf8_lower(const char *s);
|
||||||
char *utf8_upper_options(const char *s, utf8proc_option_t options);
|
char *utf8_upper_options(const char *s, utf8proc_option_t options);
|
||||||
char *utf8_lower(const char *s);
|
char *utf8_upper(const char *s);
|
||||||
|
|
||||||
int utf8_compare(const char *str1, const char *str2);
|
int utf8_compare(const char *str1, const char *str2);
|
||||||
int utf8_compare_len(const char *str1, const char *str2, size_t len);
|
int utf8_compare_len(const char *str1, const char *str2, size_t len);
|
||||||
@@ -87,6 +87,8 @@ bool utf8_equal_ignore_separators(const char *str1, const char *str2);
|
|||||||
|
|
||||||
ssize_t utf8_len(const char *str, size_t len);
|
ssize_t utf8_len(const char *str, size_t len);
|
||||||
|
|
||||||
|
uint32_array *unicode_codepoints(const char *str);
|
||||||
|
|
||||||
bool utf8_is_hyphen(int32_t ch);
|
bool utf8_is_hyphen(int32_t ch);
|
||||||
bool utf8_is_letter(int cat);
|
bool utf8_is_letter(int cat);
|
||||||
bool utf8_is_number(int cat);
|
bool utf8_is_number(int cat);
|
||||||
|
|||||||
Reference in New Issue
Block a user