From 245aa226e087fd847947c9fa0c1953e7a12b43eb Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 Oct 2017 04:48:50 -0400 Subject: [PATCH] [utils] function to create an array of uint32_t codepoints from a UTF-8 string, a few bug fixes to string_utils --- src/string_utils.c | 25 ++++++++++++++++++++++++- src/string_utils.h | 4 +++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index 9d27cc37..f1155001 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -361,12 +361,35 @@ ssize_t utf8_len(const char *str, size_t len) { if (remaining == 0) break; ptr += char_len; - num_utf8_chars += char_len; + num_utf8_chars++; } return num_utf8_chars; } +uint32_array *unicode_codepoints(const char *str) { + if (str == NULL) return NULL; + + uint32_array *a = uint32_array_new(); + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + + uint32_array_push(a, (uint32_t)ch); + ptr += char_len; + } + + return a; +} + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 852f1813..e7760e45 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -74,7 +74,7 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds char *utf8_lower_options(const char *s, utf8proc_option_t options); char *utf8_lower(const char *s); char *utf8_upper_options(const char *s, utf8proc_option_t options); -char *utf8_lower(const char *s); +char *utf8_upper(const char *s); int utf8_compare(const char *str1, const char *str2); int utf8_compare_len(const char *str1, const char *str2, size_t len); @@ -87,6 +87,8 @@ bool utf8_equal_ignore_separators(const char *str1, const char *str2); ssize_t utf8_len(const char *str, size_t len); +uint32_array *unicode_codepoints(const char *str); + bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat);