From f8a808e25426f7c29c6ad8d9420be5e0a219b6d8 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 12 Oct 2017 11:16:53 -0400 Subject: [PATCH] [utils] adding utf8_len function for strings, and utf8_is_digit --- src/string_utils.c | 32 ++++++++++++++++++++++++++++++++ src/string_utils.h | 2 ++ 2 files changed, 34 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index b337de47..b2dc2bbd 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -293,6 +293,10 @@ inline bool utf8_is_letter(int cat) { || cat == UTF8PROC_CATEGORY_LM; } +inline bool utf8_is_digit(int cat) { + return cat == UTF8PROC_CATEGORY_ND; +} + inline bool utf8_is_number(int cat) { return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO; } @@ -336,6 +340,34 @@ inline bool utf8_is_whitespace(int32_t ch) { ; } + +ssize_t utf8_len(const char *str, size_t len) { + if (str == NULL) return -1; + if (len == 0) return 0; + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + size_t remaining = len; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + remaining -= char_len; + if (remaining == 0) break; + + ptr += char_len; + num_utf8_chars += char_len; + } + + return num_utf8_chars; +} + + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 0e7dd235..0cf0382c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -83,6 +83,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); +ssize_t utf8_len(const char *str, size_t len); + bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat);