From ca746304e3e14795401733d8358f75e586b1a737 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 4 Jun 2015 13:20:14 -0400 Subject: [PATCH] [utils] Adding a few methods to string_utils for finding utf8proc category groups --- src/string_utils.c | 39 +++++++++++++++++++++++++++++++++------ src/string_utils.h | 5 +++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index 836c9f63..bd63b056 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -142,15 +142,42 @@ error_free_output: return NULL; } -bool utf8_is_letter(int32_t ch) { - const utf8proc_property_t *props = utf8proc_get_property(ch); - utf8proc_propval_t cat = props->category; +inline bool utf8_is_letter(int32_t ch) { + int cat = utf8proc_category(ch); return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \ - || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO || \ - cat == UTF8PROC_CATEGORY_LM; - + || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \ + || cat == UTF8PROC_CATEGORY_LM; } +inline bool utf8_is_number(int32_t ch) { + int cat = utf8proc_category(ch); + return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO; +} + +inline bool utf8_is_hyphen(int32_t ch) { + int cat = utf8proc_category(ch); + return cat == UTF8PROC_CATEGORY_PD || c == 0x2212; +} + +inline bool utf8_is_punctuation(int32_t ch) { + int cat = utf8proc_category(ch); + return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \ + || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \ + || cat == UTF8PROC_CATEGORY_PO || cat == UTF8PROC_CATEGORY_PS; +} + +inline bool utf8_is_symbol(int32_t ch) { + int cat = utf8proc_category(ch); + return cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_SC \ + || cat == UTF8PROC_CATEGORY_SM || cat == UTF8PROC_CATEGORY_SO; +} + +inline bool utf8_is_separator(int32_t ch) { + int cat = utf8proc_category(ch); + return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP; +} + + size_t string_rtrim(char *str) { size_t spaces = 0; diff --git a/src/string_utils.h b/src/string_utils.h index f244811b..d1bc634c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -31,6 +31,11 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, char *utf8_reversed_string(const char *s); // returns a copy, caller frees ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst); bool utf8_is_letter(int32_t ch); +bool utf8_is_number(int32_t ch); +bool utf8_is_hyphen(int32_t ch); +bool utf8_is_punctuation(int32_t ch); +bool utf8_is_symbol(int32_t ch); +bool utf8_is_separator(int32_t ch); size_t string_ltrim(char *str); size_t string_rtrim(char *str);