From e4e84f0147a1ebd588eb29959af7773527857912 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 8 Dec 2017 14:28:30 -0500 Subject: [PATCH] [utils] adding unicode_common_prefix/unicode_common_suffix, string_hyphen_prefix_len and string_hyphen_suffix_len to string_utils --- src/string_utils.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++ src/string_utils.h | 5 ++++ 2 files changed, 68 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index 567c2213..7045dd25 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -410,6 +410,47 @@ bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) { return true; } +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_prefix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[i] == u2[i]) { + common_prefix++; + } else { + break; + } + } + return common_prefix; +} + +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_suffix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[len1 - i - 1] == u2[len2 - i - 1]) { + common_suffix++; + } else { + break; + } + } + return common_suffix; +} + + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; @@ -749,6 +790,28 @@ size_t string_right_spaces_len(char *str, size_t len) { } +inline size_t string_hyphen_prefix_len(char *str, size_t len) { + // Strip beginning hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + +inline size_t string_hyphen_suffix_len(char *str, size_t len) { + // Strip ending hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + size_t string_left_spaces_len(char *str, size_t len) { size_t spaces = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 86a018d8..a94f1d93 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -89,6 +89,8 @@ ssize_t utf8_len(const char *str, size_t len); uint32_array *unicode_codepoints(const char *str); bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); bool utf8_is_period(int32_t ch); @@ -119,6 +121,9 @@ bool string_contains_period(char *str); char *string_trim(char *str); +size_t string_hyphen_prefix_len(char *str, size_t len); +size_t string_hyphen_suffix_len(char *str, size_t len); + /* char_array is a dynamic character array defined in collections.h but has a few additional methods related to string manipulation.