From 98c395d34c1af96fdf0a09144fec4f611cf2c74b Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 10 Feb 2016 09:21:31 -0500 Subject: [PATCH] [numex] Concatenating a string of numeric expressions with no intervening tokens like Seventeen Eighty or Ten Oh Four --- src/numex.c | 9 ++++++--- src/string_utils.c | 25 +++++++++++++++++++++++++ src/string_utils.h | 2 ++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/numex.c b/src/numex.c index 06ae9608..661b1b4a 100644 --- a/src/numex.c +++ b/src/numex.c @@ -984,8 +984,8 @@ char *replace_numeric_expressions(char *str, char *lang) { size_t len = strlen(str); char_array *replacement = char_array_new_size(len); - int start = 0; - int end = 0; + size_t start = 0; + size_t end = 0; for (int i = 0; i < results->n; i++) { numex_result_t result = results->a[i]; @@ -1001,7 +1001,10 @@ char *replace_numeric_expressions(char *str, char *lang) { char numeric_string[INT64_MAX_STRING_SIZE] = {0}; sprintf(numeric_string, "%" PRId64, result.value); - char_array_append_len(replacement, str + start, end - start); + if (!string_is_ignorable(str + start, end - start)) { + char_array_append_len(replacement, str + start, end - start); + } + char_array_append(replacement, numeric_string); if (result.is_ordinal) { diff --git a/src/string_utils.c b/src/string_utils.c index 5fa8dbb1..70b2f304 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -361,6 +361,31 @@ inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2)); } +bool string_is_ignorable(char *str, size_t len) { + uint8_t *ptr = (uint8_t *)str; + size_t idx = 0; + + bool ignorable = true; + + while (idx < len) { + int32_t ch; + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0) break; + if (ch == 0) break; + if (!(utf8proc_codepoint_valid(ch))) return false; + + int cat = utf8proc_category(ch); + if (!utf8_is_separator(cat) && !utf8_is_hyphen(ch)) { + return false; + } + + ptr += char_len; + idx += char_len; + } + + return true; +} bool string_contains_hyphen_len(char *str, size_t len) { uint8_t *ptr = (uint8_t *)str; diff --git a/src/string_utils.h b/src/string_utils.h index 8253989c..76c12a15 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -84,6 +84,8 @@ bool utf8_is_punctuation(int cat); bool utf8_is_symbol(int cat); bool utf8_is_separator(int cat); +bool string_is_ignorable(char *str, size_t len); + bool string_contains_hyphen(char *str); bool string_contains_hyphen_len(char *str, size_t len);