[numex] Concatenating a string of numeric expressions with no intervening tokens like Seventeen Eighty or Ten Oh Four
This commit is contained in:
@@ -984,8 +984,8 @@ char *replace_numeric_expressions(char *str, char *lang) {
|
|||||||
size_t len = strlen(str);
|
size_t len = strlen(str);
|
||||||
|
|
||||||
char_array *replacement = char_array_new_size(len);
|
char_array *replacement = char_array_new_size(len);
|
||||||
int start = 0;
|
size_t start = 0;
|
||||||
int end = 0;
|
size_t end = 0;
|
||||||
|
|
||||||
for (int i = 0; i < results->n; i++) {
|
for (int i = 0; i < results->n; i++) {
|
||||||
numex_result_t result = results->a[i];
|
numex_result_t result = results->a[i];
|
||||||
@@ -1001,7 +1001,10 @@ char *replace_numeric_expressions(char *str, char *lang) {
|
|||||||
char numeric_string[INT64_MAX_STRING_SIZE] = {0};
|
char numeric_string[INT64_MAX_STRING_SIZE] = {0};
|
||||||
sprintf(numeric_string, "%" PRId64, result.value);
|
sprintf(numeric_string, "%" PRId64, result.value);
|
||||||
|
|
||||||
char_array_append_len(replacement, str + start, end - start);
|
if (!string_is_ignorable(str + start, end - start)) {
|
||||||
|
char_array_append_len(replacement, str + start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
char_array_append(replacement, numeric_string);
|
char_array_append(replacement, numeric_string);
|
||||||
|
|
||||||
if (result.is_ordinal) {
|
if (result.is_ordinal) {
|
||||||
|
|||||||
@@ -361,6 +361,31 @@ inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char
|
|||||||
return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2));
|
return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool string_is_ignorable(char *str, size_t len) {
|
||||||
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
|
size_t idx = 0;
|
||||||
|
|
||||||
|
bool ignorable = true;
|
||||||
|
|
||||||
|
while (idx < len) {
|
||||||
|
int32_t ch;
|
||||||
|
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
|
||||||
|
|
||||||
|
if (char_len <= 0) break;
|
||||||
|
if (ch == 0) break;
|
||||||
|
if (!(utf8proc_codepoint_valid(ch))) return false;
|
||||||
|
|
||||||
|
int cat = utf8proc_category(ch);
|
||||||
|
if (!utf8_is_separator(cat) && !utf8_is_hyphen(ch)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr += char_len;
|
||||||
|
idx += char_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool string_contains_hyphen_len(char *str, size_t len) {
|
bool string_contains_hyphen_len(char *str, size_t len) {
|
||||||
uint8_t *ptr = (uint8_t *)str;
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
|
|||||||
@@ -84,6 +84,8 @@ bool utf8_is_punctuation(int cat);
|
|||||||
bool utf8_is_symbol(int cat);
|
bool utf8_is_symbol(int cat);
|
||||||
bool utf8_is_separator(int cat);
|
bool utf8_is_separator(int cat);
|
||||||
|
|
||||||
|
bool string_is_ignorable(char *str, size_t len);
|
||||||
|
|
||||||
bool string_contains_hyphen(char *str);
|
bool string_contains_hyphen(char *str);
|
||||||
bool string_contains_hyphen_len(char *str, size_t len);
|
bool string_contains_hyphen_len(char *str, size_t len);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user