[numex] helper function to retrieve ordinal suffix lengths from a tokenized string for use in deduping
This commit is contained in:
46
src/numex.c
46
src/numex.c
@@ -1137,6 +1137,52 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) {
|
||||
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
|
||||
|
||||
int32_t unichr = 0;
|
||||
const uint8_t *ptr = (const uint8_t *)str;
|
||||
|
||||
if (len_ordinal_suffix > 0) {
|
||||
ssize_t start = 0;
|
||||
size_t token_offset = token.offset;
|
||||
size_t token_len = token.len;
|
||||
|
||||
if (len_ordinal_suffix < token.len) {
|
||||
start = token.offset + token.len - len_ordinal_suffix;
|
||||
token_offset = token.offset;
|
||||
token_len = token.len - len_ordinal_suffix;
|
||||
} else {
|
||||
start = prev_token.offset + prev_token.len;
|
||||
token_offset = prev_token.offset;
|
||||
token_len = prev_token.len;
|
||||
}
|
||||
ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr);
|
||||
if (prev_char_len <= 0) return 0;
|
||||
if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return len_ordinal_suffix;
|
||||
}
|
||||
|
||||
bool add_ordinal_suffix_lengths(uint32_array *suffixes, char *str, token_array *tokens_array, char *lang) {
|
||||
if (suffixes == NULL || str == NULL || tokens_array == NULL) return false;
|
||||
size_t n = tokens_array->n;
|
||||
token_t *tokens = tokens_array->a;
|
||||
token_t prev_token = NULL_TOKEN;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
token_t token = tokens[i];
|
||||
size_t suffix_len = valid_ordinal_suffix_len(str, token, prev_token, lang);
|
||||
uint32_array_push(suffixes, (uint32_t)suffix_len);
|
||||
prev_token = token;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline bool is_roman_numeral_char(char c) {
|
||||
|
||||
Reference in New Issue
Block a user