[normalize] Adding hyphen elimination as a string option (changes tokenization)
This commit is contained in:
@@ -29,12 +29,21 @@ char *normalize_string_utf8(char *str, uint64_t options) {
|
||||
utf8proc_options |= UTF8PROC_OPTIONS_LOWERCASE;
|
||||
}
|
||||
|
||||
char *normalized = NULL;
|
||||
|
||||
if (have_utf8proc_options) {
|
||||
utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
|
||||
return (char *)utf8proc_normalized;
|
||||
|
||||
normalized = (char *)utf8proc_normalized;
|
||||
str = normalized;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
if (options & NORMALIZE_STRING_REPLACE_HYPHENS) {
|
||||
string_replace(str, '-', ' ');
|
||||
normalized = str;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ As well as normalizations for individual string tokens:
|
||||
#define NORMALIZE_STRING_DECOMPOSE 1 << 3
|
||||
#define NORMALIZE_STRING_LOWERCASE 1 << 4
|
||||
#define NORMALIZE_STRING_TRIM 1 << 5
|
||||
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
|
||||
|
||||
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
|
||||
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
|
||||
|
||||
@@ -75,6 +75,12 @@ inline void string_upper(char *s) {
|
||||
for (; *s; ++s) *s = toupper(*s);
|
||||
}
|
||||
|
||||
inline void string_replace(char *s, char c1, char c2) {
|
||||
for (; *s; ++s) {
|
||||
if (*s == c1) *s = c2;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool string_is_upper(char *s) {
|
||||
for (; *s; ++s) {
|
||||
if (*s != toupper(*s)) return false;
|
||||
@@ -93,6 +99,7 @@ inline bool string_is_lower(char *s) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_repls, size_t trans_len) {
|
||||
uint32_t num_replacements = 0;
|
||||
|
||||
|
||||
@@ -55,6 +55,8 @@ void string_lower(char *s);
|
||||
bool string_is_upper(char *s);
|
||||
void string_upper(char *s);
|
||||
|
||||
void string_replace(char *s, char c1, char c2);
|
||||
|
||||
bool string_starts_with(const char *str, const char *start);
|
||||
bool string_ends_with(const char *str, const char *ending);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user