[utils] UTF-8 lowercasing and string comparison, including a version which ignores dashes/spaces

This commit is contained in:
Al
2015-06-10 18:26:52 -04:00
parent cb603562e0
commit aad5f3edd3
2 changed files with 133 additions and 3 deletions

View File

@@ -13,11 +13,27 @@ extern "C" {
#include "utf8proc/utf8proc.h"
#include "vector.h"
#define MAX_UTF8_CHAR_SIZE 4
#define MAX_UTF8_CHAR_SIZE 4
#define UTF8PROC_OPTIONS_BASE UTF8PROC_NULLTERM | UTF8PROC_STABLE
// Unicode normalization forms
#define UTF8PROC_OPTIONS_NFD UTF8PROC_OPTIONS_BASE | UTF8PROC_DECOMPOSE
#define UTF8PROC_OPTIONS_NFC UTF8PROC_OPTIONS_BASE | UTF8PROC_COMPOSE
#define UTF8PROC_OPTIONS_NFKD UTF8PROC_OPTIONS_NFD | UTF8PROC_COMPAT
#define UTF8PROC_OPTIONS_NFKC UTF8PROC_OPTIONS_NFC | UTF8PROC_COMPAT
// Strip accents
#define UTF8PROC_OPTIONS_STRIP_ACCENTS UTF8PROC_OPTIONS_BASE | UTF8PROC_STRIPMARK
// Lowercase
#define UTF8PROC_OPTIONS_LOWERCASE UTF8PROC_OPTIONS_BASE | UTF8PROC_CASEFOLD
// NOTE: this particular implementation works only for ASCII strings
int string_compare_case_insensitive(const char *str1, const char *str2);
int string_compare_n_case_insensitive(const char *str1, const char *str2, size_t len);
int string_compare_len_case_insensitive(const char *str1, const char *str2, size_t len);
int string_common_prefix(const char *str1, const char *str2);
void string_lower(char *str);
@@ -30,6 +46,13 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls,
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
char *utf8_lower(const char *s); // returns a copy, caller frees
int utf8_compare(const char *str1, const char *str2);
int utf8_compare_len(const char *str1, const char *str2, size_t len);
int utf8_compare_ignore_separators(const char *str1, const char *str2);
int utf8_compare_len_ignore_separators(const char *str1, const char *str2, size_t len);
bool utf8_is_hyphen(int32_t ch);
bool utf8_is_letter(int cat);
bool utf8_is_number(int cat);