[strings] adding latest utf8proc, new functions for utf8_lower (instead of case folding) and utf8_upper, and a utf8_is_whitespace that takes things like tabs into account
This commit is contained in:
@@ -174,14 +174,63 @@ error_free_output:
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *utf8_lower(const char *s) {
|
char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
|
||||||
ssize_t len = (ssize_t)strlen(s);
|
ssize_t len = (ssize_t)strlen(s);
|
||||||
uint8_t *dest;
|
utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
|
||||||
|
utf8proc_uint8_t *dest = NULL;
|
||||||
|
|
||||||
utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
|
utf8proc_ssize_t result;
|
||||||
return (char *)dest;
|
result = utf8proc_decompose(str, len, NULL, 0, options);
|
||||||
|
|
||||||
|
if (result < 0) return NULL;
|
||||||
|
utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||||
|
if (buffer == NULL) return NULL;
|
||||||
|
|
||||||
|
result = utf8proc_decompose(str, len, buffer, result, options);
|
||||||
|
if (result < 0) {
|
||||||
|
free(buffer);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (utf8proc_ssize_t i = 0; i < result; i++) {
|
||||||
|
utf8proc_int32_t uc = buffer[i];
|
||||||
|
utf8proc_int32_t norm;
|
||||||
|
|
||||||
|
if (casing == UTF8_LOWER) {
|
||||||
|
norm = utf8proc_tolower(uc);
|
||||||
|
} else if (casing == UTF8_UPPER) {
|
||||||
|
norm = utf8proc_toupper(uc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result = utf8proc_reencode(buffer, result, options);
|
||||||
|
|
||||||
|
utf8proc_int32_t *newptr;
|
||||||
|
newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
|
||||||
|
if (newptr) buffer = newptr;
|
||||||
|
|
||||||
|
free(buffer);
|
||||||
|
|
||||||
|
return (char *)buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
|
||||||
|
return utf8_case(s, UTF8_LOWER, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline char *utf8_lower(const char *s) {
|
||||||
|
return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
|
||||||
|
return utf8_case(s, UTF8_UPPER, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline char *utf8_upper(const char *s) {
|
||||||
|
return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
inline bool utf8_is_letter(int cat) {
|
inline bool utf8_is_letter(int cat) {
|
||||||
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|
||||||
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
|
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
|
||||||
@@ -219,6 +268,18 @@ inline bool utf8_is_separator(int cat) {
|
|||||||
return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
|
return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool utf8_is_whitespace(int32_t ch) {
|
||||||
|
int cat = utf8proc_category(ch);
|
||||||
|
return utf8_is_separator(cat) ||
|
||||||
|
ch == 9 || // character tabulation
|
||||||
|
ch == 10 || // line feed
|
||||||
|
ch == 11 || // line tabulation
|
||||||
|
ch == 12 || // form feed
|
||||||
|
ch == 13 || // carriage return
|
||||||
|
ch == 133 // next line
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
|
int utf8_compare_len(const char *str1, const char *str2, size_t len) {
|
||||||
if (len == 0) return 0;
|
if (len == 0) return 0;
|
||||||
|
|
||||||
@@ -415,11 +476,10 @@ inline bool string_contains_hyphen(char *str) {
|
|||||||
return string_next_hyphen_index(str, strlen(str)) >= 0;
|
return string_next_hyphen_index(str, strlen(str)) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t string_right_spaces(char *str) {
|
size_t string_right_spaces_len(char *str, size_t len) {
|
||||||
size_t spaces = 0;
|
size_t spaces = 0;
|
||||||
|
|
||||||
uint8_t *ptr = (uint8_t *)str;
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
ssize_t len = strlen(str);
|
|
||||||
int32_t ch = 0;
|
int32_t ch = 0;
|
||||||
ssize_t index = len;
|
ssize_t index = len;
|
||||||
|
|
||||||
@@ -428,24 +488,22 @@ size_t string_right_spaces(char *str) {
|
|||||||
|
|
||||||
if (ch <= 0) break;
|
if (ch <= 0) break;
|
||||||
|
|
||||||
int cat = utf8proc_category(ch);
|
if (!utf8_is_whitespace(ch)) {
|
||||||
if (!utf8_is_separator(cat)) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
index -= char_len;
|
index -= char_len;
|
||||||
spaces++;
|
spaces += char_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
return spaces;
|
return spaces;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t string_left_spaces(char *str) {
|
size_t string_left_spaces_len(char *str, size_t len) {
|
||||||
size_t spaces = 0;
|
size_t spaces = 0;
|
||||||
|
|
||||||
uint8_t *ptr = (uint8_t *)str;
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
size_t len = strlen(str);
|
|
||||||
int32_t ch = 0;
|
int32_t ch = 0;
|
||||||
ssize_t index = 0;
|
ssize_t index = 0;
|
||||||
|
|
||||||
@@ -454,22 +512,21 @@ size_t string_left_spaces(char *str) {
|
|||||||
|
|
||||||
if (ch <= 0) break;
|
if (ch <= 0) break;
|
||||||
|
|
||||||
int cat = utf8proc_category(ch);
|
if (!utf8_is_whitespace(ch)) {
|
||||||
if (!utf8_is_separator(cat)) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
index += char_len;
|
index += char_len;
|
||||||
ptr += char_len;
|
ptr += char_len;
|
||||||
spaces++;
|
spaces += char_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
return spaces;
|
return spaces;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *string_trim(char *str) {
|
char *string_trim(char *str) {
|
||||||
size_t left_spaces = string_left_spaces(str);
|
|
||||||
size_t right_spaces = string_right_spaces(str);
|
|
||||||
size_t len = strlen(str);
|
size_t len = strlen(str);
|
||||||
|
size_t left_spaces = string_left_spaces_len(str, len);
|
||||||
|
size_t right_spaces = string_right_spaces_len(str, len);
|
||||||
char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
|
char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
|||||||
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
|
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||||
if (uc < 0xe0) { // 2-byte sequence
|
if (uc < 0xe0) { // 2-byte sequence
|
||||||
// Must have valid continuation character
|
// Must have valid continuation character
|
||||||
if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
|
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||||
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
|
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
|
|||||||
if (uc < 0x00) {
|
if (uc < 0x00) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (uc < 0x80) {
|
} else if (uc < 0x80) {
|
||||||
dst[0] = uc;
|
dst[0] = (utf8proc_uint8_t) uc;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (uc < 0x800) {
|
} else if (uc < 0x800) {
|
||||||
dst[0] = 0xC0 + (uc >> 6);
|
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||||
dst[1] = 0x80 + (uc & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 2;
|
return 2;
|
||||||
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
||||||
// the API, however, these are actually invalid in UTF-8
|
// the API, however, these are actually invalid in UTF-8
|
||||||
} else if (uc < 0x10000) {
|
} else if (uc < 0x10000) {
|
||||||
dst[0] = 0xE0 + (uc >> 12);
|
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||||
dst[1] = 0x80 + ((uc >> 6) & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||||
dst[2] = 0x80 + (uc & 0x3F);
|
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 3;
|
return 3;
|
||||||
} else if (uc < 0x110000) {
|
} else if (uc < 0x110000) {
|
||||||
dst[0] = 0xF0 + (uc >> 18);
|
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||||
dst[1] = 0x80 + ((uc >> 12) & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||||
dst[2] = 0x80 + ((uc >> 6) & 0x3F);
|
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||||
dst[3] = 0x80 + (uc & 0x3F);
|
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 4;
|
return 4;
|
||||||
} else return 0;
|
} else return 0;
|
||||||
}
|
}
|
||||||
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
|
|||||||
if (uc < 0x00) {
|
if (uc < 0x00) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (uc < 0x80) {
|
} else if (uc < 0x80) {
|
||||||
dst[0] = uc;
|
dst[0] = (utf8proc_uint8_t)uc;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (uc < 0x800) {
|
} else if (uc < 0x800) {
|
||||||
dst[0] = 0xC0 + (uc >> 6);
|
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||||
dst[1] = 0x80 + (uc & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 2;
|
return 2;
|
||||||
} else if (uc == 0xFFFF) {
|
} else if (uc == 0xFFFF) {
|
||||||
dst[0] = 0xFF;
|
dst[0] = (utf8proc_uint8_t)0xFF;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (uc == 0xFFFE) {
|
} else if (uc == 0xFFFE) {
|
||||||
dst[0] = 0xFE;
|
dst[0] = (utf8proc_uint8_t)0xFE;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (uc < 0x10000) {
|
} else if (uc < 0x10000) {
|
||||||
dst[0] = 0xE0 + (uc >> 12);
|
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||||
dst[1] = 0x80 + ((uc >> 6) & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||||
dst[2] = 0x80 + (uc & 0x3F);
|
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 3;
|
return 3;
|
||||||
} else if (uc < 0x110000) {
|
} else if (uc < 0x110000) {
|
||||||
dst[0] = 0xF0 + (uc >> 18);
|
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||||
dst[1] = 0x80 + ((uc >> 12) & 0x3F);
|
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||||
dst[2] = 0x80 + ((uc >> 6) & 0x3F);
|
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||||
dst[3] = 0x80 + (uc & 0x3F);
|
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||||
return 4;
|
return 4;
|
||||||
} else return 0;
|
} else return 0;
|
||||||
}
|
}
|
||||||
@@ -233,48 +233,144 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
|
|||||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return whether there is a grapheme break between boundclasses lbc and tbc */
|
/* return whether there is a grapheme break between boundclasses lbc and tbc
|
||||||
static utf8proc_bool grapheme_break(int lbc, int tbc) {
|
(according to the definition of extended grapheme clusters)
|
||||||
|
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
|
||||||
|
http://www.unicode.org/reports/tr29/tr29-29.html
|
||||||
|
CAVEATS:
|
||||||
|
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
|
||||||
|
and GB 12/13 (regional indicator code points) require knowledge of previous characters
|
||||||
|
and are thus not handled by this function. This may result in an incorrect break before
|
||||||
|
an E_Modifier class codepoint and an incorrectly missing break between two
|
||||||
|
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
|
||||||
|
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
|
||||||
|
*/
|
||||||
|
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||||
return
|
return
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
|
||||||
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
|
||||||
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_V ||
|
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
|
||||||
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
|
||||||
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
|
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
|
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||||
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
|
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||||
|
true; // GB999
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
{
|
||||||
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
int lbc_override = lbc;
|
||||||
utf8proc_get_property(c2)->boundclass);
|
if (state && *state != UTF8PROC_BOUNDCLASS_START)
|
||||||
|
lbc_override = *state;
|
||||||
|
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
||||||
|
if (state) {
|
||||||
|
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||||
|
// class codepoints we want to force a break. Do this by resetting the
|
||||||
|
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
|
||||||
|
// after that character according to GB999 (unless of course such a break is
|
||||||
|
// forbidden by a different rule such as GB9).
|
||||||
|
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||||
|
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||||
|
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
||||||
|
// boundclass if we're dealing with an emoji base boundclass.
|
||||||
|
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
||||||
|
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
||||||
|
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
||||||
|
else
|
||||||
|
*state = tbc;
|
||||||
|
}
|
||||||
|
return break_permitted;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||||
|
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
|
||||||
|
|
||||||
|
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
|
||||||
|
utf8proc_get_property(c2)->boundclass,
|
||||||
|
state);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||||
|
utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
||||||
|
return utf8proc_grapheme_break_stateful(c1, c2, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
|
||||||
|
{
|
||||||
|
utf8proc_int32_t entry_cp = **entry;
|
||||||
|
if ((entry_cp & 0xF800) == 0xD800) {
|
||||||
|
*entry = *entry + 1;
|
||||||
|
entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
|
||||||
|
entry_cp += 0x10000;
|
||||||
|
}
|
||||||
|
return entry_cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
|
||||||
|
{
|
||||||
|
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
|
||||||
|
return seqindex_decode_entry(&entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||||
|
utf8proc_ssize_t written = 0;
|
||||||
|
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
|
||||||
|
int len = seqindex >> 13;
|
||||||
|
if (len >= 7) {
|
||||||
|
len = *entry;
|
||||||
|
entry++;
|
||||||
|
}
|
||||||
|
for (; len >= 0; entry++, len--) {
|
||||||
|
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
|
||||||
|
|
||||||
|
written += utf8proc_decompose_char(entry_cp, dst+written,
|
||||||
|
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||||
|
last_boundclass);
|
||||||
|
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||||
|
}
|
||||||
|
return written;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
|
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
|
||||||
return cl >= 0 ? cl : c;
|
return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
|
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
|
||||||
return cu >= 0 ? cu : c;
|
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
|
||||||
|
{
|
||||||
|
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
|
||||||
|
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return a character width analogous to wcwidth (except portable and
|
/* return a character width analogous to wcwidth (except portable and
|
||||||
@@ -357,39 +453,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
category == UTF8PROC_CATEGORY_ME) return 0;
|
category == UTF8PROC_CATEGORY_ME) return 0;
|
||||||
}
|
}
|
||||||
if (options & UTF8PROC_CASEFOLD) {
|
if (options & UTF8PROC_CASEFOLD) {
|
||||||
if (property->casefold_mapping != UINT16_MAX) {
|
if (property->casefold_seqindex != UINT16_MAX) {
|
||||||
const utf8proc_int32_t *casefold_entry;
|
return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
|
||||||
utf8proc_ssize_t written = 0;
|
|
||||||
for (casefold_entry = &utf8proc_sequences[property->casefold_mapping];
|
|
||||||
*casefold_entry >= 0; casefold_entry++) {
|
|
||||||
written += utf8proc_decompose_char(*casefold_entry, dst+written,
|
|
||||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
|
||||||
last_boundclass);
|
|
||||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
|
||||||
}
|
|
||||||
return written;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||||
if (property->decomp_mapping != UINT16_MAX &&
|
if (property->decomp_seqindex != UINT16_MAX &&
|
||||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||||
const utf8proc_int32_t *decomp_entry;
|
return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
|
||||||
utf8proc_ssize_t written = 0;
|
|
||||||
for (decomp_entry = &utf8proc_sequences[property->decomp_mapping];
|
|
||||||
*decomp_entry >= 0; decomp_entry++) {
|
|
||||||
written += utf8proc_decompose_char(*decomp_entry, dst+written,
|
|
||||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
|
||||||
last_boundclass);
|
|
||||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
|
||||||
}
|
|
||||||
return written;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (options & UTF8PROC_CHARBOUND) {
|
if (options & UTF8PROC_CHARBOUND) {
|
||||||
utf8proc_bool boundary;
|
utf8proc_bool boundary;
|
||||||
int tbc = property->boundclass;
|
int tbc = property->boundclass;
|
||||||
boundary = grapheme_break(*last_boundclass, tbc);
|
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
|
||||||
*last_boundclass = tbc;
|
|
||||||
if (boundary) {
|
if (boundary) {
|
||||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||||
if (bufsize >= 2) dst[1] = uc;
|
if (bufsize >= 2) dst[1] = uc;
|
||||||
@@ -403,6 +480,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||||
|
) {
|
||||||
|
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
) {
|
) {
|
||||||
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
||||||
utf8proc_ssize_t wpos = 0;
|
utf8proc_ssize_t wpos = 0;
|
||||||
@@ -429,6 +514,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
||||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||||
}
|
}
|
||||||
|
if (custom_func != NULL) {
|
||||||
|
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
|
||||||
|
}
|
||||||
decomp_result = utf8proc_decompose_char(
|
decomp_result = utf8proc_decompose_char(
|
||||||
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
||||||
&boundclass
|
&boundclass
|
||||||
@@ -463,9 +551,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
return wpos;
|
return wpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
|
||||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
|
||||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||||
utf8proc_ssize_t rpos;
|
utf8proc_ssize_t rpos;
|
||||||
utf8proc_ssize_t wpos = 0;
|
utf8proc_ssize_t wpos = 0;
|
||||||
@@ -538,17 +625,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
if (!starter_property) {
|
if (!starter_property) {
|
||||||
starter_property = unsafe_get_property(*starter);
|
starter_property = unsafe_get_property(*starter);
|
||||||
}
|
}
|
||||||
if (starter_property->comb1st_index >= 0 &&
|
if (starter_property->comb_index < 0x8000 &&
|
||||||
current_property->comb2nd_index >= 0) {
|
current_property->comb_index != UINT16_MAX &&
|
||||||
composition = utf8proc_combinations[
|
current_property->comb_index >= 0x8000) {
|
||||||
starter_property->comb1st_index +
|
int sidx = starter_property->comb_index;
|
||||||
current_property->comb2nd_index
|
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
|
||||||
];
|
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
|
||||||
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
idx += sidx + 2;
|
||||||
!(unsafe_get_property(composition)->comp_exclusion))) {
|
if (current_property->comb_index & 0x4000) {
|
||||||
*starter = composition;
|
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
||||||
starter_property = NULL;
|
} else
|
||||||
continue;
|
composition = utf8proc_combinations[idx];
|
||||||
|
|
||||||
|
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
|
||||||
|
!(unsafe_get_property(composition)->comp_exclusion))) {
|
||||||
|
*starter = composition;
|
||||||
|
starter_property = NULL;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -566,6 +660,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
}
|
}
|
||||||
length = wpos;
|
length = wpos;
|
||||||
}
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||||
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||||
|
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||||
|
length = utf8proc_normalize_utf32(buffer, length, options);
|
||||||
|
if (length < 0) return length;
|
||||||
{
|
{
|
||||||
utf8proc_ssize_t rpos, wpos = 0;
|
utf8proc_ssize_t rpos, wpos = 0;
|
||||||
utf8proc_int32_t uc;
|
utf8proc_int32_t uc;
|
||||||
@@ -587,15 +689,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
|
) {
|
||||||
|
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
) {
|
) {
|
||||||
utf8proc_int32_t *buffer;
|
utf8proc_int32_t *buffer;
|
||||||
utf8proc_ssize_t result;
|
utf8proc_ssize_t result;
|
||||||
*dstptr = NULL;
|
*dstptr = NULL;
|
||||||
result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
|
||||||
if (result < 0) return result;
|
if (result < 0) return result;
|
||||||
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||||
result = utf8proc_decompose(str, strlen, buffer, result, options);
|
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
|
||||||
if (result < 0) {
|
if (result < 0) {
|
||||||
free(buffer);
|
free(buffer);
|
||||||
return result;
|
return result;
|
||||||
@@ -641,4 +750,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
|
|||||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,21 +63,23 @@
|
|||||||
* runtime version may append a string like "-dev" to the version number
|
* runtime version may append a string like "-dev" to the version number
|
||||||
* for prerelease versions.
|
* for prerelease versions.
|
||||||
*
|
*
|
||||||
* @note The shared-library version number in the Makefile may be different,
|
* @note The shared-library version number in the Makefile
|
||||||
|
* (and CMakeLists.txt, and MANIFEST) may be different,
|
||||||
* being based on ABI compatibility rather than API compatibility.
|
* being based on ABI compatibility rather than API compatibility.
|
||||||
*/
|
*/
|
||||||
/** @{ */
|
/** @{ */
|
||||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||||
#define UTF8PROC_VERSION_MAJOR 1
|
#define UTF8PROC_VERSION_MAJOR 2
|
||||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||||
#define UTF8PROC_VERSION_MINOR 3
|
#define UTF8PROC_VERSION_MINOR 1
|
||||||
/** The PATCH version (increased for fixes that do not change the API). */
|
/** The PATCH version (increased for fixes that do not change the API). */
|
||||||
#define UTF8PROC_VERSION_PATCH 0
|
#define UTF8PROC_VERSION_PATCH 0
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#ifdef _MSC_VER
|
#if defined(_MSC_VER) && _MSC_VER < 1800
|
||||||
|
// MSVC prior to 2013 lacked stdbool.h and inttypes.h
|
||||||
typedef signed char utf8proc_int8_t;
|
typedef signed char utf8proc_int8_t;
|
||||||
typedef unsigned char utf8proc_uint8_t;
|
typedef unsigned char utf8proc_uint8_t;
|
||||||
typedef short utf8proc_int16_t;
|
typedef short utf8proc_int16_t;
|
||||||
@@ -92,12 +94,18 @@ typedef int utf8proc_ssize_t;
|
|||||||
typedef unsigned int utf8proc_size_t;
|
typedef unsigned int utf8proc_size_t;
|
||||||
# endif
|
# endif
|
||||||
# ifndef __cplusplus
|
# ifndef __cplusplus
|
||||||
|
// emulate C99 bool
|
||||||
typedef unsigned char utf8proc_bool;
|
typedef unsigned char utf8proc_bool;
|
||||||
enum {false, true};
|
# ifndef __bool_true_false_are_defined
|
||||||
|
# define false 0
|
||||||
|
# define true 1
|
||||||
|
# define __bool_true_false_are_defined 1
|
||||||
|
# endif
|
||||||
# else
|
# else
|
||||||
typedef bool utf8proc_bool;
|
typedef bool utf8proc_bool;
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
|
# include <stddef.h>
|
||||||
# include <stdbool.h>
|
# include <stdbool.h>
|
||||||
# include <inttypes.h>
|
# include <inttypes.h>
|
||||||
typedef int8_t utf8proc_int8_t;
|
typedef int8_t utf8proc_int8_t;
|
||||||
@@ -107,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t;
|
|||||||
typedef int32_t utf8proc_int32_t;
|
typedef int32_t utf8proc_int32_t;
|
||||||
typedef uint32_t utf8proc_uint32_t;
|
typedef uint32_t utf8proc_uint32_t;
|
||||||
typedef size_t utf8proc_size_t;
|
typedef size_t utf8proc_size_t;
|
||||||
typedef ssize_t utf8proc_ssize_t;
|
typedef ptrdiff_t utf8proc_ssize_t;
|
||||||
typedef bool utf8proc_bool;
|
typedef bool utf8proc_bool;
|
||||||
#endif
|
#endif
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
@@ -133,7 +141,7 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef UINT16_MAX
|
#ifndef UINT16_MAX
|
||||||
# define UINT16_MAX ~(utf8proc_uint16_t)0
|
# define UINT16_MAX 65535U
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -242,13 +250,12 @@ typedef struct utf8proc_property_struct {
|
|||||||
* @see utf8proc_decomp_type_t.
|
* @see utf8proc_decomp_type_t.
|
||||||
*/
|
*/
|
||||||
utf8proc_propval_t decomp_type;
|
utf8proc_propval_t decomp_type;
|
||||||
utf8proc_uint16_t decomp_mapping;
|
utf8proc_uint16_t decomp_seqindex;
|
||||||
utf8proc_uint16_t casefold_mapping;
|
utf8proc_uint16_t casefold_seqindex;
|
||||||
utf8proc_int32_t uppercase_mapping;
|
utf8proc_uint16_t uppercase_seqindex;
|
||||||
utf8proc_int32_t lowercase_mapping;
|
utf8proc_uint16_t lowercase_seqindex;
|
||||||
utf8proc_int32_t titlecase_mapping;
|
utf8proc_uint16_t titlecase_seqindex;
|
||||||
utf8proc_int32_t comb1st_index;
|
utf8proc_uint16_t comb_index;
|
||||||
utf8proc_int32_t comb2nd_index;
|
|
||||||
unsigned bidi_mirrored:1;
|
unsigned bidi_mirrored:1;
|
||||||
unsigned comp_exclusion:1;
|
unsigned comp_exclusion:1;
|
||||||
/**
|
/**
|
||||||
@@ -259,13 +266,14 @@ typedef struct utf8proc_property_struct {
|
|||||||
*/
|
*/
|
||||||
unsigned ignorable:1;
|
unsigned ignorable:1;
|
||||||
unsigned control_boundary:1;
|
unsigned control_boundary:1;
|
||||||
|
/** The width of the codepoint. */
|
||||||
|
unsigned charwidth:2;
|
||||||
|
unsigned pad:2;
|
||||||
/**
|
/**
|
||||||
* Boundclass.
|
* Boundclass.
|
||||||
* @see utf8proc_boundclass_t.
|
* @see utf8proc_boundclass_t.
|
||||||
*/
|
*/
|
||||||
unsigned boundclass:4;
|
unsigned boundclass:8;
|
||||||
/** The width of the codepoint. */
|
|
||||||
unsigned charwidth:2;
|
|
||||||
} utf8proc_property_t;
|
} utf8proc_property_t;
|
||||||
|
|
||||||
/** Unicode categories. */
|
/** Unicode categories. */
|
||||||
@@ -349,7 +357,7 @@ typedef enum {
|
|||||||
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
||||||
} utf8proc_decomp_type_t;
|
} utf8proc_decomp_type_t;
|
||||||
|
|
||||||
/** Boundclass property. */
|
/** Boundclass property. (TR29) */
|
||||||
typedef enum {
|
typedef enum {
|
||||||
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
||||||
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
||||||
@@ -364,8 +372,21 @@ typedef enum {
|
|||||||
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
||||||
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
||||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||||
|
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||||
|
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||||
|
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||||
} utf8proc_boundclass_t;
|
} utf8proc_boundclass_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function pointer type passed to @ref utf8proc_map_custom and
|
||||||
|
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
|
||||||
|
* mapping of codepoints to be applied in conjunction with other mappings.
|
||||||
|
*/
|
||||||
|
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||||
* on the first byte.
|
* on the first byte.
|
||||||
@@ -473,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
|||||||
* `buffer` (which must contain at least `bufsize` entries). In case of
|
* `buffer` (which must contain at least `bufsize` entries). In case of
|
||||||
* success, the number of codepoints written is returned; in case of an
|
* success, the number of codepoints written is returned; in case of an
|
||||||
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||||
|
* See @ref utf8proc_decompose_custom to supply additional transformations.
|
||||||
*
|
*
|
||||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||||
* required buffer size is returned, while the buffer will be overwritten with
|
* required buffer size is returned, while the buffer will be overwritten with
|
||||||
@@ -484,8 +506,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
|
||||||
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
* that is called on each codepoint in `str` before any other transformations
|
||||||
|
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||||
|
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
||||||
|
* in-place (i.e., the result is also stored in `buffer`).
|
||||||
*
|
*
|
||||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||||
* @param length the length (in codepoints) of the buffer.
|
* @param length the length (in codepoints) of the buffer.
|
||||||
@@ -500,9 +534,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
* the unicode versioning stability
|
* the unicode versioning stability
|
||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
* In case of success, the length (in bytes) of the resulting UTF-8 string is
|
* In case of success, the length (in codepoints) of the normalized UTF-32 string is
|
||||||
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
||||||
*
|
*
|
||||||
|
* @warning The entries of the array pointed to by `str` have to be in the
|
||||||
|
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
||||||
|
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
||||||
|
* Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
|
||||||
|
*
|
||||||
|
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||||
|
* @param length the length (in codepoints) of the buffer.
|
||||||
|
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||||
|
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||||
|
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||||
|
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||||
|
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||||
|
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||||
|
* codepoints
|
||||||
|
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||||
|
* the unicode versioning stability
|
||||||
|
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* In case of success, the length (in bytes) of the resulting nul-terminated
|
||||||
|
* UTF-8 string is returned; otherwise, a negative error code is returned
|
||||||
|
* (@ref utf8proc_errmsg).
|
||||||
|
*
|
||||||
* @warning The amount of free space pointed to by `buffer` must
|
* @warning The amount of free space pointed to by `buffer` must
|
||||||
* exceed the amount of the input data by one byte, and the
|
* exceed the amount of the input data by one byte, and the
|
||||||
* entries of the array pointed to by `str` have to be in the
|
* entries of the array pointed to by `str` have to be in the
|
||||||
@@ -513,8 +575,26 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
/**
|
/**
|
||||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
|
*
|
||||||
|
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
||||||
|
* state to break graphemes. This state can be passed in as a pointer
|
||||||
|
* in the `state` argument and should initially be set to 0. If the
|
||||||
|
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
|
||||||
|
* GB10/12/13 which require this state will not be applied, essentially
|
||||||
|
* matching the rules in Unicode 8.0.0.
|
||||||
|
*
|
||||||
|
* @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
|
||||||
|
* be called IN ORDER on ALL potential breaks in a string.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||||
|
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Same as @ref utf8proc_grapheme_break_stateful, except without support for the
|
||||||
|
* Unicode 9 additions to the algorithm. Supported for legacy reasons.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||||
|
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -531,6 +611,13 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
|||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||||
|
* title-case character, if any; otherwise (if there is no title-case
|
||||||
|
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||||
* except that a width of 0 is returned for non-printable codepoints
|
* except that a width of 0 is returned for non-printable codepoints
|
||||||
@@ -563,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
|
|||||||
* in any case the result will be NULL terminated (though it might
|
* in any case the result will be NULL terminated (though it might
|
||||||
* contain NULL characters with the string if `str` contained NULL
|
* contain NULL characters with the string if `str` contained NULL
|
||||||
* characters). Other flags in the `options` field are passed to the
|
* characters). Other flags in the `options` field are passed to the
|
||||||
* functions defined above, and regarded as described.
|
* functions defined above, and regarded as described. See also
|
||||||
|
* @ref utfproc_map_custom to supply a custom codepoint transformation.
|
||||||
*
|
*
|
||||||
* In case of success the length of the new string is returned,
|
* In case of success the length of the new string is returned,
|
||||||
* otherwise a negative error code is returned.
|
* otherwise a negative error code is returned.
|
||||||
@@ -575,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
|||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
|
||||||
|
* that is called on each codepoint in `str` before any other transformations
|
||||||
|
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||||
|
* The `custom_func` argument is ignored if it is `NULL`.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
|
);
|
||||||
|
|
||||||
/** @name Unicode normalization
|
/** @name Unicode normalization
|
||||||
*
|
*
|
||||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||||
@@ -587,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
|||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
||||||
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||||
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
@@ -598,4 +697,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user