[strings] adding latest utf8proc, new functions for utf8_lower (instead of case folding) and utf8_upper, and a utf8_is_whitespace that takes things like tabs into account

This commit is contained in:
Al
2016-12-31 00:52:12 -05:00
parent db16e656ca
commit 8978000320
4 changed files with 13218 additions and 15322 deletions

View File

@@ -174,14 +174,63 @@ error_free_output:
return NULL; return NULL;
} }
char *utf8_lower(const char *s) { char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
ssize_t len = (ssize_t)strlen(s); ssize_t len = (ssize_t)strlen(s);
uint8_t *dest; utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
utf8proc_uint8_t *dest = NULL;
utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE); utf8proc_ssize_t result;
return (char *)dest; result = utf8proc_decompose(str, len, NULL, 0, options);
if (result < 0) return NULL;
utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
if (buffer == NULL) return NULL;
result = utf8proc_decompose(str, len, buffer, result, options);
if (result < 0) {
free(buffer);
return NULL;
}
for (utf8proc_ssize_t i = 0; i < result; i++) {
utf8proc_int32_t uc = buffer[i];
utf8proc_int32_t norm;
if (casing == UTF8_LOWER) {
norm = utf8proc_tolower(uc);
} else if (casing == UTF8_UPPER) {
norm = utf8proc_toupper(uc);
}
}
result = utf8proc_reencode(buffer, result, options);
utf8proc_int32_t *newptr;
newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
if (newptr) buffer = newptr;
free(buffer);
return (char *)buffer;
} }
inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
return utf8_case(s, UTF8_LOWER, options);
}
inline char *utf8_lower(const char *s) {
return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
}
inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
return utf8_case(s, UTF8_UPPER, options);
}
inline char *utf8_upper(const char *s) {
return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
}
inline bool utf8_is_letter(int cat) { inline bool utf8_is_letter(int cat) {
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \ return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \ || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
@@ -219,6 +268,18 @@ inline bool utf8_is_separator(int cat) {
return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP; return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
} }
inline bool utf8_is_whitespace(int32_t ch) {
int cat = utf8proc_category(ch);
return utf8_is_separator(cat) ||
ch == 9 || // character tabulation
ch == 10 || // line feed
ch == 11 || // line tabulation
ch == 12 || // form feed
ch == 13 || // carriage return
ch == 133 // next line
;
}
int utf8_compare_len(const char *str1, const char *str2, size_t len) { int utf8_compare_len(const char *str1, const char *str2, size_t len) {
if (len == 0) return 0; if (len == 0) return 0;
@@ -415,11 +476,10 @@ inline bool string_contains_hyphen(char *str) {
return string_next_hyphen_index(str, strlen(str)) >= 0; return string_next_hyphen_index(str, strlen(str)) >= 0;
} }
size_t string_right_spaces(char *str) { size_t string_right_spaces_len(char *str, size_t len) {
size_t spaces = 0; size_t spaces = 0;
uint8_t *ptr = (uint8_t *)str; uint8_t *ptr = (uint8_t *)str;
ssize_t len = strlen(str);
int32_t ch = 0; int32_t ch = 0;
ssize_t index = len; ssize_t index = len;
@@ -428,24 +488,22 @@ size_t string_right_spaces(char *str) {
if (ch <= 0) break; if (ch <= 0) break;
int cat = utf8proc_category(ch); if (!utf8_is_whitespace(ch)) {
if (!utf8_is_separator(cat)) {
break; break;
} }
index -= char_len; index -= char_len;
spaces++; spaces += char_len;
} }
return spaces; return spaces;
} }
size_t string_left_spaces(char *str) { size_t string_left_spaces_len(char *str, size_t len) {
size_t spaces = 0; size_t spaces = 0;
uint8_t *ptr = (uint8_t *)str; uint8_t *ptr = (uint8_t *)str;
size_t len = strlen(str);
int32_t ch = 0; int32_t ch = 0;
ssize_t index = 0; ssize_t index = 0;
@@ -454,22 +512,21 @@ size_t string_left_spaces(char *str) {
if (ch <= 0) break; if (ch <= 0) break;
int cat = utf8proc_category(ch); if (!utf8_is_whitespace(ch)) {
if (!utf8_is_separator(cat)) {
break; break;
} }
index += char_len; index += char_len;
ptr += char_len; ptr += char_len;
spaces++; spaces += char_len;
} }
return spaces; return spaces;
} }
char *string_trim(char *str) { char *string_trim(char *str) {
size_t left_spaces = string_left_spaces(str);
size_t right_spaces = string_right_spaces(str);
size_t len = strlen(str); size_t len = strlen(str);
size_t left_spaces = string_left_spaces_len(str, len);
size_t right_spaces = string_right_spaces_len(str, len);
char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces); char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
return ret; return ret;
} }

View File

@@ -128,7 +128,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { // 2-byte sequence if (uc < 0xe0) { // 2-byte sequence
// Must have valid continuation character // Must have valid continuation character
if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f); *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
return 2; return 2;
} }
@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
if (uc < 0x00) { if (uc < 0x00) {
return 0; return 0;
} else if (uc < 0x80) { } else if (uc < 0x80) {
dst[0] = uc; dst[0] = (utf8proc_uint8_t) uc;
return 1; return 1;
} else if (uc < 0x800) { } else if (uc < 0x800) {
dst[0] = 0xC0 + (uc >> 6); dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
dst[1] = 0x80 + (uc & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2; return 2;
// Note: we allow encoding 0xd800-0xdfff here, so as not to change // Note: we allow encoding 0xd800-0xdfff here, so as not to change
// the API, however, these are actually invalid in UTF-8 // the API, however, these are actually invalid in UTF-8
} else if (uc < 0x10000) { } else if (uc < 0x10000) {
dst[0] = 0xE0 + (uc >> 12); dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
dst[1] = 0x80 + ((uc >> 6) & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[2] = 0x80 + (uc & 0x3F); dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 3; return 3;
} else if (uc < 0x110000) { } else if (uc < 0x110000) {
dst[0] = 0xF0 + (uc >> 18); dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
dst[1] = 0x80 + ((uc >> 12) & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
dst[2] = 0x80 + ((uc >> 6) & 0x3F); dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[3] = 0x80 + (uc & 0x3F); dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 4; return 4;
} else return 0; } else return 0;
} }
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
if (uc < 0x00) { if (uc < 0x00) {
return 0; return 0;
} else if (uc < 0x80) { } else if (uc < 0x80) {
dst[0] = uc; dst[0] = (utf8proc_uint8_t)uc;
return 1; return 1;
} else if (uc < 0x800) { } else if (uc < 0x800) {
dst[0] = 0xC0 + (uc >> 6); dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
dst[1] = 0x80 + (uc & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2; return 2;
} else if (uc == 0xFFFF) { } else if (uc == 0xFFFF) {
dst[0] = 0xFF; dst[0] = (utf8proc_uint8_t)0xFF;
return 1; return 1;
} else if (uc == 0xFFFE) { } else if (uc == 0xFFFE) {
dst[0] = 0xFE; dst[0] = (utf8proc_uint8_t)0xFE;
return 1; return 1;
} else if (uc < 0x10000) { } else if (uc < 0x10000) {
dst[0] = 0xE0 + (uc >> 12); dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
dst[1] = 0x80 + ((uc >> 6) & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[2] = 0x80 + (uc & 0x3F); dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 3; return 3;
} else if (uc < 0x110000) { } else if (uc < 0x110000) {
dst[0] = 0xF0 + (uc >> 18); dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
dst[1] = 0x80 + ((uc >> 12) & 0x3F); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
dst[2] = 0x80 + ((uc >> 6) & 0x3F); dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[3] = 0x80 + (uc & 0x3F); dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 4; return 4;
} else return 0; } else return 0;
} }
@@ -233,48 +233,144 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
} }
/* return whether there is a grapheme break between boundclasses lbc and tbc */ /* return whether there is a grapheme break between boundclasses lbc and tbc
static utf8proc_bool grapheme_break(int lbc, int tbc) { (according to the definition of extended grapheme clusters)
return Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : http://www.unicode.org/reports/tr29/tr29-29.html
(lbc == UTF8PROC_BOUNDCLASS_CR && CAVEATS:
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : and GB 12/13 (regional indicator code points) require knowledge of previous characters
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : and are thus not handled by this function. This may result in an incorrect break before
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : an E_Modifier class codepoint and an incorrectly missing break between two
(lbc == UTF8PROC_BOUNDCLASS_L && REGIONAL_INDICATOR class code points if such support does not exist in the caller.
(tbc == UTF8PROC_BOUNDCLASS_L || See the special support in grapheme_break_extended, for required bookkeeping by the caller.
tbc == UTF8PROC_BOUNDCLASS_V || */
tbc == UTF8PROC_BOUNDCLASS_LV || static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : return
((lbc == UTF8PROC_BOUNDCLASS_LV || (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
lbc == UTF8PROC_BOUNDCLASS_V) && (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
(tbc == UTF8PROC_BOUNDCLASS_V || tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
((lbc == UTF8PROC_BOUNDCLASS_LVT || (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
lbc == UTF8PROC_BOUNDCLASS_T) && (lbc == UTF8PROC_BOUNDCLASS_L && // GB6
tbc == UTF8PROC_BOUNDCLASS_T) ? false : (tbc == UTF8PROC_BOUNDCLASS_L || // ---
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : tbc == UTF8PROC_BOUNDCLASS_LV || // ---
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK); tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
} }
/* return whether there is a grapheme break between codepoints c1 and c2 */ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { {
return grapheme_break(utf8proc_get_property(c1)->boundclass, int lbc_override = lbc;
utf8proc_get_property(c2)->boundclass); if (state && *state != UTF8PROC_BOUNDCLASS_START)
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
if (state) {
// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB10. Fold any EXTEND codepoints into the previous
// boundclass if we're dealing with an emoji base boundclass.
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
*state = UTF8PROC_BOUNDCLASS_E_BASE;
else
*state = tbc;
}
return break_permitted;
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
state);
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t c1, utf8proc_int32_t c2) {
return utf8proc_grapheme_break_stateful(c1, c2, NULL);
}
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
{
utf8proc_int32_t entry_cp = **entry;
if ((entry_cp & 0xF800) == 0xD800) {
*entry = *entry + 1;
entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
entry_cp += 0x10000;
}
return entry_cp;
}
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
{
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
return seqindex_decode_entry(&entry);
}
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
utf8proc_ssize_t written = 0;
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
int len = seqindex >> 13;
if (len >= 7) {
len = *entry;
entry++;
}
for (; len >= 0; entry++, len--) {
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
written += utf8proc_decompose_char(entry_cp, dst+written,
(bufsize > written) ? (bufsize - written) : 0, options,
last_boundclass);
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
}
return written;
} }
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
{ {
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping; utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
return cl >= 0 ? cl : c; return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
} }
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
{ {
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping; utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
return cu >= 0 ? cu : c; return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
} }
/* return a character width analogous to wcwidth (except portable and /* return a character width analogous to wcwidth (except portable and
@@ -357,39 +453,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
category == UTF8PROC_CATEGORY_ME) return 0; category == UTF8PROC_CATEGORY_ME) return 0;
} }
if (options & UTF8PROC_CASEFOLD) { if (options & UTF8PROC_CASEFOLD) {
if (property->casefold_mapping != UINT16_MAX) { if (property->casefold_seqindex != UINT16_MAX) {
const utf8proc_int32_t *casefold_entry; return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
utf8proc_ssize_t written = 0;
for (casefold_entry = &utf8proc_sequences[property->casefold_mapping];
*casefold_entry >= 0; casefold_entry++) {
written += utf8proc_decompose_char(*casefold_entry, dst+written,
(bufsize > written) ? (bufsize - written) : 0, options,
last_boundclass);
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
}
return written;
} }
} }
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
if (property->decomp_mapping != UINT16_MAX && if (property->decomp_seqindex != UINT16_MAX &&
(!property->decomp_type || (options & UTF8PROC_COMPAT))) { (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
const utf8proc_int32_t *decomp_entry; return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
utf8proc_ssize_t written = 0;
for (decomp_entry = &utf8proc_sequences[property->decomp_mapping];
*decomp_entry >= 0; decomp_entry++) {
written += utf8proc_decompose_char(*decomp_entry, dst+written,
(bufsize > written) ? (bufsize - written) : 0, options,
last_boundclass);
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
}
return written;
} }
} }
if (options & UTF8PROC_CHARBOUND) { if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary; utf8proc_bool boundary;
int tbc = property->boundclass; int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc); boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
*last_boundclass = tbc;
if (boundary) { if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF; if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 2) dst[1] = uc; if (bufsize >= 2) dst[1] = uc;
@@ -403,6 +480,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
) {
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
}
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
) { ) {
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
utf8proc_ssize_t wpos = 0; utf8proc_ssize_t wpos = 0;
@@ -429,6 +514,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
} }
if (custom_func != NULL) {
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
}
decomp_result = utf8proc_decompose_char( decomp_result = utf8proc_decompose_char(
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
&boundclass &boundclass
@@ -463,9 +551,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
return wpos; return wpos;
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
ASSERT: 'buffer' has one spare byte of free space at the end! */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
utf8proc_ssize_t rpos; utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0; utf8proc_ssize_t wpos = 0;
@@ -538,17 +625,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
if (!starter_property) { if (!starter_property) {
starter_property = unsafe_get_property(*starter); starter_property = unsafe_get_property(*starter);
} }
if (starter_property->comb1st_index >= 0 && if (starter_property->comb_index < 0x8000 &&
current_property->comb2nd_index >= 0) { current_property->comb_index != UINT16_MAX &&
composition = utf8proc_combinations[ current_property->comb_index >= 0x8000) {
starter_property->comb1st_index + int sidx = starter_property->comb_index;
current_property->comb2nd_index int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
]; if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
if (composition >= 0 && (!(options & UTF8PROC_STABLE) || idx += sidx + 2;
!(unsafe_get_property(composition)->comp_exclusion))) { if (current_property->comb_index & 0x4000) {
*starter = composition; composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
starter_property = NULL; } else
continue; composition = utf8proc_combinations[idx];
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
!(unsafe_get_property(composition)->comp_exclusion))) {
*starter = composition;
starter_property = NULL;
continue;
}
} }
} }
} }
@@ -566,6 +660,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
} }
length = wpos; length = wpos;
} }
return length;
}
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
ASSERT: 'buffer' has one spare byte of free space at the end! */
length = utf8proc_normalize_utf32(buffer, length, options);
if (length < 0) return length;
{ {
utf8proc_ssize_t rpos, wpos = 0; utf8proc_ssize_t rpos, wpos = 0;
utf8proc_int32_t uc; utf8proc_int32_t uc;
@@ -587,15 +689,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
) {
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
}
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
) { ) {
utf8proc_int32_t *buffer; utf8proc_int32_t *buffer;
utf8proc_ssize_t result; utf8proc_ssize_t result;
*dstptr = NULL; *dstptr = NULL;
result = utf8proc_decompose(str, strlen, NULL, 0, options); result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
if (result < 0) return result; if (result < 0) return result;
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM; if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose(str, strlen, buffer, result, options); result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
if (result < 0) { if (result < 0) {
free(buffer); free(buffer);
return result; return result;
@@ -640,5 +749,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT); UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval; return retval;
} }

View File

@@ -22,7 +22,7 @@
*/ */
/** /**
* @mainpage * @mainpage
* *
* utf8proc is a free/open-source (MIT/expat licensed) C library * utf8proc is a free/open-source (MIT/expat licensed) C library
@@ -54,7 +54,7 @@
#define UTF8PROC_H #define UTF8PROC_H
/** @name API version /** @name API version
* *
* The utf8proc API version MAJOR.MINOR.PATCH, following * The utf8proc API version MAJOR.MINOR.PATCH, following
* semantic-versioning rules (http://semver.org) based on API * semantic-versioning rules (http://semver.org) based on API
* compatibility. * compatibility.
@@ -63,21 +63,23 @@
* runtime version may append a string like "-dev" to the version number * runtime version may append a string like "-dev" to the version number
* for prerelease versions. * for prerelease versions.
* *
* @note The shared-library version number in the Makefile may be different, * @note The shared-library version number in the Makefile
* (and CMakeLists.txt, and MANIFEST) may be different,
* being based on ABI compatibility rather than API compatibility. * being based on ABI compatibility rather than API compatibility.
*/ */
/** @{ */ /** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */ /** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1 #define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3 #define UTF8PROC_VERSION_MINOR 1
/** The PATCH version (increased for fixes that do not change the API). */ /** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0 #define UTF8PROC_VERSION_PATCH 0
/** @} */ /** @} */
#include <stdlib.h> #include <stdlib.h>
#include <sys/types.h> #include <sys/types.h>
#ifdef _MSC_VER #if defined(_MSC_VER) && _MSC_VER < 1800
// MSVC prior to 2013 lacked stdbool.h and inttypes.h
typedef signed char utf8proc_int8_t; typedef signed char utf8proc_int8_t;
typedef unsigned char utf8proc_uint8_t; typedef unsigned char utf8proc_uint8_t;
typedef short utf8proc_int16_t; typedef short utf8proc_int16_t;
@@ -92,12 +94,18 @@ typedef int utf8proc_ssize_t;
typedef unsigned int utf8proc_size_t; typedef unsigned int utf8proc_size_t;
# endif # endif
# ifndef __cplusplus # ifndef __cplusplus
// emulate C99 bool
typedef unsigned char utf8proc_bool; typedef unsigned char utf8proc_bool;
enum {false, true}; # ifndef __bool_true_false_are_defined
# define false 0
# define true 1
# define __bool_true_false_are_defined 1
# endif
# else # else
typedef bool utf8proc_bool; typedef bool utf8proc_bool;
# endif # endif
#else #else
# include <stddef.h>
# include <stdbool.h> # include <stdbool.h>
# include <inttypes.h> # include <inttypes.h>
typedef int8_t utf8proc_int8_t; typedef int8_t utf8proc_int8_t;
@@ -107,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t;
typedef int32_t utf8proc_int32_t; typedef int32_t utf8proc_int32_t;
typedef uint32_t utf8proc_uint32_t; typedef uint32_t utf8proc_uint32_t;
typedef size_t utf8proc_size_t; typedef size_t utf8proc_size_t;
typedef ssize_t utf8proc_ssize_t; typedef ptrdiff_t utf8proc_ssize_t;
typedef bool utf8proc_bool; typedef bool utf8proc_bool;
#endif #endif
#include <limits.h> #include <limits.h>
@@ -133,7 +141,7 @@ extern "C" {
#endif #endif
#ifndef UINT16_MAX #ifndef UINT16_MAX
# define UINT16_MAX ~(utf8proc_uint16_t)0 # define UINT16_MAX 65535U
#endif #endif
/** /**
@@ -242,13 +250,12 @@ typedef struct utf8proc_property_struct {
* @see utf8proc_decomp_type_t. * @see utf8proc_decomp_type_t.
*/ */
utf8proc_propval_t decomp_type; utf8proc_propval_t decomp_type;
utf8proc_uint16_t decomp_mapping; utf8proc_uint16_t decomp_seqindex;
utf8proc_uint16_t casefold_mapping; utf8proc_uint16_t casefold_seqindex;
utf8proc_int32_t uppercase_mapping; utf8proc_uint16_t uppercase_seqindex;
utf8proc_int32_t lowercase_mapping; utf8proc_uint16_t lowercase_seqindex;
utf8proc_int32_t titlecase_mapping; utf8proc_uint16_t titlecase_seqindex;
utf8proc_int32_t comb1st_index; utf8proc_uint16_t comb_index;
utf8proc_int32_t comb2nd_index;
unsigned bidi_mirrored:1; unsigned bidi_mirrored:1;
unsigned comp_exclusion:1; unsigned comp_exclusion:1;
/** /**
@@ -259,13 +266,14 @@ typedef struct utf8proc_property_struct {
*/ */
unsigned ignorable:1; unsigned ignorable:1;
unsigned control_boundary:1; unsigned control_boundary:1;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned pad:2;
/** /**
* Boundclass. * Boundclass.
* @see utf8proc_boundclass_t. * @see utf8proc_boundclass_t.
*/ */
unsigned boundclass:4; unsigned boundclass:8;
/** The width of the codepoint. */
unsigned charwidth:2;
} utf8proc_property_t; } utf8proc_property_t;
/** Unicode categories. */ /** Unicode categories. */
@@ -349,7 +357,7 @@ typedef enum {
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */ UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
} utf8proc_decomp_type_t; } utf8proc_decomp_type_t;
/** Boundclass property. */ /** Boundclass property. (TR29) */
typedef enum { typedef enum {
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */ UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */ UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
@@ -364,8 +372,21 @@ typedef enum {
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */ UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */ UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */ UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t; } utf8proc_boundclass_t;
/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
* mapping of codepoints to be applied in conjunction with other mappings.
*/
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
/** /**
* Array containing the byte lengths of a UTF-8 encoded codepoint based * Array containing the byte lengths of a UTF-8 encoded codepoint based
* on the first byte. * on the first byte.
@@ -473,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
* `buffer` (which must contain at least `bufsize` entries). In case of * `buffer` (which must contain at least `bufsize` entries). In case of
* success, the number of codepoints written is returned; in case of an * success, the number of codepoints written is returned; in case of an
* error, a negative error code is returned (@ref utf8proc_errmsg). * error, a negative error code is returned (@ref utf8proc_errmsg).
* See @ref utf8proc_decompose_custom to supply additional transformations.
* *
* If the number of written codepoints would be bigger than `bufsize`, the * If the number of written codepoints would be bigger than `bufsize`, the
* required buffer size is returned, while the buffer will be overwritten with * required buffer size is returned, while the buffer will be overwritten with
@@ -484,8 +506,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
); );
/** /**
* Reencodes the sequence of `length` codepoints pointed to by `buffer` * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
* UTF-8 data in-place (i.e., the result is also stored in `buffer`). * that is called on each codepoint in `str` before any other transformations
* (along with a `custom_data` pointer that is passed through to `custom_func`).
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
);
/**
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
* in-place (i.e., the result is also stored in `buffer`).
* *
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
* @param length the length (in codepoints) of the buffer. * @param length the length (in codepoints) of the buffer.
@@ -500,9 +534,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
* the unicode versioning stability * the unicode versioning stability
* *
* @return * @return
* In case of success, the length (in bytes) of the resulting UTF-8 string is * In case of success, the length (in codepoints) of the normalized UTF-32 string is
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
* *
* @warning The entries of the array pointed to by `str` have to be in the
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
/**
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
* Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
*
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
* @param length the length (in codepoints) of the buffer.
* @param options a bitwise or (`|`) of one or more of the following flags:
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
* codepoints
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
* the unicode versioning stability
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
*
* @return
* In case of success, the length (in bytes) of the resulting nul-terminated
* UTF-8 string is returned; otherwise, a negative error code is returned
* (@ref utf8proc_errmsg).
*
* @warning The amount of free space pointed to by `buffer` must * @warning The amount of free space pointed to by `buffer` must
* exceed the amount of the input data by one byte, and the * exceed the amount of the input data by one byte, and the
* entries of the array pointed to by `str` have to be in the * entries of the array pointed to by `str` have to be in the
@@ -513,8 +575,26 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
/** /**
* Given a pair of consecutive codepoints, return whether a grapheme break is * Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29). * permitted between them (as defined by the extended grapheme clusters in UAX#29).
*
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
* state to break graphemes. This state can be passed in as a pointer
* in the `state` argument and should initially be set to 0. If the
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
* GB10/12/13 which require this state will not be applied, essentially
* matching the rules in Unicode 8.0.0.
*
* @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
* be called IN ORDER on ALL potential breaks in a string.
*/ */
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
/**
* Same as @ref utf8proc_grapheme_break_stateful, except without support for the
* Unicode 9 additions to the algorithm. Supported for legacy reasons.
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
/** /**
@@ -531,11 +611,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
*/ */
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
/**
* Given a codepoint `c`, return the codepoint of the corresponding
* title-case character, if any; otherwise (if there is no title-case
* variant, or if `c` is not a valid codepoint) return `c`.
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
/** /**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints * except that a width of 0 is returned for non-printable codepoints
* instead of -1 as in `wcwidth`. * instead of -1 as in `wcwidth`.
* *
* @note * @note
* If you want to check for particular types of non-printable characters, * If you want to check for particular types of non-printable characters,
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
@@ -563,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
* in any case the result will be NULL terminated (though it might * in any case the result will be NULL terminated (though it might
* contain NULL characters with the string if `str` contained NULL * contain NULL characters with the string if `str` contained NULL
* characters). Other flags in the `options` field are passed to the * characters). Other flags in the `options` field are passed to the
* functions defined above, and regarded as described. * functions defined above, and regarded as described. See also
* @ref utfproc_map_custom to supply a custom codepoint transformation.
* *
* In case of success the length of the new string is returned, * In case of success the length of the new string is returned,
* otherwise a negative error code is returned. * otherwise a negative error code is returned.
@@ -575,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
); );
/**
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
* that is called on each codepoint in `str` before any other transformations
* (along with a `custom_data` pointer that is passed through to `custom_func`).
* The `custom_func` argument is ignored if it is `NULL`.
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
);
/** @name Unicode normalization /** @name Unicode normalization
* *
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -587,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
/** NFC normalization (@ref UTF8PROC_COMPOSE). */ /** NFC normalization (@ref UTF8PROC_COMPOSE). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ /** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
/** @} */ /** @} */
@@ -597,5 +696,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
} }
#endif #endif
#endif #endif

File diff suppressed because it is too large Load Diff