[unicode] Upgrading to JuliaLang's utf8proc (Unicode 7, maintained)
This commit is contained in:
@@ -43,7 +43,7 @@
|
|||||||
#include "utf8proc_data.c"
|
#include "utf8proc_data.c"
|
||||||
|
|
||||||
|
|
||||||
const int8_t utf8proc_utf8class[256] = {
|
DLLEXPORT const int8_t utf8proc_utf8class[256] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
@@ -81,25 +81,14 @@ const int8_t utf8proc_utf8class[256] = {
|
|||||||
#define UTF8PROC_HANGUL_S_START 0xAC00
|
#define UTF8PROC_HANGUL_S_START 0xAC00
|
||||||
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
||||||
|
|
||||||
|
/* Should follow semantic-versioning rules (semver.org) based on API
|
||||||
#define UTF8PROC_BOUNDCLASS_START 0
|
compatibility. (Note that the shared-library version number will
|
||||||
#define UTF8PROC_BOUNDCLASS_OTHER 1
|
be different, being based on ABI compatibility.): */
|
||||||
#define UTF8PROC_BOUNDCLASS_CR 2
|
DLLEXPORT const char *utf8proc_version(void) {
|
||||||
#define UTF8PROC_BOUNDCLASS_LF 3
|
return "1.2-dev";
|
||||||
#define UTF8PROC_BOUNDCLASS_CONTROL 4
|
|
||||||
#define UTF8PROC_BOUNDCLASS_EXTEND 5
|
|
||||||
#define UTF8PROC_BOUNDCLASS_L 6
|
|
||||||
#define UTF8PROC_BOUNDCLASS_V 7
|
|
||||||
#define UTF8PROC_BOUNDCLASS_T 8
|
|
||||||
#define UTF8PROC_BOUNDCLASS_LV 9
|
|
||||||
#define UTF8PROC_BOUNDCLASS_LVT 10
|
|
||||||
|
|
||||||
|
|
||||||
const char *utf8proc_version(void) {
|
|
||||||
return "1.1.6";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *utf8proc_errmsg(ssize_t errcode) {
|
DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) {
|
||||||
switch (errcode) {
|
switch (errcode) {
|
||||||
case UTF8PROC_ERROR_NOMEM:
|
case UTF8PROC_ERROR_NOMEM:
|
||||||
return "Memory for processing UTF-8 data could not be allocated.";
|
return "Memory for processing UTF-8 data could not be allocated.";
|
||||||
@@ -112,11 +101,11 @@ const char *utf8proc_errmsg(ssize_t errcode) {
|
|||||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||||
return "Invalid options for UTF-8 processing chosen.";
|
return "Invalid options for UTF-8 processing chosen.";
|
||||||
default:
|
default:
|
||||||
return "An unknown error occured while processing UTF-8 data.";
|
return "An unknown error occurred while processing UTF-8 data.";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t utf8proc_iterate(
|
DLLEXPORT ssize_t utf8proc_iterate(
|
||||||
const uint8_t *str, ssize_t strlen, int32_t *dst
|
const uint8_t *str, ssize_t strlen, int32_t *dst
|
||||||
) {
|
) {
|
||||||
int length;
|
int length;
|
||||||
@@ -156,14 +145,14 @@ ssize_t utf8proc_iterate(
|
|||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool utf8proc_codepoint_valid(int32_t uc) {
|
DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) {
|
||||||
if (uc < 0 || uc >= 0x110000 ||
|
if (uc < 0 || uc >= 0x110000 ||
|
||||||
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
|
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
|
||||||
(uc >= 0xFDD0 && uc < 0xFDF0)) return false;
|
(uc >= 0xFDD0 && uc < 0xFDF0)) return false;
|
||||||
else return true;
|
else return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
||||||
if (uc < 0x00) {
|
if (uc < 0x00) {
|
||||||
return 0;
|
return 0;
|
||||||
} else if (uc < 0x80) {
|
} else if (uc < 0x80) {
|
||||||
@@ -193,7 +182,8 @@ ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
|||||||
} else return 0;
|
} else return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
/* internal "unsafe" version that does not check whether uc is in range */
|
||||||
|
static const utf8proc_property_t *get_property(int32_t uc) {
|
||||||
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
||||||
return utf8proc_properties + (
|
return utf8proc_properties + (
|
||||||
utf8proc_stage2table[
|
utf8proc_stage2table[
|
||||||
@@ -202,17 +192,67 @@ const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
||||||
|
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return whether there is a grapheme break between boundclasses lbc and tbc */
|
||||||
|
static bool grapheme_break(int lbc, int tbc) {
|
||||||
|
return
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
||||||
|
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||||
|
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
|
||||||
|
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
||||||
|
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
|
||||||
|
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
||||||
|
utf8proc_get_property(c2)->boundclass);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return a character width analogous to wcwidth (except portable and
|
||||||
|
hopefully less buggy than most system wcwidth functions). */
|
||||||
|
DLLEXPORT int utf8proc_charwidth(int32_t c) {
|
||||||
|
return utf8proc_get_property(c)->charwidth;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLLEXPORT int utf8proc_category(int32_t c) {
|
||||||
|
return utf8proc_get_property(c)->category;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLLEXPORT const char *utf8proc_category_string(int32_t c) {
|
||||||
|
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
||||||
|
return s[utf8proc_category(c)];
|
||||||
|
}
|
||||||
|
|
||||||
#define utf8proc_decompose_lump(replacement_uc) \
|
#define utf8proc_decompose_lump(replacement_uc) \
|
||||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||||
|
|
||||||
ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
|
DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, int options, int *last_boundclass) {
|
||||||
int options, int *last_boundclass) {
|
|
||||||
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
|
||||||
const utf8proc_property_t *property;
|
const utf8proc_property_t *property;
|
||||||
utf8proc_propval_t category;
|
utf8proc_propval_t category;
|
||||||
int32_t hangul_sindex;
|
int32_t hangul_sindex;
|
||||||
property = utf8proc_get_property(uc);
|
if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||||
|
property = get_property(uc);
|
||||||
category = property->category;
|
category = property->category;
|
||||||
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
||||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||||
@@ -298,48 +338,8 @@ ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
|
|||||||
}
|
}
|
||||||
if (options & UTF8PROC_CHARBOUND) {
|
if (options & UTF8PROC_CHARBOUND) {
|
||||||
bool boundary;
|
bool boundary;
|
||||||
int tbc, lbc;
|
int tbc = property->boundclass;
|
||||||
tbc =
|
boundary = grapheme_break(*last_boundclass, tbc);
|
||||||
(uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
|
|
||||||
(uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
|
|
||||||
((category == UTF8PROC_CATEGORY_ZL ||
|
|
||||||
category == UTF8PROC_CATEGORY_ZP ||
|
|
||||||
category == UTF8PROC_CATEGORY_CC ||
|
|
||||||
category == UTF8PROC_CATEGORY_CF) &&
|
|
||||||
!(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
|
|
||||||
property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
|
|
||||||
((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
|
|
||||||
uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
|
|
||||||
(uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_V :
|
|
||||||
(uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_T :
|
|
||||||
(uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
|
|
||||||
((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
|
|
||||||
) :
|
|
||||||
UTF8PROC_BOUNDCLASS_OTHER;
|
|
||||||
lbc = *last_boundclass;
|
|
||||||
boundary =
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_V ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
|
||||||
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
|
||||||
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
|
||||||
true;
|
|
||||||
*last_boundclass = tbc;
|
*last_boundclass = tbc;
|
||||||
if (boundary) {
|
if (boundary) {
|
||||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||||
@@ -351,7 +351,7 @@ ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t utf8proc_decompose(
|
DLLEXPORT ssize_t utf8proc_decompose(
|
||||||
const uint8_t *str, ssize_t strlen,
|
const uint8_t *str, ssize_t strlen,
|
||||||
int32_t *buffer, ssize_t bufsize, int options
|
int32_t *buffer, ssize_t bufsize, int options
|
||||||
) {
|
) {
|
||||||
@@ -370,7 +370,7 @@ ssize_t utf8proc_decompose(
|
|||||||
while (1) {
|
while (1) {
|
||||||
if (options & UTF8PROC_NULLTERM) {
|
if (options & UTF8PROC_NULLTERM) {
|
||||||
rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
||||||
/* checking of return value is not neccessary,
|
/* checking of return value is not necessary,
|
||||||
as 'uc' is < 0 in case of error */
|
as 'uc' is < 0 in case of error */
|
||||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||||
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||||
@@ -398,8 +398,8 @@ ssize_t utf8proc_decompose(
|
|||||||
const utf8proc_property_t *property1, *property2;
|
const utf8proc_property_t *property1, *property2;
|
||||||
uc1 = buffer[pos];
|
uc1 = buffer[pos];
|
||||||
uc2 = buffer[pos+1];
|
uc2 = buffer[pos+1];
|
||||||
property1 = utf8proc_get_property(uc1);
|
property1 = get_property(uc1);
|
||||||
property2 = utf8proc_get_property(uc2);
|
property2 = get_property(uc2);
|
||||||
if (property1->combining_class > property2->combining_class &&
|
if (property1->combining_class > property2->combining_class &&
|
||||||
property2->combining_class > 0) {
|
property2->combining_class > 0) {
|
||||||
buffer[pos] = uc2;
|
buffer[pos] = uc2;
|
||||||
@@ -413,7 +413,7 @@ ssize_t utf8proc_decompose(
|
|||||||
return wpos;
|
return wpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
||||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||||
@@ -457,7 +457,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
|||||||
int32_t composition;
|
int32_t composition;
|
||||||
for (rpos = 0; rpos < length; rpos++) {
|
for (rpos = 0; rpos < length; rpos++) {
|
||||||
current_char = buffer[rpos];
|
current_char = buffer[rpos];
|
||||||
current_property = utf8proc_get_property(current_char);
|
current_property = get_property(current_char);
|
||||||
if (starter && current_property->combining_class > max_combining_class) {
|
if (starter && current_property->combining_class > max_combining_class) {
|
||||||
/* combination perhaps possible */
|
/* combination perhaps possible */
|
||||||
int32_t hangul_lindex;
|
int32_t hangul_lindex;
|
||||||
@@ -486,7 +486,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!starter_property) {
|
if (!starter_property) {
|
||||||
starter_property = utf8proc_get_property(*starter);
|
starter_property = get_property(*starter);
|
||||||
}
|
}
|
||||||
if (starter_property->comb1st_index >= 0 &&
|
if (starter_property->comb1st_index >= 0 &&
|
||||||
current_property->comb2nd_index >= 0) {
|
current_property->comb2nd_index >= 0) {
|
||||||
@@ -495,7 +495,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
|||||||
current_property->comb2nd_index
|
current_property->comb2nd_index
|
||||||
];
|
];
|
||||||
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
||||||
!(utf8proc_get_property(composition)->comp_exclusion))) {
|
!(get_property(composition)->comp_exclusion))) {
|
||||||
*starter = composition;
|
*starter = composition;
|
||||||
starter_property = NULL;
|
starter_property = NULL;
|
||||||
continue;
|
continue;
|
||||||
@@ -528,7 +528,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t utf8proc_map(
|
DLLEXPORT ssize_t utf8proc_map(
|
||||||
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
||||||
) {
|
) {
|
||||||
int32_t *buffer;
|
int32_t *buffer;
|
||||||
@@ -536,7 +536,7 @@ ssize_t utf8proc_map(
|
|||||||
*dstptr = NULL;
|
*dstptr = NULL;
|
||||||
result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
||||||
if (result < 0) return result;
|
if (result < 0) return result;
|
||||||
buffer = malloc(result * sizeof(int32_t) + 1);
|
buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1);
|
||||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||||
result = utf8proc_decompose(str, strlen, buffer, result, options);
|
result = utf8proc_decompose(str, strlen, buffer, result, options);
|
||||||
if (result < 0) {
|
if (result < 0) {
|
||||||
@@ -550,38 +550,37 @@ ssize_t utf8proc_map(
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
int32_t *newptr;
|
int32_t *newptr;
|
||||||
newptr = realloc(buffer, (size_t)result+1);
|
newptr = (int32_t *) realloc(buffer, (size_t)result+1);
|
||||||
if (newptr) buffer = newptr;
|
if (newptr) buffer = newptr;
|
||||||
}
|
}
|
||||||
*dstptr = (uint8_t *)buffer;
|
*dstptr = (uint8_t *)buffer;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *utf8proc_NFD(const uint8_t *str) {
|
DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) {
|
||||||
uint8_t *retval;
|
uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_DECOMPOSE);
|
UTF8PROC_DECOMPOSE);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *utf8proc_NFC(const uint8_t *str) {
|
DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) {
|
||||||
uint8_t *retval;
|
uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_COMPOSE);
|
UTF8PROC_COMPOSE);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *utf8proc_NFKD(const uint8_t *str) {
|
DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) {
|
||||||
uint8_t *retval;
|
uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *utf8proc_NFKC(const uint8_t *str) {
|
DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) {
|
||||||
uint8_t *retval;
|
uint8_t *retval;
|
||||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@
|
|||||||
* File name: utf8proc.h
|
* File name: utf8proc.h
|
||||||
*
|
*
|
||||||
* Description:
|
* Description:
|
||||||
* Header files for libutf8proc, which is a mapping tool for UTF-8 strings
|
* Header files for utf8proc, which is a mapping tool for UTF-8 strings
|
||||||
* with following features:
|
* with following features:
|
||||||
* - decomposing and composing of strings
|
* - decomposing and composing of strings
|
||||||
* - replacing compatibility characters with their equivalents
|
* - replacing compatibility characters with their equivalents
|
||||||
@@ -44,7 +44,7 @@
|
|||||||
* - rejection of illegal UTF-8 data
|
* - rejection of illegal UTF-8 data
|
||||||
* (i.e. UTF-8 encoded UTF-16 surrogates)
|
* (i.e. UTF-8 encoded UTF-16 surrogates)
|
||||||
* - support for korean hangul characters
|
* - support for korean hangul characters
|
||||||
* Unicode Version 5.0.0 is supported.
|
* Unicode Version 7.0.0 is supported.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
@@ -60,19 +60,33 @@ typedef unsigned char uint8_t;
|
|||||||
typedef short int16_t;
|
typedef short int16_t;
|
||||||
typedef unsigned short uint16_t;
|
typedef unsigned short uint16_t;
|
||||||
typedef int int32_t;
|
typedef int int32_t;
|
||||||
#ifdef _WIN64
|
# ifdef _WIN64
|
||||||
#define ssize_t __int64
|
# define ssize_t __int64
|
||||||
#else
|
# else
|
||||||
#define ssize_t int
|
# define ssize_t int
|
||||||
#endif
|
# endif
|
||||||
|
# ifndef __cplusplus
|
||||||
typedef unsigned char bool;
|
typedef unsigned char bool;
|
||||||
enum {false, true};
|
enum {false, true};
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
#include <stdbool.h>
|
# include <stdbool.h>
|
||||||
#include <inttypes.h>
|
# include <inttypes.h>
|
||||||
#endif
|
#endif
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
# ifdef UTF8PROC_EXPORTS
|
||||||
|
# define DLLEXPORT __declspec(dllexport)
|
||||||
|
# else
|
||||||
|
# define DLLEXPORT __declspec(dllimport)
|
||||||
|
# endif
|
||||||
|
#elif __GNUC__ >= 4
|
||||||
|
# define DLLEXPORT __attribute__ ((visibility("default")))
|
||||||
|
#else
|
||||||
|
# define DLLEXPORT
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@@ -100,7 +114,7 @@ extern "C" {
|
|||||||
* Flags being regarded by several functions in the library:
|
* Flags being regarded by several functions in the library:
|
||||||
* NULLTERM: The given UTF-8 input is NULL terminated.
|
* NULLTERM: The given UTF-8 input is NULL terminated.
|
||||||
* STABLE: Unicode Versioning Stability has to be respected.
|
* STABLE: Unicode Versioning Stability has to be respected.
|
||||||
* COMPAT: Compatiblity decomposition
|
* COMPAT: Compatibility decomposition
|
||||||
* (i.e. formatting information is lost)
|
* (i.e. formatting information is lost)
|
||||||
* COMPOSE: Return a result with composed characters.
|
* COMPOSE: Return a result with composed characters.
|
||||||
* DECOMPOSE: Return a result with decomposed characters.
|
* DECOMPOSE: Return a result with decomposed characters.
|
||||||
@@ -126,7 +140,7 @@ extern "C" {
|
|||||||
* is representing a single grapheme cluster (see UAX#29).
|
* is representing a single grapheme cluster (see UAX#29).
|
||||||
* LUMP: Lumps certain characters together
|
* LUMP: Lumps certain characters together
|
||||||
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
|
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
|
||||||
* (See lump.txt for details.)
|
* (See lump.md for details.)
|
||||||
* If NLF2LF is set, this includes a transformation of
|
* If NLF2LF is set, this includes a transformation of
|
||||||
* paragraph and line separators to ASCII line-feed (LF).
|
* paragraph and line separators to ASCII line-feed (LF).
|
||||||
* STRIPMARK: Strips all character markings
|
* STRIPMARK: Strips all character markings
|
||||||
@@ -156,19 +170,21 @@ typedef struct utf8proc_property_struct {
|
|||||||
utf8proc_propval_t bidi_class;
|
utf8proc_propval_t bidi_class;
|
||||||
utf8proc_propval_t decomp_type;
|
utf8proc_propval_t decomp_type;
|
||||||
const int32_t *decomp_mapping;
|
const int32_t *decomp_mapping;
|
||||||
unsigned bidi_mirrored:1;
|
const int32_t *casefold_mapping;
|
||||||
int32_t uppercase_mapping;
|
int32_t uppercase_mapping;
|
||||||
int32_t lowercase_mapping;
|
int32_t lowercase_mapping;
|
||||||
int32_t titlecase_mapping;
|
int32_t titlecase_mapping;
|
||||||
int32_t comb1st_index;
|
int32_t comb1st_index;
|
||||||
int32_t comb2nd_index;
|
int32_t comb2nd_index;
|
||||||
|
unsigned bidi_mirrored:1;
|
||||||
unsigned comp_exclusion:1;
|
unsigned comp_exclusion:1;
|
||||||
unsigned ignorable:1;
|
unsigned ignorable:1;
|
||||||
unsigned control_boundary:1;
|
unsigned control_boundary:1;
|
||||||
unsigned extend:1;
|
unsigned boundclass:4;
|
||||||
const int32_t *casefold_mapping;
|
unsigned charwidth:2;
|
||||||
} utf8proc_property_t;
|
} utf8proc_property_t;
|
||||||
|
|
||||||
|
#define UTF8PROC_CATEGORY_CN 0
|
||||||
#define UTF8PROC_CATEGORY_LU 1
|
#define UTF8PROC_CATEGORY_LU 1
|
||||||
#define UTF8PROC_CATEGORY_LL 2
|
#define UTF8PROC_CATEGORY_LL 2
|
||||||
#define UTF8PROC_CATEGORY_LT 3
|
#define UTF8PROC_CATEGORY_LT 3
|
||||||
@@ -198,7 +214,6 @@ typedef struct utf8proc_property_struct {
|
|||||||
#define UTF8PROC_CATEGORY_CF 27
|
#define UTF8PROC_CATEGORY_CF 27
|
||||||
#define UTF8PROC_CATEGORY_CS 28
|
#define UTF8PROC_CATEGORY_CS 28
|
||||||
#define UTF8PROC_CATEGORY_CO 29
|
#define UTF8PROC_CATEGORY_CO 29
|
||||||
#define UTF8PROC_CATEGORY_CN 30
|
|
||||||
#define UTF8PROC_BIDI_CLASS_L 1
|
#define UTF8PROC_BIDI_CLASS_L 1
|
||||||
#define UTF8PROC_BIDI_CLASS_LRE 2
|
#define UTF8PROC_BIDI_CLASS_LRE 2
|
||||||
#define UTF8PROC_BIDI_CLASS_LRO 3
|
#define UTF8PROC_BIDI_CLASS_LRO 3
|
||||||
@@ -218,6 +233,10 @@ typedef struct utf8proc_property_struct {
|
|||||||
#define UTF8PROC_BIDI_CLASS_S 17
|
#define UTF8PROC_BIDI_CLASS_S 17
|
||||||
#define UTF8PROC_BIDI_CLASS_WS 18
|
#define UTF8PROC_BIDI_CLASS_WS 18
|
||||||
#define UTF8PROC_BIDI_CLASS_ON 19
|
#define UTF8PROC_BIDI_CLASS_ON 19
|
||||||
|
#define UTF8PROC_BIDI_CLASS_LRI 20 /* new in Unicode 6.3 */
|
||||||
|
#define UTF8PROC_BIDI_CLASS_RLI 21 /* new in Unicode 6.3 */
|
||||||
|
#define UTF8PROC_BIDI_CLASS_FSI 22 /* new in Unicode 6.3 */
|
||||||
|
#define UTF8PROC_BIDI_CLASS_PDI 23 /* new in Unicode 6.3 */
|
||||||
#define UTF8PROC_DECOMP_TYPE_FONT 1
|
#define UTF8PROC_DECOMP_TYPE_FONT 1
|
||||||
#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
|
#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
|
||||||
#define UTF8PROC_DECOMP_TYPE_INITIAL 3
|
#define UTF8PROC_DECOMP_TYPE_INITIAL 3
|
||||||
@@ -235,16 +254,31 @@ typedef struct utf8proc_property_struct {
|
|||||||
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
|
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
|
||||||
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
|
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
|
||||||
|
|
||||||
extern const int8_t utf8proc_utf8class[256];
|
/* values for boundclass property: */
|
||||||
|
#define UTF8PROC_BOUNDCLASS_START 0
|
||||||
|
#define UTF8PROC_BOUNDCLASS_OTHER 1
|
||||||
|
#define UTF8PROC_BOUNDCLASS_CR 2
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LF 3
|
||||||
|
#define UTF8PROC_BOUNDCLASS_CONTROL 4
|
||||||
|
#define UTF8PROC_BOUNDCLASS_EXTEND 5
|
||||||
|
#define UTF8PROC_BOUNDCLASS_L 6
|
||||||
|
#define UTF8PROC_BOUNDCLASS_V 7
|
||||||
|
#define UTF8PROC_BOUNDCLASS_T 8
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LV 9
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LVT 10
|
||||||
|
#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
|
||||||
|
#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12
|
||||||
|
|
||||||
const char *utf8proc_version(void);
|
DLLEXPORT extern const int8_t utf8proc_utf8class[256];
|
||||||
|
|
||||||
const char *utf8proc_errmsg(ssize_t errcode);
|
DLLEXPORT const char *utf8proc_version(void);
|
||||||
|
|
||||||
|
DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode);
|
||||||
/*
|
/*
|
||||||
* Returns a static error string for the given error code.
|
* Returns a static error string for the given error code.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
|
DLLEXPORT ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
|
||||||
/*
|
/*
|
||||||
* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
|
* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
|
||||||
* The maximum number of bytes read is 'strlen', unless 'strlen' is
|
* The maximum number of bytes read is 'strlen', unless 'strlen' is
|
||||||
@@ -255,12 +289,12 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
|
|||||||
* negative error code is returned.
|
* negative error code is returned.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
bool utf8proc_codepoint_valid(int32_t uc);
|
DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc);
|
||||||
/*
|
/*
|
||||||
* Returns 1, if the given unicode code-point is valid, otherwise 0.
|
* Returns 1, if the given unicode code-point is valid, otherwise 0.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
|
DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
|
||||||
/*
|
/*
|
||||||
* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
|
* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
|
||||||
* the byte array being pointed to by 'dst'. This array has to be at least
|
* the byte array being pointed to by 'dst'. This array has to be at least
|
||||||
@@ -270,17 +304,15 @@ ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
|
|||||||
* This function does not check if 'uc' is a valid unicode code point.
|
* This function does not check if 'uc' is a valid unicode code point.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const utf8proc_property_t *utf8proc_get_property(int32_t uc);
|
DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc);
|
||||||
/*
|
/*
|
||||||
* Returns a pointer to a (constant) struct containing information about
|
* Returns a pointer to a (constant) struct containing information about
|
||||||
* the unicode char with the given code point 'uc'.
|
* the unicode char with the given code point 'uc'.
|
||||||
* If the character is not existent a pointer to a special struct is
|
* If the character is not existent a pointer to a special struct is
|
||||||
* returned, where 'category' is a NULL pointer.
|
* returned, where 'category' is 0 (UTF8PROC_CATEGORY_CN).
|
||||||
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
|
|
||||||
* 0x10FFFF, otherwise the program might crash!
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_decompose_char(
|
DLLEXPORT ssize_t utf8proc_decompose_char(
|
||||||
int32_t uc, int32_t *dst, ssize_t bufsize,
|
int32_t uc, int32_t *dst, ssize_t bufsize,
|
||||||
int options, int *last_boundclass
|
int options, int *last_boundclass
|
||||||
);
|
);
|
||||||
@@ -304,11 +336,9 @@ ssize_t utf8proc_decompose_char(
|
|||||||
* If the number of written chars would be bigger than 'bufsize',
|
* If the number of written chars would be bigger than 'bufsize',
|
||||||
* the buffer (up to 'bufsize') has inpredictable data, and the needed
|
* the buffer (up to 'bufsize') has inpredictable data, and the needed
|
||||||
* buffer size is returned.
|
* buffer size is returned.
|
||||||
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
|
|
||||||
* 0x10FFFF, otherwise the program might crash!
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_decompose(
|
DLLEXPORT ssize_t utf8proc_decompose(
|
||||||
const uint8_t *str, ssize_t strlen,
|
const uint8_t *str, ssize_t strlen,
|
||||||
int32_t *buffer, ssize_t bufsize, int options
|
int32_t *buffer, ssize_t bufsize, int options
|
||||||
);
|
);
|
||||||
@@ -326,7 +356,7 @@ ssize_t utf8proc_decompose(
|
|||||||
* buffer size is returned.
|
* buffer size is returned.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
|
DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
|
||||||
/*
|
/*
|
||||||
* Reencodes the sequence of unicode characters given by the pointer
|
* Reencodes the sequence of unicode characters given by the pointer
|
||||||
* 'buffer' and 'length' as UTF-8.
|
* 'buffer' and 'length' as UTF-8.
|
||||||
@@ -349,7 +379,28 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
|
|||||||
* crash!
|
* crash!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ssize_t utf8proc_map(
|
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
|
||||||
|
/*
|
||||||
|
* Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
|
||||||
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
|
*/
|
||||||
|
|
||||||
|
DLLEXPORT int utf8proc_charwidth(int32_t c);
|
||||||
|
/* Given a codepoint c, return a character width analogous to wcwidth(c),
|
||||||
|
except that a width of 0 is returned for non-printable characters
|
||||||
|
instead of -1 as in wcwidth.
|
||||||
|
If you want to check for particular types of non-printable characters,
|
||||||
|
(analogous to isprint or iscntrl), use utf8proc_category(c). */
|
||||||
|
|
||||||
|
DLLEXPORT int utf8proc_category(int32_t c);
|
||||||
|
/* Return the Unicode character category for c (one of the
|
||||||
|
UTF8PROC_CATEGORY_* constants.) */
|
||||||
|
|
||||||
|
DLLEXPORT const char *utf8proc_category_string(int32_t c);
|
||||||
|
/* Return the two-letter (nul-terminated) Unicode category string for
|
||||||
|
c (e.g. "Lu" or "Co"). */
|
||||||
|
|
||||||
|
DLLEXPORT ssize_t utf8proc_map(
|
||||||
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
||||||
);
|
);
|
||||||
/*
|
/*
|
||||||
@@ -368,10 +419,10 @@ ssize_t utf8proc_map(
|
|||||||
* 'malloc', and has theirfore to be freed with 'free'.
|
* 'malloc', and has theirfore to be freed with 'free'.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
uint8_t *utf8proc_NFD(const uint8_t *str);
|
DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str);
|
||||||
uint8_t *utf8proc_NFC(const uint8_t *str);
|
DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str);
|
||||||
uint8_t *utf8proc_NFKD(const uint8_t *str);
|
DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str);
|
||||||
uint8_t *utf8proc_NFKC(const uint8_t *str);
|
DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str);
|
||||||
/*
|
/*
|
||||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||||
* normalized version of the null-terminated string 'str'.
|
* normalized version of the null-terminated string 'str'.
|
||||||
@@ -381,5 +432,4 @@ uint8_t *utf8proc_NFKC(const uint8_t *str);
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user