[unicode] Upgrading to JuliaLang's utf8proc (Unicode 7, maintained)

2015-03-17 12:20:08 -04:00
parent 621b25c964
commit f787851754
3 changed files with 13170 additions and 10233 deletions
--- a/src/utf8proc/utf8proc.c
+++ b/src/utf8proc/utf8proc.c
@@ -43,7 +43,7 @@
 #include "utf8proc_data.c"


-const int8_t utf8proc_utf8class[256] = {
+DLLEXPORT const int8_t utf8proc_utf8class[256] = {
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -81,25 +81,14 @@ const int8_t utf8proc_utf8class[256] = {
 #define UTF8PROC_HANGUL_S_START  0xAC00
 #define UTF8PROC_HANGUL_S_END    0xD7A4

-
-#define UTF8PROC_BOUNDCLASS_START    0
-#define UTF8PROC_BOUNDCLASS_OTHER    1
-#define UTF8PROC_BOUNDCLASS_CR       2
-#define UTF8PROC_BOUNDCLASS_LF       3
-#define UTF8PROC_BOUNDCLASS_CONTROL  4
-#define UTF8PROC_BOUNDCLASS_EXTEND   5
-#define UTF8PROC_BOUNDCLASS_L        6
-#define UTF8PROC_BOUNDCLASS_V        7
-#define UTF8PROC_BOUNDCLASS_T        8
-#define UTF8PROC_BOUNDCLASS_LV       9
-#define UTF8PROC_BOUNDCLASS_LVT     10
-
-
-const char *utf8proc_version(void) {
-  return "1.1.6";
+/* Should follow semantic-versioning rules (semver.org) based on API
+   compatibility.  (Note that the shared-library version number will
+   be different, being based on ABI compatibility.): */
+DLLEXPORT const char *utf8proc_version(void) {
+  return "1.2-dev";
 }

-const char *utf8proc_errmsg(ssize_t errcode) {
+DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) {
  switch (errcode) {
    case UTF8PROC_ERROR_NOMEM:
    return "Memory for processing UTF-8 data could not be allocated.";
@@ -112,11 +101,11 @@ const char *utf8proc_errmsg(ssize_t errcode) {
    case UTF8PROC_ERROR_INVALIDOPTS:
    return "Invalid options for UTF-8 processing chosen.";
    default:
-    return "An unknown error occured while processing UTF-8 data.";
+    return "An unknown error occurred while processing UTF-8 data.";
  }
 }

-ssize_t utf8proc_iterate(
+DLLEXPORT ssize_t utf8proc_iterate(
  const uint8_t *str, ssize_t strlen, int32_t *dst
 ) {
  int length;
@@ -156,14 +145,14 @@ ssize_t utf8proc_iterate(
  return length;
 }

-bool utf8proc_codepoint_valid(int32_t uc) {
+DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) {
  if (uc < 0 || uc >= 0x110000 ||
    ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
    (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
  else return true;
 }

-ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
+DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
  if (uc < 0x00) {
    return 0;
  } else if (uc < 0x80) {
@@ -193,7 +182,8 @@ ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
  } else return 0;
 }

-const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
+/* internal "unsafe" version that does not check whether uc is in range */
+static const utf8proc_property_t *get_property(int32_t uc) {
  /* ASSERT: uc >= 0 && uc < 0x110000 */
  return utf8proc_properties + (
    utf8proc_stage2table[
@@ -202,17 +192,67 @@ const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
  );
 }

+DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
+     return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
+}
+
+/* return whether there is a grapheme break between boundclasses lbc and tbc */
+static bool grapheme_break(int lbc, int tbc) {
+     return 
+          (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
+          (lbc == UTF8PROC_BOUNDCLASS_CR &&
+           tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
+          (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+          (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+          (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
+          (lbc == UTF8PROC_BOUNDCLASS_L &&
+           (tbc == UTF8PROC_BOUNDCLASS_L ||
+            tbc == UTF8PROC_BOUNDCLASS_V ||
+            tbc == UTF8PROC_BOUNDCLASS_LV ||
+            tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
+          ((lbc == UTF8PROC_BOUNDCLASS_LV ||
+            lbc == UTF8PROC_BOUNDCLASS_V) &&
+           (tbc == UTF8PROC_BOUNDCLASS_V ||
+            tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
+          ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
+            lbc == UTF8PROC_BOUNDCLASS_T) &&
+           tbc == UTF8PROC_BOUNDCLASS_T) ? false :
+          (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
+           tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
+          (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+}
+
+/* return whether there is a grapheme break between codepoints c1 and c2 */
+DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
+     return grapheme_break(utf8proc_get_property(c1)->boundclass,
+                           utf8proc_get_property(c2)->boundclass);
+}
+
+/* return a character width analogous to wcwidth (except portable and
+   hopefully less buggy than most system wcwidth functions). */
+DLLEXPORT int utf8proc_charwidth(int32_t c) {
+     return utf8proc_get_property(c)->charwidth;
+}
+
+DLLEXPORT int utf8proc_category(int32_t c) {
+     return utf8proc_get_property(c)->category;
+}
+
+DLLEXPORT const char *utf8proc_category_string(int32_t c) {
+     static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
+     return s[utf8proc_category(c)];
+}
+
 #define utf8proc_decompose_lump(replacement_uc) \
  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
  options & ~UTF8PROC_LUMP, last_boundclass)

-ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
-    int options, int *last_boundclass) {
-  /* ASSERT: uc >= 0 && uc < 0x110000 */
+DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, int options, int *last_boundclass) {
  const utf8proc_property_t *property;
  utf8proc_propval_t category;
  int32_t hangul_sindex;
-  property = utf8proc_get_property(uc);
+  if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
+  property = get_property(uc);
  category = property->category;
  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
@@ -298,48 +338,8 @@ ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
  }
  if (options & UTF8PROC_CHARBOUND) {
    bool boundary;
-    int tbc, lbc;
-    tbc =
-      (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
-      (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
-      ((category == UTF8PROC_CATEGORY_ZL ||
-        category == UTF8PROC_CATEGORY_ZP ||
-        category == UTF8PROC_CATEGORY_CC ||
-        category == UTF8PROC_CATEGORY_CF) &&
-        !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
-      property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
-      ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
-        uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
-      (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
-        UTF8PROC_BOUNDCLASS_V :
-      (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
-        UTF8PROC_BOUNDCLASS_T :
-      (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
-        ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
-          UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
-      ) :
-      UTF8PROC_BOUNDCLASS_OTHER;
-    lbc = *last_boundclass;
-    boundary =
-      (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
-      (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
-      (lbc == UTF8PROC_BOUNDCLASS_CR &&
-       tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
-      (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-      (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-      (lbc == UTF8PROC_BOUNDCLASS_L &&
-       (tbc == UTF8PROC_BOUNDCLASS_L ||
-        tbc == UTF8PROC_BOUNDCLASS_V ||
-        tbc == UTF8PROC_BOUNDCLASS_LV ||
-        tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
-      ((lbc == UTF8PROC_BOUNDCLASS_LV ||
-        lbc == UTF8PROC_BOUNDCLASS_V) &&
-       (tbc == UTF8PROC_BOUNDCLASS_V ||
-        tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
-      ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
-        lbc == UTF8PROC_BOUNDCLASS_T) &&
-       tbc == UTF8PROC_BOUNDCLASS_T) ? false :
-       true;
+    int tbc = property->boundclass;
+    boundary = grapheme_break(*last_boundclass, tbc);
    *last_boundclass = tbc;
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
@@ -351,7 +351,7 @@ ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
  return 1;
 }

-ssize_t utf8proc_decompose(
+DLLEXPORT ssize_t utf8proc_decompose(
  const uint8_t *str, ssize_t strlen,
  int32_t *buffer, ssize_t bufsize, int options
 ) {
@@ -370,7 +370,7 @@ ssize_t utf8proc_decompose(
    while (1) {
      if (options & UTF8PROC_NULLTERM) {
        rpos += utf8proc_iterate(str + rpos, -1, &uc);
-        /* checking of return value is not neccessary,
+        /* checking of return value is not necessary,
           as 'uc' is < 0 in case of error */
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
@@ -398,8 +398,8 @@ ssize_t utf8proc_decompose(
      const utf8proc_property_t *property1, *property2;
      uc1 = buffer[pos];
      uc2 = buffer[pos+1];
-      property1 = utf8proc_get_property(uc1);
-      property2 = utf8proc_get_property(uc2);
+      property1 = get_property(uc1);
+      property2 = get_property(uc2);
      if (property1->combining_class > property2->combining_class &&
          property2->combining_class > 0) {
        buffer[pos] = uc2;
@@ -413,7 +413,7 @@ ssize_t utf8proc_decompose(
  return wpos;
 }

-ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
+DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
     ASSERT: 'buffer' has one spare byte of free space at the end! */
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
@@ -457,7 +457,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
    int32_t composition;
    for (rpos = 0; rpos < length; rpos++) {
      current_char = buffer[rpos];
-      current_property = utf8proc_get_property(current_char);
+      current_property = get_property(current_char);
      if (starter && current_property->combining_class > max_combining_class) {
        /* combination perhaps possible */
        int32_t hangul_lindex;
@@ -486,7 +486,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
          }
        }
        if (!starter_property) {
-          starter_property = utf8proc_get_property(*starter);
+          starter_property = get_property(*starter);
        }
        if (starter_property->comb1st_index >= 0 &&
            current_property->comb2nd_index >= 0) {
@@ -495,7 +495,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
            current_property->comb2nd_index
          ];
          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
-              !(utf8proc_get_property(composition)->comp_exclusion))) {
+              !(get_property(composition)->comp_exclusion))) {
            *starter = composition;
            starter_property = NULL;
            continue;
@@ -528,7 +528,7 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
  }
 }

-ssize_t utf8proc_map(
+DLLEXPORT ssize_t utf8proc_map(
  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 ) {
  int32_t *buffer;
@@ -536,7 +536,7 @@ ssize_t utf8proc_map(
  *dstptr = NULL;
  result = utf8proc_decompose(str, strlen, NULL, 0, options);
  if (result < 0) return result;
-  buffer = malloc(result * sizeof(int32_t) + 1);
+  buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1);
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
  result = utf8proc_decompose(str, strlen, buffer, result, options);
  if (result < 0) {
@@ -550,38 +550,37 @@ ssize_t utf8proc_map(
  }
  {
    int32_t *newptr;
-    newptr = realloc(buffer, (size_t)result+1);
+    newptr = (int32_t *) realloc(buffer, (size_t)result+1);
    if (newptr) buffer = newptr;
  }
  *dstptr = (uint8_t *)buffer;
  return result;
 }

-uint8_t *utf8proc_NFD(const uint8_t *str) {
+DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) {
  uint8_t *retval;
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_DECOMPOSE);
  return retval;
 }

-uint8_t *utf8proc_NFC(const uint8_t *str) {
+DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) {
  uint8_t *retval;
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_COMPOSE);
  return retval;
 }

-uint8_t *utf8proc_NFKD(const uint8_t *str) {
+DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) {
  uint8_t *retval;
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
  return retval;
 }

-uint8_t *utf8proc_NFKC(const uint8_t *str) {
+DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) {
  uint8_t *retval;
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
 }
-
--- a/src/utf8proc/utf8proc.h
+++ b/src/utf8proc/utf8proc.h
@@ -25,7 +25,7 @@
 *  File name:    utf8proc.h
 *
 *  Description:
- *  Header files for libutf8proc, which is a mapping tool for UTF-8 strings
+ *  Header files for utf8proc, which is a mapping tool for UTF-8 strings
 *  with following features:
 *  - decomposing and composing of strings
 *  - replacing compatibility characters with their equivalents
@@ -44,7 +44,7 @@
 *  - rejection of illegal UTF-8 data
 *    (i.e. UTF-8 encoded UTF-16 surrogates)
 *  - support for korean hangul characters
- *  Unicode Version 5.0.0 is supported.
+ *  Unicode Version 7.0.0 is supported.
 */


@@ -60,19 +60,33 @@ typedef unsigned char uint8_t;
 typedef short int16_t;
 typedef unsigned short uint16_t;
 typedef int int32_t;
-#ifdef _WIN64
-#define ssize_t __int64
-#else
-#define ssize_t int
-#endif
+#  ifdef _WIN64
+#    define ssize_t __int64
+#  else
+#    define ssize_t int
+#  endif
+#  ifndef __cplusplus
 typedef unsigned char bool;
 enum {false, true};
+#  endif
 #else
-#include <stdbool.h>
-#include <inttypes.h>
+#  include <stdbool.h>
+#  include <inttypes.h>
 #endif
 #include <limits.h>

+#ifdef _WIN32
+#  ifdef UTF8PROC_EXPORTS
+#    define DLLEXPORT __declspec(dllexport)
+#  else
+#    define DLLEXPORT __declspec(dllimport)
+#  endif
+#elif __GNUC__ >= 4
+#  define DLLEXPORT __attribute__ ((visibility("default")))
+#else
+#  define DLLEXPORT
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -100,7 +114,7 @@ extern "C" {
 *  Flags being regarded by several functions in the library:
 *  NULLTERM:  The given UTF-8 input is NULL terminated.
 *  STABLE:    Unicode Versioning Stability has to be respected.
- *  COMPAT:    Compatiblity decomposition
+ *  COMPAT:    Compatibility decomposition
 *             (i.e. formatting information is lost)
 *  COMPOSE:   Return a result with composed characters.
 *  DECOMPOSE: Return a result with decomposed characters.
@@ -126,7 +140,7 @@ extern "C" {
 *             is representing a single grapheme cluster (see UAX#29).
 *  LUMP:      Lumps certain characters together
 *             (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
- *             (See lump.txt for details.)
+ *             (See lump.md for details.)
 *             If NLF2LF is set, this includes a transformation of
 *             paragraph and line separators to ASCII line-feed (LF).
 *  STRIPMARK: Strips all character markings
@@ -156,19 +170,21 @@ typedef struct utf8proc_property_struct {
  utf8proc_propval_t bidi_class;
  utf8proc_propval_t decomp_type;
  const int32_t *decomp_mapping;
-  unsigned bidi_mirrored:1;
+  const int32_t *casefold_mapping;
  int32_t uppercase_mapping;
  int32_t lowercase_mapping;
  int32_t titlecase_mapping;
  int32_t comb1st_index;
  int32_t comb2nd_index;
+  unsigned bidi_mirrored:1;
  unsigned comp_exclusion:1;
  unsigned ignorable:1;
  unsigned control_boundary:1;
-  unsigned extend:1;
-  const int32_t *casefold_mapping;
+  unsigned boundclass:4;
+  unsigned charwidth:2;
 } utf8proc_property_t;

+#define UTF8PROC_CATEGORY_CN  0
 #define UTF8PROC_CATEGORY_LU  1
 #define UTF8PROC_CATEGORY_LL  2
 #define UTF8PROC_CATEGORY_LT  3
@@ -198,7 +214,6 @@ typedef struct utf8proc_property_struct {
 #define UTF8PROC_CATEGORY_CF 27
 #define UTF8PROC_CATEGORY_CS 28
 #define UTF8PROC_CATEGORY_CO 29
-#define UTF8PROC_CATEGORY_CN 30
 #define UTF8PROC_BIDI_CLASS_L    1
 #define UTF8PROC_BIDI_CLASS_LRE  2
 #define UTF8PROC_BIDI_CLASS_LRO  3
@@ -218,6 +233,10 @@ typedef struct utf8proc_property_struct {
 #define UTF8PROC_BIDI_CLASS_S   17
 #define UTF8PROC_BIDI_CLASS_WS  18
 #define UTF8PROC_BIDI_CLASS_ON  19
+#define UTF8PROC_BIDI_CLASS_LRI  20 /* new in Unicode 6.3 */
+#define UTF8PROC_BIDI_CLASS_RLI  21 /* new in Unicode 6.3 */
+#define UTF8PROC_BIDI_CLASS_FSI  22 /* new in Unicode 6.3 */
+#define UTF8PROC_BIDI_CLASS_PDI  23 /* new in Unicode 6.3 */
 #define UTF8PROC_DECOMP_TYPE_FONT      1
 #define UTF8PROC_DECOMP_TYPE_NOBREAK   2
 #define UTF8PROC_DECOMP_TYPE_INITIAL   3
@@ -235,16 +254,31 @@ typedef struct utf8proc_property_struct {
 #define UTF8PROC_DECOMP_TYPE_FRACTION 15
 #define UTF8PROC_DECOMP_TYPE_COMPAT   16

-extern const int8_t utf8proc_utf8class[256];
+/* values for boundclass property: */
+#define UTF8PROC_BOUNDCLASS_START    0
+#define UTF8PROC_BOUNDCLASS_OTHER    1
+#define UTF8PROC_BOUNDCLASS_CR       2
+#define UTF8PROC_BOUNDCLASS_LF       3
+#define UTF8PROC_BOUNDCLASS_CONTROL  4
+#define UTF8PROC_BOUNDCLASS_EXTEND   5
+#define UTF8PROC_BOUNDCLASS_L        6
+#define UTF8PROC_BOUNDCLASS_V        7
+#define UTF8PROC_BOUNDCLASS_T        8
+#define UTF8PROC_BOUNDCLASS_LV       9
+#define UTF8PROC_BOUNDCLASS_LVT     10
+#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
+#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12

-const char *utf8proc_version(void);
+DLLEXPORT extern const int8_t utf8proc_utf8class[256];

-const char *utf8proc_errmsg(ssize_t errcode);
+DLLEXPORT const char *utf8proc_version(void);
+
+DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode);
 /*
 *  Returns a static error string for the given error code.
 */

-ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
+DLLEXPORT ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
 /*
 *  Reads a single char from the UTF-8 sequence being pointed to by 'str'.
 *  The maximum number of bytes read is 'strlen', unless 'strlen' is
@@ -255,12 +289,12 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
 *  negative error code is returned.
 */

-bool utf8proc_codepoint_valid(int32_t uc);
+DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc);
 /*
 *  Returns 1, if the given unicode code-point is valid, otherwise 0.
 */

-ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
+DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
 /*
 *  Encodes the unicode char with the code point 'uc' as an UTF-8 string in
 *  the byte array being pointed to by 'dst'. This array has to be at least
@@ -270,17 +304,15 @@ ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
 *  This function does not check if 'uc' is a valid unicode code point.
 */

-const utf8proc_property_t *utf8proc_get_property(int32_t uc);
+DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc);
 /*
 *  Returns a pointer to a (constant) struct containing information about
 *  the unicode char with the given code point 'uc'.
 *  If the character is not existent a pointer to a special struct is
- *  returned, where 'category' is a NULL pointer.
- *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
- *           0x10FFFF, otherwise the program might crash!
+ *  returned, where 'category' is 0 (UTF8PROC_CATEGORY_CN).
 */

-ssize_t utf8proc_decompose_char(
+DLLEXPORT ssize_t utf8proc_decompose_char(
  int32_t uc, int32_t *dst, ssize_t bufsize,
  int options, int *last_boundclass
 );
@@ -304,11 +336,9 @@ ssize_t utf8proc_decompose_char(
 *  If the number of written chars would be bigger than 'bufsize',
 *  the buffer (up to 'bufsize') has inpredictable data, and the needed
 *  buffer size is returned.
- *  WARNING: The parameter 'uc' has to be in the range of 0x0000 to
- *           0x10FFFF, otherwise the program might crash!
 */

-ssize_t utf8proc_decompose(
+DLLEXPORT ssize_t utf8proc_decompose(
  const uint8_t *str, ssize_t strlen,
  int32_t *buffer, ssize_t bufsize, int options
 );
@@ -326,7 +356,7 @@ ssize_t utf8proc_decompose(
 *  buffer size is returned.
 */

-ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
+DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
 /*
 *  Reencodes the sequence of unicode characters given by the pointer
 *  'buffer' and 'length' as UTF-8.
@@ -349,7 +379,28 @@ ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
 *           crash!
 */

-ssize_t utf8proc_map(
+DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
+/*
+ * Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
+ * permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ */
+
+DLLEXPORT int utf8proc_charwidth(int32_t c);
+/* Given a codepoint c, return a character width analogous to wcwidth(c),
+   except that a width of 0 is returned for non-printable characters
+   instead of -1 as in wcwidth.
+   If you want to check for particular types of non-printable characters,
+   (analogous to isprint or iscntrl), use utf8proc_category(c). */
+
+DLLEXPORT int utf8proc_category(int32_t c);
+/* Return the Unicode character category for c (one of the
+   UTF8PROC_CATEGORY_* constants.) */
+
+DLLEXPORT const char *utf8proc_category_string(int32_t c);
+/* Return the two-letter (nul-terminated) Unicode category string for
+   c (e.g. "Lu" or "Co"). */
+
+DLLEXPORT ssize_t utf8proc_map(
  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 );
 /*
@@ -368,10 +419,10 @@ ssize_t utf8proc_map(
 *          'malloc', and has theirfore to be freed with 'free'.
 */

-uint8_t *utf8proc_NFD(const uint8_t *str);
-uint8_t *utf8proc_NFC(const uint8_t *str);
-uint8_t *utf8proc_NFKD(const uint8_t *str);
-uint8_t *utf8proc_NFKC(const uint8_t *str);
+DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str);
+DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str);
+DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str);
+DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str);
 /*
 *  Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
 *  normalized version of the null-terminated string 'str'.
@@ -382,4 +433,3 @@ uint8_t *utf8proc_NFKC(const uint8_t *str);
 #endif

 #endif
-
--- a/src/utf8proc/utf8proc_data.c
+++ b/src/utf8proc/utf8proc_data.c