[strings] adding latest utf8proc, new functions for utf8_lower (instead of case folding) and utf8_upper, and a utf8_is_whitespace that takes things like tabs into account

2016-12-31 00:52:12 -05:00
parent db16e656ca
commit 8978000320
4 changed files with 13218 additions and 15322 deletions
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -174,14 +174,63 @@ error_free_output:
    return NULL;
 }
-char *utf8_lower(const char *s) {
+char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
    ssize_t len = (ssize_t)strlen(s);
-    uint8_t *dest;
+    utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
    utf8proc_uint8_t *dest = NULL;
-    utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
+    utf8proc_ssize_t result;
-    return (char *)dest;
+    result = utf8proc_decompose(str, len, NULL, 0, options);
    if (result < 0) return NULL;
    utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
    if (buffer == NULL) return NULL;
    result = utf8proc_decompose(str, len, buffer, result, options);
    if (result < 0) {
        free(buffer);
        return NULL;
    }
    for (utf8proc_ssize_t i = 0; i < result; i++) {
        utf8proc_int32_t uc = buffer[i];
        utf8proc_int32_t norm;
        if (casing == UTF8_LOWER) {
            norm = utf8proc_tolower(uc);
        } else if (casing == UTF8_UPPER) {
            norm = utf8proc_toupper(uc);
        }
    }
    result = utf8proc_reencode(buffer, result, options);
    utf8proc_int32_t *newptr;
    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
    if (newptr) buffer = newptr;
    free(buffer);
    return (char *)buffer;
 }
 inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
    return utf8_case(s, UTF8_LOWER, options);
 }
 inline char *utf8_lower(const char *s) {
    return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
 }
 inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
    return utf8_case(s, UTF8_UPPER, options);
 }
 inline char *utf8_upper(const char *s) {
    return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
 }
 inline bool utf8_is_letter(int cat) {
    return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
            || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
@@ -219,6 +268,18 @@ inline bool utf8_is_separator(int cat) {
    return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
 }
 inline bool utf8_is_whitespace(int32_t ch) {
    int cat = utf8proc_category(ch);
    return utf8_is_separator(cat) || 
           ch == 9 || // character tabulation
           ch == 10 || // line feed
           ch == 11 || // line tabulation
           ch == 12 || // form feed
           ch == 13 || // carriage return
           ch == 133 // next line
           ;
 }
 int utf8_compare_len(const char *str1, const char *str2, size_t len) {
    if (len == 0) return 0;
@@ -415,11 +476,10 @@ inline bool string_contains_hyphen(char *str) {
    return string_next_hyphen_index(str, strlen(str)) >= 0;
 }
-size_t string_right_spaces(char *str) {
+size_t string_right_spaces_len(char *str, size_t len) {
    size_t spaces = 0;
    uint8_t *ptr = (uint8_t *)str;
    ssize_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = len;
@@ -428,24 +488,22 @@ size_t string_right_spaces(char *str) {
        if (ch <= 0) break;
-        int cat = utf8proc_category(ch);
+        if (!utf8_is_whitespace(ch)) {
        if (!utf8_is_separator(cat)) {
            break;
        }
        index -= char_len;
-        spaces++;
+        spaces += char_len;
    }
    return spaces;
 }
-size_t string_left_spaces(char *str) {
+size_t string_left_spaces_len(char *str, size_t len) {
    size_t spaces = 0;
    uint8_t *ptr = (uint8_t *)str;
    size_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = 0;
@@ -454,22 +512,21 @@ size_t string_left_spaces(char *str) {
        if (ch <= 0) break;
-        int cat = utf8proc_category(ch);
+        if (!utf8_is_whitespace(ch)) {
        if (!utf8_is_separator(cat)) {
            break;
        }
        index += char_len;
        ptr += char_len;
-        spaces++;
+        spaces += char_len;
    }
    return spaces;
 }
 char *string_trim(char *str) {
    size_t left_spaces = string_left_spaces(str);
    size_t right_spaces = string_right_spaces(str);
    size_t len = strlen(str);
    size_t left_spaces = string_left_spaces_len(str, len);
    size_t right_spaces = string_right_spaces_len(str, len);
    char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
    return ret;
 }
--- a/src/utf8proc/utf8proc.c
+++ b/src/utf8proc/utf8proc.c
@@ -128,7 +128,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
  if (uc < 0xe0) {         // 2-byte sequence
     // Must have valid continuation character
-     if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
+     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
     return 2;
  }
@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
  if (uc < 0x00) {
    return 0;
  } else if (uc < 0x80) {
-    dst[0] = uc;
+    dst[0] = (utf8proc_uint8_t) uc;
    return 1;
  } else if (uc < 0x800) {
-    dst[0] = 0xC0 + (uc >> 6);
+    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
-    dst[1] = 0x80 + (uc & 0x3F);
+    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 2;
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
  // the API, however, these are actually invalid in UTF-8
  } else if (uc < 0x10000) {
-    dst[0] = 0xE0 + (uc >> 12);
+    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
-    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
-    dst[2] = 0x80 + (uc & 0x3F);
+    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 3;
  } else if (uc < 0x110000) {
-    dst[0] = 0xF0 + (uc >> 18);
+    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
-    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
-    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
+    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
-    dst[3] = 0x80 + (uc & 0x3F);
+    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 4;
  } else return 0;
 }
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
   if (uc < 0x00) {
      return 0;
   } else if (uc < 0x80) {
-      dst[0] = uc;
+      dst[0] = (utf8proc_uint8_t)uc;
      return 1;
   } else if (uc < 0x800) {
-      dst[0] = 0xC0 + (uc >> 6);
+      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
-      dst[1] = 0x80 + (uc & 0x3F);
+      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 2;
   } else if (uc == 0xFFFF) {
-       dst[0] = 0xFF;
+       dst[0] = (utf8proc_uint8_t)0xFF;
       return 1;
   } else if (uc == 0xFFFE) {
-       dst[0] = 0xFE;
+       dst[0] = (utf8proc_uint8_t)0xFE;
       return 1;
   } else if (uc < 0x10000) {
-      dst[0] = 0xE0 + (uc >> 12);
+      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
-      dst[1] = 0x80 + ((uc >> 6) & 0x3F);
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
-      dst[2] = 0x80 + (uc & 0x3F);
+      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 3;
   } else if (uc < 0x110000) {
-      dst[0] = 0xF0 + (uc >> 18);
+      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
-      dst[1] = 0x80 + ((uc >> 12) & 0x3F);
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
-      dst[2] = 0x80 + ((uc >> 6) & 0x3F);
+      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
-      dst[3] = 0x80 + (uc & 0x3F);
+      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 4;
   } else return 0;
 }
@@ -233,48 +233,144 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
 }
-/* return whether there is a grapheme break between boundclasses lbc and tbc */
+/* return whether there is a grapheme break between boundclasses lbc and tbc
-static utf8proc_bool grapheme_break(int lbc, int tbc) {
+   (according to the definition of extended grapheme clusters)
-  return 
+  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
-    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
+  http://www.unicode.org/reports/tr29/tr29-29.html
-    (lbc == UTF8PROC_BOUNDCLASS_CR &&
+  CAVEATS:
-     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
+   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
-    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+   and GB 12/13 (regional indicator code points) require knowledge of previous characters
-    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+   and are thus not handled by this function. This may result in an incorrect break before
-    (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
+   an E_Modifier class codepoint and an incorrectly missing break between two
-    (lbc == UTF8PROC_BOUNDCLASS_L &&
+   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
-     (tbc == UTF8PROC_BOUNDCLASS_L ||
+   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
-      tbc == UTF8PROC_BOUNDCLASS_V ||
+*/
-      tbc == UTF8PROC_BOUNDCLASS_LV ||
+static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
-      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
+  return
-    ((lbc == UTF8PROC_BOUNDCLASS_LV ||
+    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
-      lbc == UTF8PROC_BOUNDCLASS_V) &&
+    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
-     (tbc == UTF8PROC_BOUNDCLASS_V ||
+     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
-      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
+    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
-    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
+    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
-      lbc == UTF8PROC_BOUNDCLASS_T) &&
+    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
-     tbc == UTF8PROC_BOUNDCLASS_T) ? false :
+     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
-    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
+      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
-     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
+      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
-    (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            // GB10 (requires additional handling below)
      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       // ----
     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         // GB11
     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             // ----
      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        // ----
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
    true; // GB999
 }
-/* return whether there is a grapheme break between codepoints c1 and c2 */
+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
+{
-  return grapheme_break(utf8proc_get_property(c1)->boundclass,
+  int lbc_override = lbc;
-                        utf8proc_get_property(c2)->boundclass);
+  if (state && *state != UTF8PROC_BOUNDCLASS_START)
    lbc_override = *state;
  utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
  if (state) {
    // Special support for GB 12/13 made possible by GB999. After two RI
    // class codepoints we want to force a break. Do this by resetting the
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
    // after that character according to GB999 (unless of course such a break is
    // forbidden by a different rule such as GB9).
    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
      *state = UTF8PROC_BOUNDCLASS_OTHER;
    // Special support for GB10. Fold any EXTEND codepoints into the previous
    // boundclass if we're dealing with an emoji base boundclass.
    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
      *state = UTF8PROC_BOUNDCLASS_E_BASE;
    else
      *state = tbc;
  }
  return break_permitted;
 }
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
                                 utf8proc_get_property(c2)->boundclass,
                                 state);
 }
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t c1, utf8proc_int32_t c2) {
  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
 }
 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
 {
  utf8proc_int32_t entry_cp = **entry;
  if ((entry_cp & 0xF800) == 0xD800) {
    *entry = *entry + 1;
    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
    entry_cp += 0x10000;
  }
  return entry_cp;
 }
 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
 {
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
  return seqindex_decode_entry(&entry);
 }
 static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
  utf8proc_ssize_t written = 0;
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
  int len = seqindex >> 13;
  if (len >= 7) {
    len = *entry;
    entry++;
  }
  for (; len >= 0; entry++, len--) {
    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
    written += utf8proc_decompose_char(entry_cp, dst+written,
      (bufsize > written) ? (bufsize - written) : 0, options,
    last_boundclass);
    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
  }
  return written;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
+  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
-  return cl >= 0 ? cl : c;
+  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
+  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
-  return cu >= 0 ? cu : c;
+  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
 {
  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 /* return a character width analogous to wcwidth (except portable and
@@ -357,39 +453,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
      category == UTF8PROC_CATEGORY_ME) return 0;
  }
  if (options & UTF8PROC_CASEFOLD) {
-    if (property->casefold_mapping != UINT16_MAX) {
+    if (property->casefold_seqindex != UINT16_MAX) {
-      const utf8proc_int32_t *casefold_entry;
+      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
      utf8proc_ssize_t written = 0;
      for (casefold_entry = &utf8proc_sequences[property->casefold_mapping];
          *casefold_entry >= 0; casefold_entry++) {
        written += utf8proc_decompose_char(*casefold_entry, dst+written,
          (bufsize > written) ? (bufsize - written) : 0, options,
          last_boundclass);
        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
      }
      return written;
    }
  }
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
-    if (property->decomp_mapping != UINT16_MAX &&
+    if (property->decomp_seqindex != UINT16_MAX &&
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
-      const utf8proc_int32_t *decomp_entry;
+      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
      utf8proc_ssize_t written = 0;
      for (decomp_entry = &utf8proc_sequences[property->decomp_mapping];
          *decomp_entry >= 0; decomp_entry++) {
        written += utf8proc_decompose_char(*decomp_entry, dst+written,
          (bufsize > written) ? (bufsize - written) : 0, options,
        last_boundclass);
        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
      }
      return written;
    }
  }
  if (options & UTF8PROC_CHARBOUND) {
    utf8proc_bool boundary;
    int tbc = property->boundclass;
-    boundary = grapheme_break(*last_boundclass, tbc);
+    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
    *last_boundclass = tbc;
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
      if (bufsize >= 2) dst[1] = uc;
@@ -403,6 +480,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
 ) {
    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 ) {
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
  utf8proc_ssize_t wpos = 0;
@@ -429,6 +514,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
      }
      if (custom_func != NULL) {
        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
      }
      decomp_result = utf8proc_decompose_char(
        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
        &boundclass
@@ -463,9 +551,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  return wpos;
 }
-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
-  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
     ASSERT: 'buffer' has one spare byte of free space at the end! */
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
    utf8proc_ssize_t rpos;
    utf8proc_ssize_t wpos = 0;
@@ -538,17 +625,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
        if (!starter_property) {
          starter_property = unsafe_get_property(*starter);
        }
-        if (starter_property->comb1st_index >= 0 &&
+        if (starter_property->comb_index < 0x8000 &&
-            current_property->comb2nd_index >= 0) {
+            current_property->comb_index != UINT16_MAX &&
-          composition = utf8proc_combinations[
+            current_property->comb_index >= 0x8000) {
-            starter_property->comb1st_index +
+          int sidx = starter_property->comb_index;
-            current_property->comb2nd_index
+          int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
-          ];
+          if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
-          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
+            idx += sidx + 2;
-              !(unsafe_get_property(composition)->comp_exclusion))) {
+            if (current_property->comb_index & 0x4000) {
-            *starter = composition;
+              composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
-            starter_property = NULL;
+            } else
-            continue;
+              composition = utf8proc_combinations[idx];
            if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
                !(unsafe_get_property(composition)->comp_exclusion))) {
              *starter = composition;
              starter_property = NULL;
              continue;
            }
          }
        }
      }
@@ -566,6 +660,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
    }
    length = wpos;
  }
  return length;
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
     ASSERT: 'buffer' has one spare byte of free space at the end! */
  length = utf8proc_normalize_utf32(buffer, length, options);
  if (length < 0) return length;
  {
    utf8proc_ssize_t rpos, wpos = 0;
    utf8proc_int32_t uc;
@@ -587,15 +689,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 ) {
    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 ) {
  utf8proc_int32_t *buffer;
  utf8proc_ssize_t result;
  *dstptr = NULL;
-  result = utf8proc_decompose(str, strlen, NULL, 0, options);
+  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
  if (result < 0) return result;
  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
-  result = utf8proc_decompose(str, strlen, buffer, result, options);
+  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
  if (result < 0) {
    free(buffer);
    return result;
@@ -640,5 +749,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
-}
+}
--- a/src/utf8proc/utf8proc.h
+++ b/src/utf8proc/utf8proc.h
@@ -22,7 +22,7 @@
 */
-/** 
+/**
 * @mainpage
 *
 * utf8proc is a free/open-source (MIT/expat licensed) C library
@@ -54,7 +54,7 @@
 #define UTF8PROC_H
 /** @name API version
- *  
+ *
 * The utf8proc API version MAJOR.MINOR.PATCH, following
 * semantic-versioning rules (http://semver.org) based on API
 * compatibility.
@@ -63,21 +63,23 @@
 * runtime version may append a string like "-dev" to the version number
 * for prerelease versions.
 *
- * @note The shared-library version number in the Makefile may be different,
+ * @note The shared-library version number in the Makefile
 *       (and CMakeLists.txt, and MANIFEST) may be different,
 *       being based on ABI compatibility rather than API compatibility.
 */
 /** @{ */
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 1
+#define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 3
+#define UTF8PROC_VERSION_MINOR 1
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */
 #include <stdlib.h>
 #include <sys/types.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1800
 // MSVC prior to 2013 lacked stdbool.h and inttypes.h
 typedef signed char utf8proc_int8_t;
 typedef unsigned char utf8proc_uint8_t;
 typedef short utf8proc_int16_t;
@@ -92,12 +94,18 @@ typedef int utf8proc_ssize_t;
 typedef unsigned int utf8proc_size_t;
 #  endif
 #  ifndef __cplusplus
 // emulate C99 bool
 typedef unsigned char utf8proc_bool;
-enum {false, true};
+#    ifndef __bool_true_false_are_defined
 #      define false 0
 #      define true 1
 #      define __bool_true_false_are_defined 1
 #    endif
 #  else
 typedef bool utf8proc_bool;
 #  endif
 #else
 #  include <stddef.h>
 #  include <stdbool.h>
 #  include <inttypes.h>
 typedef int8_t utf8proc_int8_t;
@@ -107,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t;
 typedef int32_t utf8proc_int32_t;
 typedef uint32_t utf8proc_uint32_t;
 typedef size_t utf8proc_size_t;
-typedef ssize_t utf8proc_ssize_t;
+typedef ptrdiff_t utf8proc_ssize_t;
 typedef bool utf8proc_bool;
 #endif
 #include <limits.h>
@@ -133,7 +141,7 @@ extern "C" {
 #endif
 #ifndef UINT16_MAX
-#  define UINT16_MAX ~(utf8proc_uint16_t)0
+#  define UINT16_MAX 65535U
 #endif
 /**
@@ -242,13 +250,12 @@ typedef struct utf8proc_property_struct {
   * @see utf8proc_decomp_type_t.
   */
  utf8proc_propval_t decomp_type;
-  utf8proc_uint16_t decomp_mapping;
+  utf8proc_uint16_t decomp_seqindex;
-  utf8proc_uint16_t casefold_mapping;
+  utf8proc_uint16_t casefold_seqindex;
-  utf8proc_int32_t uppercase_mapping;
+  utf8proc_uint16_t uppercase_seqindex;
-  utf8proc_int32_t lowercase_mapping;
+  utf8proc_uint16_t lowercase_seqindex;
-  utf8proc_int32_t titlecase_mapping;
+  utf8proc_uint16_t titlecase_seqindex;
-  utf8proc_int32_t comb1st_index;
+  utf8proc_uint16_t comb_index;
  utf8proc_int32_t comb2nd_index;
  unsigned bidi_mirrored:1;
  unsigned comp_exclusion:1;
  /**
@@ -259,13 +266,14 @@ typedef struct utf8proc_property_struct {
   */
  unsigned ignorable:1;
  unsigned control_boundary:1;
  /** The width of the codepoint. */
  unsigned charwidth:2;
  unsigned pad:2;
  /**
   * Boundclass.
   * @see utf8proc_boundclass_t.
   */
-  unsigned boundclass:4;
+  unsigned boundclass:8;
  /** The width of the codepoint. */
  unsigned charwidth:2;
 } utf8proc_property_t;
 /** Unicode categories. */
@@ -349,7 +357,7 @@ typedef enum {
  UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
 } utf8proc_decomp_type_t;
-/** Boundclass property. */
+/** Boundclass property. (TR29) */
 typedef enum {
  UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
  UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
@@ -364,8 +372,21 @@ typedef enum {
  UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
  UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;
 /**
 * Function pointer type passed to @ref utf8proc_map_custom and
 * @ref utf8proc_decompose_custom, which is used to specify a user-defined
 * mapping of codepoints to be applied in conjunction with other mappings.
 */
 typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
 /**
 * Array containing the byte lengths of a UTF-8 encoded codepoint based
 * on the first byte.
@@ -473,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
 * `buffer` (which must contain at least `bufsize` entries).  In case of
 * success, the number of codepoints written is returned; in case of an
 * error, a negative error code is returned (@ref utf8proc_errmsg).
 * See @ref utf8proc_decompose_custom to supply additional transformations.
 *
 * If the number of written codepoints would be bigger than `bufsize`, the
 * required buffer size is returned, while the buffer will be overwritten with
@@ -484,8 +506,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
 );
 /**
- * Reencodes the sequence of `length` codepoints pointed to by `buffer`
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
- * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * that is called on each codepoint in `str` before any other transformations
 * (along with a `custom_data` pointer that is passed through to `custom_func`).
 * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 );
 /**
 * Normalizes the sequence of `length` codepoints pointed to by `buffer`
 * in-place (i.e., the result is also stored in `buffer`).
 *
 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
 * @param length the length (in codepoints) of the buffer.
@@ -500,9 +534,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
 *                           the unicode versioning stability
 *
 * @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
 * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
 *
 * @warning The entries of the array pointed to by `str` have to be in the
 *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
 /**
 * Reencodes the sequence of `length` codepoints pointed to by `buffer`
 * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
 * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
 *
 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
 * @param length the length (in codepoints) of the buffer.
 * @param options a bitwise or (`|`) of one or more of the following flags:
 * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
 * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
 * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
 * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
 * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
 *                           codepoints
 * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
 *                           the unicode versioning stability
 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
 *
 * @return
 * In case of success, the length (in bytes) of the resulting nul-terminated
 * UTF-8 string is returned; otherwise, a negative error code is returned
 * (@ref utf8proc_errmsg).
 *
 * @warning The amount of free space pointed to by `buffer` must
 *          exceed the amount of the input data by one byte, and the
 *          entries of the array pointed to by `str` have to be in the
@@ -513,8 +575,26 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 /**
 * Given a pair of consecutive codepoints, return whether a grapheme break is
 * permitted between them (as defined by the extended grapheme clusters in UAX#29).
 *
 * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
 *              state to break graphemes. This state can be passed in as a pointer
 *              in the `state` argument and should initially be set to 0. If the
 *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
 *              GB10/12/13 which require this state will not be applied, essentially
 *              matching the rules in Unicode 8.0.0.
 *
 * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
 *          be called IN ORDER on ALL potential breaks in a string.
 */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
 /**
 * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
 * Unicode 9 additions to the algorithm. Supported for legacy reasons.
 */
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
 /**
@@ -531,11 +611,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
 /**
 * Given a codepoint `c`, return the codepoint of the corresponding
 * title-case character, if any; otherwise (if there is no title-case
 * variant, or if `c` is not a valid codepoint) return `c`.
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
 /**
 * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
 * except that a width of 0 is returned for non-printable codepoints
 * instead of -1 as in `wcwidth`.
- * 
+ *
 * @note
 * If you want to check for particular types of non-printable characters,
 * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
@@ -563,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
 * in any case the result will be NULL terminated (though it might
 * contain NULL characters with the string if `str` contained NULL
 * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described.  See also
 * @ref utfproc_map_custom to supply a custom codepoint transformation.
 *
 * In case of success the length of the new string is returned,
 * otherwise a negative error code is returned.
@@ -575,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 );
 /**
 * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
 * that is called on each codepoint in `str` before any other transformations
 * (along with a `custom_data` pointer that is passed through to `custom_func`).
 * The `custom_func` argument is ignored if it is `NULL`.
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 );
 /** @name Unicode normalization
 *
 * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -587,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 /** @} */
@@ -597,5 +696,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 }
 #endif
-#endif
+#endif
--- a/src/utf8proc/utf8proc_data.c
+++ b/src/utf8proc/utf8proc_data.c