[strings] adding latest utf8proc, new functions for utf8_lower (instead of case folding) and utf8_upper, and a utf8_is_whitespace that takes things like tabs into account

2016-12-31 00:52:12 -05:00
parent db16e656ca
commit 8978000320
4 changed files with 13218 additions and 15322 deletions
--- a/src/string_utils.c
+++ b/src/string_utils.c
@@ -174,14 +174,63 @@ error_free_output:
    return NULL;
 }

-char *utf8_lower(const char *s) {
+char *utf8_case(const char *s, casing_option_t casing, utf8proc_option_t options) {
    ssize_t len = (ssize_t)strlen(s);
-    uint8_t *dest;
+    utf8proc_uint8_t *str = (utf8proc_uint8_t *)s;
+    utf8proc_uint8_t *dest = NULL;

-    utf8proc_map((const uint8_t *)s, len, &dest, UTF8PROC_OPTIONS_LOWERCASE);
-    return (char *)dest;
+    utf8proc_ssize_t result;
+    result = utf8proc_decompose(str, len, NULL, 0, options);
+
+    if (result < 0) return NULL;
+    utf8proc_int32_t *buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
+    if (buffer == NULL) return NULL;
+
+    result = utf8proc_decompose(str, len, buffer, result, options);
+    if (result < 0) {
+        free(buffer);
+        return NULL;
+    }
+
+    for (utf8proc_ssize_t i = 0; i < result; i++) {
+        utf8proc_int32_t uc = buffer[i];
+        utf8proc_int32_t norm;
+
+        if (casing == UTF8_LOWER) {
+            norm = utf8proc_tolower(uc);
+        } else if (casing == UTF8_UPPER) {
+            norm = utf8proc_toupper(uc);
+        }
+    }
+
+    result = utf8proc_reencode(buffer, result, options);
+
+    utf8proc_int32_t *newptr;
+    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
+    if (newptr) buffer = newptr;
+
+    free(buffer);
+
+    return (char *)buffer;
 }

+inline char *utf8_lower_options(const char *s, utf8proc_option_t options) {
+    return utf8_case(s, UTF8_LOWER, options);
+}
+
+inline char *utf8_lower(const char *s) {
+    return utf8_case(s, UTF8_LOWER, UTF8PROC_OPTIONS_NFC);
+}
+
+inline char *utf8_upper_options(const char *s, utf8proc_option_t options) {
+    return utf8_case(s, UTF8_UPPER, options);
+}
+
+inline char *utf8_upper(const char *s) {
+    return utf8_case(s, UTF8_UPPER, UTF8PROC_OPTIONS_NFC);
+}
+
+
 inline bool utf8_is_letter(int cat) {
    return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU        \
            || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO    \
@@ -219,6 +268,18 @@ inline bool utf8_is_separator(int cat) {
    return cat == UTF8PROC_CATEGORY_ZS || cat == UTF8PROC_CATEGORY_ZL || cat == UTF8PROC_CATEGORY_ZP;
 }

+inline bool utf8_is_whitespace(int32_t ch) {
+    int cat = utf8proc_category(ch);
+    return utf8_is_separator(cat) || 
+           ch == 9 || // character tabulation
+           ch == 10 || // line feed
+           ch == 11 || // line tabulation
+           ch == 12 || // form feed
+           ch == 13 || // carriage return
+           ch == 133 // next line
+           ;
+}
+
 int utf8_compare_len(const char *str1, const char *str2, size_t len) {
    if (len == 0) return 0;

@@ -415,11 +476,10 @@ inline bool string_contains_hyphen(char *str) {
    return string_next_hyphen_index(str, strlen(str)) >= 0;
 }

-size_t string_right_spaces(char *str) {
+size_t string_right_spaces_len(char *str, size_t len) {
    size_t spaces = 0;

    uint8_t *ptr = (uint8_t *)str;
-    ssize_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = len;

@@ -428,24 +488,22 @@ size_t string_right_spaces(char *str) {

        if (ch <= 0) break;

-        int cat = utf8proc_category(ch);
-        if (!utf8_is_separator(cat)) {
+        if (!utf8_is_whitespace(ch)) {
            break;
        }

        index -= char_len;
-        spaces++;
+        spaces += char_len;
    }

    return spaces;

 }

-size_t string_left_spaces(char *str) {
+size_t string_left_spaces_len(char *str, size_t len) {
    size_t spaces = 0;

    uint8_t *ptr = (uint8_t *)str;
-    size_t len = strlen(str);
    int32_t ch = 0;
    ssize_t index = 0;

@@ -454,22 +512,21 @@ size_t string_left_spaces(char *str) {

        if (ch <= 0) break;

-        int cat = utf8proc_category(ch);
-        if (!utf8_is_separator(cat)) {
+        if (!utf8_is_whitespace(ch)) {
            break;
        }
        index += char_len;
        ptr += char_len;
-        spaces++;
+        spaces += char_len;
    }

    return spaces;
 }

 char *string_trim(char *str) {
-    size_t left_spaces = string_left_spaces(str);
-    size_t right_spaces = string_right_spaces(str);
    size_t len = strlen(str);
+    size_t left_spaces = string_left_spaces_len(str, len);
+    size_t right_spaces = string_right_spaces_len(str, len);
    char *ret = strndup(str + left_spaces, len - left_spaces - right_spaces);
    return ret;
 }
--- a/src/utf8proc/utf8proc.c
+++ b/src/utf8proc/utf8proc.c
@@ -128,7 +128,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
  if (uc < 0xe0) {         // 2-byte sequence
     // Must have valid continuation character
-     if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
+     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
     return 2;
  }
@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
  if (uc < 0x00) {
    return 0;
  } else if (uc < 0x80) {
-    dst[0] = uc;
+    dst[0] = (utf8proc_uint8_t) uc;
    return 1;
  } else if (uc < 0x800) {
-    dst[0] = 0xC0 + (uc >> 6);
-    dst[1] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 2;
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
  // the API, however, these are actually invalid in UTF-8
  } else if (uc < 0x10000) {
-    dst[0] = 0xE0 + (uc >> 12);
-    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[2] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 3;
  } else if (uc < 0x110000) {
-    dst[0] = 0xF0 + (uc >> 18);
-    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[3] = 0x80 + (uc & 0x3F);
+    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
    return 4;
  } else return 0;
 }
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
   if (uc < 0x00) {
      return 0;
   } else if (uc < 0x80) {
-      dst[0] = uc;
+      dst[0] = (utf8proc_uint8_t)uc;
      return 1;
   } else if (uc < 0x800) {
-      dst[0] = 0xC0 + (uc >> 6);
-      dst[1] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 2;
   } else if (uc == 0xFFFF) {
-       dst[0] = 0xFF;
+       dst[0] = (utf8proc_uint8_t)0xFF;
       return 1;
   } else if (uc == 0xFFFE) {
-       dst[0] = 0xFE;
+       dst[0] = (utf8proc_uint8_t)0xFE;
       return 1;
   } else if (uc < 0x10000) {
-      dst[0] = 0xE0 + (uc >> 12);
-      dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[2] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 3;
   } else if (uc < 0x110000) {
-      dst[0] = 0xF0 + (uc >> 18);
-      dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-      dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[3] = 0x80 + (uc & 0x3F);
+      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
      return 4;
   } else return 0;
 }
@@ -233,48 +233,144 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
 }

-/* return whether there is a grapheme break between boundclasses lbc and tbc */
-static utf8proc_bool grapheme_break(int lbc, int tbc) {
-  return 
-    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
-    (lbc == UTF8PROC_BOUNDCLASS_CR &&
-     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
-    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-    (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
-    (lbc == UTF8PROC_BOUNDCLASS_L &&
-     (tbc == UTF8PROC_BOUNDCLASS_L ||
-      tbc == UTF8PROC_BOUNDCLASS_V ||
-      tbc == UTF8PROC_BOUNDCLASS_LV ||
-      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
-    ((lbc == UTF8PROC_BOUNDCLASS_LV ||
-      lbc == UTF8PROC_BOUNDCLASS_V) &&
-     (tbc == UTF8PROC_BOUNDCLASS_V ||
-      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
-    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
-      lbc == UTF8PROC_BOUNDCLASS_T) &&
-     tbc == UTF8PROC_BOUNDCLASS_T) ? false :
-    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
-     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
-    (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+/* return whether there is a grapheme break between boundclasses lbc and tbc
+   (according to the definition of extended grapheme clusters)
+  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
+  http://www.unicode.org/reports/tr29/tr29-29.html
+  CAVEATS:
+   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
+   and GB 12/13 (regional indicator code points) require knowledge of previous characters
+   and are thus not handled by this function. This may result in an incorrect break before
+   an E_Modifier class codepoint and an incorrectly missing break between two
+   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
+   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
+*/
+static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
+  return
+    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
+    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
+     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
+    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
+    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
+    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
+     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
+      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
+      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
+      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
+    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
+      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
+     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
+      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
+    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
+      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
+     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
+    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
+     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
+     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
+     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
+    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            // GB10 (requires additional handling below)
+      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       // ----
+     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
+    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         // GB11
+     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             // ----
+      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        // ----
+    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
+     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
+    true; // GB999
 }

-/* return whether there is a grapheme break between codepoints c1 and c2 */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
-  return grapheme_break(utf8proc_get_property(c1)->boundclass,
-                        utf8proc_get_property(c2)->boundclass);
+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
+{
+  int lbc_override = lbc;
+  if (state && *state != UTF8PROC_BOUNDCLASS_START)
+    lbc_override = *state;
+  utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
+  if (state) {
+    // Special support for GB 12/13 made possible by GB999. After two RI
+    // class codepoints we want to force a break. Do this by resetting the
+    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
+    // after that character according to GB999 (unless of course such a break is
+    // forbidden by a different rule such as GB9).
+    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
+      *state = UTF8PROC_BOUNDCLASS_OTHER;
+    // Special support for GB10. Fold any EXTEND codepoints into the previous
+    // boundclass if we're dealing with an emoji base boundclass.
+    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
+              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
+             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
+      *state = UTF8PROC_BOUNDCLASS_E_BASE;
+    else
+      *state = tbc;
+  }
+  return break_permitted;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
+    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
+
+  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
+                                 utf8proc_get_property(c2)->boundclass,
+                                 state);
+}
+
+
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
+    utf8proc_int32_t c1, utf8proc_int32_t c2) {
+  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
+}
+
+static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
+{
+  utf8proc_int32_t entry_cp = **entry;
+  if ((entry_cp & 0xF800) == 0xD800) {
+    *entry = *entry + 1;
+    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
+    entry_cp += 0x10000;
+  }
+  return entry_cp;
+}
+
+static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
+{
+  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
+  return seqindex_decode_entry(&entry);
+}
+
+static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
+  utf8proc_ssize_t written = 0;
+  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
+  int len = seqindex >> 13;
+  if (len >= 7) {
+    len = *entry;
+    entry++;
+  }
+  for (; len >= 0; entry++, len--) {
+    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
+
+    written += utf8proc_decompose_char(entry_cp, dst+written,
+      (bufsize > written) ? (bufsize - written) : 0, options,
+    last_boundclass);
+    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
+  }
+  return written;
 }

 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
-  return cl >= 0 ? cl : c;
+  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
+  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
 }

 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
-  return cu >= 0 ? cu : c;
+  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
+  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
+{
+  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
+  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }

 /* return a character width analogous to wcwidth (except portable and
@@ -357,39 +453,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
      category == UTF8PROC_CATEGORY_ME) return 0;
  }
  if (options & UTF8PROC_CASEFOLD) {
-    if (property->casefold_mapping != UINT16_MAX) {
-      const utf8proc_int32_t *casefold_entry;
-      utf8proc_ssize_t written = 0;
-      for (casefold_entry = &utf8proc_sequences[property->casefold_mapping];
-          *casefold_entry >= 0; casefold_entry++) {
-        written += utf8proc_decompose_char(*casefold_entry, dst+written,
-          (bufsize > written) ? (bufsize - written) : 0, options,
-          last_boundclass);
-        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
-      }
-      return written;
+    if (property->casefold_seqindex != UINT16_MAX) {
+      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
    }
  }
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
-    if (property->decomp_mapping != UINT16_MAX &&
+    if (property->decomp_seqindex != UINT16_MAX &&
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
-      const utf8proc_int32_t *decomp_entry;
-      utf8proc_ssize_t written = 0;
-      for (decomp_entry = &utf8proc_sequences[property->decomp_mapping];
-          *decomp_entry >= 0; decomp_entry++) {
-        written += utf8proc_decompose_char(*decomp_entry, dst+written,
-          (bufsize > written) ? (bufsize - written) : 0, options,
-        last_boundclass);
-        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
-      }
-      return written;
+      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
    }
  }
  if (options & UTF8PROC_CHARBOUND) {
    utf8proc_bool boundary;
    int tbc = property->boundclass;
-    boundary = grapheme_break(*last_boundclass, tbc);
-    *last_boundclass = tbc;
+    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
      if (bufsize >= 2) dst[1] = uc;
@@ -403,6 +480,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
+) {
+    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
 ) {
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
  utf8proc_ssize_t wpos = 0;
@@ -429,6 +514,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
      }
+      if (custom_func != NULL) {
+        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
+      }
      decomp_result = utf8proc_decompose_char(
        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
        &boundclass
@@ -463,9 +551,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  return wpos;
 }

-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
-  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
-     ASSERT: 'buffer' has one spare byte of free space at the end! */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
    utf8proc_ssize_t rpos;
    utf8proc_ssize_t wpos = 0;
@@ -538,17 +625,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
        if (!starter_property) {
          starter_property = unsafe_get_property(*starter);
        }
-        if (starter_property->comb1st_index >= 0 &&
-            current_property->comb2nd_index >= 0) {
-          composition = utf8proc_combinations[
-            starter_property->comb1st_index +
-            current_property->comb2nd_index
-          ];
-          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
-              !(unsafe_get_property(composition)->comp_exclusion))) {
-            *starter = composition;
-            starter_property = NULL;
-            continue;
+        if (starter_property->comb_index < 0x8000 &&
+            current_property->comb_index != UINT16_MAX &&
+            current_property->comb_index >= 0x8000) {
+          int sidx = starter_property->comb_index;
+          int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
+          if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
+            idx += sidx + 2;
+            if (current_property->comb_index & 0x4000) {
+              composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
+            } else
+              composition = utf8proc_combinations[idx];
+
+            if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
+                !(unsafe_get_property(composition)->comp_exclusion))) {
+              *starter = composition;
+              starter_property = NULL;
+              continue;
+            }
          }
        }
      }
@@ -566,6 +660,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
    }
    length = wpos;
  }
+  return length;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
+     ASSERT: 'buffer' has one spare byte of free space at the end! */
+  length = utf8proc_normalize_utf32(buffer, length, options);
+  if (length < 0) return length;
  {
    utf8proc_ssize_t rpos, wpos = 0;
    utf8proc_int32_t uc;
@@ -587,15 +689,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,

 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
+) {
+    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
 ) {
  utf8proc_int32_t *buffer;
  utf8proc_ssize_t result;
  *dstptr = NULL;
-  result = utf8proc_decompose(str, strlen, NULL, 0, options);
+  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
  if (result < 0) return result;
  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
-  result = utf8proc_decompose(str, strlen, buffer, result, options);
+  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
  if (result < 0) {
    free(buffer);
    return result;
@@ -640,5 +749,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
-}
-
+}
--- a/src/utf8proc/utf8proc.h
+++ b/src/utf8proc/utf8proc.h
@@ -22,7 +22,7 @@
 */


-/** 
+/**
 * @mainpage
 *
 * utf8proc is a free/open-source (MIT/expat licensed) C library
@@ -54,7 +54,7 @@
 #define UTF8PROC_H

 /** @name API version
- *  
+ *
 * The utf8proc API version MAJOR.MINOR.PATCH, following
 * semantic-versioning rules (http://semver.org) based on API
 * compatibility.
@@ -63,21 +63,23 @@
 * runtime version may append a string like "-dev" to the version number
 * for prerelease versions.
 *
- * @note The shared-library version number in the Makefile may be different,
+ * @note The shared-library version number in the Makefile
+ *       (and CMakeLists.txt, and MANIFEST) may be different,
 *       being based on ABI compatibility rather than API compatibility.
 */
 /** @{ */
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 1
+#define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 3
+#define UTF8PROC_VERSION_MINOR 1
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */

 #include <stdlib.h>
 #include <sys/types.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// MSVC prior to 2013 lacked stdbool.h and inttypes.h
 typedef signed char utf8proc_int8_t;
 typedef unsigned char utf8proc_uint8_t;
 typedef short utf8proc_int16_t;
@@ -92,12 +94,18 @@ typedef int utf8proc_ssize_t;
 typedef unsigned int utf8proc_size_t;
 #  endif
 #  ifndef __cplusplus
+// emulate C99 bool
 typedef unsigned char utf8proc_bool;
-enum {false, true};
+#    ifndef __bool_true_false_are_defined
+#      define false 0
+#      define true 1
+#      define __bool_true_false_are_defined 1
+#    endif
 #  else
 typedef bool utf8proc_bool;
 #  endif
 #else
+#  include <stddef.h>
 #  include <stdbool.h>
 #  include <inttypes.h>
 typedef int8_t utf8proc_int8_t;
@@ -107,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t;
 typedef int32_t utf8proc_int32_t;
 typedef uint32_t utf8proc_uint32_t;
 typedef size_t utf8proc_size_t;
-typedef ssize_t utf8proc_ssize_t;
+typedef ptrdiff_t utf8proc_ssize_t;
 typedef bool utf8proc_bool;
 #endif
 #include <limits.h>
@@ -133,7 +141,7 @@ extern "C" {
 #endif

 #ifndef UINT16_MAX
-#  define UINT16_MAX ~(utf8proc_uint16_t)0
+#  define UINT16_MAX 65535U
 #endif

 /**
@@ -242,13 +250,12 @@ typedef struct utf8proc_property_struct {
   * @see utf8proc_decomp_type_t.
   */
  utf8proc_propval_t decomp_type;
-  utf8proc_uint16_t decomp_mapping;
-  utf8proc_uint16_t casefold_mapping;
-  utf8proc_int32_t uppercase_mapping;
-  utf8proc_int32_t lowercase_mapping;
-  utf8proc_int32_t titlecase_mapping;
-  utf8proc_int32_t comb1st_index;
-  utf8proc_int32_t comb2nd_index;
+  utf8proc_uint16_t decomp_seqindex;
+  utf8proc_uint16_t casefold_seqindex;
+  utf8proc_uint16_t uppercase_seqindex;
+  utf8proc_uint16_t lowercase_seqindex;
+  utf8proc_uint16_t titlecase_seqindex;
+  utf8proc_uint16_t comb_index;
  unsigned bidi_mirrored:1;
  unsigned comp_exclusion:1;
  /**
@@ -259,13 +266,14 @@ typedef struct utf8proc_property_struct {
   */
  unsigned ignorable:1;
  unsigned control_boundary:1;
+  /** The width of the codepoint. */
+  unsigned charwidth:2;
+  unsigned pad:2;
  /**
   * Boundclass.
   * @see utf8proc_boundclass_t.
   */
-  unsigned boundclass:4;
-  /** The width of the codepoint. */
-  unsigned charwidth:2;
+  unsigned boundclass:8;
 } utf8proc_property_t;

 /** Unicode categories. */
@@ -349,7 +357,7 @@ typedef enum {
  UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
 } utf8proc_decomp_type_t;

-/** Boundclass property. */
+/** Boundclass property. (TR29) */
 typedef enum {
  UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
  UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
@@ -364,8 +372,21 @@ typedef enum {
  UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
  UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
+  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
+  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
+  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
+  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
+  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
+  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;

+/**
+ * Function pointer type passed to @ref utf8proc_map_custom and
+ * @ref utf8proc_decompose_custom, which is used to specify a user-defined
+ * mapping of codepoints to be applied in conjunction with other mappings.
+ */
+typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
+
 /**
 * Array containing the byte lengths of a UTF-8 encoded codepoint based
 * on the first byte.
@@ -473,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
 * `buffer` (which must contain at least `bufsize` entries).  In case of
 * success, the number of codepoints written is returned; in case of an
 * error, a negative error code is returned (@ref utf8proc_errmsg).
+ * See @ref utf8proc_decompose_custom to supply additional transformations.
 *
 * If the number of written codepoints would be bigger than `bufsize`, the
 * required buffer size is returned, while the buffer will be overwritten with
@@ -484,8 +506,20 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
 );

 /**
- * Reencodes the sequence of `length` codepoints pointed to by `buffer`
- * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
+/**
+ * Normalizes the sequence of `length` codepoints pointed to by `buffer`
+ * in-place (i.e., the result is also stored in `buffer`).
 *
 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
 * @param length the length (in codepoints) of the buffer.
@@ -500,9 +534,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
 *                           the unicode versioning stability
 *
 * @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
 * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
 *
+ * @warning The entries of the array pointed to by `str` have to be in the
+ *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
+
+/**
+ * Reencodes the sequence of `length` codepoints pointed to by `buffer`
+ * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
+ *
+ * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
+ * @param length the length (in codepoints) of the buffer.
+ * @param options a bitwise or (`|`) of one or more of the following flags:
+ * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
+ * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
+ * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
+ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
+ * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
+ *                           codepoints
+ * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
+ *                           the unicode versioning stability
+ * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
+ *
+ * @return
+ * In case of success, the length (in bytes) of the resulting nul-terminated
+ * UTF-8 string is returned; otherwise, a negative error code is returned
+ * (@ref utf8proc_errmsg).
+ *
 * @warning The amount of free space pointed to by `buffer` must
 *          exceed the amount of the input data by one byte, and the
 *          entries of the array pointed to by `str` have to be in the
@@ -513,8 +575,26 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 /**
 * Given a pair of consecutive codepoints, return whether a grapheme break is
 * permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ *
+ * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
+ *              state to break graphemes. This state can be passed in as a pointer
+ *              in the `state` argument and should initially be set to 0. If the
+ *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
+ *              GB10/12/13 which require this state will not be applied, essentially
+ *              matching the rules in Unicode 8.0.0.
+ *
+ * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
+ *          be called IN ORDER on ALL potential breaks in a string.
 */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
+    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
+
+/**
+ * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
+ * Unicode 9 additions to the algorithm. Supported for legacy reasons.
+ */
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
+    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);


 /**
@@ -531,11 +611,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);

+/**
+ * Given a codepoint `c`, return the codepoint of the corresponding
+ * title-case character, if any; otherwise (if there is no title-case
+ * variant, or if `c` is not a valid codepoint) return `c`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
+
 /**
 * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
 * except that a width of 0 is returned for non-printable codepoints
 * instead of -1 as in `wcwidth`.
- * 
+ *
 * @note
 * If you want to check for particular types of non-printable characters,
 * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
@@ -563,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
 * in any case the result will be NULL terminated (though it might
 * contain NULL characters with the string if `str` contained NULL
 * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described.  See also
+ * @ref utfproc_map_custom to supply a custom codepoint transformation.
 *
 * In case of success the length of the new string is returned,
 * otherwise a negative error code is returned.
@@ -575,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 );

+/**
+ * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
 /** @name Unicode normalization
 *
 * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -587,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 /** @} */

@@ -597,5 +696,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 }
 #endif

-#endif
-
+#endif
--- a/src/utf8proc/utf8proc_data.c
+++ b/src/utf8proc/utf8proc_data.c