[strings] reverting to utf8proc v1.3.1, as 2.0 and above can chop off certain sequences

2017-01-01 20:03:23 -05:00
parent fe88630f78
commit 5c56a44faa
3 changed files with 15340 additions and 13186 deletions
--- a/src/utf8proc/utf8proc.c
+++ b/src/utf8proc/utf8proc.c
@@ -1,6 +1,5 @@
 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
 /*
 *  Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a
@@ -128,7 +127,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
  if (uc < 0xe0) {         // 2-byte sequence
     // Must have valid continuation character
-     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
+     if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
     return 2;
  }
@@ -166,24 +165,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
  if (uc < 0x00) {
    return 0;
  } else if (uc < 0x80) {
-    dst[0] = (utf8proc_uint8_t) uc;
+    dst[0] = uc;
    return 1;
  } else if (uc < 0x800) {
-    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+    dst[0] = 0xC0 + (uc >> 6);
-    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+    dst[1] = 0x80 + (uc & 0x3F);
    return 2;
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
  // the API, however, these are actually invalid in UTF-8
  } else if (uc < 0x10000) {
-    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+    dst[0] = 0xE0 + (uc >> 12);
-    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+    dst[2] = 0x80 + (uc & 0x3F);
    return 3;
  } else if (uc < 0x110000) {
-    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+    dst[0] = 0xF0 + (uc >> 18);
-    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+    dst[3] = 0x80 + (uc & 0x3F);
    return 4;
  } else return 0;
 }
@@ -193,28 +192,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
   if (uc < 0x00) {
      return 0;
   } else if (uc < 0x80) {
-      dst[0] = (utf8proc_uint8_t)uc;
+      dst[0] = uc;
      return 1;
   } else if (uc < 0x800) {
-      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+      dst[0] = 0xC0 + (uc >> 6);
-      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+      dst[1] = 0x80 + (uc & 0x3F);
      return 2;
   } else if (uc == 0xFFFF) {
-       dst[0] = (utf8proc_uint8_t)0xFF;
+       dst[0] = 0xFF;
       return 1;
   } else if (uc == 0xFFFE) {
-       dst[0] = (utf8proc_uint8_t)0xFE;
+       dst[0] = 0xFE;
       return 1;
   } else if (uc < 0x10000) {
-      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+      dst[0] = 0xE0 + (uc >> 12);
-      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[1] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+      dst[2] = 0x80 + (uc & 0x3F);
      return 3;
   } else if (uc < 0x110000) {
-      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+      dst[0] = 0xF0 + (uc >> 18);
-      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+      dst[1] = 0x80 + ((uc >> 12) & 0x3F);
-      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+      dst[2] = 0x80 + ((uc >> 6) & 0x3F);
-      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
+      dst[3] = 0x80 + (uc & 0x3F);
      return 4;
   } else return 0;
 }
@@ -233,144 +232,48 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
 }
-/* return whether there is a grapheme break between boundclasses lbc and tbc
+/* return whether there is a grapheme break between boundclasses lbc and tbc */
-   (according to the definition of extended grapheme clusters)
+static utf8proc_bool grapheme_break(int lbc, int tbc) {
  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
  http://www.unicode.org/reports/tr29/tr29-29.html
  CAVEATS:
   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
   and GB 12/13 (regional indicator code points) require knowledge of previous characters
   and are thus not handled by this function. This may result in an incorrect break before
   an E_Modifier class codepoint and an incorrectly missing break between two
   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
 */
 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
  return 
-    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
+    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
-    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
+    (lbc == UTF8PROC_BOUNDCLASS_CR &&
-     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
+     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
-    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
+    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
+    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
+    (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
-     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
+    (lbc == UTF8PROC_BOUNDCLASS_L &&
-      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
+     (tbc == UTF8PROC_BOUNDCLASS_L ||
-      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
+      tbc == UTF8PROC_BOUNDCLASS_V ||
-      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
+      tbc == UTF8PROC_BOUNDCLASS_LV ||
-    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
+      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
-      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
+    ((lbc == UTF8PROC_BOUNDCLASS_LV ||
-     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
+      lbc == UTF8PROC_BOUNDCLASS_V) &&
-      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
+     (tbc == UTF8PROC_BOUNDCLASS_V ||
-    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
+      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
-      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
+    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
-     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
+      lbc == UTF8PROC_BOUNDCLASS_T) &&
-    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
+     tbc == UTF8PROC_BOUNDCLASS_T) ? false :
-     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
+    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
-     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
+     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
-     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
+    (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            // GB10 (requires additional handling below)
      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       // ----
     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         // GB11
     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             // ----
      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        // ----
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
    true; // GB999
 }
-static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
+/* return whether there is a grapheme break between codepoints c1 and c2 */
-{
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
-  int lbc_override = lbc;
+  return grapheme_break(utf8proc_get_property(c1)->boundclass,
-  if (state && *state != UTF8PROC_BOUNDCLASS_START)
+                        utf8proc_get_property(c2)->boundclass);
    lbc_override = *state;
  utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
  if (state) {
    // Special support for GB 12/13 made possible by GB999. After two RI
    // class codepoints we want to force a break. Do this by resetting the
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
    // after that character according to GB999 (unless of course such a break is
    // forbidden by a different rule such as GB9).
    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
      *state = UTF8PROC_BOUNDCLASS_OTHER;
    // Special support for GB10. Fold any EXTEND codepoints into the previous
    // boundclass if we're dealing with an emoji base boundclass.
    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
      *state = UTF8PROC_BOUNDCLASS_E_BASE;
    else
      *state = tbc;
  }
  return break_permitted;
 }
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
                                 utf8proc_get_property(c2)->boundclass,
                                 state);
 }
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t c1, utf8proc_int32_t c2) {
  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
 }
 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
 {
  utf8proc_int32_t entry_cp = **entry;
  if ((entry_cp & 0xF800) == 0xD800) {
    *entry = *entry + 1;
    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
    entry_cp += 0x10000;
  }
  return entry_cp;
 }
 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
 {
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
  return seqindex_decode_entry(&entry);
 }
 static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
  utf8proc_ssize_t written = 0;
  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
  int len = seqindex >> 13;
  if (len >= 7) {
    len = *entry;
    entry++;
  }
  for (; len >= 0; entry++, len--) {
    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
    written += utf8proc_decompose_char(entry_cp, dst+written,
      (bufsize > written) ? (bufsize - written) : 0, options,
    last_boundclass);
    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
  }
  return written;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
+  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
-  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
+  return cl >= 0 ? cl : c;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
-  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
+  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
-  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+  return cu >= 0 ? cu : c;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
 {
  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 /* return a character width analogous to wcwidth (except portable and
@@ -453,20 +356,39 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
      category == UTF8PROC_CATEGORY_ME) return 0;
  }
  if (options & UTF8PROC_CASEFOLD) {
-    if (property->casefold_seqindex != UINT16_MAX) {
+    if (property->casefold_mapping) {
-      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
+      const utf8proc_int32_t *casefold_entry;
      utf8proc_ssize_t written = 0;
      for (casefold_entry = property->casefold_mapping;
          *casefold_entry >= 0; casefold_entry++) {
        written += utf8proc_decompose_char(*casefold_entry, dst+written,
          (bufsize > written) ? (bufsize - written) : 0, options,
          last_boundclass);
        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
      }
      return written;
    }
  }
  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
-    if (property->decomp_seqindex != UINT16_MAX &&
+    if (property->decomp_mapping &&
        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
-      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
+      const utf8proc_int32_t *decomp_entry;
      utf8proc_ssize_t written = 0;
      for (decomp_entry = property->decomp_mapping;
          *decomp_entry >= 0; decomp_entry++) {
        written += utf8proc_decompose_char(*decomp_entry, dst+written,
          (bufsize > written) ? (bufsize - written) : 0, options,
        last_boundclass);
        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
      }
      return written;
    }
  }
  if (options & UTF8PROC_CHARBOUND) {
    utf8proc_bool boundary;
    int tbc = property->boundclass;
-    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
+    boundary = grapheme_break(*last_boundclass, tbc);
    *last_boundclass = tbc;
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
      if (bufsize >= 2) dst[1] = uc;
@@ -480,14 +402,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
 ) {
    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 ) {
  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
  utf8proc_ssize_t wpos = 0;
@@ -514,9 +428,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
      }
      if (custom_func != NULL) {
        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
      }
      decomp_result = utf8proc_decompose_char(
        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
        &boundclass
@@ -524,8 +435,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
      if (decomp_result < 0) return decomp_result;
      wpos += decomp_result;
      /* prohibiting integer overflows due to too long strings: */
-      if (wpos < 0 ||
+      if (wpos < 0 || wpos > SSIZE_MAX/sizeof(utf8proc_int32_t)/2)
          wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
        return UTF8PROC_ERROR_OVERFLOW;
    }
  }
@@ -551,8 +461,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  return wpos;
 }
-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
-  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
+  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
     ASSERT: 'buffer' has one spare byte of free space at the end! */
  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
    utf8proc_ssize_t rpos;
    utf8proc_ssize_t wpos = 0;
@@ -625,19 +536,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
        if (!starter_property) {
          starter_property = unsafe_get_property(*starter);
        }
-        if (starter_property->comb_index < 0x8000 &&
+        if (starter_property->comb1st_index >= 0 &&
-            current_property->comb_index != UINT16_MAX &&
+            current_property->comb2nd_index >= 0) {
-            current_property->comb_index >= 0x8000) {
+          composition = utf8proc_combinations[
-          int sidx = starter_property->comb_index;
+            starter_property->comb1st_index +
-          int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
+            current_property->comb2nd_index
-          if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
+          ];
-            idx += sidx + 2;
+          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
            if (current_property->comb_index & 0x4000) {
              composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
            } else
              composition = utf8proc_combinations[idx];
            if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
              !(unsafe_get_property(composition)->comp_exclusion))) {
            *starter = composition;
            starter_property = NULL;
@@ -645,7 +550,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
          }
        }
      }
      }
      buffer[wpos] = current_char;
      if (current_property->combining_class) {
        if (current_property->combining_class > max_combining_class) {
@@ -660,14 +564,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
    }
    length = wpos;
  }
  return length;
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
     ASSERT: 'buffer' has one spare byte of free space at the end! */
  length = utf8proc_normalize_utf32(buffer, length, options);
  if (length < 0) return length;
  {
    utf8proc_ssize_t rpos, wpos = 0;
    utf8proc_int32_t uc;
@@ -689,22 +585,15 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 ) {
    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
 }
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 ) {
  utf8proc_int32_t *buffer;
  utf8proc_ssize_t result;
  *dstptr = NULL;
-  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
+  result = utf8proc_decompose(str, strlen, NULL, 0, options);
  if (result < 0) return result;
  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
  if (!buffer) return UTF8PROC_ERROR_NOMEM;
-  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
+  result = utf8proc_decompose(str, strlen, buffer, result, options);
  if (result < 0) {
    free(buffer);
    return result;
@@ -750,3 +639,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
 }
--- a/src/utf8proc/utf8proc.h
+++ b/src/utf8proc/utf8proc.h
@@ -1,5 +1,4 @@
 /*
 * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@@ -28,7 +27,7 @@
 * utf8proc is a free/open-source (MIT/expat licensed) C library
 * providing Unicode normalization, case-folding, and other operations
 * for strings in the UTF-8 encoding, supporting Unicode version
- * 8.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
+ * 7.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
 * for downloads and other information, or the source code on github
 * (https://github.com/JuliaLang/utf8proc).
 *
@@ -63,23 +62,21 @@
 * runtime version may append a string like "-dev" to the version number
 * for prerelease versions.
 *
- * @note The shared-library version number in the Makefile
+ * @note The shared-library version number in the Makefile may be different,
 *       (and CMakeLists.txt, and MANIFEST) may be different,
 *       being based on ABI compatibility rather than API compatibility.
 */
 /** @{ */
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 2
+#define UTF8PROC_VERSION_MAJOR 1
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 1
+#define UTF8PROC_VERSION_MINOR 3
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */
 #include <stdlib.h>
 #include <sys/types.h>
-#if defined(_MSC_VER) && _MSC_VER < 1800
+#ifdef _MSC_VER
 // MSVC prior to 2013 lacked stdbool.h and inttypes.h
 typedef signed char utf8proc_int8_t;
 typedef unsigned char utf8proc_uint8_t;
 typedef short utf8proc_int16_t;
@@ -94,18 +91,12 @@ typedef int utf8proc_ssize_t;
 typedef unsigned int utf8proc_size_t;
 #  endif
 #  ifndef __cplusplus
 // emulate C99 bool
 typedef unsigned char utf8proc_bool;
-#    ifndef __bool_true_false_are_defined
+enum {false, true};
 #      define false 0
 #      define true 1
 #      define __bool_true_false_are_defined 1
 #    endif
 #  else
 typedef bool utf8proc_bool;
 #  endif
 #else
 #  include <stddef.h>
 #  include <stdbool.h>
 #  include <inttypes.h>
 typedef int8_t utf8proc_int8_t;
@@ -115,7 +106,7 @@ typedef uint16_t utf8proc_uint16_t;
 typedef int32_t utf8proc_int32_t;
 typedef uint32_t utf8proc_uint32_t;
 typedef size_t utf8proc_size_t;
-typedef ptrdiff_t utf8proc_ssize_t;
+typedef ssize_t utf8proc_ssize_t;
 typedef bool utf8proc_bool;
 #endif
 #include <limits.h>
@@ -140,10 +131,6 @@ extern "C" {
 #define SSIZE_MAX ((size_t)SIZE_MAX/2)
 #endif
 #ifndef UINT16_MAX
 #  define UINT16_MAX 65535U
 #endif
 /**
 * Option flags used by several functions in the library.
 */
@@ -250,12 +237,13 @@ typedef struct utf8proc_property_struct {
   * @see utf8proc_decomp_type_t.
   */
  utf8proc_propval_t decomp_type;
-  utf8proc_uint16_t decomp_seqindex;
+  const utf8proc_int32_t *decomp_mapping;
-  utf8proc_uint16_t casefold_seqindex;
+  const utf8proc_int32_t *casefold_mapping;
-  utf8proc_uint16_t uppercase_seqindex;
+  utf8proc_int32_t uppercase_mapping;
-  utf8proc_uint16_t lowercase_seqindex;
+  utf8proc_int32_t lowercase_mapping;
-  utf8proc_uint16_t titlecase_seqindex;
+  utf8proc_int32_t titlecase_mapping;
-  utf8proc_uint16_t comb_index;
+  utf8proc_int32_t comb1st_index;
  utf8proc_int32_t comb2nd_index;
  unsigned bidi_mirrored:1;
  unsigned comp_exclusion:1;
  /**
@@ -266,14 +254,13 @@ typedef struct utf8proc_property_struct {
   */
  unsigned ignorable:1;
  unsigned control_boundary:1;
  /** The width of the codepoint. */
  unsigned charwidth:2;
  unsigned pad:2;
  /**
   * Boundclass.
   * @see utf8proc_boundclass_t.
   */
-  unsigned boundclass:8;
+  unsigned boundclass:4;
  /** The width of the codepoint. */
  unsigned charwidth:2;
 } utf8proc_property_t;
 /** Unicode categories. */
@@ -357,7 +344,7 @@ typedef enum {
  UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
 } utf8proc_decomp_type_t;
-/** Boundclass property. (TR29) */
+/** Boundclass property. */
 typedef enum {
  UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
  UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
@@ -372,21 +359,8 @@ typedef enum {
  UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
  UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;
 /**
 * Function pointer type passed to @ref utf8proc_map_custom and
 * @ref utf8proc_decompose_custom, which is used to specify a user-defined
 * mapping of codepoints to be applied in conjunction with other mappings.
 */
 typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
 /**
 * Array containing the byte lengths of a UTF-8 encoded codepoint based
 * on the first byte.
@@ -494,7 +468,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
 * `buffer` (which must contain at least `bufsize` entries).  In case of
 * success, the number of codepoints written is returned; in case of an
 * error, a negative error code is returned (@ref utf8proc_errmsg).
 * See @ref utf8proc_decompose_custom to supply additional transformations.
 *
 * If the number of written codepoints would be bigger than `bufsize`, the
 * required buffer size is returned, while the buffer will be overwritten with
@@ -505,47 +478,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
 );
 /**
 * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
 * that is called on each codepoint in `str` before any other transformations
 * (along with a `custom_data` pointer that is passed through to `custom_func`).
 * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 );
 /**
 * Normalizes the sequence of `length` codepoints pointed to by `buffer`
 * in-place (i.e., the result is also stored in `buffer`).
 *
 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
 * @param length the length (in codepoints) of the buffer.
 * @param options a bitwise or (`|`) of one or more of the following flags:
 * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
 * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
 * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
 * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
 * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
 *                           codepoints
 * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
 *                           the unicode versioning stability
 *
 * @return
 * In case of success, the length (in codepoints) of the normalized UTF-32 string is
 * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
 *
 * @warning The entries of the array pointed to by `str` have to be in the
 *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
 /**
 * Reencodes the sequence of `length` codepoints pointed to by `buffer`
 * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
 * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
 *
 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
 * @param length the length (in codepoints) of the buffer.
@@ -558,12 +493,10 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
 *                           codepoints
 * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
 *                           the unicode versioning stability
 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
 *
 * @return
- * In case of success, the length (in bytes) of the resulting nul-terminated
+ * In case of success, the length (in bytes) of the resulting UTF-8 string is
- * UTF-8 string is returned; otherwise, a negative error code is returned
+ * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
 * (@ref utf8proc_errmsg).
 *
 * @warning The amount of free space pointed to by `buffer` must
 *          exceed the amount of the input data by one byte, and the
@@ -575,26 +508,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 /**
 * Given a pair of consecutive codepoints, return whether a grapheme break is
 * permitted between them (as defined by the extended grapheme clusters in UAX#29).
 *
 * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
 *              state to break graphemes. This state can be passed in as a pointer
 *              in the `state` argument and should initially be set to 0. If the
 *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
 *              GB10/12/13 which require this state will not be applied, essentially
 *              matching the rules in Unicode 8.0.0.
 *
 * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
 *          be called IN ORDER on ALL potential breaks in a string.
 */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
 /**
 * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
 * Unicode 9 additions to the algorithm. Supported for legacy reasons.
 */
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
 /**
@@ -611,13 +526,6 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
 /**
 * Given a codepoint `c`, return the codepoint of the corresponding
 * title-case character, if any; otherwise (if there is no title-case
 * variant, or if `c` is not a valid codepoint) return `c`.
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
 /**
 * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
 * except that a width of 0 is returned for non-printable codepoints
@@ -650,8 +558,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
 * in any case the result will be NULL terminated (though it might
 * contain NULL characters with the string if `str` contained NULL
 * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.  See also
+ * functions defined above, and regarded as described.
 * @ref utfproc_map_custom to supply a custom codepoint transformation.
 *
 * In case of success the length of the new string is returned,
 * otherwise a negative error code is returned.
@@ -663,17 +570,6 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 );
 /**
 * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
 * that is called on each codepoint in `str` before any other transformations
 * (along with a `custom_data` pointer that is passed through to `custom_func`).
 * The `custom_func` argument is ignored if it is `NULL`.
 */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
  utf8proc_custom_func custom_func, void *custom_data
 );
 /** @name Unicode normalization
 *
 * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -686,9 +582,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 /** @} */
@@ -697,3 +593,4 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 #endif
 #endif
--- a/src/utf8proc/utf8proc_data.c
+++ b/src/utf8proc/utf8proc_data.c