Merge branch 'master' of https://github.com/JuliaLang/utf8proc - utf8proc - A clean C library for processing UTF-8 Unicode data

	utf8proc A clean C library for processing UTF-8 Unicode data
	git clone https://git.sinitax.com/juliastrings/utf8proc
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt

commit 94395db2821f15a97adaa18ea4dd10f84023af44
parent 3637d518558553f098697e2164d721aed575a075
Author: Steven G. Johnson <stevenj@alum.mit.edu>
Date:   Sat, 30 Mar 2019 16:04:48 -0400

Merge branch 'master' of https://github.com/JuliaLang/utf8proc

Diffstat:
M test/graphemetest.c  | 22 +++++++++++++++++-----
M utf8proc.c  | 18 ++++++++----------

2 files changed, 25 insertions(+), 15 deletions(-)
diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -7,17 +7,17 @@ int main(int argc, char **argv)
     FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
     utf8proc_uint8_t src[1024];
     int len;
-    
+
     check(f != NULL, "error opening GraphemeBreakTest.txt");
     while (getline(&buf, &bufsize, f) > 0) {
         size_t bi = 0, si = 0;
         lineno += 1;
-        
+
         if (lineno % 100 == 0)
             printf("checking line %zd...\n", lineno);
-        
+
         if (buf[0] == '#') continue;
-        
+
         while (buf[bi]) {
             bi = skipspaces(buf, bi);
             if (buf[bi] == '/') { /* grapheme break */
@@ -39,7 +39,7 @@ int main(int argc, char **argv)
         if (si && src[si-1] == '/')
             --si; /* no break after final grapheme */
         src[si] = 0; /* NUL-terminate */
-        
+
         if (si) {
             utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
             size_t i = 0, j = 0;
@@ -70,5 +70,17 @@ int main(int argc, char **argv)
     }
     fclose(f);
     printf("Passed tests after %zd lines!\n", lineno);
+
+    /* issue 144 */
+    {
+        utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
+        utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
+        utf8proc_ssize_t glen;
+        utf8proc_uint8_t *g;
+        glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
+        check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
+        free(g);
+    };
+
     return 0;
 }
diff --git a/utf8proc.c b/utf8proc.c
@@ -196,9 +196,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
   } else return 0;
 }
 
-/* internal "unsafe" version that does not check whether uc is in range */
-static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
+/* internal version used for inserting 0xff bytes between graphemes */
+static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
    if (uc < 0x00) {
+      if (uc == -1) { /* internal value used for grapheme breaks */
+        dst[0] = (utf8proc_uint8_t)0xFF;
+        return 1;
+      }
       return 0;
    } else if (uc < 0x80) {
       dst[0] = (utf8proc_uint8_t)uc;
@@ -207,12 +211,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
       dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
       dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
       return 2;
-   } else if (uc == 0xFFFF) {
-       dst[0] = (utf8proc_uint8_t)0xFF;
-       return 1;
-   } else if (uc == 0xFFFE) {
-       dst[0] = (utf8proc_uint8_t)0xFE;
-       return 1;
    } else if (uc < 0x10000) {
       dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
       dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
@@ -480,7 +478,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
     int tbc = property->boundclass;
     boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
     if (boundary) {
-      if (bufsize >= 1) dst[0] = 0xFFFF;
+      if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
       if (bufsize >= 2) dst[1] = uc;
       return 2;
     }
@@ -686,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, 
     if (options & UTF8PROC_CHARBOUND) {
         for (rpos = 0; rpos < length; rpos++) {
             uc = buffer[rpos];
-            wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
+            wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
         }
     } else {
         for (rpos = 0; rpos < length; rpos++) {

M	test/graphemetest.c	\|	22	+++++++++++++++++-----
M	utf8proc.c	\|	18	++++++++----------