utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 4603e00cfc72c58056a18962ea43ebffaf89ec30
parent e76cebb784028d33e3bfd9fd0170e6835b9522b8
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Sat, 30 Mar 2019 15:22:25 -0400

fix CHARBOUND option for non-characters (#149)


Diffstat:
Mtest/graphemetest.c | 22+++++++++++++++++-----
Mutf8proc.c | 18++++++++----------
2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/test/graphemetest.c b/test/graphemetest.c @@ -7,17 +7,17 @@ int main(int argc, char **argv) FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; utf8proc_uint8_t src[1024]; int len; - + check(f != NULL, "error opening GraphemeBreakTest.txt"); while (getline(&buf, &bufsize, f) > 0) { size_t bi = 0, si = 0; lineno += 1; - + if (lineno % 100 == 0) printf("checking line %zd...\n", lineno); - + if (buf[0] == '#') continue; - + while (buf[bi]) { bi = skipspaces(buf, bi); if (buf[bi] == '/') { /* grapheme break */ @@ -39,7 +39,7 @@ int main(int argc, char **argv) if (si && src[si-1] == '/') --si; /* no break after final grapheme */ src[si] = 0; /* NUL-terminate */ - + if (si) { utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ size_t i = 0, j = 0; @@ -70,5 +70,17 @@ int main(int argc, char **argv) } fclose(f); printf("Passed tests after %zd lines!\n", lineno); + + /* issue 144 */ + { + utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */ + utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */ + utf8proc_ssize_t glen; + utf8proc_uint8_t *g; + glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); + check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); + free(g); + }; + return 0; } diff --git a/utf8proc.c b/utf8proc.c @@ -196,9 +196,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut } else return 0; } -/* internal "unsafe" version that does not check whether uc is in range */ -static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { +/* internal version used for inserting 0xff bytes between graphemes */ +static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { + if (uc == -1) { /* internal value used for grapheme breaks */ + dst[0] = (utf8proc_uint8_t)0xFF; + return 1; + } return 0; } else if (uc < 0x80) { dst[0] = (utf8proc_uint8_t)uc; @@ -207,12 +211,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; - } else if (uc == 0xFFFF) { - dst[0] = (utf8proc_uint8_t)0xFF; - return 1; - } else if (uc == 0xFFFE) { - dst[0] = (utf8proc_uint8_t)0xFE; - return 1; } else if (uc < 0x10000) { dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); @@ -480,7 +478,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, int tbc = property->boundclass; boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass); if (boundary) { - if (bufsize >= 1) dst[0] = 0xFFFF; + if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ if (bufsize >= 2) dst[1] = uc; return 2; } @@ -686,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, if (options & UTF8PROC_CHARBOUND) { for (rpos = 0; rpos < length; rpos++) { uc = buffer[rpos]; - wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); + wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); } } else { for (rpos = 0; rpos < length; rpos++) {