commit 0643a64479958151a822827e38a058b0198e8cee
parent 6f7d73071afc272dd07b06e72cf2c489402c13e7
Author: Steven G. Johnson <stevenj@mit.edu>
Date: Mon, 23 Nov 2020 14:10:29 -0500
Fix grapheme breaks on string-initial (#205)
* Fix extended emoji + zwj combo
* Patch initial repeated regional flags and extended+zwj emoj
* Merge conditions for setting breaks bt region
* updated fix
* perform tests for both utf8proc_map and manual calls to utf8proc_grapheme_break_stateful
* consolidate tests
Co-authored-by: Thomas Marks <marksta@umich.edu>
Diffstat:
2 files changed, 101 insertions(+), 58 deletions(-)
diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -1,74 +1,107 @@
#include "tests.h"
+/* check one line in the format of GraphemeBreakTest.txt */
+void checkline(const char *_buf, bool verbose) {
+ size_t bi = 0, si = 0;
+ utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
+ const unsigned char *buf = (const unsigned char *) _buf;
+
+ while (buf[bi]) {
+ bi = skipspaces(buf, bi);
+ if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
+ src[si++] = '/';
+ bi += 2;
+ }
+ else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
+ bi += 2;
+ }
+ else if (buf[bi] == '#') { /* start of comments */
+ break;
+ }
+ else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
+ src[si++] = '/';
+ bi += 1;
+ }
+ else { /* hex-encoded codepoint */
+ size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
+ while (src[si]) ++si; /* advance to NUL termination */
+ bi += len;
+ }
+ }
+ if (si && src[si-1] == '/')
+ --si; /* no break after final grapheme */
+ src[si] = 0; /* NUL-terminate */
+
+ if (si) { /* test utf8proc_map */
+ utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
+ size_t i = 0, j = 0;
+ utf8proc_ssize_t glen, k;
+ utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
+ while (i < si) {
+ if (src[i] != '/')
+ utf8[j++] = src[i++];
+ else
+ i++;
+ }
+ glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+ if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
+ /* the test file contains surrogate codepoints, which are only for UTF-16 */
+ printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
+ }
+ else {
+ check(glen >= 0, "utf8proc_map error = %s",
+ utf8proc_errmsg(glen));
+ for (k = 0; k <= glen; ++k)
+ if (g[k] == 0xff)
+ g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
+ check(!strcmp((char*)g, (char*)src),
+ "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
+ }
+ free(g);
+ }
+
+ if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
+ utf8proc_int32_t state = 0, prev_codepoint = 0;
+ size_t i = 0;
+ utf8proc_bool expectbreak = false;
+ do {
+ utf8proc_int32_t codepoint;
+ i += utf8proc_iterate(src + i, si - i, &codepoint);
+ check(codepoint >= 0, "invalid UTF-8 data");
+ if (codepoint == 0x002F)
+ expectbreak = true;
+ else {
+ if (prev_codepoint != 0) {
+ check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
+ "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
+ }
+ expectbreak = false;
+ prev_codepoint = codepoint;
+ }
+ } while (i < si);
+ }
+
+ if (verbose)
+ printf("passed grapheme test: \"%s\"\n", (char*) src);
+}
+
int main(int argc, char **argv)
{
unsigned char buf[8192];
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
- utf8proc_uint8_t src[1024];
check(f != NULL, "error opening GraphemeBreakTest.txt");
while (simple_getline(buf, f) > 0) {
- size_t bi = 0, si = 0;
- lineno += 1;
-
- if (lineno % 100 == 0)
+ if ((++lineno) % 100 == 0)
printf("checking line %zd...\n", lineno);
-
if (buf[0] == '#') continue;
-
- while (buf[bi]) {
- bi = skipspaces(buf, bi);
- if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
- src[si++] = '/';
- bi += 2;
- }
- else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
- bi += 2;
- }
- else if (buf[bi] == '#') { /* start of comments */
- break;
- }
- else { /* hex-encoded codepoint */
- size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
- while (src[si]) ++si; /* advance to NUL termination */
- bi += len;
- }
- }
- if (si && src[si-1] == '/')
- --si; /* no break after final grapheme */
- src[si] = 0; /* NUL-terminate */
-
- if (si) {
- utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
- size_t i = 0, j = 0;
- utf8proc_ssize_t glen, k;
- utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
- while (i < si) {
- if (src[i] != '/')
- utf8[j++] = src[i++];
- else
- i++;
- }
- glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
- if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
- /* the test file contains surrogate codepoints, which are only for UTF-16 */
- printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
- }
- else {
- check(glen >= 0, "utf8proc_map error = %s",
- utf8proc_errmsg(glen));
- for (k = 0; k <= glen; ++k)
- if (g[k] == 0xff)
- g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
- check(!strcmp((char*)g, (char*)src),
- "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
- }
- free(g);
- }
+ checkline((char *) buf, false);
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
+ printf("Performing regression tests...\n");
+
/* issue 144 */
{
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
@@ -80,5 +113,12 @@ int main(int argc, char **argv)
free(g);
};
+ /* https://github.com/JuliaLang/julia/issues/37680 */
+ checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
+ checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
+ checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
+
+ printf("Passed regression tests!\n");
+
return 0;
}
diff --git a/utf8proc.c b/utf8proc.c
@@ -290,8 +290,11 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
{
- int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
- ? *state : lbc);
+ int lbc_override;
+ if (*state == UTF8PROC_BOUNDCLASS_START)
+ *state = lbc_override = lbc;
+ else
+ lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
if (state) {
// Special support for GB 12/13 made possible by GB999. After two RI