utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

graphemetest.c (5339B)


      1#include "tests.h"
      2
      3/* check one line in the format of GraphemeBreakTest.txt */
      4void checkline(const char *_buf, bool verbose) {
      5    size_t bi = 0, si = 0;
      6    utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
      7    const unsigned char *buf = (const unsigned char *) _buf;
      8
      9    while (buf[bi]) {
     10        bi = skipspaces(buf, bi);
     11        if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
     12            src[si++] = '/';
     13            bi += 2;
     14        }
     15        else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
     16            bi += 2;
     17        }
     18        else if (buf[bi] == '#') { /* start of comments */
     19            break;
     20        }
     21        else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
     22            src[si++] = '/';
     23            bi += 1;
     24        }
     25        else { /* hex-encoded codepoint */
     26            size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
     27            while (src[si]) ++si; /* advance to NUL termination */
     28            bi += len;
     29        }
     30    }
     31    if (si && src[si-1] == '/')
     32        --si; /* no break after final grapheme */
     33    src[si] = 0; /* NUL-terminate */
     34
     35    if (si) { /* test utf8proc_map */
     36        utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
     37        size_t i = 0, j = 0;
     38        utf8proc_ssize_t glen, k;
     39        utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
     40        while (i < si) {
     41            if (src[i] != '/')
     42                utf8[j++] = src[i++];
     43            else
     44                i++;
     45        }
     46        glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND);
     47        if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
     48            /* the test file contains surrogate codepoints, which are only for UTF-16 */
     49            printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
     50        }
     51        else {
     52            check(glen >= 0, "utf8proc_map error = %s",
     53                utf8proc_errmsg(glen));
     54            for (k = 0; k <= glen; ++k)
     55                if (g[k] == 0xff)
     56                    g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
     57            check(!strcmp((char*)g, (char*)src),
     58                "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
     59        }
     60        free(g);
     61    }
     62
     63    if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
     64        utf8proc_int32_t state = 0, prev_codepoint = 0;
     65        size_t i = 0;
     66        utf8proc_bool expectbreak = false;
     67        do {
     68            utf8proc_int32_t codepoint;
     69            i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint);
     70            check(codepoint >= 0, "invalid UTF-8 data");
     71            if (codepoint == 0x002F)
     72                expectbreak = true;
     73            else {
     74                if (prev_codepoint != 0) {
     75                    check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
     76                          "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
     77                }
     78                expectbreak = false;
     79                prev_codepoint = codepoint;
     80            }
     81        } while (i < si);
     82    }
     83
     84    if (verbose)
     85        printf("passed grapheme test: \"%s\"\n", (char*) src);
     86}
     87
     88int main(int argc, char **argv)
     89{
     90    unsigned char buf[8192];
     91    FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
     92
     93    check(f != NULL, "error opening GraphemeBreakTest.txt");
     94    while (simple_getline(buf, f) > 0) {
     95        if ((++lineno) % 100 == 0)
     96            printf("checking line %zd...\n", lineno);
     97        if (buf[0] == '#') continue;
     98        checkline((char *) buf, false);
     99    }
    100    fclose(f);
    101    printf("Passed tests after %zd lines!\n", lineno);
    102
    103    printf("Performing regression tests...\n");
    104
    105    /* issue 144 */
    106    {
    107        utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
    108        utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
    109        utf8proc_ssize_t glen;
    110        utf8proc_uint8_t *g;
    111        glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
    112        check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
    113        check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks");
    114        free(g);
    115    };
    116
    117    /* https://github.com/JuliaLang/julia/issues/37680 */
    118    checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
    119    checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
    120    checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
    121
    122    /* more GB9c tests */
    123    checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
    124    checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
    125    checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
    126    checkline("/ 0915 0300 094d 0300 / 0078 /", true);
    127    checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
    128
    129    check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
    130    check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
    131
    132    printf("Passed regression tests!\n");
    133
    134    return 0;
    135}