libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 009498ac0fc3744a7bc5cc1afb5f601e445442be
parent d74e91e355c37eff0ac64b8ce0e18ef587a1d333
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 18 Oct 2020 22:20:31 +0200

Split test/test.c into three separate tests

The test-infrastructure needed a bit of preparation, but now it makes
sense to split the single test.c into its three parts, making it easier
to handle and reason about.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 10+++++++---
Atest/grapheme_break.c | 41+++++++++++++++++++++++++++++++++++++++++
Dtest/test.c | 374-------------------------------------------------------------------------------
Atest/utf8-decode.c | 275+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/utf8-encode.c | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 415 insertions(+), 377 deletions(-)

diff --git a/Makefile b/Makefile @@ -5,7 +5,7 @@ include config.mk LIB = src/boundary src/codepoint src/grapheme -TEST = test/test +TEST = test/grapheme_break test/utf8-decode test/utf8-encode DATA = data/gbp data/emo data/gbt MAN3 = man/grapheme_bytelen.3 @@ -24,12 +24,16 @@ data/util.o: data/util.c config.mk data/util.h src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h src/codepoint.o: src/codepoint.c config.mk grapheme.h src/grapheme.o: src/grapheme.c config.mk grapheme.h -test/test.o: test/test.c config.mk data/gbt.h grapheme.h +test/grapheme_break.o: test/grapheme_break.c config.mk data/gbt.h grapheme.h +test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h +test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h data/gbp: data/gbp.o data/util.o data/emo: data/emo.o data/util.o data/gbt: data/gbt.o data/util.o -test/test: test/test.o $(LIB:=.o) +test/grapheme_break: test/grapheme_break.o $(LIB:=.o) +test/utf8-encode: test/utf8-encode.o $(LIB:=.o) +test/utf8-decode: test/utf8-decode.o $(LIB:=.o) data/gbp.txt: wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt diff --git a/test/grapheme_break.c b/test/grapheme_break.c @@ -0,0 +1,41 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" +#include "../data/gbt.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +int +main(void) +{ + int state; + size_t i, j, k, len, failed; + + /* grapheme break test */ + for (i = 0, failed = 0; i < LEN(t); i++) { + for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { + if ((j + 1) == t[i].cplen || + grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], + &state)) { + /* check if our resulting length matches */ + if (k == t[i].lenlen || len != t[i].len[k++]) { + fprintf(stderr, "Failed \"%s\"\n", + t[i].descr); + failed++; + break; + } + len = 1; + } else { + len++; + } + } + } + printf("Grapheme break test: Passed %zu out of %zu tests.\n", + LEN(t) - failed, LEN(t)); + + return (failed > 0) ? 1 : 0; +} diff --git a/test/test.c b/test/test.c @@ -1,374 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -#include "../grapheme.h" -#include "../data/gbt.h" - -#define LEN(x) (sizeof(x) / sizeof(*x)) - -static const struct { - uint32_t cp; /* input code point */ - uint8_t *exp_arr; /* expected UTF-8 byte sequence */ - size_t exp_len; /* expected length of UTF-8 sequence */ -} enc_test[] = { - { - /* invalid code point (UTF-16 surrogate half) */ - .cp = UINT32_C(0xD800), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, - .exp_len = 3, - }, - { - /* invalid code point (UTF-16-unrepresentable) */ - .cp = UINT32_C(0x110000), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, - .exp_len = 3, - }, - { - /* code point encoded to a 1-byte sequence */ - .cp = 0x01, - .exp_arr = (uint8_t[]){ 0x01 }, - .exp_len = 1, - }, - { - /* code point encoded to a 2-byte sequence */ - .cp = 0xFF, - .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, - .exp_len = 2, - }, - { - /* code point encoded to a 3-byte sequence */ - .cp = 0xFFF, - .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, - .exp_len = 3, - }, - { - /* code point encoded to a 4-byte sequence */ - .cp = UINT32_C(0xFFFFF), - .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, - .exp_len = 4, - }, -}; - -static const struct { - uint8_t *arr; /* UTF-8 byte sequence */ - size_t len; /* length of UTF-8 byte sequence */ - size_t exp_len; /* expected length returned */ - uint32_t exp_cp; /* expected code point returned */ -} dec_test[] = { - { - /* empty sequence - * [ ] -> - * INVALID - */ - .arr = NULL, - .len = 0, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid lead byte - * [ 11111101 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xFD }, - .len = 1, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 1-byte sequence - * [ 00000001 ] -> - * 0000001 - */ - .arr = (uint8_t[]){ 0x01 }, - .len = 1, - .exp_len = 1, - .exp_cp = 0x1, - }, - { - /* valid 2-byte sequence - * [ 11000011 10111111 ] -> - * 00011111111 - */ - .arr = (uint8_t[]){ 0xC3, 0xBF }, - .len = 2, - .exp_len = 2, - .exp_cp = 0xFF, - }, - { - /* invalid 2-byte sequence (second byte missing) - * [ 11000011 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC3 }, - .len = 1, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 2-byte sequence (second byte malformed) - * [ 11000011 11111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC3, 0xFF }, - .len = 2, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 2-byte sequence (overlong encoded) - * [ 11000001 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC1, 0xBF }, - .len = 2, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 3-byte sequence - * [ 11100000 10111111 10111111 ] -> - * 0000111111111111 - */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, - .len = 3, - .exp_len = 3, - .exp_cp = 0xFFF, - }, - { - /* invalid 3-byte sequence (second byte missing) - * [ 11100000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0 }, - .len = 1, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (second byte malformed) - * [ 11100000 01111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, - .len = 3, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (third byte missing) - * [ 11100000 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0xBF }, - .len = 2, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (third byte malformed) - * [ 11100000 10111111 01111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, - .len = 3, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (overlong encoded) - * [ 11100000 10011111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, - .len = 3, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (UTF-16 surrogate half) - * [ 11101101 10100000 10000000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, - .len = 3, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 4-byte sequence - * [ 11110011 10111111 10111111 10111111 ] -> - * 011111111111111111111 - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, - .len = 4, - .exp_len = 4, - .exp_cp = UINT32_C(0xFFFFF), - }, - { - /* invalid 4-byte sequence (second byte missing) - * [ 11110011 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3 }, - .len = 1, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (second byte malformed) - * [ 11110011 01111111 10111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, - .len = 4, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (third byte missing) - * [ 11110011 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF }, - .len = 2, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (third byte malformed) - * [ 11110011 10111111 01111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, - .len = 4, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (fourth byte missing) - * [ 11110011 10111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, - .len = 3, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (fourth byte malformed) - * [ 11110011 10111111 10111111 01111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, - .len = 4, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (overlong encoded) - * [ 11110000 10000000 10000001 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, - .len = 4, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (UTF-16-unrepresentable) - * [ 11110100 10010000 10000000 10000000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, - .len = 4, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, -}; - -int -main(void) -{ - int state; - size_t i, j, k, len, failed; - - /* UTF-8 encoder test */ - for (i = 0, failed = 0; i < LEN(enc_test); i++) { - uint8_t arr[4]; - size_t len; - - len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); - - if (len != enc_test[i].exp_len || - memcmp(arr, enc_test[i].exp_arr, len)) { - fprintf(stderr, "Failed UTF-8-encoder test %zu: " - "Expected (", i); - for (j = 0; j < enc_test[i].exp_len; j++) { - fprintf(stderr, "0x%x", - enc_test[i].exp_arr[j]); - if (j + 1 < enc_test[i].exp_len) { - fprintf(stderr, " "); - } - } - fprintf(stderr, "), but got ("); - for (j = 0; j < len; j++) { - fprintf(stderr, "0x%x", arr[j]); - if (j + 1 < len) { - fprintf(stderr, " "); - } - } - fprintf(stderr, ")\n"); - failed++; - } - } - printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", - LEN(enc_test) - failed, LEN(enc_test)); - - /* UTF-8 decoder test */ - for (i = 0, failed = 0; i < LEN(dec_test); i++) { - size_t len; - uint32_t cp; - - len = grapheme_cp_decode(&cp, dec_test[i].arr, - dec_test[i].len); - - if (len != dec_test[i].exp_len || - cp != dec_test[i].exp_cp) { - fprintf(stderr, "Failed UTF-8-decoder test %zu: " - "Expected (%zx,%u), but got (%zx,%u)\n", - i, dec_test[i].exp_len, - dec_test[i].exp_cp, len, cp); - failed++; - } - } - printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", - LEN(dec_test) - failed, LEN(dec_test)); - - /* grapheme break test */ - for (i = 0, failed = 0; i < LEN(t); i++) { - for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { - if ((j + 1) == t[i].cplen || - grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], - &state)) { - /* check if our resulting length matches */ - if (k == t[i].lenlen || len != t[i].len[k++]) { - fprintf(stderr, "Failed \"%s\"\n", - t[i].descr); - failed++; - break; - } - len = 1; - } else { - len++; - } - } - } - printf("Grapheme break test: Passed %zu out of %zu tests.\n", - LEN(t) - failed, LEN(t)); - - return (failed > 0) ? 1 : 0; -} diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -0,0 +1,275 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +static const struct { + uint8_t *arr; /* UTF-8 byte sequence */ + size_t len; /* length of UTF-8 byte sequence */ + size_t exp_len; /* expected length returned */ + uint32_t exp_cp; /* expected code point returned */ +} dec_test[] = { + { + /* empty sequence + * [ ] -> + * INVALID + */ + .arr = NULL, + .len = 0, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid lead byte + * [ 11111101 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xFD }, + .len = 1, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 1-byte sequence + * [ 00000001 ] -> + * 0000001 + */ + .arr = (uint8_t[]){ 0x01 }, + .len = 1, + .exp_len = 1, + .exp_cp = 0x1, + }, + { + /* valid 2-byte sequence + * [ 11000011 10111111 ] -> + * 00011111111 + */ + .arr = (uint8_t[]){ 0xC3, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = 0xFF, + }, + { + /* invalid 2-byte sequence (second byte missing) + * [ 11000011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3 }, + .len = 1, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 2-byte sequence (second byte malformed) + * [ 11000011 11111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3, 0xFF }, + .len = 2, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 2-byte sequence (overlong encoded) + * [ 11000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC1, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 3-byte sequence + * [ 11100000 10111111 10111111 ] -> + * 0000111111111111 + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = 0xFFF, + }, + { + /* invalid 3-byte sequence (second byte missing) + * [ 11100000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0 }, + .len = 1, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (second byte malformed) + * [ 11100000 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, + .len = 3, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte missing) + * [ 11100000 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF }, + .len = 2, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte malformed) + * [ 11100000 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, + .len = 3, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (overlong encoded) + * [ 11100000 10011111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (UTF-16 surrogate half) + * [ 11101101 10100000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, + .len = 3, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 4-byte sequence + * [ 11110011 10111111 10111111 10111111 ] -> + * 011111111111111111111 + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = UINT32_C(0xFFFFF), + }, + { + /* invalid 4-byte sequence (second byte missing) + * [ 11110011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3 }, + .len = 1, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (second byte malformed) + * [ 11110011 01111111 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, + .len = 4, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte missing) + * [ 11110011 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF }, + .len = 2, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte malformed) + * [ 11110011 10111111 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, + .len = 4, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte missing) + * [ 11110011 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, + .len = 3, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte malformed) + * [ 11110011 10111111 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, + .len = 4, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (overlong encoded) + * [ 11110000 10000000 10000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (UTF-16-unrepresentable) + * [ 11110100 10010000 10000000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, + .len = 4, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, +}; + +int +main(void) +{ + size_t i, failed; + + /* UTF-8 decoder test */ + for (i = 0, failed = 0; i < LEN(dec_test); i++) { + size_t len; + uint32_t cp; + + len = grapheme_cp_decode(&cp, dec_test[i].arr, + dec_test[i].len); + + if (len != dec_test[i].exp_len || + cp != dec_test[i].exp_cp) { + fprintf(stderr, "Failed UTF-8-decoder test %zu: " + "Expected (%zx,%u), but got (%zx,%u)\n", + i, dec_test[i].exp_len, + dec_test[i].exp_cp, len, cp); + failed++; + } + } + printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", + LEN(dec_test) - failed, LEN(dec_test)); + + return (failed > 0) ? 1 : 0; +} diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -0,0 +1,92 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +static const struct { + uint32_t cp; /* input code point */ + uint8_t *exp_arr; /* expected UTF-8 byte sequence */ + size_t exp_len; /* expected length of UTF-8 sequence */ +} enc_test[] = { + { + /* invalid code point (UTF-16 surrogate half) */ + .cp = UINT32_C(0xD800), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* invalid code point (UTF-16-unrepresentable) */ + .cp = UINT32_C(0x110000), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* code point encoded to a 1-byte sequence */ + .cp = 0x01, + .exp_arr = (uint8_t[]){ 0x01 }, + .exp_len = 1, + }, + { + /* code point encoded to a 2-byte sequence */ + .cp = 0xFF, + .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, + .exp_len = 2, + }, + { + /* code point encoded to a 3-byte sequence */ + .cp = 0xFFF, + .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .exp_len = 3, + }, + { + /* code point encoded to a 4-byte sequence */ + .cp = UINT32_C(0xFFFFF), + .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .exp_len = 4, + }, +}; + +int +main(void) +{ + size_t i, j, failed; + + /* UTF-8 encoder test */ + for (i = 0, failed = 0; i < LEN(enc_test); i++) { + uint8_t arr[4]; + size_t len; + + len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); + + if (len != enc_test[i].exp_len || + memcmp(arr, enc_test[i].exp_arr, len)) { + fprintf(stderr, "Failed UTF-8-encoder test %zu: " + "Expected (", i); + for (j = 0; j < enc_test[i].exp_len; j++) { + fprintf(stderr, "0x%x", + enc_test[i].exp_arr[j]); + if (j + 1 < enc_test[i].exp_len) { + fprintf(stderr, " "); + } + } + fprintf(stderr, "), but got ("); + for (j = 0; j < len; j++) { + fprintf(stderr, "0x%x", arr[j]); + if (j + 1 < len) { + fprintf(stderr, " "); + } + } + fprintf(stderr, ")\n"); + failed++; + } + } + printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", + LEN(enc_test) - failed, LEN(enc_test)); + + return (failed > 0) ? 1 : 0; +}