commit 009498ac0fc3744a7bc5cc1afb5f601e445442be
parent d74e91e355c37eff0ac64b8ce0e18ef587a1d333
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 18 Oct 2020 22:20:31 +0200
Split test/test.c into three separate tests
The test-infrastructure needed a bit of preparation, but now it makes
sense to split the single test.c into its three parts, making it easier
to handle and reason about.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | Makefile | | | 10 | +++++++--- |
A | test/grapheme_break.c | | | 41 | +++++++++++++++++++++++++++++++++++++++++ |
D | test/test.c | | | 374 | ------------------------------------------------------------------------------- |
A | test/utf8-decode.c | | | 275 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | test/utf8-encode.c | | | 92 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 415 insertions(+), 377 deletions(-)
diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@
include config.mk
LIB = src/boundary src/codepoint src/grapheme
-TEST = test/test
+TEST = test/grapheme_break test/utf8-decode test/utf8-encode
DATA = data/gbp data/emo data/gbt
MAN3 = man/grapheme_bytelen.3
@@ -24,12 +24,16 @@ data/util.o: data/util.c config.mk data/util.h
src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h
src/codepoint.o: src/codepoint.c config.mk grapheme.h
src/grapheme.o: src/grapheme.c config.mk grapheme.h
-test/test.o: test/test.c config.mk data/gbt.h grapheme.h
+test/grapheme_break.o: test/grapheme_break.c config.mk data/gbt.h grapheme.h
+test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h
+test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h
data/gbp: data/gbp.o data/util.o
data/emo: data/emo.o data/util.o
data/gbt: data/gbt.o data/util.o
-test/test: test/test.o $(LIB:=.o)
+test/grapheme_break: test/grapheme_break.o $(LIB:=.o)
+test/utf8-encode: test/utf8-encode.o $(LIB:=.o)
+test/utf8-decode: test/utf8-decode.o $(LIB:=.o)
data/gbp.txt:
wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
diff --git a/test/grapheme_break.c b/test/grapheme_break.c
@@ -0,0 +1,41 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../data/gbt.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+int
+main(void)
+{
+ int state;
+ size_t i, j, k, len, failed;
+
+ /* grapheme break test */
+ for (i = 0, failed = 0; i < LEN(t); i++) {
+ for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
+ if ((j + 1) == t[i].cplen ||
+ grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
+ &state)) {
+ /* check if our resulting length matches */
+ if (k == t[i].lenlen || len != t[i].len[k++]) {
+ fprintf(stderr, "Failed \"%s\"\n",
+ t[i].descr);
+ failed++;
+ break;
+ }
+ len = 1;
+ } else {
+ len++;
+ }
+ }
+ }
+ printf("Grapheme break test: Passed %zu out of %zu tests.\n",
+ LEN(t) - failed, LEN(t));
+
+ return (failed > 0) ? 1 : 0;
+}
diff --git a/test/test.c b/test/test.c
@@ -1,374 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "../grapheme.h"
-#include "../data/gbt.h"
-
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
-static const struct {
- uint32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
- size_t exp_len; /* expected length of UTF-8 sequence */
-} enc_test[] = {
- {
- /* invalid code point (UTF-16 surrogate half) */
- .cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
- .exp_len = 3,
- },
- {
- /* invalid code point (UTF-16-unrepresentable) */
- .cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
- .exp_len = 3,
- },
- {
- /* code point encoded to a 1-byte sequence */
- .cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
- .exp_len = 1,
- },
- {
- /* code point encoded to a 2-byte sequence */
- .cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
- .exp_len = 2,
- },
- {
- /* code point encoded to a 3-byte sequence */
- .cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
- .exp_len = 3,
- },
- {
- /* code point encoded to a 4-byte sequence */
- .cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
- .exp_len = 4,
- },
-};
-
-static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
- size_t len; /* length of UTF-8 byte sequence */
- size_t exp_len; /* expected length returned */
- uint32_t exp_cp; /* expected code point returned */
-} dec_test[] = {
- {
- /* empty sequence
- * [ ] ->
- * INVALID
- */
- .arr = NULL,
- .len = 0,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid lead byte
- * [ 11111101 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xFD },
- .len = 1,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 1-byte sequence
- * [ 00000001 ] ->
- * 0000001
- */
- .arr = (uint8_t[]){ 0x01 },
- .len = 1,
- .exp_len = 1,
- .exp_cp = 0x1,
- },
- {
- /* valid 2-byte sequence
- * [ 11000011 10111111 ] ->
- * 00011111111
- */
- .arr = (uint8_t[]){ 0xC3, 0xBF },
- .len = 2,
- .exp_len = 2,
- .exp_cp = 0xFF,
- },
- {
- /* invalid 2-byte sequence (second byte missing)
- * [ 11000011 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC3 },
- .len = 1,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 2-byte sequence (second byte malformed)
- * [ 11000011 11111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC3, 0xFF },
- .len = 2,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 2-byte sequence (overlong encoded)
- * [ 11000001 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC1, 0xBF },
- .len = 2,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 3-byte sequence
- * [ 11100000 10111111 10111111 ] ->
- * 0000111111111111
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
- .len = 3,
- .exp_len = 3,
- .exp_cp = 0xFFF,
- },
- {
- /* invalid 3-byte sequence (second byte missing)
- * [ 11100000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0 },
- .len = 1,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (second byte malformed)
- * [ 11100000 01111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
- .len = 3,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (third byte missing)
- * [ 11100000 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF },
- .len = 2,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (third byte malformed)
- * [ 11100000 10111111 01111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
- .len = 3,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (overlong encoded)
- * [ 11100000 10011111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
- .len = 3,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (UTF-16 surrogate half)
- * [ 11101101 10100000 10000000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
- .len = 3,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 4-byte sequence
- * [ 11110011 10111111 10111111 10111111 ] ->
- * 011111111111111111111
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
- .len = 4,
- .exp_len = 4,
- .exp_cp = UINT32_C(0xFFFFF),
- },
- {
- /* invalid 4-byte sequence (second byte missing)
- * [ 11110011 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3 },
- .len = 1,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (second byte malformed)
- * [ 11110011 01111111 10111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
- .len = 4,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (third byte missing)
- * [ 11110011 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF },
- .len = 2,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (third byte malformed)
- * [ 11110011 10111111 01111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
- .len = 4,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (fourth byte missing)
- * [ 11110011 10111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
- .len = 3,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (fourth byte malformed)
- * [ 11110011 10111111 10111111 01111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
- .len = 4,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (overlong encoded)
- * [ 11110000 10000000 10000001 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
- .len = 4,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (UTF-16-unrepresentable)
- * [ 11110100 10010000 10000000 10000000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
- .len = 4,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
-};
-
-int
-main(void)
-{
- int state;
- size_t i, j, k, len, failed;
-
- /* UTF-8 encoder test */
- for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
- size_t len;
-
- len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
-
- if (len != enc_test[i].exp_len ||
- memcmp(arr, enc_test[i].exp_arr, len)) {
- fprintf(stderr, "Failed UTF-8-encoder test %zu: "
- "Expected (", i);
- for (j = 0; j < enc_test[i].exp_len; j++) {
- fprintf(stderr, "0x%x",
- enc_test[i].exp_arr[j]);
- if (j + 1 < enc_test[i].exp_len) {
- fprintf(stderr, " ");
- }
- }
- fprintf(stderr, "), but got (");
- for (j = 0; j < len; j++) {
- fprintf(stderr, "0x%x", arr[j]);
- if (j + 1 < len) {
- fprintf(stderr, " ");
- }
- }
- fprintf(stderr, ")\n");
- failed++;
- }
- }
- printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
- LEN(enc_test) - failed, LEN(enc_test));
-
- /* UTF-8 decoder test */
- for (i = 0, failed = 0; i < LEN(dec_test); i++) {
- size_t len;
- uint32_t cp;
-
- len = grapheme_cp_decode(&cp, dec_test[i].arr,
- dec_test[i].len);
-
- if (len != dec_test[i].exp_len ||
- cp != dec_test[i].exp_cp) {
- fprintf(stderr, "Failed UTF-8-decoder test %zu: "
- "Expected (%zx,%u), but got (%zx,%u)\n",
- i, dec_test[i].exp_len,
- dec_test[i].exp_cp, len, cp);
- failed++;
- }
- }
- printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
- LEN(dec_test) - failed, LEN(dec_test));
-
- /* grapheme break test */
- for (i = 0, failed = 0; i < LEN(t); i++) {
- for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
- if ((j + 1) == t[i].cplen ||
- grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
- &state)) {
- /* check if our resulting length matches */
- if (k == t[i].lenlen || len != t[i].len[k++]) {
- fprintf(stderr, "Failed \"%s\"\n",
- t[i].descr);
- failed++;
- break;
- }
- len = 1;
- } else {
- len++;
- }
- }
- }
- printf("Grapheme break test: Passed %zu out of %zu tests.\n",
- LEN(t) - failed, LEN(t));
-
- return (failed > 0) ? 1 : 0;
-}
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -0,0 +1,275 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+static const struct {
+ uint8_t *arr; /* UTF-8 byte sequence */
+ size_t len; /* length of UTF-8 byte sequence */
+ size_t exp_len; /* expected length returned */
+ uint32_t exp_cp; /* expected code point returned */
+} dec_test[] = {
+ {
+ /* empty sequence
+ * [ ] ->
+ * INVALID
+ */
+ .arr = NULL,
+ .len = 0,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid lead byte
+ * [ 11111101 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xFD },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 1-byte sequence
+ * [ 00000001 ] ->
+ * 0000001
+ */
+ .arr = (uint8_t[]){ 0x01 },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = 0x1,
+ },
+ {
+ /* valid 2-byte sequence
+ * [ 11000011 10111111 ] ->
+ * 00011111111
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = 0xFF,
+ },
+ {
+ /* invalid 2-byte sequence (second byte missing)
+ * [ 11000011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3 },
+ .len = 1,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (second byte malformed)
+ * [ 11000011 11111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .len = 2,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (overlong encoded)
+ * [ 11000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 3-byte sequence
+ * [ 11100000 10111111 10111111 ] ->
+ * 0000111111111111
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = 0xFFF,
+ },
+ {
+ /* invalid 3-byte sequence (second byte missing)
+ * [ 11100000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0 },
+ .len = 1,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (second byte malformed)
+ * [ 11100000 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .len = 3,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte missing)
+ * [ 11100000 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .len = 2,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte malformed)
+ * [ 11100000 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .len = 3,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (overlong encoded)
+ * [ 11100000 10011111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (UTF-16 surrogate half)
+ * [ 11101101 10100000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 4-byte sequence
+ * [ 11110011 10111111 10111111 10111111 ] ->
+ * 011111111111111111111
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = UINT32_C(0xFFFFF),
+ },
+ {
+ /* invalid 4-byte sequence (second byte missing)
+ * [ 11110011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3 },
+ .len = 1,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (second byte malformed)
+ * [ 11110011 01111111 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte missing)
+ * [ 11110011 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .len = 2,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte malformed)
+ * [ 11110011 10111111 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .len = 4,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte missing)
+ * [ 11110011 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte malformed)
+ * [ 11110011 10111111 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .len = 4,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (overlong encoded)
+ * [ 11110000 10000000 10000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (UTF-16-unrepresentable)
+ * [ 11110100 10010000 10000000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+};
+
+int
+main(void)
+{
+ size_t i, failed;
+
+ /* UTF-8 decoder test */
+ for (i = 0, failed = 0; i < LEN(dec_test); i++) {
+ size_t len;
+ uint32_t cp;
+
+ len = grapheme_cp_decode(&cp, dec_test[i].arr,
+ dec_test[i].len);
+
+ if (len != dec_test[i].exp_len ||
+ cp != dec_test[i].exp_cp) {
+ fprintf(stderr, "Failed UTF-8-decoder test %zu: "
+ "Expected (%zx,%u), but got (%zx,%u)\n",
+ i, dec_test[i].exp_len,
+ dec_test[i].exp_cp, len, cp);
+ failed++;
+ }
+ }
+ printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
+ LEN(dec_test) - failed, LEN(dec_test));
+
+ return (failed > 0) ? 1 : 0;
+}
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -0,0 +1,92 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+static const struct {
+ uint32_t cp; /* input code point */
+ uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+ size_t exp_len; /* expected length of UTF-8 sequence */
+} enc_test[] = {
+ {
+ /* invalid code point (UTF-16 surrogate half) */
+ .cp = UINT32_C(0xD800),
+ .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_len = 3,
+ },
+ {
+ /* invalid code point (UTF-16-unrepresentable) */
+ .cp = UINT32_C(0x110000),
+ .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_len = 3,
+ },
+ {
+ /* code point encoded to a 1-byte sequence */
+ .cp = 0x01,
+ .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_len = 1,
+ },
+ {
+ /* code point encoded to a 2-byte sequence */
+ .cp = 0xFF,
+ .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_len = 2,
+ },
+ {
+ /* code point encoded to a 3-byte sequence */
+ .cp = 0xFFF,
+ .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_len = 3,
+ },
+ {
+ /* code point encoded to a 4-byte sequence */
+ .cp = UINT32_C(0xFFFFF),
+ .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_len = 4,
+ },
+};
+
+int
+main(void)
+{
+ size_t i, j, failed;
+
+ /* UTF-8 encoder test */
+ for (i = 0, failed = 0; i < LEN(enc_test); i++) {
+ uint8_t arr[4];
+ size_t len;
+
+ len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+
+ if (len != enc_test[i].exp_len ||
+ memcmp(arr, enc_test[i].exp_arr, len)) {
+ fprintf(stderr, "Failed UTF-8-encoder test %zu: "
+ "Expected (", i);
+ for (j = 0; j < enc_test[i].exp_len; j++) {
+ fprintf(stderr, "0x%x",
+ enc_test[i].exp_arr[j]);
+ if (j + 1 < enc_test[i].exp_len) {
+ fprintf(stderr, " ");
+ }
+ }
+ fprintf(stderr, "), but got (");
+ for (j = 0; j < len; j++) {
+ fprintf(stderr, "0x%x", arr[j]);
+ if (j + 1 < len) {
+ fprintf(stderr, " ");
+ }
+ }
+ fprintf(stderr, ")\n");
+ failed++;
+ }
+ }
+ printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
+ LEN(enc_test) - failed, LEN(enc_test));
+
+ return (failed > 0) ? 1 : 0;
+}