libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit d2b53cb080b8c75b140bb1a3347b409c118e882d
parent 21b6f66acc659e8c515d4685a11fa534a289af14
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 31 May 2020 22:49:30 +0200

Add UTF-8-encoder tests

This should cover all the edge cases and provide a regression test
for the encoder.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/test_body.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/test_body.c b/src/test_body.c @@ -1,14 +1,55 @@ /* See LICENSE file for copyright and license details. */ #include <stddef.h> #include <stdio.h> +#include <string.h> #include "boundary.h" #include "codepoint.h" #define LEN(x) (sizeof(x) / sizeof(*x)) -/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e - * expected return value and return cp */ +static const struct { + uint32_t cp; /* input code point */ + uint8_t *exp_arr; /* expected UTF-8 byte sequence */ + size_t exp_len; /* expected length of UTF-8 sequence */ +} enc_test[] = { + { + /* invalid code point (UTF-16 surrogate half) */ + .cp = UINT32_C(0xD800), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* invalid code point (UTF-16-unrepresentable) */ + .cp = UINT32_C(0x110000), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* code point encoded to a 1-byte sequence */ + .cp = 0x01, + .exp_arr = (uint8_t[]){ 0x01 }, + .exp_len = 1, + }, + { + /* code point encoded to a 2-byte sequence */ + .cp = 0xFF, + .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, + .exp_len = 2, + }, + { + /* code point encoded to a 3-byte sequence */ + .cp = 0xFFF, + .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .exp_len = 3, + }, + { + /* code point encoded to a 4-byte sequence */ + .cp = UINT32_C(0xFFFFF), + .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .exp_len = 4, + }, +}; static const struct { uint8_t *arr; /* byte array */ @@ -253,6 +294,38 @@ int main(void) int state; size_t i, j, k, len, failed; + /* UTF-8 encoder test */ + for (i = 0, failed = 0; i < LEN(enc_test); i++) { + uint8_t arr[4]; + size_t len; + + len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); + + if (len != enc_test[i].exp_len || + memcmp(arr, enc_test[i].exp_arr, len)) { + fprintf(stderr, "Failed UTF-8-encoder test %zu: " + "Expected (", i); + for (j = 0; j < enc_test[i].exp_len; j++) { + fprintf(stderr, "0x%x", + enc_test[i].exp_arr[j]); + if (j != enc_test[i].exp_len - 1) { + fprintf(stderr, " "); + } + } + fprintf(stderr, "), but got ("); + for (j = 0; j < len; j++) { + fprintf(stderr, "0x%x", arr[j]); + if (j != len - 1) { + fprintf(stderr, " "); + } + } + fprintf(stderr, ")\n"); + failed++; + } + } + printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", + LEN(enc_test) - failed, LEN(enc_test)); + /* UTF-8 decoder test */ for (i = 0, failed = 0; i < LEN(dec_test); i++) { size_t len;