libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 04bab2a4c09816c37c8e06aa38dfc7f2cab8c680
parent 7034bddcc4bc2262cec8a47d47272c1a2cd9bf9d
Author: Laslo Hunhold <dev@frign.de>
Date:   Thu, 28 May 2020 12:57:37 +0200

Add automatic UTF-8-decoder-tests

The 23 tests should cover all cases and provide safety against any
possible regressions.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mdata/gbt.awk | 2+-
Msrc/test_body.c | 270+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 267 insertions(+), 5 deletions(-)

diff --git a/data/gbt.awk b/data/gbt.awk @@ -6,7 +6,7 @@ BEGIN { printf("struct test {\n\tCodepoint *cp;\n\tsize_t cplen;\n"); printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n"); - printf("struct test t[] = {\n"); + printf("static const struct test t[] = {\n"); } $0 ~ /^#/ || $0 ~ /^\s*$/ { next } diff --git a/src/test_body.c b/src/test_body.c @@ -3,15 +3,277 @@ #include <stdio.h> #include "boundary.h" +#include "codepoint.h" #define LEN(x) (sizeof(x) / sizeof(*x)) +/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e + * expected return value and return cp */ + +static const struct { + uint8_t *arr; /* byte array */ + size_t len; /* number of bytes in array */ + size_t exp_len; /* expected length returned */ + uint32_t exp_cp; /* expected codepoint returned */ +} dec_test[] = { + { + /* empty sequence + * [ ] -> + * INVALID + */ + .arr = NULL, + .len = 0, + .exp_len = 1, + .exp_cp = CP_INVALID, + }, + { + /* invalid lead byte + * [ 11111101 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xFD }, + .len = 1, + .exp_len = 1, + .exp_cp = CP_INVALID, + }, + { + /* valid 1-byte sequence + * [ 00000001 ] -> + * 0000001 + */ + .arr = (uint8_t[]){ 0x01 }, + .len = 1, + .exp_len = 1, + .exp_cp = 0x1, + }, + { + /* valid 2-byte sequence + * [ 11000011 10111111 ] -> + * 00011111111 + */ + .arr = (uint8_t[]){ 0xC3, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = 0xff, + }, + { + /* invalid 2-byte sequence (second byte missing) + * [ 11000011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3 }, + .len = 1, + .exp_len = 2, + .exp_cp = CP_INVALID, + }, + { + /* invalid 2-byte sequence (second byte malformed) + * [ 11000011 11111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3, 0xFF }, + .len = 2, + .exp_len = 1, + .exp_cp = CP_INVALID, + }, + { + /* invalid 2-byte sequence (overlong encoded) + * [ 11000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC1, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = CP_INVALID, + }, + { + /* valid 3-byte sequence + * [ 11100000 10111111 10111111 ] -> + * 0000111111111111 + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = 0xfff, + }, + { + /* invalid 3-byte sequence (second byte missing) + * [ 11100000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0 }, + .len = 1, + .exp_len = 3, + .exp_cp = CP_INVALID, + }, + { + /* invalid 3-byte sequence (second byte malformed) + * [ 11100000 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, + .len = 3, + .exp_len = 1, + .exp_cp = CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte missing) + * [ 11100000 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF }, + .len = 2, + .exp_len = 3, + .exp_cp = CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte malformed) + * [ 11100000 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, + .len = 3, + .exp_len = 2, + .exp_cp = CP_INVALID, + }, + { + /* invalid 3-byte sequence (overlong encoded) + * [ 11100000 10011111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = CP_INVALID, + }, + { + /* invalid 3-byte sequence (UTF-16 surrogate half) + * [ 11101101 10100000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, + .len = 3, + .exp_len = 3, + .exp_cp = CP_INVALID, + }, + { + /* valid 4-byte sequence + * [ 11110011 10111111 10111111 10111111 ] -> + * 011111111111111111111 + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = 0xfffff, + }, + { + /* invalid 4-byte sequence (second byte missing) + * [ 11110011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3 }, + .len = 1, + .exp_len = 4, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (second byte malformed) + * [ 11110011 01111111 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, + .len = 4, + .exp_len = 1, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte missing) + * [ 11110011 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF }, + .len = 2, + .exp_len = 4, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte malformed) + * [ 11110011 10111111 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, + .len = 4, + .exp_len = 2, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte missing) + * [ 11110011 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, + .len = 3, + .exp_len = 4, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte malformed) + * [ 11110011 10111111 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, + .len = 4, + .exp_len = 3, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (overlong encoded) + * [ 11110000 10000000 10000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = CP_INVALID, + }, + { + /* invalid 4-byte sequence (UTF-16-unrepresentable) + * [ 11110100 10010000 10000000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, + .len = 4, + .exp_len = 4, + .exp_cp = CP_INVALID, + }, +}; + int main(void) { int state; - size_t i, j, k, len, failed = 0; + size_t i, j, k, len, failed; + + /* UTF-8 decoder test */ + for (i = 0, failed = 0; i < LEN(dec_test); i++) { + size_t len; + uint32_t cp; + + len = grapheme_cp_decode(&cp, dec_test[i].arr, + dec_test[i].len); - for (i = 0; i < LEN(t); i++) { + if (len != dec_test[i].exp_len || + cp != dec_test[i].exp_cp) { + fprintf(stderr, "Failed UTF-8-decoder test %zu: " + "Expected (%zx,%u), but got (%zx,%u)\n", + i, dec_test[i].exp_len, + dec_test[i].exp_cp, len, cp); + } + } + printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", + LEN(dec_test) - failed, LEN(dec_test)); + + /* grapheme break test */ + for (i = 0, failed = 0; i < LEN(t); i++) { for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { if ((j + 1) == t[i].cplen || boundary(t[i].cp[j], t[i].cp[j + 1], &state)) { @@ -28,8 +290,8 @@ int main(void) } } } - - printf("Passed %zu out of %zu tests.\n", LEN(t) - failed, LEN(t)); + printf("Grapheme break test: Passed %zu out of %zu tests.\n", + LEN(t) - failed, LEN(t)); return (failed > 0) ? 1 : 0; }