libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit a815be4b5de7f7df2da664049fdb04874d37016a
parent 5ea8d87a9a0fb9c6dda827cc55d43c637cd4086d
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon,  3 Oct 2022 21:18:52 +0200

Add unit tests for all segmentation functions

Now all functions in the library are covered by exhaustive unit tests
which supplement the already present conformance tests to make sure
that the thin layer between API and implementation is also working as
expected.

At this point I would assess that libgrapheme is a stable foundation
for using it in the real world and now preparation can go underway
to prepare the release of version 2.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mtest/character.c | 113+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mtest/line.c | 112++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtest/sentence.c | 112++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtest/utf8-decode.c | 2+-
Mtest/utf8-encode.c | 2+-
Mtest/util.c | 47+++++++++++++++++++++++++++++++++++++++++++++--
Mtest/util.h | 34++++++++++++++++++++++++++++++++--
Mtest/word.c | 112++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
8 files changed, 523 insertions(+), 11 deletions(-)

diff --git a/test/character.c b/test/character.c @@ -6,12 +6,121 @@ #include "../grapheme.h" #include "util.h" +static const struct unit_test_next_break next_character_break[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 0 }, + }, + { + .description = "one character", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2A }, + .srclen = 3, + }, + .output = { 2 }, + }, + { + .description = "one character, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 2 }, + }, +}; + +static const struct unit_test_next_break_utf8 next_character_break_utf8[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { "", 0 }, + .output = { 0 }, + }, + { + .description = "empty input, NUL-terminated", + .input = { "", SIZE_MAX }, + .output = { 0 }, + }, + { + .description = "one character", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA*", 9 }, + .output = { 8 }, + }, + { + .description = "one character, fragment", + .input = { "\xF0\x9F\x87\xA9\xF0", 5 }, + .output = { 4 }, + }, + { + .description = "one character, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA", SIZE_MAX }, + .output = { 8 }, + }, + { + .description = "one character, fragment, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX }, + .output = { 4 }, + }, +}; + +static int +unit_test_callback_next_character_break(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break(t, off, + grapheme_next_character_break, + name, argv0); +} + +static int +unit_test_callback_next_character_break_utf8(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break_utf8(t, off, + grapheme_next_character_break_utf8, + name, argv0); +} + int main(int argc, char *argv[]) { (void)argc; return run_break_tests(grapheme_next_character_break, - character_break_test, - LEN(character_break_test), argv[0]); + character_break_test, LEN(character_break_test), argv[0]) + + run_unit_tests(unit_test_callback_next_character_break, + next_character_break, LEN(next_character_break), + "grapheme_next_character_break", argv[0]) + + run_unit_tests(unit_test_callback_next_character_break_utf8, + next_character_break_utf8, LEN(next_character_break_utf8), + "grapheme_next_character_break_utf8", argv[0]); } diff --git a/test/line.c b/test/line.c @@ -6,6 +6,110 @@ #include "../grapheme.h" #include "util.h" +static const struct unit_test_next_break next_line_break[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 0 }, + }, + { + .description = "one opportunity", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A }, + .srclen = 4, + }, + .output = { 3 }, + }, + { + .description = "one opportunity, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 3 }, + }, +}; + +static const struct unit_test_next_break_utf8 next_line_break_utf8[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { "", 0 }, + .output = { 0 }, + }, + { + .description = "empty input, NUL-terminated", + .input = { "", SIZE_MAX }, + .output = { 0 }, + }, + { + .description = "one opportunity", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA *", 10 }, + .output = { 9 }, + }, + { + .description = "one opportunity, fragment", + .input = { "\xF0\x9F\x87\xA9\xF0", 5 }, + .output = { 4 }, + }, + { + .description = "one opportunity, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA A", SIZE_MAX }, + .output = { 9 }, + }, + { + .description = "one opportunity, fragment, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX }, + .output = { 4 }, + }, +}; + +static int +unit_test_callback_next_line_break(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break(t, off, + grapheme_next_line_break, + name, argv0); +} + +static int +unit_test_callback_next_line_break_utf8(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break_utf8(t, off, + grapheme_next_line_break_utf8, + name, argv0); +} + int main(int argc, char *argv[]) { @@ -13,5 +117,11 @@ main(int argc, char *argv[]) return run_break_tests(grapheme_next_line_break, line_break_test, LEN(line_break_test), - argv[0]); + argv[0]) + + run_unit_tests(unit_test_callback_next_line_break, + next_line_break, LEN(next_line_break), + "grapheme_next_line_break", argv[0]) + + run_unit_tests(unit_test_callback_next_line_break_utf8, + next_line_break_utf8, LEN(next_line_break_utf8), + "grapheme_next_line_break_utf8", argv[0]); } diff --git a/test/sentence.c b/test/sentence.c @@ -6,6 +6,110 @@ #include "../grapheme.h" #include "util.h" +static const struct unit_test_next_break next_sentence_break[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 0 }, + }, + { + .description = "one sentence", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A }, + .srclen = 5, + }, + .output = { 4 }, + }, + { + .description = "one sentence, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A, 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 4 }, + }, +}; + +static const struct unit_test_next_break_utf8 next_sentence_break_utf8[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { "", 0 }, + .output = { 0 }, + }, + { + .description = "empty input, NUL-terminated", + .input = { "", SIZE_MAX }, + .output = { 0 }, + }, + { + .description = "one sentence", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany. It", 36 }, + .output = { 34 }, + }, + { + .description = "one sentence, fragment", + .input = { "\xF0\x9F\x87\xA9\xF0", 5 }, + .output = { 4 }, + }, + { + .description = "one sentence, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany. It", SIZE_MAX }, + .output = { 34 }, + }, + { + .description = "one sentence, fragment, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX }, + .output = { 6 }, + }, +}; + +static int +unit_test_callback_next_sentence_break(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break(t, off, + grapheme_next_sentence_break, + name, argv0); +} + +static int +unit_test_callback_next_sentence_break_utf8(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break_utf8(t, off, + grapheme_next_sentence_break_utf8, + name, argv0); +} + int main(int argc, char *argv[]) { @@ -13,5 +117,11 @@ main(int argc, char *argv[]) return run_break_tests(grapheme_next_sentence_break, sentence_break_test, - LEN(sentence_break_test), argv[0]); + LEN(sentence_break_test), argv[0]) + + run_unit_tests(unit_test_callback_next_sentence_break, + next_sentence_break, LEN(next_sentence_break), + "grapheme_next_sentence_break", argv[0]) + + run_unit_tests(unit_test_callback_next_sentence_break_utf8, + next_sentence_break_utf8, LEN(next_sentence_break_utf8), + "grapheme_next_character_break_utf8", argv[0]); } diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -310,7 +310,7 @@ main(int argc, char *argv[]) failed++; } } - printf("%s: %zu/%zu tests passed.\n", argv[0], + printf("%s: %zu/%zu unit tests passed.\n", argv[0], LEN(dec_test) - failed, LEN(dec_test)); return (failed > 0) ? 1 : 0; diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -86,7 +86,7 @@ main(int argc, char *argv[]) failed++; } } - printf("%s: %zu/%zu tests passed.\n", argv[0], + printf("%s: %zu/%zu unit tests passed.\n", argv[0], LEN(enc_test) - failed, LEN(enc_test)); return (failed > 0) ? 1 : 0; diff --git a/test/util.c b/test/util.c @@ -38,8 +38,8 @@ run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t), } int -run_unit_tests(int (*unit_test_callback)(void *, size_t, const char *, - const char *), void *test, size_t testlen, const char *name, +run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *, + const char *), const void *test, size_t testlen, const char *name, const char *argv0) { size_t i, failed; @@ -53,3 +53,46 @@ run_unit_tests(int (*unit_test_callback)(void *, size_t, const char *, return (failed > 0) ? 1 : 0; } + +int +unit_test_callback_next_break(const struct unit_test_next_break *t, size_t off, + size_t (*next_break)(const uint_least32_t *, size_t), + const char *name, const char *argv0) +{ + const struct unit_test_next_break *test = t + off; + + size_t ret = next_break(test->input.src, test->input.srclen); + + if (ret != test->output.ret) { + goto err; + } + + return 0; +err: + fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" " + "(returned %zu instead of %zu).\n", argv0, + name, off, test->description, ret, test->output.ret); + return 1; +} + +int +unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *t, + size_t off, + size_t (*next_break_utf8)(const char *, size_t), + const char *name, const char *argv0) +{ + const struct unit_test_next_break_utf8 *test = t + off; + + size_t ret = next_break_utf8(test->input.src, test->input.srclen); + + if (ret != test->output.ret) { + goto err; + } + + return 0; +err: + fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" " + "(returned %zu instead of %zu).\n", argv0, + name, off, test->description, ret, test->output.ret); + return 1; +} diff --git a/test/util.h b/test/util.h @@ -10,10 +10,40 @@ #undef LEN #define LEN(x) (sizeof(x) / sizeof(*(x))) +struct unit_test_next_break { + const char *description; + struct { + const uint_least32_t *src; + size_t srclen; + } input; + struct { + size_t ret; + } output; +}; + +struct unit_test_next_break_utf8 { + const char *description; + struct { + const char *src; + size_t srclen; + } input; + struct { + size_t ret; + } output; +}; + int run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t), const struct break_test *test, size_t testlen, const char *); -int run_unit_tests(int (*unit_test_callback)(void *, size_t, const char *, - const char *), void *, size_t, const char *, const char *); +int run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *, + const char *), const void *, size_t, const char *, const char *); + +int unit_test_callback_next_break(const struct unit_test_next_break *, size_t, + size_t (*next_break)(const uint_least32_t *, size_t), + const char *, const char *); +int unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *, + size_t, + size_t (*next_break_utf8)(const char *, size_t), + const char *, const char *); #endif /* UTIL_H */ diff --git a/test/word.c b/test/word.c @@ -6,11 +6,121 @@ #include "../grapheme.h" #include "util.h" +static const struct unit_test_next_break next_word_break[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 0 }, + }, + { + .description = "one word", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A }, + .srclen = 4, + }, + .output = { 2 }, + }, + { + .description = "one word, null-terminated", + .input = { + .src = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 }, + .srclen = SIZE_MAX, + }, + .output = { 2 }, + }, +}; + +static const struct unit_test_next_break_utf8 next_word_break_utf8[] = { + { + .description = "NULL input", + .input = { + .src = NULL, + .srclen = 0, + }, + .output = { 0 }, + }, + { + .description = "empty input", + .input = { "", 0 }, + .output = { 0 }, + }, + { + .description = "empty input, NUL-terminated", + .input = { "", SIZE_MAX }, + .output = { 0 }, + }, + { + .description = "one word", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", 11 }, + .output = { 8 }, + }, + { + .description = "one word, fragment", + .input = { "\xF0\x9F\x87\xA9\xF0", 5 }, + .output = { 4 }, + }, + { + .description = "one word, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", SIZE_MAX }, + .output = { 8 }, + }, + { + .description = "one word, fragment, NUL-terminated", + .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX }, + .output = { 4 }, + }, +}; + +static int +unit_test_callback_next_word_break(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break(t, off, + grapheme_next_word_break, + name, argv0); +} + +static int +unit_test_callback_next_word_break_utf8(const void *t, size_t off, + const char *name, + const char *argv0) +{ + return unit_test_callback_next_break_utf8(t, off, + grapheme_next_word_break_utf8, + name, argv0); +} + int main(int argc, char *argv[]) { (void)argc; return run_break_tests(grapheme_next_word_break, word_break_test, - LEN(word_break_test), argv[0]); + LEN(word_break_test), argv[0]) + + run_unit_tests(unit_test_callback_next_word_break, + next_word_break, LEN(next_word_break), + "grapheme_next_word_break", argv[0]) + + run_unit_tests(unit_test_callback_next_word_break_utf8, + next_word_break_utf8, LEN(next_word_break_utf8), + "grapheme_next_word_break_utf8", argv[0]); }