libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit da46b2648d2846dc23e310b7ac0cc3ddebb7ccd3
parent 0f8eb87382b2953b6c4b62c6f4c42616ce74003c
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  9 Jan 2022 17:30:53 +0100

Refactor benchmark code

Rename some variables for more consistent naming, add a function
to explicitly generate a UTF-8-test-buffer and move some things into
benchmark/util.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mbenchmark/character.c | 41+++++++++++++++++------------------------
Mbenchmark/utf8-decode.c | 78++++++++++++++++++++++++++----------------------------------------------------
Mbenchmark/util.c | 49+++++++++++++++++++++++++++++++++++++++++++------
Mbenchmark/util.h | 13+++++++++++--
4 files changed, 97 insertions(+), 84 deletions(-)

diff --git a/benchmark/character.c b/benchmark/character.c @@ -14,27 +14,20 @@ #define NUM_ITERATIONS 1000000 -#ifdef __has_attribute - #if __has_attribute(optnone) - void libgrapheme(const void *) __attribute__((optnone)); - void libutf8proc(const void *) __attribute__((optnone)); - #endif -#endif - -struct payload { +struct break_benchmark_payload { uint_least32_t *buf; - utf8proc_int32_t *buf_int32; - size_t bufsiz; + utf8proc_int32_t *buf_utf8proc; + size_t buflen; }; void libgrapheme(const void *payload) { GRAPHEME_STATE state = { 0 }; - const struct payload *p = payload; + const struct break_benchmark_payload *p = payload; size_t i; - for (i = 0; i + 1 < p->bufsiz; i++) { + for (i = 0; i + 1 < p->buflen; i++) { (void)grapheme_is_character_break(p->buf[i], p->buf[i+1], &state); } @@ -44,12 +37,12 @@ void libutf8proc(const void *payload) { utf8proc_int32_t state = 0; - const struct payload *p = payload; + const struct break_benchmark_payload *p = payload; size_t i; - for (i = 0; i + 1 < p->bufsiz; i++) { - (void)utf8proc_grapheme_break_stateful(p->buf_int32[i], - p->buf_int32[i+1], + for (i = 0; i + 1 < p->buflen; i++) { + (void)utf8proc_grapheme_break_stateful(p->buf_utf8proc[i], + p->buf_utf8proc[i+1], &state); } } @@ -57,33 +50,33 @@ libutf8proc(const void *payload) int main(int argc, char *argv[]) { - struct payload p; + struct break_benchmark_payload p; double baseline = (double)NAN; size_t i; (void)argc; - if ((p.buf = generate_test_buffer(character_test, LEN(character_test), - &(p.bufsiz))) == NULL) { + if ((p.buf = generate_cp_test_buffer(character_test, LEN(character_test), + &(p.buflen))) == NULL) { return 1; } - if ((p.buf_int32 = malloc(p.bufsiz * sizeof(*(p.buf_int32)))) == NULL) { + if ((p.buf_utf8proc = malloc(p.buflen * sizeof(*(p.buf_utf8proc)))) == NULL) { fprintf(stderr, "malloc: %s\n", strerror(errno)); exit(1); } - for (i = 0; i < p.bufsiz; i++) { + for (i = 0; i < p.buflen; i++) { /* * there is no overflow, as we know that the maximum * codepoint is 0x10FFFF, which is way below 2^31 */ - p.buf_int32[i] = (utf8proc_int32_t)p.buf[i]; + p.buf_utf8proc[i] = (utf8proc_int32_t)p.buf[i]; } printf("%s\n", argv[0]); run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "comparison", - &baseline, NUM_ITERATIONS, p.bufsiz - 1); + &baseline, NUM_ITERATIONS, p.buflen - 1); run_benchmark(libutf8proc, &p, "libutf8proc ", NULL, "comparison", - &baseline, NUM_ITERATIONS, p.bufsiz - 1); + &baseline, NUM_ITERATIONS, p.buflen - 1); free(p.buf); diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c @@ -14,30 +14,23 @@ #define NUM_ITERATIONS 100000 -#ifdef __has_attribute - #if __has_attribute(optnone) - void libgrapheme(const void *) __attribute__((optnone)); - void libutf8proc(const void *) __attribute__((optnone)); - #endif -#endif - -struct payload { - char *buf_char; - utf8proc_uint8_t *buf_uint8; - size_t bufsiz; +struct utf8_benchmark_payload { + char *buf; + utf8proc_uint8_t *buf_utf8proc; + size_t buflen; }; void libgrapheme(const void *payload) { - const struct payload *p = payload; + const struct utf8_benchmark_payload *p = payload; uint_least32_t cp; size_t ret, off; - for (off = 0; off < p->bufsiz; off += ret) { - if ((ret = grapheme_decode_utf8(p->buf_char + off, - p->bufsiz - off, &cp)) > - (p->bufsiz - off)) { + for (off = 0; off < p->buflen; off += ret) { + if ((ret = grapheme_decode_utf8(p->buf + off, + p->buflen - off, &cp)) > + (p->buflen - off)) { break; } (void)cp; @@ -47,14 +40,14 @@ libgrapheme(const void *payload) void libutf8proc(const void *payload) { - const struct payload *p = payload; + const struct utf8_benchmark_payload *p = payload; utf8proc_int32_t cp; utf8proc_ssize_t ret; size_t off; - for (off = 0; off < p->bufsiz; off += (size_t)ret) { - if ((ret = utf8proc_iterate(p->buf_uint8 + off, - (utf8proc_ssize_t)(p->bufsiz - off), + for (off = 0; off < p->buflen; off += (size_t)ret) { + if ((ret = utf8proc_iterate(p->buf_utf8proc + off, + (utf8proc_ssize_t)(p->buflen - off), &cp)) < 0) { break; } @@ -65,57 +58,38 @@ libutf8proc(const void *payload) int main(int argc, char *argv[]) { - struct payload p; - size_t cpbufsiz, i, off, ret; - uint_least32_t *cpbuf; + struct utf8_benchmark_payload p; + size_t i; double baseline = (double)NAN; (void)argc; - if ((cpbuf = generate_test_buffer(character_test, LEN(character_test), - &cpbufsiz)) == NULL) { - return 1; - } + p.buf = generate_utf8_test_buffer(character_test, + LEN(character_test), + &(p.buflen)); - /* convert cp-buffer to utf8-data (both as char and custom uint8-type) */ - for (i = 0, p.bufsiz = 0; i < cpbufsiz; i++) { - p.bufsiz += grapheme_encode_utf8(cpbuf[i], NULL, 0); - } - if ((p.buf_char = malloc(p.bufsiz)) == NULL) { - fprintf(stderr, "malloc: %s\n", strerror(errno)); - exit(1); - } - for (i = 0, off = 0; i < cpbufsiz; i++, off += ret) { - if ((ret = grapheme_encode_utf8(cpbuf[i], p.buf_char + off, - p.bufsiz - off)) > - (p.bufsiz - off)) { - /* shouldn't happen */ - fprintf(stderr, "Error while converting buffer.\n"); - exit(1); - } - } - if ((p.buf_uint8 = malloc(p.bufsiz)) == NULL) { + /* convert cp-buffer to stupid custom libutf8proc-uint8-type */ + if ((p.buf_utf8proc = malloc(p.buflen)) == NULL) { fprintf(stderr, "malloc: %s\n", strerror(errno)); exit(1); } - for (i = 0; i < p.bufsiz; i++) { + for (i = 0; i < p.buflen; i++) { /* * even if char is larger than 8 bit, it will only have * any of the first 8 bits set (by construction). */ - p.buf_uint8[i] = (utf8proc_uint8_t)p.buf_char[i]; + p.buf_utf8proc[i] = (utf8proc_uint8_t)p.buf[i]; } printf("%s\n", argv[0]); run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, - "byte", &baseline, NUM_ITERATIONS, p.bufsiz); + "byte", &baseline, NUM_ITERATIONS, p.buflen); run_benchmark(libutf8proc, &p, "libutf8proc ", "but unsafe (does not detect overlong encodings)", - "byte", &baseline, NUM_ITERATIONS, p.bufsiz); + "byte", &baseline, NUM_ITERATIONS, p.buflen); - free(cpbuf); - free(p.buf_char); - free(p.buf_uint8); + free(p.buf); + free(p.buf_utf8proc); return 0; } diff --git a/benchmark/util.c b/benchmark/util.c @@ -5,22 +5,23 @@ #include <time.h> #include "../gen/types.h" +#include "../grapheme.h" #include "util.h" uint_least32_t * -generate_test_buffer(const struct break_test *test, size_t testlen, - size_t *bufsiz) +generate_cp_test_buffer(const struct break_test *test, size_t testlen, + size_t *buflen) { size_t i, j, off; uint_least32_t *buf; /* allocate and generate buffer */ - for (i = 0, *bufsiz = 0; i < testlen; i++) { - *bufsiz += test[i].cplen; + for (i = 0, *buflen = 0; i < testlen; i++) { + *buflen += test[i].cplen; } - if (!(buf = calloc(*bufsiz, sizeof(*buf)))) { + if (!(buf = calloc(*buflen, sizeof(*buf)))) { fprintf(stderr, "generate_test_buffer: calloc: Out of memory.\n"); - return NULL; + exit(1); } for (i = 0, off = 0; i < testlen; i++) { for (j = 0; j < test[i].cplen; j++) { @@ -32,6 +33,42 @@ generate_test_buffer(const struct break_test *test, size_t testlen, return buf; } +char * +generate_utf8_test_buffer(const struct break_test *test, size_t testlen, + size_t *buflen) +{ + size_t i, j, off, ret; + char *buf; + + /* allocate and generate buffer */ + for (i = 0, *buflen = 0; i < testlen; i++) { + for (j = 0; j < test[i].cplen; j++) { + *buflen += grapheme_encode_utf8(test[i].cp[j], NULL, 0); + } + } + (*buflen)++; /* terminating NUL-byte */ + if (!(buf = malloc(*buflen))) { + fprintf(stderr, "generate_test_buffer: malloc: Out of memory.\n"); + exit(1); + } + for (i = 0, off = 0; i < testlen; i++) { + for (j = 0; j < test[i].cplen; j++, off += ret) { + if ((ret = grapheme_encode_utf8(test[i].cp[j], + buf + off, + *buflen - off)) > + (*buflen - off)) { + /* shouldn't happen */ + fprintf(stderr, "generate_utf8_test_buffer: " + "Buffer too small.\n"); + exit(1); + } + } + } + buf[*buflen - 1] = '\0'; + + return buf; +} + static double time_diff(struct timespec *a, struct timespec *b) { diff --git a/benchmark/util.h b/benchmark/util.h @@ -6,8 +6,17 @@ #define LEN(x) (sizeof(x) / sizeof(*(x))) -uint_least32_t *generate_test_buffer(const struct break_test *, size_t, - size_t *); +#ifdef __has_attribute + #if __has_attribute(optnone) + void libgrapheme(const void *) __attribute__((optnone)); + void libutf8proc(const void *) __attribute__((optnone)); + #endif +#endif + +uint_least32_t *generate_cp_test_buffer(const struct break_test *, size_t, + size_t *); +char *generate_utf8_test_buffer(const struct break_test *, size_t, size_t *); + void run_benchmark(void (*func)(const void *), const void *, const char *, const char *, const char *, double *, size_t, size_t);