libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit b91df388fd392e4feb024462da6feff4ed304814
parent 759f02fbe74f11f64ba451b3a18090cbbb10add2
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  2 Jan 2022 12:24:43 +0100

Create separate comparative benchmark/ folder

Benchmarks shouldn't really be part of the functional test and are
only meaningful in direct comparison with other implementations, which
obviously should not be a install-time-dependency. Instead, benchmarks
are meant as a development-tool, and thus deserve their own separate
folder.

So, which library should we compare ourselves to? There are many
options, like libunistring, libicu and libutf8proc, but at the end only
the latter remains, for the following reasons: libunistring's
implementation is incorrect, as it is not stateful and thus probably
fell victim to the semantic change of the standard a few years ago,
which makes any benchmark against it meaningless. libicu does not offer
a way to simply check two code points with a manual state. libutf8proc
is a great comparison, because it's the backbone of the modern Julia
language and received a lot of performance optimization.

Measuring ourselves against it makes a lot of sense.

Regarding the implementation: Given we don't use the return value of the
break check, I added function-level-optimization attributes to the
function. However, they are behind the feature-check "__has_attribute"
ensuring maximum portability even for compilers which don't offer that.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Abenchmark/character.c | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abenchmark/util.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abenchmark/util.h | 13+++++++++++++
3 files changed, 153 insertions(+), 0 deletions(-)

diff --git a/benchmark/character.c b/benchmark/character.c @@ -0,0 +1,68 @@ +/* See LICENSE file for copyright and license details. */ +#include <math.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "../grapheme.h" +#include "../gen/character-test.h" +#include "util.h" + +#include <unigbrk.h> +#include <utf8proc.h> + +#define NUM_ITERATIONS 10000 + +#if defined __has_attribute + #if __has_attribute(optnone) + void libgrapheme(const uint32_t *, size_t) __attribute__((optnone)); + void libutf8proc(const uint32_t *, size_t) __attribute__((optnone)); + #endif +#endif + +void +libgrapheme(const uint32_t *buf, size_t bufsiz) +{ + GRAPHEME_STATE state = { 0 }; + size_t i; + + for (i = 0; i + 1 < bufsiz; i++) { + (void)grapheme_is_character_break(buf[i], buf[i+1], &state); + } +} + +void +libutf8proc(const uint32_t *buf, size_t bufsiz) +{ + utf8proc_int32_t state = 0; + size_t i; + + for (i = 0; i + 1 < bufsiz; i++) { + (void)utf8proc_grapheme_break_stateful(buf[i], buf[i+1], &state); + } +} + +int +main(int argc, char *argv[]) +{ + size_t bufsiz; + uint32_t *buf; + double baseline = NAN; + + (void)argc; + + if ((buf = generate_test_buffer(character_test, LEN(character_test), + &bufsiz)) == NULL) { + return 1; + } + + printf("%s\n", argv[0]); + run_benchmark(libgrapheme, "libgrapheme ", &baseline, buf, bufsiz, + NUM_ITERATIONS); + run_benchmark(libutf8proc, "libutf8proc ", &baseline, buf, bufsiz, + NUM_ITERATIONS); + + free(buf); + + return 0; +} diff --git a/benchmark/util.c b/benchmark/util.c @@ -0,0 +1,72 @@ +/* See LICENSE file for copyright and license details. */ +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#include <time.h> + +#include "util.h" + +uint32_t * +generate_test_buffer(const struct test *t, size_t tlen, size_t *bufsiz) +{ + size_t i, j, off; + uint32_t *buf; + + /* allocate and generate buffer */ + for (i = 0, *bufsiz = 0; i < tlen; i++) { + *bufsiz += t[i].cplen; + } + if (!(buf = calloc(*bufsiz, sizeof(*buf)))) { + fprintf(stderr, "generate_test_buffer: calloc: Out of memory.\n"); + return NULL; + } + for (i = 0, off = 0; i < tlen; i++) { + for (j = 0; j < t[i].cplen; j++) { + buf[off + j] = t[i].cp[j]; + } + off += t[i].cplen; + } + + return buf; +} + +static double +time_diff(struct timespec *a, struct timespec *b) +{ + return (double)(b->tv_sec - a->tv_sec) + + (double)(b->tv_nsec - a->tv_nsec) * 1E-9; +} + +void +run_benchmark(void (*func)(const uint32_t *, size_t), const char *name, + double *baseline, const uint32_t *buf, size_t bufsiz, + uint32_t num_iterations) +{ + struct timespec start, end; + size_t i; + double diff; + + printf("\t%s ", name); + fflush(stdout); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (i = 0; i < num_iterations; i++) { + func(buf, bufsiz); + + if (i % (num_iterations / 10) == 0) { + printf("."); + fflush(stdout); + } + } + clock_gettime(CLOCK_MONOTONIC, &end); + diff = time_diff(&start, &end); + + if (isnan(*baseline)) { + *baseline = diff; + printf(" %.3fs (baseline)\n", diff); + } else { + printf(" %.3fs (%.2f%% %s)\n", diff, + fabs(1.0 - diff / *baseline) * 100, + (diff < *baseline) ? "faster" : "slower"); + } +} diff --git a/benchmark/util.h b/benchmark/util.h @@ -0,0 +1,13 @@ +/* See LICENSE file for copyright and license details. */ +#ifndef UTIL_H +#define UTIL_H + +#include "../gen/types.h" + +#define LEN(x) (sizeof(x) / sizeof(*(x))) + +uint32_t *generate_test_buffer(const struct test *, size_t, size_t *); +void run_benchmark(void (*func)(const uint32_t *, size_t), const char *, + double *, const uint32_t *, size_t, uint32_t); + +#endif /* UTIL_H */