libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 06013743a38729c531a67a63bbfd55b50badddfe
parent 602ae9b2041df6c7e2b1d9f9da2b5ae57eb94b64
Author: Laslo Hunhold <dev@frign.de>
Date:   Tue,  4 Jan 2022 18:11:02 +0100

Add UTF-8 decoder benchmark

Here we can also see the trouble with the custom types in libutf8proc.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 5++++-
Abenchmark/utf8-decode.c | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile @@ -6,6 +6,7 @@ include config.mk BENCHMARK =\ benchmark/character\ + benchmark/utf8-decode\ DATA =\ data/emoji-data.txt\ @@ -37,6 +38,7 @@ MAN7 = man/libgrapheme.7 all: libgrapheme.a libgrapheme.so benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h +benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h benchmark/util.o: benchmark/util.c config.mk benchmark/util.h gen/character-prop.o: gen/character-prop.c config.mk gen/util.h gen/character-test.o: gen/character-test.c config.mk gen/util.h @@ -51,6 +53,7 @@ test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h test/util.o: test/util.c config.mk test/util.h benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a +benchmark/utf8-decode: benchmark/utf8-decode.o benchmark/util.o libgrapheme.a gen/character-test: gen/character-test.o gen/util.o gen/properties: gen/properties.o gen/util.o test/character: test/character.o test/util.o libgrapheme.a @@ -139,4 +142,4 @@ dist: tar -cf - "libgrapheme-$(VERSION)" | gzip -c > "libgrapheme-$(VERSION).tar.gz" rm -rf "libgrapheme-$(VERSION)" -.PHONY: all test install uninstall clean clean-data dist +.PHONY: all benchmark test install uninstall clean clean-data dist diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c @@ -0,0 +1,120 @@ +/* See LICENSE file for copyright and license details. */ +#include <errno.h> +#include <math.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "../grapheme.h" +#include "../gen/character-test.h" +#include "util.h" + +#include <utf8proc.h> + +#define NUM_ITERATIONS 100000 + +#if defined __has_attribute + #if __has_attribute(optnone) + void libgrapheme(const void *) __attribute__((optnone)); + void libutf8proc(const void *) __attribute__((optnone)); + #endif +#endif + +struct payload { + char *buf_char; + utf8proc_uint8_t *buf_uint8; + size_t bufsiz; +}; + +void +libgrapheme(const void *payload) +{ + const struct payload *p = payload; + uint_least32_t cp; + size_t ret, off; + + for (off = 0; off < p->bufsiz; off += ret) { + if ((ret = grapheme_decode_utf8(p->buf_char + off, + p->bufsiz - off, &cp)) > + (p->bufsiz - off)) { + break; + } + (void)cp; + } +} + +void +libutf8proc(const void *payload) +{ + const struct payload *p = payload; + utf8proc_int32_t cp; + utf8proc_ssize_t ret; + size_t off; + + for (off = 0; off < p->bufsiz; off += (size_t)ret) { + if ((ret = utf8proc_iterate(p->buf_uint8 + off, + (utf8proc_ssize_t)(p->bufsiz - off), + &cp)) < 0) { + break; + } + (void)cp; + } +} + +int +main(int argc, char *argv[]) +{ + struct payload p; + size_t cpbufsiz, i, off, ret; + uint32_t *cpbuf; + double baseline = (double)NAN; + + (void)argc; + + if ((cpbuf = generate_test_buffer(character_test, LEN(character_test), + &cpbufsiz)) == NULL) { + return 1; + } + + /* convert cp-buffer to utf8-data (both as char and custom uint8-type) */ + for (i = 0, p.bufsiz = 0; i < cpbufsiz; i++) { + p.bufsiz += grapheme_encode_utf8(cpbuf[i], NULL, 0); + } + if ((p.buf_char = malloc(p.bufsiz)) == NULL) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + for (i = 0, off = 0; i < cpbufsiz; i++, off += ret) { + if ((ret = grapheme_encode_utf8(cpbuf[i], p.buf_char + off, + p.bufsiz - off)) > + (p.bufsiz - off)) { + /* shouldn't happen */ + fprintf(stderr, "Error while converting buffer.\n"); + exit(1); + } + } + if ((p.buf_uint8 = malloc(p.bufsiz)) == NULL) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + for (i = 0; i < p.bufsiz; i++) { + /* + * even if char is larger than 8 bit, it will only have + * any of the first 8 bits set (by construction). + */ + p.buf_uint8[i] = (utf8proc_uint8_t)p.buf_char[i]; + } + + printf("%s\n", argv[0]); + run_benchmark(libgrapheme, &p, "libgrapheme ", &baseline, + NUM_ITERATIONS); + run_benchmark(libutf8proc, &p, "libutf8proc ", &baseline, + NUM_ITERATIONS); + + free(cpbuf); + free(p.buf_char); + free(p.buf_uint8); + + return 0; +}