libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 7981a5db713073992d00ee2231b88558977671aa
parent 17c56be9f79741c521dc76cf256d6dc2a7428b62
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 10 Dec 2021 20:30:21 +0100

Severely speed up grapheme cluster break detection

Instead of discarding all properties we determined for two adjacent
codepoints, we use the given state (we have to have for the flags) to
shift the properties of the right character and move them to the left
character. This change leads to a speedup of ~25%.

On the API, it changes the "int *" state argument to an opaque type
"LG_SEGMENTATION_STATE *". Any normal library would just define an
internal struct and export a function "init_state" or something and
be done with it, but I didn't like the idea of having heap-allocations
in libgrapheme and wanted to make it possible to just do

   LG_SEGMENTATION_STATE state;
   lg_grapheme_isbreak(a, b, &state);

but have also the choice to do

   LG_SEGMENTATION_STATE *state;
   state = malloc(sizeof(*state));
   lg_grapheme_isbreak(a, b, &state);

whatever is preferred or makes sense in the context.

Doing this is pretty difficult, because allowing C to know the storage
size of a type through the API is only possible if it knows the
implementation details. My first attempt was to define a struct with
identical structure in grapheme.h (only with "anonymous" structs
rather than the internal heisenstate-structs), but after consulting
the standard this is undefined behaviour and breaks strict aliasing
rules. So I bit the bullet and gave the definition in the header,
indicating with "internal" that these types should not be used.

In most cases, users will probably never get in contact with this data
structure, as lg_grapheme_nextbreak() is still as clean as before.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 18+++++++++++++++---
Msrc/grapheme.c | 182++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Msrc/utf8.c | 5+++--
Msrc/util.c | 31++++++++++++++++++++-----------
Msrc/util.h | 16++++++----------
Mtest/grapheme-performance.c | 6++++--
Mtest/grapheme.c | 7++++---
Mtest/utf8-decode.c | 2+-
Mtest/utf8-encode.c | 2+-
9 files changed, 156 insertions(+), 113 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -5,12 +5,24 @@ #include <stddef.h> #include <stdint.h> +struct lg_internal_heisenstate { + uint_least64_t determined; + uint_least64_t state; +}; + +typedef struct lg_internal_segmentation_state { + struct lg_internal_heisenstate a; + struct lg_internal_heisenstate b; + uint_least16_t flags; +} LG_SEGMENTATION_STATE; + #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD) +size_t lg_grapheme_nextbreak(const char *); + +int lg_grapheme_isbreak(uint32_t, uint32_t, LG_SEGMENTATION_STATE *); + size_t lg_utf8_decode(const uint8_t *, size_t, uint32_t *); size_t lg_utf8_encode(uint32_t, uint8_t *, size_t); -size_t lg_grapheme_nextbreak(const char *); -int lg_grapheme_isbreak(uint32_t, uint32_t, int *); - #endif /* GRAPHEME_H */ diff --git a/src/grapheme.c b/src/grapheme.c @@ -1,73 +1,79 @@ /* See LICENSE file for copyright and license details. */ #include <stddef.h> #include <stdlib.h> +#include <string.h> #include "../gen/grapheme.h" #include "../grapheme.h" +#include "util.h" enum { - GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ - GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ + GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ + GRAPHEME_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ }; int -lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state) +lg_grapheme_isbreak(uint32_t a, uint32_t b, LG_SEGMENTATION_STATE *state) { - struct heisenstate prop[2] = { 0 }; - int s; + struct lg_internal_heisenstate *p[2] = { 0 }; + int ret = 1, flags = 0; + + /* set state depending on state pointer */ + if (state != NULL) { + p[0] = &(state->a); + p[1] = &(state->b); + flags = state->flags; + } /* skip printable ASCII */ if ((a >= 0x20 && a <= 0x7E) && (b >= 0x20 && b <= 0x7E)) { - return 1; + goto hasbreak; } - /* set internal state based on given state-pointer */ - s = (state != NULL) ? *state : 0; - /* * Apply grapheme cluster breaking algorithm (UAX #29), see * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ /* - * update state + * update flags, if state-pointer given */ - if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { - /* one more RI is on the left side of the seam */ - s ^= GRAPHEME_STATE_RI_ODD; + if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { + /* one more RI is on the left side of the seam, flip state */ + flags ^= GRAPHEME_FLAG_RI_ODD; } else { /* an RI appeared on the right side but the left - side is not an RI, reset state (0 is even) */ - s &= ~GRAPHEME_STATE_RI_ODD; + side is not an RI, reset state (number 0 is even) */ + flags &= ~GRAPHEME_FLAG_RI_ODD; } } - if (!(*state & GRAPHEME_STATE_EMOJI) && - ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { - s |= GRAPHEME_STATE_EMOJI; - } else if ((*state & GRAPHEME_STATE_EMOJI) && - ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) || - (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) || - (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { - /* GRAPHEME_STATE_EMOJI remains */ + if (!(flags & GRAPHEME_FLAG_EMOJI) && + ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { + flags |= GRAPHEME_FLAG_EMOJI; + } else if ((flags & GRAPHEME_FLAG_EMOJI) && + ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) || + (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) || + (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || + (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { + /* GRAPHEME_FLAG_EMOJI remains */ } else { - s &= ~GRAPHEME_STATE_EMOJI; + flags &= ~GRAPHEME_FLAG_EMOJI; } - /* write updated state to state-pointer, if given */ + /* write updated flags to state, if given */ if (state != NULL) { - *state = s; + state->flags = flags; } /* @@ -77,81 +83,97 @@ lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state) /* skip GB1 and GB2, as they are never satisfied here */ /* GB3 */ - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) { - return 0; + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { + goto nobreak; } /* GB4 */ - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) || - has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) || - has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) { - return 1; + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) || + has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) || + has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) { + goto hasbreak; } /* GB5 */ - if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) { - return 1; + if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { + goto hasbreak; } /* GB6 */ - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) && - (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) { - return 0; + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) && + (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || + + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) { + goto nobreak; } /* GB7 */ - if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || - has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) && - (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) { - return 0; + if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || + has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) && + (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) { + goto nobreak; } /* GB8 */ - if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) || - has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) { - return 0; + if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) || + has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) { + goto nobreak; } /* GB9 */ - if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) || - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) { - return 0; + if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) || + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) { + goto nobreak; } /* GB9a */ - if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) { - return 0; + if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) { + goto nobreak; } /* GB9b */ - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) { - return 0; + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) { + goto nobreak; } /* GB11 */ - if ((s & GRAPHEME_STATE_EMOJI) && - has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) { - return 0; + if ((flags & GRAPHEME_FLAG_EMOJI) && + has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) { + goto nobreak; } /* GB12/GB13 */ - if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && - has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && - (s & GRAPHEME_STATE_RI_ODD)) { - return 0; + if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && + has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && + (flags & GRAPHEME_FLAG_RI_ODD)) { + goto nobreak; } /* GB999 */ - return 1; + goto hasbreak; +nobreak: + ret = 0; +hasbreak: + if (state != NULL) { + /* move b-state to a-state, discard b-state */ + memcpy(&(state->a), &(state->b), sizeof(state->a)); + memset(&(state->b), 0, sizeof(state->b)); + + /* reset flags */ + if (ret == 1) { + state->flags = 0; + } + } + + return ret; } size_t @@ -159,7 +181,7 @@ lg_grapheme_nextbreak(const char *str) { uint32_t cp0, cp1; size_t ret, len = 0; - int state = 0; + LG_SEGMENTATION_STATE state = { 0 }; if (str == NULL) { return 0; diff --git a/src/utf8.c b/src/utf8.c @@ -1,9 +1,10 @@ /* See LICENSE file for copyright and license details. */ -#include "../grapheme.h" #include <stdio.h> +#include "../grapheme.h" +#include "util.h" + #define BETWEEN(c, l, u) (c >= l && c <= u) -#define LEN(x) (sizeof(x) / sizeof(*x)) /* lookup-table for the types of sequence first bytes */ static const struct { diff --git a/src/util.c b/src/util.c @@ -2,10 +2,13 @@ #include <stdint.h> #include <stdlib.h> +#include "../grapheme.h" #include "util.h" +/* 64-slot (0,...,63) optionally undetermined binary state */ + int -heisenstate_get(struct heisenstate *h, int slot) +heisenstate_get(struct lg_internal_heisenstate *h, int slot) { if (h == NULL || slot >= 64 || slot < 0 || !(h->determined & (1 << slot))) { @@ -18,7 +21,7 @@ heisenstate_get(struct heisenstate *h, int slot) } int -heisenstate_set(struct heisenstate *h, int slot, int state) +heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state) { if (h == NULL || slot >= 64 || slot < 0) { /* no state given or slot out of range */ @@ -45,17 +48,23 @@ cp_cmp(const void *a, const void *b) } int -has_property(uint32_t cp, struct heisenstate *cpstate, +has_property(uint32_t cp, struct lg_internal_heisenstate *cpstate, const struct range_list *proptable, int property) { - if (heisenstate_get(cpstate, property) == -1) { - /* state undetermined, make a lookup and set it */ - heisenstate_set(cpstate, property, bsearch(&cp, - proptable[property].data, - proptable[property].len, - sizeof(*proptable[property].data), - cp_cmp) ? 1 : 0); + int res; + + if (cpstate == NULL || + (res = heisenstate_get(cpstate, property)) == -1) { + /* make a lookup */ + res = bsearch(&cp, proptable[property].data, + proptable[property].len, + sizeof(*proptable[property].data), cp_cmp) ? + 1 : 0; + + if (cpstate != NULL) { + heisenstate_set(cpstate, property, res); + } } - return heisenstate_get(cpstate, property); + return res; } diff --git a/src/util.h b/src/util.h @@ -5,7 +5,9 @@ #include <stddef.h> #include <stdint.h> -#define LEN(x) (sizeof (x) / sizeof *(x)) +#include "../grapheme.h" + +#define LEN(x) (sizeof(x) / sizeof(*(x))) struct range { uint32_t lower; @@ -17,16 +19,10 @@ struct range_list { size_t len; }; -/* 64-slot (0,...,63) optionally undetermined binary state */ -struct heisenstate { - uint_least64_t determined; - uint_least64_t state; -}; - -int heisenstate_get(struct heisenstate *, int); -int heisenstate_set(struct heisenstate *, int, int); +int heisenstate_get(struct lg_internal_heisenstate *, int); +int heisenstate_set(struct lg_internal_heisenstate *, int, int); -int has_property(uint32_t, struct heisenstate *, +int has_property(uint32_t, struct lg_internal_heisenstate *, const struct range_list *, int); #endif /* UTIL_H */ diff --git a/test/grapheme-performance.c b/test/grapheme-performance.c @@ -2,12 +2,13 @@ #include <stdint.h> #include <stdio.h> #include <stdlib.h> +#include <string.h> #include <time.h> #include "../grapheme.h" #include "../gen/grapheme-test.h" -#define LEN(x) (sizeof(x) / sizeof(*x)) +#define LEN(x) (sizeof(x) / sizeof(*(x))) #define NUM_ITERATIONS 10000 int64_t time_diff(struct timespec *a, struct timespec *b) @@ -22,7 +23,7 @@ main(void) struct timespec start, end; size_t i, j, bufsiz, off; uint32_t *buf; - int state; + LG_SEGMENTATION_STATE state; double cp_per_sec; /* allocate and generate buffer */ @@ -46,6 +47,7 @@ main(void) clock_gettime(CLOCK_MONOTONIC, &start); for (i = 0; i < NUM_ITERATIONS; i++) { + memset(&state, 0, sizeof(state)); for (j = 0; j < bufsiz - 1; j++) { (void)lg_grapheme_isbreak(buf[j], buf[j+1], &state); } diff --git a/test/grapheme.c b/test/grapheme.c @@ -7,17 +7,18 @@ #include "../grapheme.h" #include "../gen/grapheme-test.h" -#define LEN(x) (sizeof(x) / sizeof(*x)) +#define LEN(x) (sizeof(x) / sizeof(*(x))) int main(void) { - int state; + LG_SEGMENTATION_STATE state; size_t i, j, k, len, failed; /* grapheme break test */ for (i = 0, failed = 0; i < LEN(grapheme_test); i++) { - for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) { + memset(&state, 0, sizeof(state)); + for (j = 0, k = 0, len = 1; j < grapheme_test[i].cplen; j++) { if ((j + 1) == grapheme_test[i].cplen || lg_grapheme_isbreak(grapheme_test[i].cp[j], grapheme_test[i].cp[j + 1], diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -6,7 +6,7 @@ #include "../grapheme.h" -#define LEN(x) (sizeof(x) / sizeof(*x)) +#define LEN(x) (sizeof(x) / sizeof(*(x))) static const struct { uint8_t *arr; /* UTF-8 byte sequence */ diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -6,7 +6,7 @@ #include "../grapheme.h" -#define LEN(x) (sizeof(x) / sizeof(*x)) +#define LEN(x) (sizeof(x) / sizeof(*(x))) static const struct { uint32_t cp; /* input code point */