libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit fc73d06fed76dd7cde37d3704949d01391ea0032
parent a815be4b5de7f7df2da664049fdb04874d37016a
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon,  3 Oct 2022 23:13:26 +0200

Convert GRAPHEME_STATE to uint_least16_t and remove it

I was never quite happy with the fact that the internal state-struct
was visible in the grapheme.h-header, given the declaration of the
fields only namely served internal purposes and were useless noise to
the reader.

To keep it in was merely a choice made because I had always hoped
to be able to implement maybe a few more state-based pairwise
segmentation check functions and use the GRAPHEME_STATE type in more
places, but now after implementing the algorithms it becomes clear that
they all do not satisfy these pairwise semantics.

The first logical step was to convert the struct to an uint_least16_t,
which provides enough space (at least 16 bits) to store all the complete
state, and have internal deserialiation and serialization functions.

The remaining question was if the

	typedef uint_least16_t GRAPHEME_STATE

should be removed. I took inspiration from the Linux kernel coding
style[0], which in section 5b lays out the exact case of typedeffing
an integer that is meant to store flags (just like in our case).
It is argued there that there needs to be a good reason to typedef an
integer (e.g. given it might change by architecture or maybe change in
later versions). Both cases are not given here (we will _never_ need
more than 16 bits to store the grapheme cluster break state and you can
even reduce more wastage, e.g. for storing the prop which never exceeds
4 bits given NUM_CHAR_BREAK_PROPS == 14 < 15 == 2^4-1), and I must admit
that it improves readability a bit given you finally know what you're
dealing with.

The expression

	GRAPHEME_STATE state = 0;

admittedly looks a little fishy, given you don't really know what happens
behind the scenes unless you look in the header, and I want all of the
semantics to be crystal clear to the end-user.

[0]:https://www.kernel.org/doc/html/latest/process/coding-style.html#typedefs

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mbenchmark/character.c | 2+-
Mgrapheme.h | 9+--------
Mman/grapheme_is_character_break.sh | 4++--
Msrc/character.c | 60+++++++++++++++++++++++++++++++++++++++++++++---------------
4 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/benchmark/character.c b/benchmark/character.c @@ -23,7 +23,7 @@ struct break_benchmark_payload { void libgrapheme(const void *payload) { - GRAPHEME_STATE state = { 0 }; + uint_least16_t state = 0; const struct break_benchmark_payload *p = payload; size_t i; diff --git a/grapheme.h b/grapheme.h @@ -6,19 +6,12 @@ #include <stddef.h> #include <stdint.h> -typedef struct grapheme_internal_segmentation_state { - uint_least8_t prop; - bool prop_set; - bool gb11_flag; - bool gb12_13_flag; -} GRAPHEME_STATE; - #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD) size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); -bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *); +bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *); bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *); bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *); diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_break.sh @@ -8,7 +8,7 @@ cat << EOF .Sh SYNOPSIS .In grapheme.h .Ft size_t -.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_STATE *state" +.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state" .Sh DESCRIPTION The .Fn grapheme_is_character_break @@ -52,7 +52,7 @@ if there is not. int main(void) { - GRAPHEME_STATE state = { 0 }; + uint_least16_t state = 0; uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */ size_t i; diff --git a/src/character.c b/src/character.c @@ -7,6 +7,13 @@ #include "../grapheme.h" #include "util.h" +struct character_break_state { + uint_least8_t prop; + bool prop_set; + bool gb11_flag; + bool gb12_13_flag; +}; + static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_OTHER] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ @@ -106,38 +113,59 @@ get_break_prop(uint_least32_t cp) { if (likely(cp <= 0x10FFFF)) { return (enum char_break_property) - char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)]; + char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)]; } else { return CHAR_BREAK_PROP_OTHER; } } +static inline void +state_serialize(const struct character_break_state *in, uint_least16_t *out) +{ + *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */ + (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */ + (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */ + (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */ +} + +static inline void +state_deserialize(uint_least16_t in, struct character_break_state *out) +{ + out->prop = in & UINT8_C(0xFF); + out->prop_set = in & (((uint_least16_t)(1)) << 8); + out->gb11_flag = in & (((uint_least16_t)(1)) << 9); + out->gb12_13_flag = in & (((uint_least16_t)(1)) << 10); +} + bool -grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state) +grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s) { + struct character_break_state state; enum char_break_property cp0_prop, cp1_prop; bool notbreak = false; - if (likely(state)) { - if (likely(state->prop_set)) { - cp0_prop = state->prop; + if (likely(s)) { + state_deserialize(*s, &state); + + if (likely(state.prop_set)) { + cp0_prop = state.prop; } else { cp0_prop = get_break_prop(cp0); } cp1_prop = get_break_prop(cp1); /* preserve prop of right codepoint for next iteration */ - state->prop = (uint_least8_t)cp1_prop; - state->prop_set = true; + state.prop = (uint_least8_t)cp1_prop; + state.prop_set = true; /* update flags */ - state->gb11_flag = + state.gb11_flag = flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * - state->gb11_flag] & + state.gb11_flag] & UINT16_C(1) << cp1_prop; - state->gb12_13_flag = + state.gb12_13_flag = flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS * - state->gb12_13_flag] & + state.gb12_13_flag] & UINT16_C(1) << cp1_prop; /* @@ -145,17 +173,19 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || - (dont_break_gb11[cp0_prop + state->gb11_flag * + (dont_break_gb11[cp0_prop + state.gb11_flag * NUM_CHAR_BREAK_PROPS] & (UINT16_C(1) << cp1_prop)) || - (dont_break_gb12_13[cp0_prop + state->gb12_13_flag * + (dont_break_gb12_13[cp0_prop + state.gb12_13_flag * NUM_CHAR_BREAK_PROPS] & (UINT16_C(1) << cp1_prop)); /* update or reset flags (when we have a break) */ if (likely(!notbreak)) { - state->gb11_flag = state->gb12_13_flag = false; + state.gb11_flag = state.gb12_13_flag = false; } + + state_serialize(&state, s); } else { cp0_prop = get_break_prop(cp0); cp1_prop = get_break_prop(cp1); @@ -178,7 +208,7 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA static size_t next_character_break(HERODOTUS_READER *r) { - GRAPHEME_STATE state = { 0 }; + uint_least16_t state = 0; uint_least32_t cp0 = 0, cp1 = 0; for (herodotus_read_codepoint(r, true, &cp0);