libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 94fb8dc7588eb4ee563f5a4c888fcd65a8497834
parent 97b556d67245215e201fef717082b0156f161eed
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 17 Oct 2020 18:17:04 +0200

Refactor grapheme_boundary()

Get rid of the macro-mess, make the code more readable with better
comments and formatting and allow the state-pointer to be NULL, which
is as if the value state is pointing to contained 0, only with the
difference that the state is not updated.

I would, actually, prefer to return some kind of error, but I like
the fact that the function can't really error out as is, which is why
I've chosen this compromise.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/boundary_body.c | 210++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
1 file changed, 129 insertions(+), 81 deletions(-)

diff --git a/src/boundary_body.c b/src/boundary_body.c @@ -10,37 +10,27 @@ enum { GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ }; -static int -cp_cmp(const void *a, const void *b) -{ - uint32_t cp = *(uint32_t *)a; - uint32_t *range = (uint32_t *)b; - - return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); -} - -enum property { - PROP_CR, /* carriage return */ - PROP_LF, /* line feed */ - PROP_CONTROL, /* control character */ - PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ - PROP_ZWJ, /* zero width joiner */ - PROP_RI, /* regional indicator */ - PROP_PREPEND, /* prepend character */ - PROP_SPACINGMARK, /* spacing mark */ - PROP_L, /* hangul syllable type L */ - PROP_V, /* hangul syllable type V */ - PROP_T, /* hangul syllable type T */ - PROP_LV, /* hangul syllable type LV */ - PROP_LVT, /* hangul syllable type LVT */ - PROP_EXTPICT, /* extended pictographic */ - NUM_PROPS, +enum cp_property { + PROP_CR, /* carriage return */ + PROP_LF, /* line feed */ + PROP_CONTROL, /* control character */ + PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ + PROP_ZWJ, /* zero width joiner */ + PROP_RI, /* regional indicator */ + PROP_PREPEND, /* prepend character */ + PROP_SPACINGMARK, /* spacing mark */ + PROP_L, /* hangul syllable type L */ + PROP_V, /* hangul syllable type V */ + PROP_T, /* hangul syllable type T */ + PROP_LV, /* hangul syllable type LV */ + PROP_LVT, /* hangul syllable type LVT */ + PROP_EXTPICT, /* extended pictographic */ }; struct { const uint32_t (*table)[2]; size_t tablelen; -} tables[] = { +} cp_property_tables[] = { [PROP_CR] = { .table = cr_table, .tablelen = LEN(cr_table), @@ -99,128 +89,186 @@ struct { }, }; +struct cp_properties { + uint32_t cp; + int_least16_t determined; + int_least16_t state; +}; + static int -is(uint32_t cp[2], char (*props)[2], int index, enum property p) +cp_cmp(const void *a, const void *b) { - if (props[p][index] == 2) { - /* need to determine property */ - props[p][index] = (bsearch(&(cp[index]), - tables[p].table, - tables[p].tablelen, - sizeof(*(tables[p].table)), - cp_cmp) == NULL) ? 0 : 1; - } + uint32_t cp = *(uint32_t *)a; + uint32_t *range = (uint32_t *)b; - return props[p][index]; + return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); } -#define IS(I, PROP) (is(cp, props, I, PROP)) +static int +has_property(struct cp_properties *props, enum cp_property p) +{ + if (!(props->determined & (1 << p))) { + /* not determined yet, do a lookup and set the state */ + if (bsearch(&props->cp, cp_property_tables[p].table, + cp_property_tables[p].tablelen, + sizeof(*cp_property_tables[p].table), + cp_cmp)) { + props->state |= (1 << p); + } else { + props->state &= ~(1 << p); + } + + /* now it's determined */ + props->determined |= (1 << p); + } + + return (props->state & (1 << p)); +} int -grapheme_boundary(uint32_t cp0, uint32_t cp1, int *state) +grapheme_boundary(uint32_t a, uint32_t b, int *state) { - uint32_t cp[2] = { cp0, cp1 }; - char props[NUM_PROPS][2]; - size_t i; + struct cp_properties props[] = { + { + .cp = a, + }, + { + .cp = b, + }, + }; + int s; - if ((cp0 >= 0x20 && cp0 <= 0x7E) && - (cp1 >= 0x20 && cp1 <= 0x7E)) { - /* skip printable ascii */ + /* skip printable ASCII */ + if ((a >= 0x20 && a <= 0x7E) && + (b >= 0x20 && b <= 0x7E)) { return 1; } - /* set all properties to undetermined (2) */ - for (i = 0; i < NUM_PROPS; i++) { - props[i][0] = props[i][1] = 2; - } + /* set internal state based on given state-pointer */ + s = (state != NULL) ? *state : 0; - /* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ + /* + * Apply grapheme cluster breaking algorithm (UAX #29), see + * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + */ - /* update state machine */ - if (IS(1, PROP_RI)) { - if (IS(0, PROP_RI)) { + /* + * update state + */ + if (has_property(&props[1], PROP_RI)) { + if (has_property(&props[0], PROP_RI)) { /* one more RI is on the left side of the seam */ - *state ^= GRAPHEME_STATE_RI_ODD; + s ^= GRAPHEME_STATE_RI_ODD; } else { /* an RI appeared on the right side but the left side is not an RI, reset state (0 is even) */ - *state &= ~GRAPHEME_STATE_RI_ODD; + s &= ~GRAPHEME_STATE_RI_ODD; } } if (!(*state & GRAPHEME_STATE_EMOJI) && - ((IS(0, PROP_EXTPICT) && IS(1, PROP_ZWJ)) || - (IS(0, PROP_EXTPICT) && IS(1, PROP_EXTEND)))) { - *state |= GRAPHEME_STATE_EMOJI; + ((has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_EXTEND)))) { + s |= GRAPHEME_STATE_EMOJI; } else if ((*state & GRAPHEME_STATE_EMOJI) && - ( - (IS(0, PROP_ZWJ) && IS(1, PROP_EXTPICT)) || - (IS(0, PROP_EXTEND) && IS(1, PROP_EXTEND)) || - (IS(0, PROP_EXTEND) && IS(1, PROP_ZWJ)) || - (IS(0, PROP_EXTPICT) && IS(1, PROP_ZWJ)) || - (IS(0, PROP_EXTPICT) && IS(1, PROP_EXTEND)) - ) - ) { + ((has_property(&props[0], PROP_ZWJ) && + has_property(&props[1], PROP_EXTPICT)) || + (has_property(&props[0], PROP_EXTEND) && + has_property(&props[1], PROP_EXTEND)) || + (has_property(&props[0], PROP_EXTEND) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_EXTEND)))) { /* GRAPHEME_STATE_EMOJI remains */ } else { - *state &= ~GRAPHEME_STATE_EMOJI; + s &= ~GRAPHEME_STATE_EMOJI; } + /* write updated state to state-pointer, if given */ + if (state != NULL) { + *state = s; + } + + /* + * apply rules + */ + + /* skip GB1 and GB2, as they are never satisfied here */ + /* GB3 */ - if (IS(0, PROP_CR) && IS(1, PROP_LF)) { + if (has_property(&props[0], PROP_CR) && + has_property(&props[1], PROP_LF)) { return 0; } /* GB4 */ - if (IS(0, PROP_CONTROL) || IS(0, PROP_CR) || IS(0, PROP_LF)) { + if (has_property(&props[0], PROP_CONTROL) || + has_property(&props[0], PROP_CR) || + has_property(&props[0], PROP_LF)) { return 1; } /* GB5 */ - if (IS(1, PROP_CONTROL) || IS(1, PROP_CR) || IS(1, PROP_LF)) { + if (has_property(&props[1], PROP_CONTROL) || + has_property(&props[1], PROP_CR) || + has_property(&props[1], PROP_LF)) { return 1; } /* GB6 */ - if (IS(0, PROP_L) && (IS(1, PROP_L) || IS(1, PROP_V) || - IS(1, PROP_LV) || IS(1, PROP_LVT))) { + if (has_property(&props[0], PROP_L) && + (has_property(&props[1], PROP_L) || + has_property(&props[1], PROP_V) || + has_property(&props[1], PROP_LV) || + has_property(&props[1], PROP_LVT))) { return 0; } /* GB7 */ - if ((IS(0, PROP_LV) || IS(0, PROP_V)) && (IS(1, PROP_V) || - IS(1, PROP_T))) { + if ((has_property(&props[0], PROP_LV) || + has_property(&props[0], PROP_V)) && + (has_property(&props[1], PROP_V) || + has_property(&props[1], PROP_T))) { return 0; } /* GB8 */ - if ((IS(0, PROP_LVT) || IS(0, PROP_T)) && IS(1, PROP_T)) { + if ((has_property(&props[0], PROP_LVT) || + has_property(&props[0], PROP_T)) && + has_property(&props[1], PROP_T)) { return 0; } /* GB9 */ - if (IS(1, PROP_EXTEND) || IS(1, PROP_ZWJ)) { + if (has_property(&props[1], PROP_EXTEND) || + has_property(&props[1], PROP_ZWJ)) { return 0; } /* GB9a */ - if (IS(1, PROP_SPACINGMARK)) { + if (has_property(&props[1], PROP_SPACINGMARK)) { return 0; } /* GB9b */ - if (IS(0, PROP_PREPEND)) { + if (has_property(&props[0], PROP_PREPEND)) { return 0; } /* GB11 */ - if ((*state & GRAPHEME_STATE_EMOJI) && IS(0, PROP_ZWJ) && - IS(1, PROP_EXTPICT)) { + if ((s & GRAPHEME_STATE_EMOJI) && + has_property(&props[0], PROP_ZWJ) && + has_property(&props[1], PROP_EXTPICT)) { return 0; } /* GB12/GB13 */ - if (IS(0, PROP_RI) && IS(1, PROP_RI) && - (*state & GRAPHEME_STATE_RI_ODD)) { + if (has_property(&props[0], PROP_RI) && + has_property(&props[1], PROP_RI) && + (s & GRAPHEME_STATE_RI_ODD)) { return 0; }