libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 65785f699be45dd77bdcbfc1d3aded39151f3205
parent b13acfd6cd5114fcddbffaf9855664a95f966403
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 24 Sep 2022 11:45:20 +0200

Refactor character-functions with Herodotus

This also unifies the code and drops a lot of complicated state
handling.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/character.c | 60+++++++++++++++++++-----------------------------------------
Msrc/util.c | 6+++++-
2 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/src/character.c b/src/character.c @@ -175,61 +175,39 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA return !notbreak; } -size_t -grapheme_next_character_break(const uint_least32_t *str, size_t len) +static size_t +next_character_break(HERODOTUS_READER *r) { GRAPHEME_STATE state = { 0 }; - size_t off; - - if (str == NULL || len == 0) { - return 0; - } + uint_least32_t cp0 = 0, cp1 = 0; - for (off = 1; off < len; off++) { - if (grapheme_is_character_break(str[off - 1], str[off], &state)) { + for (herodotus_read_codepoint(r, true, &cp0); + herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS; + herodotus_read_codepoint(r, true, &cp0)) { + if (grapheme_is_character_break(cp0, cp1, &state)) { break; } } - return off; + return herodotus_reader_number_read(r); } size_t -grapheme_next_character_break_utf8(const char *str, size_t len) +grapheme_next_character_break(const uint_least32_t *str, size_t len) { - GRAPHEME_STATE state = { 0 }; - uint_least32_t cp0 = 0, cp1 = 0; - size_t off, ret; - - if (str == NULL || len == 0) { - return 0; - } + HERODOTUS_READER r; - for (off = 0; (len == SIZE_MAX) || off < len; off += ret) { - cp0 = cp1; - ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ? - SIZE_MAX : len - off, &cp1); + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); - if (len != SIZE_MAX && ret > (len - off)) { - /* string ended abruptly, simply accept cropping */ - ret = len - off; - } + return next_character_break(&r); +} - if (len == SIZE_MAX && cp1 == 0) { - /* we hit a NUL-byte and are done */ - break; - } +size_t +grapheme_next_character_break_utf8(const char *str, size_t len) +{ + HERODOTUS_READER r; - if (off == 0) { - /* - * we skip the first round, as we need both - * cp0 and cp1 to be initialized - */ - continue; - } else if (grapheme_is_character_break(cp0, cp1, &state)) { - break; - } - } + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); - return off; + return next_character_break(&r); } diff --git a/src/util.c b/src/util.c @@ -111,7 +111,11 @@ herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) } if (r->type == HERODOTUS_TYPE_CODEPOINT) { - *cp = ((const uint_least32_t *)(r->src))[r->off++]; + *cp = ((const uint_least32_t *)(r->src))[r->off]; + + if (advance) { + r->off++; + } } else { /* r->type == HERODOTUS_TYPE_UTF8 */ ret = grapheme_decode_utf8((const char *)r->src + r->off, MIN(r->srclen, r->soft_limit[0]) -