libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit a4d42053f13e8471ee3903522f964fc0a1d3161a
parent 65785f699be45dd77bdcbfc1d3aded39151f3205
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 24 Sep 2022 12:26:19 +0200

Refactor line-functions with Herodotus

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/line.c | 59+++++++++++++++++++++++------------------------------------
1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/src/line.c b/src/line.c @@ -18,22 +18,15 @@ get_break_prop(uint_least32_t cp) } static size_t -next_line_break(const void *str, size_t len, size_t (*get_codepoint) - (const void *, size_t, size_t, uint_least32_t *)) +next_line_break(HERODOTUS_READER *r) { + HERODOTUS_READER tmp; enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop, last_non_sp_prop, last_non_sp_cm_or_zwj_prop; - enum line_break_property res; uint_least32_t cp; uint_least8_t lb25_level = 0; - size_t off, new_off; bool lb21a_flag = false, ri_even = true; - /* check degenerate cases */ - if (str == NULL || len == 0) { - return 0; - } - /* * Apply line breaking algorithm (UAX #14), see * https://unicode.org/reports/tr14/#Algorithm and tailoring @@ -47,28 +40,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint) * Initialize the different properties such that we have * a good state after the state-update in the loop */ - cp0_prop = NUM_LINE_BREAK_PROPS; - if ((off = get_codepoint(str, len, 0, &cp)) >= len) { - /* - * A line is at least one codepoint long, so we can - * safely return here - */ - return len; - } - cp1_prop = get_break_prop(cp); last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */ last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS; - for (; off < len; off = new_off) { - /* update state */ - cp0_prop = cp1_prop; - if ((new_off = off + get_codepoint(str, len, off, &cp)) <= len) { - get_codepoint(str, len, off, &cp); - cp1_prop = get_break_prop(cp); - } else { - /* LB3 */ - break; - } + for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp); + herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS; + herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) { + /* get property of the right codepoint */ + cp1_prop = get_break_prop(cp); /* update retention-states */ @@ -380,14 +359,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint) * two adjacent codepoints as we have it with * characters. */ - if (new_off < len && + herodotus_reader_copy(r, &tmp); + herodotus_read_codepoint(&tmp, true, &cp); + if (herodotus_read_codepoint(&tmp, true, &cp) == + HERODOTUS_STATUS_SUCCESS && (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF || cp1_prop == LINE_BREAK_PROP_HY)) { - get_codepoint(str, len, new_off, &cp); - res = get_break_prop(cp); - - if (res == LINE_BREAK_PROP_NU) { + if (get_break_prop(cp) == LINE_BREAK_PROP_NU) { continue; } } @@ -507,17 +486,25 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint) break; } - return off; + return herodotus_reader_number_read(r); } size_t grapheme_next_line_break(const uint_least32_t *str, size_t len) { - return next_line_break(str, len, get_codepoint); + HERODOTUS_READER r; + + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); + + return next_line_break(&r); } size_t grapheme_next_line_break_utf8(const char *str, size_t len) { - return next_line_break(str, len, get_codepoint_utf8); + HERODOTUS_READER r; + + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); + + return next_line_break(&r); }