libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
parent 52b0e29e02068d6a8123042ef901f73e37b2f38f
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun,  2 Oct 2022 22:05:11 +0200

Refactor sentence-functions with Proper (using Herodotus in the background)

This refactor was a breeze and it passed all conformance tests on the
first try. This, just like with the word-functions, leads to a massive
simplification and separation of concerns in the code. And as with the
word functions, this fixes some known quirks.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/sentence.c | 426++++++++++++++++++++++++++++++++++---------------------------------------------
1 file changed, 181 insertions(+), 245 deletions(-)

diff --git a/src/sentence.c b/src/sentence.c @@ -6,11 +6,17 @@ #include "../grapheme.h" #include "util.h" -static inline enum sentence_break_property -get_break_prop(uint_least32_t cp) +struct sentence_break_state +{ + uint_least8_t aterm_close_sp_level; + uint_least8_t saterm_close_sp_parasep_level; +}; + +static inline uint_least8_t +get_sentence_break_prop(uint_least32_t cp) { if (likely(cp <= 0x10FFFF)) { - return (enum sentence_break_property) + return (uint_least8_t) sentence_break_minor[sentence_break_major[cp >> 8] + (cp & 0xff)]; } else { @@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp) } } -static size_t -next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) - (const void *, size_t, size_t, uint_least32_t *)) +static bool +is_skippable_sentence_prop(uint_least8_t prop) { - struct { - enum sentence_break_property a, b, c, d; - } raw, skip; - enum sentence_break_property res; - uint_least32_t cp; - uint_least8_t aterm_close_sp_level = 0, - saterm_close_sp_parasep_level = 0; - size_t off, tmp, new_off; + return prop == SENTENCE_BREAK_PROP_EXTEND || + prop == SENTENCE_BREAK_PROP_FORMAT; +} - /* check degenerate cases */ - if (str == NULL || len == 0) { - return 0; - } +static void +sentence_skip_shift_callback(uint_least8_t prop, void *s) +{ + struct sentence_break_state *state = (struct sentence_break_state *)s; /* - * Apply sentence breaking algorithm (UAX #29), see - * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules + * Here comes a bit of magic. The rules + * SB8, SB8a, SB9 and SB10 have very complicated + * left-hand-side-rules of the form * - * There are 4 slots (a, b, c, d) of "break" properties and - * we check if there is a break in the middle between b and c. + * ATerm Close* Sp* + * SATerm Close* + * SATerm Close* Sp* + * SATerm Close* Sp* ParaSep? * - * The position of this middle spot is determined by off, - * which gives the offset of the first element on the right - * hand side of said spot, or, in other words, gives the number - * of elements on the left hand side. + * but instead of backtracking, we keep the + * state as some kind of "power level" in + * two state-variables * - * It is further complicated by the fact that the algorithm - * expects you to skip certain characters for the second - * half of the rules (after SB5). Thus, we do not only have - * the "raw" properties as described above, but also the "skip" - * properties, where the skip.a and skip.b, for instance, - * give the two preceding character properties behind the - * currently investigated breakpoint. + * aterm_close_sp_level + * saterm_close_sp_parasep_level + * + * that go from 0 to 3/4: + * + * 0: we are not in the sequence + * 1: we have one ATerm/SATerm to the left of + * the middle spot + * 2: we have one ATerm/SATerm and one or more + * Close to the left of the middle spot + * 3: we have one ATerm/SATerm, zero or more + * Close and one or more Sp to the left of + * the middle spot. + * 4: we have one SATerm, zero or more Close, + * zero or more Sp and one ParaSep to the + * left of the middle spot. * */ - - /* - * Initialize the different properties such that we have - * a good state after the state-update in the loop - */ - raw.b = NUM_SENTENCE_BREAK_PROPS; - if ((off = get_codepoint(str, len, 0, &cp)) >= len) { - /* - * A line is at least one codepoint long, so we can - * safely return here - */ - return len; + if ((state->aterm_close_sp_level == 0 || + state->aterm_close_sp_level == 1) && + prop == SENTENCE_BREAK_PROP_ATERM) { + /* sequence has begun */ + state->aterm_close_sp_level = 1; + } else if ((state->aterm_close_sp_level == 1 || + state->aterm_close_sp_level == 2) && + prop == SENTENCE_BREAK_PROP_CLOSE) { + /* close-sequence begins or continued */ + state->aterm_close_sp_level = 2; + } else if ((state->aterm_close_sp_level == 1 || + state->aterm_close_sp_level == 2 || + state->aterm_close_sp_level == 3) && + prop == SENTENCE_BREAK_PROP_SP) { + /* sp-sequence begins or continued */ + state->aterm_close_sp_level = 3; + } else { + /* sequence broke */ + state->aterm_close_sp_level = 0; } - raw.c = get_break_prop(cp); - (void)get_codepoint(str, len, off, &cp); - raw.d = get_break_prop(cp); - skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS; - - for (; off < len; off = new_off) { - /* - * Update left side (a and b) of the skip state by - * "shifting in" the raw.c property as long as it is - * not one of the "ignored" character properties. - * While at it, update the RI-counter. - * - */ - if (raw.c != SENTENCE_BREAK_PROP_EXTEND && - raw.c != SENTENCE_BREAK_PROP_FORMAT) { - skip.a = skip.b; - skip.b = raw.c; - - /* - * Here comes a bit of magic. The rules - * SB8, SB8a, SB9 and SB10 have very complicated - * left-hand-side-rules of the form - * - * ATerm Close* Sp* - * SATerm Close* - * SATerm Close* Sp* - * SATerm Close* Sp* ParaSep? - * - * but instead of backtracking, we keep the - * state as some kind of "power level" in - * two variables - * - * aterm_close_sp_level - * saterm_close_sp_parasep_level - * - * that go from 0 to 3/4: - * - * 0: we are not in the sequence - * 1: we have one ATerm/SATerm to the left of - * the middle spot - * 2: we have one ATerm/SATerm and one or more - * Close to the left of the middle spot - * 3: we have one ATerm/SATerm, zero or more - * Close and one or more Sp to the left of - * the middle spot. - * 4: we have one SATerm, zero or more Close, - * zero or more Sp and one ParaSep to the - * left of the middle spot. - * - */ - if ((aterm_close_sp_level == 0 || - aterm_close_sp_level == 1) && - skip.b == SENTENCE_BREAK_PROP_ATERM) { - /* sequence has begun */ - aterm_close_sp_level = 1; - } else if ((aterm_close_sp_level == 1 || - aterm_close_sp_level == 2) && - skip.b == SENTENCE_BREAK_PROP_CLOSE) { - /* close-sequence begins or continued */ - aterm_close_sp_level = 2; - } else if ((aterm_close_sp_level == 1 || - aterm_close_sp_level == 2 || - aterm_close_sp_level == 3) && - skip.b == SENTENCE_BREAK_PROP_SP) { - /* sp-sequence begins or continued */ - aterm_close_sp_level = 3; - } else { - /* sequence broke */ - aterm_close_sp_level = 0; - } - if ((saterm_close_sp_parasep_level == 0 || - saterm_close_sp_parasep_level == 1) && - (skip.b == SENTENCE_BREAK_PROP_STERM || - skip.b == SENTENCE_BREAK_PROP_ATERM)) { - /* sequence has begun */ - saterm_close_sp_parasep_level = 1; - } else if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2) && - skip.b == SENTENCE_BREAK_PROP_CLOSE) { - /* close-sequence begins or continued */ - saterm_close_sp_parasep_level = 2; - } else if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2 || - saterm_close_sp_parasep_level == 3) && - skip.b == SENTENCE_BREAK_PROP_SP) { - /* sp-sequence begins or continued */ - saterm_close_sp_parasep_level = 3; - } else if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2 || - saterm_close_sp_parasep_level == 3) && - (skip.b == SENTENCE_BREAK_PROP_SEP || - skip.b == SENTENCE_BREAK_PROP_CR || - skip.b == SENTENCE_BREAK_PROP_LF)) { - /* ParaSep at the end of the sequence */ - saterm_close_sp_parasep_level = 4; - } else { - /* sequence broke */ - saterm_close_sp_parasep_level = 0; - } - } - - /* - * Update right side (b and c) of the skip state by - * starting at the breakpoint and detecting the two - * following non-ignored character classes - * - */ - skip.c = NUM_SENTENCE_BREAK_PROPS; - for (tmp = off; tmp < len; ) { - tmp += get_codepoint(str, len, tmp, &cp); - res = get_break_prop(cp); - - if (res != SENTENCE_BREAK_PROP_EXTEND && - res != SENTENCE_BREAK_PROP_FORMAT) { - skip.c = res; - break; - } - } - skip.d = NUM_SENTENCE_BREAK_PROPS; - for (; tmp < len; ) { - tmp += get_codepoint(str, len, tmp, &cp); - res = get_break_prop(cp); + if ((state->saterm_close_sp_parasep_level == 0 || + state->saterm_close_sp_parasep_level == 1) && + (prop == SENTENCE_BREAK_PROP_STERM || + prop == SENTENCE_BREAK_PROP_ATERM)) { + /* sequence has begun */ + state->saterm_close_sp_parasep_level = 1; + } else if ((state->saterm_close_sp_parasep_level == 1 || + state->saterm_close_sp_parasep_level == 2) && + prop == SENTENCE_BREAK_PROP_CLOSE) { + /* close-sequence begins or continued */ + state->saterm_close_sp_parasep_level = 2; + } else if ((state->saterm_close_sp_parasep_level == 1 || + state->saterm_close_sp_parasep_level == 2 || + state->saterm_close_sp_parasep_level == 3) && + prop == SENTENCE_BREAK_PROP_SP) { + /* sp-sequence begins or continued */ + state->saterm_close_sp_parasep_level = 3; + } else if ((state->saterm_close_sp_parasep_level == 1 || + state->saterm_close_sp_parasep_level == 2 || + state->saterm_close_sp_parasep_level == 3) && + (prop == SENTENCE_BREAK_PROP_SEP || + prop == SENTENCE_BREAK_PROP_CR || + prop == SENTENCE_BREAK_PROP_LF)) { + /* ParaSep at the end of the sequence */ + state->saterm_close_sp_parasep_level = 4; + } else { + /* sequence broke */ + state->saterm_close_sp_parasep_level = 0; + } +} - if (res != SENTENCE_BREAK_PROP_EXTEND && - res != SENTENCE_BREAK_PROP_FORMAT) { - skip.d = res; - break; - } - } +static size_t +next_sentence_break(HERODOTUS_READER *r) +{ + HERODOTUS_READER tmp; + enum sentence_break_property prop; + struct proper p; + struct sentence_break_state state = { 0 }; + uint_least32_t cp; - /* - * Update the raw state by simply shifting everything - * in and, if we still have data left, determining - * the character class of the next codepoint. - * - */ - raw.a = raw.b; - raw.b = raw.c; - raw.c = raw.d; - if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) { - get_codepoint(str, len, new_off, &cp); - raw.d = get_break_prop(cp); - } else { - raw.d = NUM_SENTENCE_BREAK_PROPS; - } + /* + * Apply sentence breaking algorithm (UAX #29), see + * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules + */ + proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS, + get_sentence_break_prop, is_skippable_sentence_prop, + sentence_skip_shift_callback, &p); + while (!proper_advance(&p)) { /* SB3 */ - if (raw.b == SENTENCE_BREAK_PROP_CR && - raw.c == SENTENCE_BREAK_PROP_LF) { + if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR && + p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) { continue; } /* SB4 */ - if (raw.b == SENTENCE_BREAK_PROP_SEP || - raw.b == SENTENCE_BREAK_PROP_CR || - raw.b == SENTENCE_BREAK_PROP_LF) { + if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP || + p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR || + p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) { break; } /* SB5 */ - if (raw.c == SENTENCE_BREAK_PROP_EXTEND || - raw.c == SENTENCE_BREAK_PROP_FORMAT) { + if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND || + p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) { continue; } /* SB6 */ - if (skip.b == SENTENCE_BREAK_PROP_ATERM && - skip.c == SENTENCE_BREAK_PROP_NUMERIC) { + if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) { continue; } /* SB7 */ - if (off > 1 && - (skip.a == SENTENCE_BREAK_PROP_UPPER || - skip.a == SENTENCE_BREAK_PROP_LOWER) && - skip.b == SENTENCE_BREAK_PROP_ATERM && - skip.c == SENTENCE_BREAK_PROP_UPPER) { + if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER || + p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) && + p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) { continue; } /* SB8 */ - if (aterm_close_sp_level == 1 || - aterm_close_sp_level == 2 || - aterm_close_sp_level == 3) { + if (state.aterm_close_sp_level == 1 || + state.aterm_close_sp_level == 2 || + state.aterm_close_sp_level == 3) { /* * This is the most complicated rule, requiring * the right-hand-side to satisfy the regular expression @@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower * * which we simply check "manually" given LUT-lookups - * are very cheap. + * are very cheap by starting at the mid_reader. * */ - for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) { - tmp += get_codepoint(str, len, tmp, &cp); - res = get_break_prop(cp); + herodotus_reader_copy(&(p.mid_reader), &tmp); + + prop = NUM_SENTENCE_BREAK_PROPS; + while (herodotus_read_codepoint(&tmp, true, &cp) == + HERODOTUS_STATUS_SUCCESS) { + prop = get_sentence_break_prop(cp); - if (res == SENTENCE_BREAK_PROP_OLETTER || - res == SENTENCE_BREAK_PROP_UPPER || - res == SENTENCE_BREAK_PROP_LOWER || - res == SENTENCE_BREAK_PROP_SEP || - res == SENTENCE_BREAK_PROP_CR || - res == SENTENCE_BREAK_PROP_LF || - res == SENTENCE_BREAK_PROP_STERM || - res == SENTENCE_BREAK_PROP_ATERM) { + /* + * the skippable properties are ignored + * automatically here given they do not + * match the following condition + */ + if (prop == SENTENCE_BREAK_PROP_OLETTER || + prop == SENTENCE_BREAK_PROP_UPPER || + prop == SENTENCE_BREAK_PROP_LOWER || + prop == SENTENCE_BREAK_PROP_SEP || + prop == SENTENCE_BREAK_PROP_CR || + prop == SENTENCE_BREAK_PROP_LF || + prop == SENTENCE_BREAK_PROP_STERM || + prop == SENTENCE_BREAK_PROP_ATERM) { break; } } - if (res == SENTENCE_BREAK_PROP_LOWER) { + if (prop == SENTENCE_BREAK_PROP_LOWER) { continue; } } /* SB8a */ - if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2 || - saterm_close_sp_parasep_level == 3) && - (skip.c == SENTENCE_BREAK_PROP_SCONTINUE || - skip.c == SENTENCE_BREAK_PROP_STERM || - skip.c == SENTENCE_BREAK_PROP_ATERM)) { + if ((state.saterm_close_sp_parasep_level == 1 || + state.saterm_close_sp_parasep_level == 2 || + state.saterm_close_sp_parasep_level == 3) && + (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) { continue; } /* SB9 */ - if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2) && - (skip.c == SENTENCE_BREAK_PROP_CLOSE || - skip.c == SENTENCE_BREAK_PROP_SP || - skip.c == SENTENCE_BREAK_PROP_SEP || - skip.c == SENTENCE_BREAK_PROP_CR || - skip.c == SENTENCE_BREAK_PROP_LF)) { + if ((state.saterm_close_sp_parasep_level == 1 || + state.saterm_close_sp_parasep_level == 2) && + (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { continue; } /* SB10 */ - if ((saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2 || - saterm_close_sp_parasep_level == 3) && - (skip.c == SENTENCE_BREAK_PROP_SP || - skip.c == SENTENCE_BREAK_PROP_SEP || - skip.c == SENTENCE_BREAK_PROP_CR || - skip.c == SENTENCE_BREAK_PROP_LF)) { + if ((state.saterm_close_sp_parasep_level == 1 || + state.saterm_close_sp_parasep_level == 2 || + state.saterm_close_sp_parasep_level == 3) && + (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || + p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { continue; } /* SB11 */ - if (saterm_close_sp_parasep_level == 1 || - saterm_close_sp_parasep_level == 2 || - saterm_close_sp_parasep_level == 3 || - saterm_close_sp_parasep_level == 4) { + if (state.saterm_close_sp_parasep_level == 1 || + state.saterm_close_sp_parasep_level == 2 || + state.saterm_close_sp_parasep_level == 3 || + state.saterm_close_sp_parasep_level == 4) { break; } @@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) continue; } - return off; + return herodotus_reader_number_read(&(p.mid_reader)); } size_t grapheme_next_sentence_break(const uint_least32_t *str, size_t len) { - return next_sentence_break(str, len, get_codepoint); + HERODOTUS_READER r; + + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); + + return next_sentence_break(&r); } size_t grapheme_next_sentence_break_utf8(const char *str, size_t len) { - return next_sentence_break(str, len, get_codepoint_utf8); + HERODOTUS_READER r; + + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); + + return next_sentence_break(&r); }