libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 07ba2622e073850bbdd6acd8dff88b391cc5ad5c
parent aafe6c300e59ed1b4407c71917fb2034fdc7798a
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon, 21 Nov 2022 08:53:14 +0100

Split bidi-level-processing into preprocessing and line step

The bidirectional algorithm is a bit convoluted in this regard,
but the canonical choice for the implementation is to do
preprocessing on all paragraphs first (applying all rules up to
L1.3) and applying rule L1.4 separately.

The reason for this is that rule L1.4 requires the knowledge
about line break positions, which we don't have (yet). We could
take it as a parameter for the preprocessing-function, however,
line breaks may change often (think of an ncurses-context with
window resizes), making constant complete reprocessings very
wasteful.

Thus, the line-specific processing is put into a separate
function. This way, the user passes each individual line together
with its preprocessing data.

Rule L1.4 will be implemented in a later commit.

Diffstat:
Mgrapheme.h | 18++++++++++++++----
Msrc/bidirectional.c | 64++++++++++++++++++++++++++++++++++++++--------------------------
Mtest/bidirectional.c | 15+++++++++------
3 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -15,16 +15,26 @@ enum grapheme_bidirectional_override { GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL, }; -size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); -size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); +void grapheme_bidirectional_get_line_embedding_levels( + const int_least32_t *, size_t, int_least8_t *); -size_t grapheme_get_bidirectional_embedding_levels( +size_t grapheme_bidirectional_preprocess( const uint_least32_t *, size_t, enum grapheme_bidirectional_override, int_least32_t *, size_t); -size_t grapheme_get_bidirectional_embedding_levels_utf8( +size_t grapheme_bidirectional_preprocess_utf8( const char *, size_t, enum grapheme_bidirectional_override, int_least32_t *, size_t); +size_t grapheme_bidirectional_reorder_line( + const uint_least32_t *, const int_least8_t *, size_t, + uint_least32_t *, size_t); +size_t grapheme_bidirectional_reorder_line_utf8( + const char *, const int_least8_t *, size_t, + char *, size_t); + +size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); +size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); + bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *); diff --git a/src/bidirectional.c b/src/bidirectional.c @@ -385,8 +385,8 @@ ir_advance(struct isolate_runner *ir) } static size_t -process_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off, - uint_least8_t paragraph_level) +preprocess_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off, + uint_least8_t paragraph_level) { enum bidi_property sequence_prop, prop; struct isolate_runner ir, tmp; @@ -652,8 +652,8 @@ get_paragraph_level(enum grapheme_bidirectional_override override, } static void -get_paragraph_embedding_levels(enum grapheme_bidirectional_override override, - int_least32_t *buf, size_t buflen) +preprocess_paragraph(enum grapheme_bidirectional_override override, + int_least32_t *buf, size_t buflen) { enum bidi_property prop; int_least8_t level; @@ -920,7 +920,7 @@ again: for (bufoff = 0; bufoff < buflen; bufoff++) { if (get_state(STATE_VISITED, buf[bufoff]) == 0 && get_state(STATE_LEVEL, buf[bufoff]) != -1) { - bufoff += process_isolating_run_sequence( + bufoff += preprocess_isolating_run_sequence( buf, buflen, bufoff, paragraph_level); } } @@ -964,6 +964,12 @@ again: continue; } + /* rules 1 and 2 */ + if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { + set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])); + } + + /* rule 3 */ if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { @@ -971,8 +977,12 @@ again: /* a new run has begun */ runsince = bufoff; } - } else if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { - /* L1.4 -- ignored for now, < beachten! */ + } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) && + runsince != SIZE_MAX) { + /* + * we hit a segment or paragraph separator in a + * sequence, reset sequence-levels + */ for (i = runsince; i < bufoff; i++) { if (get_state(STATE_LEVEL, buf[i]) != -1) { set_state(STATE_LEVEL, paragraph_level, @@ -984,11 +994,6 @@ again: /* sequence ended */ runsince = SIZE_MAX; } - - if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { - set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])); - } - continue; } if (runsince != SIZE_MAX) { /* @@ -1027,9 +1032,9 @@ get_bidi_bracket_off(uint_least32_t cp) } static size_t -get_embedding_levels(HERODOTUS_READER *r, - enum grapheme_bidirectional_override override, - int_least32_t *buf, size_t buflen) +preprocess(HERODOTUS_READER *r, + enum grapheme_bidirectional_override override, + int_least32_t *buf, size_t buflen) { size_t bufoff, bufsize, lastparoff; uint_least32_t cp; @@ -1086,16 +1091,11 @@ get_embedding_levels(HERODOTUS_READER *r, * the terminating character or last character of the * string respectively */ - get_paragraph_embedding_levels(override, buf + lastparoff, - bufoff + 1 - lastparoff); + preprocess_paragraph(override, buf + lastparoff, + bufoff + 1 - lastparoff); lastparoff = bufoff + 1; } - /* bake the levels into the buffer, discarding the metadata */ - for (bufoff = 0; bufoff < bufsize; bufoff++) { - buf[bufoff] = get_state(STATE_LEVEL, buf[bufoff]); - } - /* * we return the number of total bytes read, as the function * should indicate if the given level-buffer is too small @@ -1104,7 +1104,7 @@ get_embedding_levels(HERODOTUS_READER *r, } size_t -grapheme_get_bidirectional_embedding_levels( +grapheme_bidirectional_preprocess( const uint_least32_t *src, size_t srclen, enum grapheme_bidirectional_override override, int_least32_t *dest, size_t destlen) @@ -1113,11 +1113,11 @@ grapheme_get_bidirectional_embedding_levels( herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); - return get_embedding_levels(&r, override, dest, destlen); + return preprocess(&r, override, dest, destlen); } size_t -grapheme_get_bidirectional_embedding_levels_utf8( +grapheme_bidirectional_preprocess_utf8( const char *src, size_t srclen, enum grapheme_bidirectional_override override, int_least32_t *dest, size_t destlen) @@ -1126,5 +1126,17 @@ grapheme_get_bidirectional_embedding_levels_utf8( herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); - return get_embedding_levels(&r, override, dest, destlen); + return preprocess(&r, override, dest, destlen); +} + +void +grapheme_bidirectional_get_line_embedding_levels( + const int_least32_t *linedata, size_t linelen, int_least8_t *linelevel) +{ + size_t i; + + /* write the levels into the level-array */ + for (i = 0; i < linelen; i++) { + linelevel[i] = get_state(STATE_LEVEL, linedata[i]); + } } diff --git a/test/bidirectional.c b/test/bidirectional.c @@ -12,10 +12,11 @@ int main(int argc, char *argv[]) { - int_least32_t lev[512]; /* TODO iterate and get max, allocate */ - size_t i, num_tests, failed, levlen, ret, j, m; + int_least32_t data[512]; /* TODO iterate and get max, allocate */ + int_least8_t lev[512]; + size_t i, num_tests, failed, datalen, ret, j, m; - levlen = LEN(lev); + datalen = LEN(data); (void)argc; @@ -28,13 +29,15 @@ main(int argc, char *argv[]) continue;*/ for (m = 0; m < bidirectional_test[i].modelen; m++) { - ret = grapheme_get_bidirectional_embedding_levels( + ret = grapheme_bidirectional_preprocess( bidirectional_test[i].cp, bidirectional_test[i].cplen, - bidirectional_test[i].mode[m], lev, levlen); + bidirectional_test[i].mode[m], data, datalen); + grapheme_bidirectional_get_line_embedding_levels( + data, datalen, lev); if (ret != bidirectional_test[i].cplen || - ret > levlen) { + ret > datalen) { goto err; }