libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 52ee78ea80d51b163f7fc85e9387389266d2331b
parent 77e30a69ce0807fbee01d43eebedda34b54f41af
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 26 May 2023 09:40:10 +0200

Refactor bidi and add reordering function

- Rename bidi-override enum to bidi-direction, including entries. This
  better reflects the general nature of it.
- Remove UTF-8-related bidi-functions, given it would be too complicated
  to reflect in an API and opens up some very difficult challenges.
- Rename *_preprocess to *_preprocess_paragraph and return the resolved
  paragraph embedding level as an optional out-parameter. This is the
  only way to meaningfully handle large chunks of text with paragraphs
  of different embedding levels.
- Separate the get_paragraph_level() function into two for
  isolated-paragraphs and whole paragraphs. This simplifies it a lot, as
  we don't have the crazy bool-flag-mess any more.
- Add a grapheme_bidirectional_reorder_line function that directly
  operates on preprocessed data and returns the reordered string without
  any additionally necessary buffering. For this the
  get_line_embedding_levels had to be made a bit more general to allow
  different ways of writing the levels into the output.
  This function makes use of the mirror-LUT and has a small section
  still commented out regarding the proper inversion of grapheme
  clusters that will need more investigation.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgen/bidirectional-test.c | 44+++++++++++++++++++++++---------------------
Mgrapheme.h | 24++++++++----------------
Msrc/bidirectional.c | 432+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
3 files changed, 366 insertions(+), 134 deletions(-)

diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c @@ -12,7 +12,7 @@ struct bidirectional_test { uint_least32_t *cp; size_t cplen; - enum grapheme_bidirectional_override mode[3]; + enum grapheme_bidirectional_direction mode[3]; size_t modelen; int_least8_t *level; int_least8_t *reorder; @@ -210,7 +210,7 @@ bidirectional_test_list_print(const struct bidirectional_test *test, printf("static const struct {\n" "\tuint_least32_t *cp;\n" "\tsize_t cplen;\n" - "\tenum grapheme_bidirectional_override *mode;\n" + "\tenum grapheme_bidirectional_direction *mode;\n" "\tsize_t modelen;\n" "\tint_least8_t *level;\n" "\tint_least8_t *reorder;\n" @@ -230,18 +230,18 @@ bidirectional_test_list_print(const struct bidirectional_test *test, printf("\t\t.cplen = %zu,\n", test[i].cplen); printf("\t\t.mode = (enum " - "grapheme_bidirectional_override[]){"); + "grapheme_bidirectional_direction[]){"); for (j = 0; j < test[i].modelen; j++) { if (test[i].mode[j] == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL) { - printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_" + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) { + printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_" "NEUTRAL"); } else if (test[i].mode[j] == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { - printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR"); + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { + printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR"); } else if (test[i].mode[j] == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { - printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL"); + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { + printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL"); } if (j + 1 < test[i].modelen) { putchar(','); @@ -374,32 +374,32 @@ test_callback(const char *file, char **field, size_t nfields, char *comment, exit(1); } else if (field[1][0] == '2') { test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; test[testlen - 1].modelen = 1; } else if (field[1][0] == '3') { /* auto=0 and LTR=1 */ test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; test[testlen - 1].mode[1] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; test[testlen - 1].modelen = 2; } else if (field[1][0] == '4') { test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; test[testlen - 1].modelen = 1; } else if (field[1][0] == '5') { test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; test[testlen - 1].mode[1] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; test[testlen - 1].modelen = 2; } else if (field[1][0] == '7') { test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; test[testlen - 1].mode[1] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; test[testlen - 1].mode[2] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; test[testlen - 1].modelen = 3; } else { fprintf(stderr, @@ -445,12 +445,14 @@ character_test_callback(const char *file, char **field, size_t nfields, fprintf(stderr, "malformed paragraph-level-setting.\n"); exit(1); } else if (field[1][0] == '0') { - test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; + test[testlen - 1].mode[0] = + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; } else if (field[1][0] == '1') { - test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; + test[testlen - 1].mode[0] = + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; } else if (field[1][0] == '2') { test[testlen - 1].mode[0] = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; } else { fprintf(stderr, "unhandled paragraph-level-setting.\n"); exit(1); diff --git a/grapheme.h b/grapheme.h @@ -8,31 +8,23 @@ #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD) -/* TODO call it simply "direction" without override */ -enum grapheme_bidirectional_override { - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL, - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR, - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL, +enum grapheme_bidirectional_direction { + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL, + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR, + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL, }; size_t grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *, size_t, int_least8_t *, size_t); -size_t grapheme_bidirectional_preprocess(const uint_least32_t *, size_t, - enum grapheme_bidirectional_override, - uint_least32_t *, size_t); -size_t -grapheme_bidirectional_preprocess_utf8(const char *, size_t, - enum grapheme_bidirectional_override, - uint_least32_t *, size_t); +size_t grapheme_bidirectional_preprocess_paragraph( + const uint_least32_t *, size_t, enum grapheme_bidirectional_direction, + uint_least32_t *, size_t, enum grapheme_bidirectional_direction *); size_t grapheme_bidirectional_reorder_line(const uint_least32_t *, - const int_least8_t *, size_t, + const uint_least32_t *, size_t, uint_least32_t *, size_t); -size_t grapheme_bidirectional_reorder_line_utf8(const char *, - const int_least8_t *, size_t, - char *, size_t); size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); diff --git a/src/bidirectional.c b/src/bidirectional.c @@ -895,28 +895,17 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, } static uint_least8_t -get_paragraph_level(enum grapheme_bidirectional_override override, - bool terminate_on_pdi, const uint_least32_t *buf, - size_t buflen) +get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen) { enum bidi_property prop; int_least8_t isolate_level; - size_t bufoff; + size_t stateoff; - /* check overrides first according to rule HL1 */ - if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { - return 0; - } else if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { - return 1; - } - - /* determine paragraph level (rules P1-P3) */ + /* determine paragraph level (rules P1-P3) and terminate on PDI */ + for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) { + prop = get_state(STATE_PROP, state[stateoff]); - for (bufoff = 0, isolate_level = 0; bufoff < buflen; bufoff++) { - prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); - - if (prop == BIDI_PROP_PDI && isolate_level == 0 && - terminate_on_pdi) { + if (prop == BIDI_PROP_PDI && isolate_level == 0) { /* * we are in a FSI-subsection of a paragraph and * matched with the terminating PDI @@ -950,28 +939,86 @@ get_paragraph_level(enum grapheme_bidirectional_override override, return 0; } +static inline uint_least8_t +get_bidi_property(uint_least32_t cp) +{ + if (likely(cp <= 0x10FFFF)) { + return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & + 0x1F /* 00011111 */; + } else { + return BIDI_PROP_L; + } +} + +static uint_least8_t +get_paragraph_level(enum grapheme_bidirectional_direction override, + const HERODOTUS_READER *r) +{ + HERODOTUS_READER tmp; + enum bidi_property prop; + int_least8_t isolate_level; + uint_least32_t cp; + + /* check overrides first according to rule HL1 */ + if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { + return 0; + } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { + return 1; + } + + /* copy reader into temporary reader */ + herodotus_reader_copy(r, &tmp); + + /* determine paragraph level (rules P1-P3) */ + for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) == + HERODOTUS_STATUS_SUCCESS;) { + prop = get_bidi_property(cp); + + /* BD8/BD9 */ + if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || + prop == BIDI_PROP_FSI) && + isolate_level < MAX_DEPTH) { + /* we hit an isolate initiator, increment counter */ + isolate_level++; + } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { + isolate_level--; + } + + /* P2 */ + if (isolate_level > 0) { + continue; + } + + /* P3 */ + if (prop == BIDI_PROP_L) { + return 0; + } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { + return 1; + } + } + + return 0; +} + static void -preprocess_paragraph(enum grapheme_bidirectional_override override, - uint_least32_t *buf, size_t buflen) +preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf, + size_t buflen) { enum bidi_property prop; int_least8_t level; struct { int_least8_t level; - enum grapheme_bidirectional_override override; + enum grapheme_bidirectional_direction override; bool directional_isolate; } directional_status[MAX_DEPTH + 2], *dirstat = directional_status; size_t overflow_isolate_count, overflow_embedding_count, valid_isolate_count, bufoff, i, runsince; - uint_least8_t paragraph_level; - - paragraph_level = get_paragraph_level(override, false, buf, buflen); /* X1 */ dirstat->level = (int_least8_t)paragraph_level; - dirstat->override = GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; dirstat->directional_isolate = false; overflow_isolate_count = overflow_embedding_count = valid_isolate_count = 0; @@ -995,7 +1042,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 != 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; dirstat->directional_isolate = false; } else { /* overflow RLE */ @@ -1014,7 +1061,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 == 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; dirstat->directional_isolate = false; } else { /* overflow LRE */ @@ -1033,7 +1080,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 != 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; dirstat->directional_isolate = false; } else { /* overflow RLO */ @@ -1052,7 +1099,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 == 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; dirstat->directional_isolate = false; } else { /* overflow LRO */ @@ -1063,11 +1110,11 @@ again: /* X5a */ set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { set_state(STATE_PROP, BIDI_PROP_L, &(buf[bufoff])); } else if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { set_state(STATE_PROP, BIDI_PROP_R, &(buf[bufoff])); } @@ -1084,7 +1131,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 != 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; dirstat->directional_isolate = true; } else { /* overflow RLI */ @@ -1094,11 +1141,11 @@ again: /* X5b */ set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { set_state(STATE_PROP, BIDI_PROP_L, &(buf[bufoff])); } else if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { set_state(STATE_PROP, BIDI_PROP_R, &(buf[bufoff])); } @@ -1115,7 +1162,7 @@ again: (dirstat - 1)->level + ((dirstat - 1)->level % 2 == 0) + 1; dirstat->override = - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; + GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; dirstat->directional_isolate = true; } else { /* overflow LRI */ @@ -1123,9 +1170,8 @@ again: } } else if (prop == BIDI_PROP_FSI) { /* X5c */ - if (get_paragraph_level( - GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL, - true, buf + (bufoff + 1), + if (get_isolated_paragraph_level( + buf + (bufoff + 1), buflen - (bufoff + 1)) == 1) { prop = BIDI_PROP_RLI; goto again; @@ -1138,11 +1184,11 @@ again: /* X6 */ set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { set_state(STATE_PROP, BIDI_PROP_L, &(buf[bufoff])); } else if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { set_state(STATE_PROP, BIDI_PROP_R, &(buf[bufoff])); } @@ -1190,11 +1236,11 @@ again: set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { set_state(STATE_PROP, BIDI_PROP_L, &(buf[bufoff])); } else if (dirstat->override == - GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { + GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { set_state(STATE_PROP, BIDI_PROP_R, &(buf[bufoff])); } @@ -1317,17 +1363,6 @@ again: } static inline uint_least8_t -get_bidi_property(uint_least32_t cp) -{ - if (likely(cp <= 0x10FFFF)) { - return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & - 0x1F /* 00011111 */; - } else { - return BIDI_PROP_L; - } -} - -static inline uint_least8_t get_bidi_bracket_off(uint_least32_t cp) { if (likely(cp <= 0x10FFFF)) { @@ -1340,20 +1375,35 @@ get_bidi_bracket_off(uint_least32_t cp) } static size_t -preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, - uint_least32_t *buf, size_t buflen) +preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override, + uint_least32_t *buf, size_t buflen, + enum grapheme_bidirectional_direction *resolved) { - size_t bufoff, bufsize, lastparoff; + HERODOTUS_READER tmp; + size_t bufoff, bufsize, paragraph_len; uint_least32_t cp; + uint_least8_t paragraph_level; - if (buf == NULL) { - for (; herodotus_read_codepoint(r, true, &cp) == - HERODOTUS_STATUS_SUCCESS;) { - ; + /* determine length and level of the paragraph */ + herodotus_reader_copy(r, &tmp); + for (; herodotus_read_codepoint(&tmp, true, &cp) == + HERODOTUS_STATUS_SUCCESS;) { + /* break on paragraph separator */ + if (get_bidi_property(cp) == BIDI_PROP_B) { + break; } + } + paragraph_len = herodotus_reader_number_read(&tmp); + paragraph_level = get_paragraph_level(override, r); + + if (resolved != NULL) { + /* store resolved paragraph level in output variable */ + *resolved = paragraph_level; + } + if (buf == NULL) { /* see below for return value reasoning */ - return herodotus_reader_number_read(r); + return paragraph_len; } /* @@ -1361,6 +1411,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, * and store them in the buffer */ for (bufoff = 0; + bufoff < paragraph_len && herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS; bufoff++) { if (bufoff < buflen) { @@ -1385,7 +1436,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, } bufsize = herodotus_reader_number_read(r); - for (bufoff = 0, lastparoff = 0; bufoff < bufsize; bufoff++) { + for (bufoff = 0; bufoff < bufsize; bufoff++) { if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B && bufoff != bufsize - 1) { continue; @@ -1398,9 +1449,8 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, * the terminating character or last character of the * string respectively */ - preprocess_paragraph(override, buf + lastparoff, - bufoff + 1 - lastparoff); - lastparoff = bufoff + 1; + preprocess_paragraph(paragraph_level, buf, bufoff + 1); + break; } /* @@ -1411,50 +1461,41 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, } size_t -grapheme_bidirectional_preprocess(const uint_least32_t *src, size_t srclen, - enum grapheme_bidirectional_override override, - uint_least32_t *dest, size_t destlen) +grapheme_bidirectional_preprocess_paragraph( + const uint_least32_t *src, size_t srclen, + enum grapheme_bidirectional_direction override, uint_least32_t *dest, + size_t destlen, enum grapheme_bidirectional_direction *resolved) { HERODOTUS_READER r; herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); - return preprocess(&r, override, dest, destlen); + return preprocess(&r, override, dest, destlen, resolved); } -size_t -grapheme_bidirectional_preprocess_utf8( - const char *src, size_t srclen, - enum grapheme_bidirectional_override override, uint_least32_t *dest, - size_t destlen) -{ - HERODOTUS_READER r; - - herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); - - return preprocess(&r, override, dest, destlen); -} - -size_t -grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata, - size_t linelen, - int_least8_t *lev, - size_t levlen) +static inline size_t +get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen, + int_least8_t (*get_level)(const void *, size_t), + void (*set_level)(void *, size_t, int_least8_t), + void *lev, size_t levsize, bool skipignored) { enum bidi_property prop; - size_t i, runsince; - int_least8_t level; + size_t i, levlen, runsince; + int_least8_t level, runlevel; /* rule L1.4 */ runsince = SIZE_MAX; - for (i = 0; i < linelen; i++) { + for (i = 0, levlen = 0; i < linelen; i++) { level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]); prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, linedata[i]); /* write level into level array if we still have space */ - if (i < levlen) { - lev[i] = level; + if (level != -1 || skipignored == false) { + if (levlen <= levsize) { + set_level(lev, levlen, level); + } + levlen++; } if (level == -1) { @@ -1467,11 +1508,14 @@ grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata, prop == BIDI_PROP_PDI) { if (runsince == SIZE_MAX) { /* a new run has begun */ - runsince = i; + runsince = levlen - 1; /* levlen > 0 */ + runlevel = get_state(STATE_PARAGRAPH_LEVEL, + linedata[i]); } } else { /* sequence ended */ runsince = SIZE_MAX; + runlevel = -1; } } if (runsince != SIZE_MAX) { @@ -1479,13 +1523,207 @@ grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata, * we hit the end of the line but were in a run; * reset the line levels to the paragraph level */ - for (i = runsince; i < MIN(linelen, levlen); i++) { - if (lev[i] != -1) { - lev[i] = (int_least8_t)get_state( - STATE_PARAGRAPH_LEVEL, linedata[i]); + for (i = runsince; i < MIN(linelen, levsize); i++) { + if (get_level(lev, i) != -1) { + set_level(lev, i, runlevel); + } + } + } + + return levlen; +} + +static inline int_least8_t +get_level_int8(const void *lev, size_t off) +{ + return ((int_least8_t *)lev)[off]; +} + +static inline void +set_level_int8(void *lev, size_t off, int_least8_t value) +{ + ((int_least8_t *)lev)[off] = value; +} + +size_t +grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata, + size_t linelen, + int_least8_t *lev, + size_t levlen) +{ + return get_line_embedding_levels(linedata, linelen, get_level_int8, + set_level_int8, lev, levlen, false); +} + +static inline int_least8_t +get_level_uint32(const void *lev, size_t off) +{ + return (int_least8_t)((((uint_least32_t *)lev)[off] & + UINT32_C(0x1FE00000)) >> + 21) - + 1; +} + +static inline void +set_level_uint32(void *lev, size_t off, int_least8_t value) +{ + ((uint_least32_t *)lev)[off] ^= + ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000); + ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21; +} + +static inline int_least16_t +get_mirror_offset(uint_least32_t cp) +{ + if (cp <= UINT32_C(0x10FFFF)) { + return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)]; + } else { + return 0; + } +} + +size_t +grapheme_bidirectional_reorder_line(const uint_least32_t *line, + const uint_least32_t *linedata, + size_t linelen, uint_least32_t *output, + size_t outputsize) +{ + size_t i, outputlen, first, last, j, k, l, laststart; + int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0; + uint_least32_t tmp; + + /* write output characters */ + for (i = 0, outputlen = 0; i < linelen; i++) { + if (get_state(STATE_LEVEL, linedata[i]) != -1) { + if (outputlen < outputsize) { + output[outputlen] = line[i]; + } + outputlen++; + } + } + if (outputlen >= outputsize) { + /* clear output buffer */ + for (i = 0; i < outputsize; i++) { + output[i] = GRAPHEME_INVALID_CODEPOINT; + } + + /* return required size */ + return outputlen; + } + + /* + * write line embedding levels as metadata and codepoints into the + * output + */ + get_line_embedding_levels(linedata, linelen, get_level_uint32, + set_level_uint32, output, linelen, true); + + /* determine level range */ + for (i = 0; i < outputlen; i++) { + level = get_level_uint32(output, i); + + if (level == -1) { + /* ignored character */ + continue; + } + + if (level % 2 == 1 && level < min_odd_level) { + min_odd_level = level; + } + if (level > max_level) { + max_level = level; + } + } + + for (level = max_level; level >= min_odd_level /* > 0 */; level--) { + for (i = 0; i < outputlen; i++) { + if (get_level_uint32(output, i) >= level) { + /* + * the current character has the desired level + */ + first = last = i; + + /* find the end of the level-sequence */ + for (i++; i < outputlen; i++) { + if (get_level_uint32(output, i) >= + level) { + /* the sequence continues */ + last = i; + } else { + break; + } + } + + /* invert the sequence first..last respecting + * grapheme clusters + * + * The standard only speaks of combining marks + * inversion, but we should in the perfect case + * respect _all_ grapheme clusters, which we do + * here! + */ + + /* mark grapheme cluster breaks */ + for (j = first; j <= last; + j += grapheme_next_character_break( + line + j, outputlen - j)) { + /* + * we use a special trick here: The + * first 21 bits of the state are filled + * with the codepoint, the next 8 bits + * are used for the level, so we can use + * the 30th bit to mark the grapheme + * cluster breaks. This allows us to + * reinvert the grapheme clusters into + * the proper direction later. + */ + output[j] |= UINT32_C(1) << 29; + } + + /* global inversion */ + for (k = first, l = last; k < l; k++, l--) { + /* swap */ + tmp = output[k]; + output[k] = output[l]; + output[l] = tmp; + } + + /* grapheme cluster reinversion */ +#if 0 + for (j = first, laststart = first; j <= last; + j++) { + if (output[j] & (UINT32_C(1) << 29)) { + /* we hit a mark! given the + * grapheme cluster is inverted, + * this means that the cluster + * ended and we now reinvert it + * again + */ + for (k = laststart, l = j; + k < l; k++, l--) { + /* swap */ + tmp = output[k]; + output[k] = output[l]; + output[l] = tmp; + } + laststart = j + 1; + } + } +#endif + + /* unmark grapheme cluster breaks */ + for (j = first; j <= last; j++) { + output[j] ^= output[j] & + UINT32_C(0x20000000); + } } } } - return linelen; + /* remove embedding level metadata */ + for (i = 0; i < outputlen; i++) { + output[i] ^= output[i] & UINT32_C(0x1FE00000); + } + + return outputlen; }