libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit bbbc72cba69445535dd035dfe1ee10d473655629
parent b9e1d4bbd4ce6a539999560c1cc863b645a080cd
Author: Laslo Hunhold <dev@frign.de>
Date:   Tue, 29 Nov 2022 23:23:53 +0100

Implement bidirectional bracket support

The single rule N0 in the Unicode Bidirectional Algorithm may not
sound like much, but it packs quite a punch and required some deep
work.

It wasn't exactly made simpler by the fact that the document is very
convoluted and not easy to follow. However, it helps to have experience
from the other algorithms and the automatic tests allow very broad
confirmation of proper function.

In particular, the following changes needed to be made: The generator
had to be modified to

	- Implement a decompositon to match canonically equivalent
	  brackets. This requires us to have UnicodeData.txt present,
	  but what matters is that the end result is fast and small.
	- The LUT-printing automatically detects type, because it's just
	  too fragile otherwise.

The implementation of the algorithm itself had the following changes:

	- The last strong type property of an isolate runner has been
	  refactored to be stateless. Otherwise, you can end up with
	  subtle bugs where strong types are added beforehand, yielding
	  a TOCTOU-problem.
	- The bracket parsing makes use of a novel FIFO structure that
	  combines the best of both worlds between a stack and naive
	  implementation.

As an end result, we now pass all ~900k bidi tests from the Unicode
standard.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 2+-
Mgen/bidirectional-test.c | 4++--
Mgen/bidirectional.c | 209+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mgen/case.c | 18+++++++++---------
Mgen/util.c | 44+++++++++++++++++++++++++++++++++++++-------
Mgen/util.h | 4++--
Msrc/bidirectional.c | 410++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtest/bidirectional.c | 3---
8 files changed, 628 insertions(+), 66 deletions(-)

diff --git a/Makefile b/Makefile @@ -233,7 +233,7 @@ test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a test/word: test/word.o test/util.o libgrapheme.a -gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/DerivedBidiClass.txt gen/bidirectional +gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/DerivedBidiClass.txt data/UnicodeData.txt gen/bidirectional gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bidirectional-test gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCasing.txt gen/case gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c @@ -471,8 +471,8 @@ main(int argc, char *argv[]) (void)argc; parse_file_with_callback("data/BidiTest.txt", test_callback, NULL); - /*parse_file_with_callback("data/BidiCharacterTest.txt", - character_test_callback, NULL);*/ + parse_file_with_callback("data/BidiCharacterTest.txt", + character_test_callback, NULL); bidirectional_test_list_print(test, testlen, "bidirectional_test", argv[0]); diff --git a/gen/bidirectional.c b/gen/bidirectional.c @@ -1,5 +1,6 @@ /* See LICENSE file for copyright and license details. */ #include <errno.h> +#include <inttypes.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> @@ -10,6 +11,9 @@ #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt" #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt" #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt" +#define FILE_UNICODE_DATA "data/UnicodeData.txt" + +#define NUM_BRACKET_ALIASES 20 static const struct property_spec bidi_property[] = { { @@ -130,36 +134,189 @@ static const struct property_spec bidi_property[] = { }, }; +struct decomposition_payload { + uint_least32_t cp; + uint_least32_t decomposition; +}; + +static int +decomposition_callback(const char *file, char **field, size_t nfields, + char *comment, void *payload) +{ + char *p; + struct decomposition_payload *decomp = + (struct decomposition_payload *)payload; + uint_least32_t cp; + + (void)file; + (void)comment; + + if (nfields < 6) { + /* we have fewer than 6 fields, discard the line */ + return 0; + } + + hextocp(field[0], strlen(field[0]), &cp); + + if (decomp->cp == cp) { + /* we hit the line that contains our decomposition target */ + if (strlen(field[5]) > 0) { + p = field[5]; + if (*p == '<') { + /* + * the decomposition contains some metadata + * <...> we skip + */ + for (; *p != '\0'; p++) { + if (*p == '>') { + p++; + while (*p == ' ') { + p++; + } + break; + } + } + } + hextocp(p, strlen(p), &(decomp->decomposition)); + } else { + decomp->decomposition = decomp->cp; + } + } + + return 0; +} + static struct { - uint_least32_t cp_base; - uint_least32_t cp_pair; + uint_least32_t base[NUM_BRACKET_ALIASES]; + size_t baselen; + uint_least32_t pair[NUM_BRACKET_ALIASES]; + size_t pairlen; + uint_least8_t class; char type; } *b = NULL; static size_t blen; +static uint_least8_t bracket_class_count = 1; static int bracket_callback(const char *file, char **field, size_t nfields, char *comment, void *payload) { + size_t i, j; + struct decomposition_payload decomp_base, decomp_pair; + uint_least32_t cp_base, cp_pair; + (void)file; (void)comment; (void)payload; if (nfields < 3) { - /* we have less than 3 fields, discard the line */ + /* we have fewer than 3 fields, discard the line */ return 0; } - /* extend bracket pair array */ + /* parse field data */ + hextocp(field[0], strlen(field[0]), &cp_base); + hextocp(field[1], strlen(field[1]), &cp_pair); + + /* determine decomposition of the base and pair codepoints */ + decomp_base.cp = cp_base; + decomp_pair.cp = cp_pair; + parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, + &decomp_base); + parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, + &decomp_pair); + + /* + * check if we already have the canonical form in the bracket array, + * per convention the canonical form is the first element of the alias + * array + */ + for (i = 0; i < blen; i++) { + if (decomp_base.decomposition == b[i].base[0]) { + /* we have a match, check type */ + if (strlen(field[2]) != 1 || + (field[2][0] != 'o' && field[2][0] != 'c')) { + /* malformed line */ + return 1; + } else if (b[i].type != field[2][0]) { + /* mismatching types */ + return 1; + } + + /* + * add our base alias to the base array unless it isn't + * already in it + */ + for (j = 0; j < b[i].baselen; j++) { + if (cp_base == b[i].base[j]) { + /* already in array, do nothing */ + break; + } + } + if (j == b[i].baselen) { + /* + * the base alias is not already in the array, + * add it + */ + if (b[i].baselen == NUM_BRACKET_ALIASES) { + fprintf(stderr, "too many aliases\n"); + return 1; + } + b[i].baselen++; + b[i].base[b[i].baselen - 1] = cp_base; + } + + /* + * also add our pair alias to the pair array unless + * it isn't already in it + */ + for (j = 0; j < b[i].pairlen; j++) { + if (cp_pair == b[i].pair[j]) { + /* already in array, do nothing */ + break; + } + } + if (j == b[i].pairlen) { + /* + * the pair alias is not already in the array, + * add it + */ + if (b[i].pairlen == NUM_BRACKET_ALIASES) { + fprintf(stderr, "too many aliases\n"); + return 1; + } + b[i].pairlen++; + b[i].pair[b[i].pairlen - 1] = cp_pair; + } + + return 0; + } + } + + /* extend bracket pair array, as this is a new bracket type */ if (!(b = realloc(b, (++blen) * sizeof(*b)))) { fprintf(stderr, "realloc: %s\n", strerror(errno)); exit(1); } - /* parse field data */ - hextocp(field[0], strlen(field[0]), &(b[blen - 1].cp_base)); - hextocp(field[1], strlen(field[1]), &(b[blen - 1].cp_pair)); + /* fill field data by adding the canonical form first */ + b[blen - 1].base[0] = decomp_base.decomposition; + b[blen - 1].baselen = 1; + b[blen - 1].pair[0] = decomp_pair.decomposition; + b[blen - 1].pairlen = 1; + + /* add alias if it differs from the canonical form */ + if (cp_base != decomp_base.decomposition) { + b[blen - 1].base[1] = cp_base; + b[blen - 1].baselen = 2; + } + if (cp_pair != decomp_pair.decomposition) { + b[blen - 1].pair[1] = cp_pair; + b[blen - 1].pairlen = 2; + } + + /* add bracket type */ if (strlen(field[2]) != 1 || (field[2][0] != 'o' && field[2][0] != 'c')) { /* malformed line */ @@ -168,13 +325,32 @@ bracket_callback(const char *file, char **field, size_t nfields, char *comment, b[blen - 1].type = field[2][0]; } + /* + * determine bracket class by iterating over the bracket-array + * and seeing if our current canonical cp already has a matching pair. + * We only need to check the first entry in each bracket alias + * list, as this is, per convention, the canonical form. + * If not, add a new class. + */ + for (i = 0; i + 1 < blen; i++) { + if (b[i].pair[0] == b[blen - 1].base[0]) { + /* matched class */ + b[blen - 1].class = b[i].class; + break; + } + } + if (i + 1 == blen) { + /* no match, assign a new class */ + b[blen - 1].class = bracket_class_count++; + } + return 0; } static void post_process(struct properties *prop) { - size_t i; + size_t i, j; for (i = 0; i < blen; i++) { /* @@ -185,7 +361,9 @@ post_process(struct properties *prop) * have offset 0, which we prepared to contain a stub * for a character that is not a bracket. */ - prop[b[i].cp_base].property |= (i << 5); + for (j = 0; j < b[i].baselen; j++) { + prop[b[i].base[j]].property |= (i << 5); + } } } @@ -236,7 +414,7 @@ main(int argc, char *argv[]) * all-zeros, as we use the implicit 0-offset for all those * codepoints that are not a bracket */ - if (!(b = calloc(1, sizeof(*b)))) { + if (!(b = calloc((blen = 1), sizeof(*b)))) { fprintf(stderr, "calloc: %s\n", strerror(errno)); exit(1); } @@ -248,16 +426,15 @@ main(int argc, char *argv[]) printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t" "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n" - "struct bracket {\n\tenum bracket_type type;\n" - "\tuint_least32_t pair;\n};\n\n" - "static const struct bracket bidi_bracket[] = {\n"); + "static const struct bracket {\n\tenum bracket_type type;\n" + "\tuint_least8_t class;\n} bidi_bracket[] = {\n"); for (i = 0; i < blen; i++) { - printf("\t{\n\t\t.type = %s,\n\t\t.pair = " - "UINT32_C(0x%06X),\n\t},\n", + printf("\t{\n\t\t.type = %s,\n\t\t.class = " + "%" PRIuLEAST8 ",\n\t},\n", (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" : (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" : "BIDI_BRACKET_NONE", - b[i].cp_pair); + b[i].class); } printf("};\n"); diff --git a/gen/case.c b/gen/case.c @@ -218,21 +218,21 @@ main(int argc, char *argv[]) properties_print_lookup_table("upper_major", mm_upper.major, 0x1100); printf("\n"); - properties_print_derived_lookup_table("upper_minor", "int_least32_t", - mm_upper.minor, mm_upper.minorlen, - get_value, comp_upper.data); + properties_print_derived_lookup_table("upper_minor", mm_upper.minor, + mm_upper.minorlen, get_value, + comp_upper.data); printf("\n"); properties_print_lookup_table("lower_major", mm_lower.major, 0x1100); printf("\n"); - properties_print_derived_lookup_table("lower_minor", "int_least32_t", - mm_lower.minor, mm_lower.minorlen, - get_value, comp_lower.data); + properties_print_derived_lookup_table("lower_minor", mm_lower.minor, + mm_lower.minorlen, get_value, + comp_lower.data); printf("\n"); properties_print_lookup_table("title_major", mm_title.major, 0x1100); printf("\n"); - properties_print_derived_lookup_table("title_minor", "int_least32_t", - mm_title.minor, mm_title.minorlen, - get_value, comp_title.data); + properties_print_derived_lookup_table("title_minor", mm_title.minor, + mm_title.minorlen, get_value, + comp_title.data); printf("\n"); printf("static const struct special_case upper_special[] = {\n"); diff --git a/gen/util.c b/gen/util.c @@ -405,9 +405,10 @@ properties_get_major_minor(const struct properties_compressed *comp, } void -properties_print_lookup_table(char *name, size_t *data, size_t datalen) +properties_print_lookup_table(const char *name, const size_t *data, + size_t datalen) { - char *type; + const char *type; size_t i, maxval; for (i = 0, maxval = 0; i < datalen; i++) { @@ -437,11 +438,41 @@ properties_print_lookup_table(char *name, size_t *data, size_t datalen) void properties_print_derived_lookup_table( - char *name, char *type, size_t *offset, size_t offsetlen, + char *name, size_t *offset, size_t offsetlen, int_least64_t (*get_value)(const struct properties *, size_t), const void *payload) { + const char *type; size_t i; + int_least64_t minval, maxval; + + for (i = 0, minval = INT_LEAST64_MAX, maxval = INT_LEAST64_MIN; + i < offsetlen; i++) { + if (get_value(payload, offset[i]) > maxval) { + maxval = get_value(payload, offset[i]); + } else if (get_value(payload, offset[i]) < minval) { + minval = get_value(payload, offset[i]); + } + } + + if (minval < 0) { + /* we need a signed type */ + type = (minval >= INT_LEAST8_MIN && maxval <= INT_LEAST8_MAX) ? + "int_least8_t" : + (minval >= INT_LEAST16_MIN && + maxval <= INT_LEAST16_MAX) ? + "int_least16_t" : + (minval >= INT_LEAST32_MIN && + maxval <= INT_LEAST32_MAX) ? + "int_least32_t" : + "int_least64_t"; + } else { + /* we are fine with an unsigned type */ + type = (maxval <= UINT_LEAST8_MAX) ? "uint_least8_t" : + (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" : + (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" : + "uint_least64_t"; + } printf("static const %s %s[] = {\n\t", type, name); for (i = 0; i < offsetlen; i++) { @@ -499,7 +530,7 @@ set_value_bp(struct properties_payload *payload, uint_least32_t cp, static int_least64_t get_value_bp(const struct properties *prop, size_t offset) { - return (uint_least8_t)prop[offset].property; + return prop[offset].property; } void @@ -606,9 +637,8 @@ properties_generate_break_property( properties_print_enum(spec, speclen, buf1, buf2); properties_print_lookup_table(buf3, mm.major, 0x1100); printf("\n"); - properties_print_derived_lookup_table(buf4, "uint_least8_t", mm.minor, - mm.minorlen, get_value_bp, - comp.data); + properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen, + get_value_bp, comp.data); /* free data */ free(prop); diff --git a/gen/util.h b/gen/util.h @@ -43,9 +43,9 @@ void properties_compress(const struct properties *, struct properties_compressed *comp); double properties_get_major_minor(const struct properties_compressed *, struct properties_major_minor *); -void properties_print_lookup_table(char *, size_t *, size_t); +void properties_print_lookup_table(const char *, const size_t *, size_t); void properties_print_derived_lookup_table( - char *, char *, size_t *, size_t, + char *, size_t *, size_t, int_least64_t (*get_value)(const struct properties *, size_t), const void *); diff --git a/src/bidirectional.c b/src/bidirectional.c @@ -74,6 +74,7 @@ set_state(enum state_type t, int_least16_t value, uint_least32_t *output) struct isolate_runner { uint_least32_t *buf; size_t buflen; + size_t start; struct { size_t off; @@ -83,7 +84,6 @@ struct isolate_runner { uint_least8_t paragraph_level; int_least8_t isolating_run_level; - enum bidi_property last_strong_type; }; static inline enum bidi_property @@ -110,14 +110,28 @@ ir_get_next_prop(const struct isolate_runner *ir) ir->buf[ir->next.off]); } +static inline enum bidi_property +ir_get_current_preserved_prop(const struct isolate_runner *ir) +{ + return (uint_least8_t)get_state(STATE_PRESERVED_PROP, + ir->buf[ir->cur.off]); +} + static inline int_least8_t ir_get_current_level(const struct isolate_runner *ir) { return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]); } +static inline const struct bracket * +ir_get_current_bracket_prop(const struct isolate_runner *ir) +{ + return bidi_bracket + + (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]); +} + static void -ir_set_current_prop(struct isolate_runner *ir, enum bidi_property prop) +ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop) { set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off])); } @@ -133,6 +147,7 @@ ir_init(uint_least32_t *buf, size_t buflen, size_t off, ir->buf = buf; ir->buflen = buflen; ir->paragraph_level = paragraph_level; + ir->start = off; /* advance off until we are at a non-removed character */ for (; off < buflen; off++) { @@ -225,18 +240,6 @@ ir_advance(struct isolate_runner *ir) /* mark as visited */ set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off])); - /* - * update last strong type, which is guaranteed to work properly - * on the first advancement as the prev.off is SIZE_T and the - * implied sos type can only be either R or L, which are both - * strong types - */ - if (ir_get_previous_prop(ir) == BIDI_PROP_R || - ir_get_previous_prop(ir) == BIDI_PROP_L || - ir_get_previous_prop(ir) == BIDI_PROP_AL) { - ir->last_strong_type = ir_get_previous_prop(ir); - } - /* initialize next state by going to the next character in the sequence */ ir->next.off = SIZE_MAX; @@ -328,6 +331,355 @@ ir_advance(struct isolate_runner *ir) return 0; } +static enum bidi_property +ir_get_last_strong_prop(const struct isolate_runner *ir) +{ + struct isolate_runner tmp; + enum bidi_property last_strong_prop = ir->sos, prop; + + ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, + &tmp); + for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { + prop = ir_get_current_prop(&tmp); + + if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || + prop == BIDI_PROP_AL) { + last_strong_prop = prop; + } + } + + return last_strong_prop; +} + +static enum bidi_property +ir_get_last_strong_or_number_prop(const struct isolate_runner *ir) +{ + struct isolate_runner tmp; + enum bidi_property last_strong_or_number_prop = ir->sos, prop; + + ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, + &tmp); + for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { + prop = ir_get_current_prop(&tmp); + + if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || + prop == BIDI_PROP_AL || prop == BIDI_PROP_EN || + prop == BIDI_PROP_AN) { + last_strong_or_number_prop = prop; + } + } + + return last_strong_or_number_prop; +} + +static void +preprocess_bracket_pair(const struct isolate_runner *start, + const struct isolate_runner *end) +{ + enum bidi_property prop, bracket_prop, last_strong_or_number_prop; + struct isolate_runner ir; + size_t strong_type_off; + + /* + * check if the bracket contains a strong type (L or R|EN|AN) + */ + for (ir = *start, strong_type_off = SIZE_MAX, + bracket_prop = NUM_BIDI_PROPS; + !ir_advance(&ir) && ir.cur.off < end->cur.off;) { + prop = ir_get_current_prop(&ir); + + if (prop == BIDI_PROP_L) { + strong_type_off = ir.cur.off; + if (ir.isolating_run_level % 2 == 0) { + /* + * set the type for both brackets to L (so they + * match the strong type they contain) + */ + bracket_prop = BIDI_PROP_L; + } + } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN || + prop == BIDI_PROP_AN) { + strong_type_off = ir.cur.off; + if (ir.isolating_run_level % 2 != 0) { + /* + * set the type for both brackets to R (so they + * match the strong type they contain) + */ + bracket_prop = BIDI_PROP_R; + } + } + } + if (strong_type_off == SIZE_MAX) { + /* + * there are no strong types within the brackets and we just + * leave the brackets as is + */ + return; + } + + if (bracket_prop == NUM_BIDI_PROPS) { + /* + * We encountered a strong type, but it was opposite + * to the embedding direction. + * Check the previous strong type before the opening + * bracket + */ + last_strong_or_number_prop = + ir_get_last_strong_or_number_prop(start); + if (last_strong_or_number_prop == BIDI_PROP_L && + ir.isolating_run_level % 2 != 0) { + /* + * the previous strong type is also opposite + * to the embedding direction, so the context + * was established and we set the brackets + * accordingly. + */ + bracket_prop = BIDI_PROP_L; + } else if ((last_strong_or_number_prop == BIDI_PROP_R || + last_strong_or_number_prop == BIDI_PROP_EN || + last_strong_or_number_prop == BIDI_PROP_AN) && + ir.isolating_run_level % 2 == 0) { + /* + * the previous strong type is also opposite + * to the embedding direction, so the context + * was established and we set the brackets + * accordingly. + */ + bracket_prop = BIDI_PROP_R; + } else { + /* set brackets to the embedding direction */ + if (ir.isolating_run_level % 2 == 0) { + bracket_prop = BIDI_PROP_L; + } else { + bracket_prop = BIDI_PROP_R; + } + } + } + + ir_set_current_prop(start, bracket_prop); + ir_set_current_prop(end, bracket_prop); + + /* + * any sequence of NSMs after opening or closing brackets get + * the same property as the one we set on the brackets + */ + for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop( + &ir) == BIDI_PROP_NSM;) { + ir_set_current_prop(&ir, bracket_prop); + } + for (ir = *end; !ir_advance(&ir) && + ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) { + ir_set_current_prop(&ir, bracket_prop); + } +} + +static void +preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off, + uint_least8_t paragraph_level) +{ + /* + * The N0-rule deals with bracket pairs that shall be determined + * with the rule BD16. This is specified as an algorithm with a + * stack of 63 bracket openings that are used to resolve into a + * separate list of pairs, which is then to be sorted by opening + * position. Thus, even though the bracketing-depth is limited + * by 63, the algorithm, as is, requires dynamic memory + * management. + * + * A naive approach (used by Fribidi) would be to screw the + * stack-approach and simply directly determine the + * corresponding closing bracket offset for a given opening + * bracket, leading to O(n²) time complexity in the worst case + * with a lot of brackets. While many brackets are not common, + * it is still possible to find a middle ground where you obtain + * strongly linear time complexity in most common cases: + * + * Instead of a stack, we use a FIFO data structure which is + * filled with bracket openings in the order of appearance (thus + * yielding an implicit sorting!) at the top. If the + * corresponding closing bracket is encountered, it is added to + * the respective entry, making it ready to "move out" at the + * bottom (i.e. passed to the bracket processing). Due to the + * nature of the specified pair detection algorithm, which only + * cares about the bracket type and nothing else (bidi class, + * level, etc.), we can mix processing and bracket detection. + * + * Obviously, if you, for instance, have one big bracket pair at + * the bottom that has not been closed yet, it will block the + * bracket processing and the FIFO might hit its capacity limit. + * At this point, the blockage is manually resolved using the + * naive quadratic approach. + * + * To remain within the specified standard behaviour, which + * mandates that processing of brackets should stop when the + * bracketing-depth is at 63, we simply check in an "overflow" + * scenario if all 63 elements in the LIFO are unfinished, which + * corresponds with such a bracketing depth. + */ + enum bidi_property prop; + + struct { + bool complete; + size_t bracket_class; + struct isolate_runner start; + struct isolate_runner end; + } fifo[63]; + const struct bracket *bracket_prop, *tmp_bracket_prop; + struct isolate_runner ir, tmp_ir; + size_t fifo_len = 0, i, blevel, j, k; + + ir_init(buf, buflen, off, paragraph_level, false, &ir); + while (!ir_advance(&ir)) { + prop = ir_get_current_prop(&ir); + bracket_prop = ir_get_current_bracket_prop(&ir); + if (prop == BIDI_PROP_ON && + bracket_prop->type == BIDI_BRACKET_OPEN) { + if (fifo_len == LEN(fifo)) { + /* + * The FIFO is full, check first if it's + * completely blocked (i.e. no finished + * bracket pairs, triggering the standard + * that mandates to abort in such a case + */ + for (i = 0; i < fifo_len; i++) { + if (fifo[i].complete) { + break; + } + } + if (i == fifo_len) { + /* abort processing */ + return; + } + + /* + * by construction, the bottom entry + * in the FIFO is guaranteed to be + * unfinished (given we "consume" all + * finished bottom entries after each + * iteration). + * + * iterate, starting after the opening + * bracket, and find the corresponding + * closing bracket. + * + * if we find none, just drop the FIFO + * entry silently + */ + for (tmp_ir = fifo[0].start, blevel = 0; + !ir_advance(&tmp_ir);) { + tmp_bracket_prop = + ir_get_current_bracket_prop( + &tmp_ir); + + if (tmp_bracket_prop->type == + BIDI_BRACKET_OPEN && + tmp_bracket_prop->class == + bracket_prop->class) { + /* we encountered another + * opening bracket of the + * same class */ + blevel++; + + } else if (tmp_bracket_prop->type == + BIDI_BRACKET_CLOSE && + tmp_bracket_prop->class == + bracket_prop + ->class) { + /* we encountered a closing + * bracket of the same class + */ + if (blevel == 0) { + /* this is the + * corresponding + * closing bracket + */ + fifo[0].complete = true; + fifo[0].end = ir; + } else { + blevel--; + } + } + } + if (fifo[0].complete) { + /* we found the matching bracket */ + preprocess_bracket_pair( + &(fifo[i].start), + &(fifo[i].end)); + } + + /* shift FIFO one to the left */ + for (i = 1; i < fifo_len; i++) { + fifo[i - 1] = fifo[i]; + } + fifo_len--; + } + + /* add element to the FIFO */ + fifo_len++; + fifo[fifo_len - 1].complete = false; + fifo[fifo_len - 1].bracket_class = bracket_prop->class; + fifo[fifo_len - 1].start = ir; + } else if (prop == BIDI_PROP_ON && + bracket_prop->type == BIDI_BRACKET_CLOSE) { + /* + * go backwards in the FIFO, skip finished entries + * and simply ignore (do nothing) the closing + * bracket if we do not match anything + */ + for (i = fifo_len; i > 0; i--) { + if (bracket_prop->class == + fifo[i - 1].bracket_class && + !fifo[i - 1].complete) { + /* we have found a pair */ + fifo[i - 1].complete = true; + fifo[i - 1].end = ir; + + /* remove all uncompleted FIFO elements + * above i - 1 */ + for (j = i; j < fifo_len; j++) { + if (fifo[j].complete) { + continue; + } + + /* shift FIFO one to the left */ + for (k = j + 1; k < fifo_len; + k++) { + fifo[k - 1] = fifo[k]; + } + fifo_len--; + } + break; + } + } + } + + /* process all ready bracket pairs from the FIFO bottom */ + while (fifo_len > 0 && fifo[0].complete) { + preprocess_bracket_pair(&(fifo[0].start), + &(fifo[0].end)); + + /* shift FIFO one to the left */ + for (j = 0; j + 1 < fifo_len; j++) { + fifo[j] = fifo[j + 1]; + } + fifo_len--; + } + } + + /* + * afterwards, we still might have unfinished bracket pairs + * that will remain as such, but the subsequent finished pairs + * also need to be taken into account, so we go through the + * FIFO once more and process all finished pairs + */ + for (i = 0; i < fifo_len; i++) { + if (fifo[i].complete) { + preprocess_bracket_pair(&(fifo[i].start), + &(fifo[i].end)); + } + } +} + static size_t preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, size_t off, uint_least8_t paragraph_level) @@ -355,7 +707,7 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, ir_init(buf, buflen, off, paragraph_level, false, &ir); while (!ir_advance(&ir)) { if (ir_get_current_prop(&ir) == BIDI_PROP_EN && - ir.last_strong_type == BIDI_PROP_AL) { + ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) { ir_set_current_prop(&ir, BIDI_PROP_AN); } } @@ -438,12 +790,13 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, ir_init(buf, buflen, off, paragraph_level, false, &ir); while (!ir_advance(&ir)) { if (ir_get_current_prop(&ir) == BIDI_PROP_EN && - ir.last_strong_type == BIDI_PROP_L) { + ir_get_last_strong_prop(&ir) == BIDI_PROP_L) { ir_set_current_prop(&ir, BIDI_PROP_L); } } /* N0 */ + preprocess_bracket_pairs(buf, buflen, off, paragraph_level); /* N1 */ sequence_end = SIZE_MAX; @@ -457,10 +810,11 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, prop == BIDI_PROP_WS || prop == BIDI_PROP_ON || prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { - /* the current character is an NI (neutral or - * isolate) */ + /* the current character is an NI (neutral + * or isolate) */ - /* scan ahead to the end of the NI-sequence */ + /* scan ahead to the end of the NI-sequence + */ ir_init(buf, buflen, ir.cur.off, paragraph_level, (ir.cur.off > off), &tmp); @@ -480,8 +834,8 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, } /* - * check what follows and see if the text has - * the same direction on both sides + * check what follows and see if the text + * has the same direction on both sides */ if (ir_get_previous_prop(&ir) == BIDI_PROP_L && ir_get_next_prop(&tmp) == BIDI_PROP_L) { @@ -794,7 +1148,8 @@ again: } else if (prop == BIDI_PROP_PDI) { /* X6a */ if (overflow_isolate_count > 0) { - /* PDI matches an overflow isolate initiator */ + /* PDI matches an overflow isolate initiator + */ overflow_isolate_count--; } else if (valid_isolate_count > 0) { /* PDI matches a normal isolate initiator */ @@ -812,9 +1167,10 @@ again: * * POSSIBLE OPTIMIZATION: Whenever * we push on the stack, check if it - * has the directional isolate status - * true and store a pointer to it - * so we can jump to it very quickly. + * has the directional isolate + * status true and store a pointer + * to it so we can jump to it very + * quickly. */ dirstat--; } @@ -974,7 +1330,9 @@ static inline uint_least8_t get_bidi_bracket_off(uint_least32_t cp) { if (likely(cp <= 0x10FFFF)) { - return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) >> 5; + return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] + + (cp & 0xff)]) >> + 5); } else { return 0; } diff --git a/test/bidirectional.c b/test/bidirectional.c @@ -25,9 +25,6 @@ main(int argc, char *argv[]) } for (i = 0, failed = 0; i < LEN(bidirectional_test); i++) { - /*if (i != 490798) - continue;*/ - for (m = 0; m < bidirectional_test[i].modelen; m++) { ret = grapheme_bidirectional_preprocess( bidirectional_test[i].cp,