libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

bidirectional.c (48331B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdbool.h>
      3#include <stddef.h>
      4
      5#include "../gen/bidirectional.h"
      6#include "../grapheme.h"
      7#include "util.h"
      8
      9#define MAX_DEPTH 125
     10
     11enum state_type {
     12	STATE_PROP,            /* in 0..23, bidi_property */
     13	STATE_PRESERVED_PROP,  /* in 0..23, preserved bidi_prop for L1-rule */
     14	STATE_BRACKET_OFF,     /* in 0..255, offset in bidi_bracket */
     15	STATE_LEVEL,           /* in 0..MAX_DEPTH+1=126, embedding level */
     16	STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
     17	STATE_VISITED,         /* in 0..1, visited within isolating run */
     18};
     19
     20static struct {
     21	uint_least32_t filter_mask;
     22	size_t mask_shift;
     23	int_least16_t value_offset;
     24} state_lut[] = {
     25	[STATE_PROP] = {
     26		.filter_mask  = 0x000001F, /* 00000000 00000000 00000000 00011111 */
     27		.mask_shift   = 0,
     28		.value_offset = 0,
     29	},
     30	[STATE_PRESERVED_PROP] = {
     31		.filter_mask  = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
     32		.mask_shift   = 5,
     33		.value_offset = 0,
     34	},
     35	[STATE_BRACKET_OFF] = {
     36		.filter_mask  = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
     37		.mask_shift   = 10,
     38		.value_offset = 0,
     39	},
     40	[STATE_LEVEL] = {
     41		.filter_mask  = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
     42		.mask_shift   = 18,
     43		.value_offset = -1,
     44	},
     45	[STATE_PARAGRAPH_LEVEL] = {
     46		.filter_mask  = 0x2000000, /* 00000010 00000000 00000000 00000000 */
     47		.mask_shift   = 25,
     48		.value_offset = 0,
     49	},
     50	[STATE_VISITED] = {
     51		.filter_mask  = 0x4000000, /* 00000100 00000000 00000000 00000000 */
     52		.mask_shift   = 26,
     53		.value_offset = 0,
     54	},
     55};
     56
     57static inline int_least16_t
     58get_state(enum state_type t, uint_least32_t input)
     59{
     60	return (int_least16_t)((input & state_lut[t].filter_mask) >>
     61	                       state_lut[t].mask_shift) +
     62	       state_lut[t].value_offset;
     63}
     64
     65static inline void
     66set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
     67{
     68	*output &= ~state_lut[t].filter_mask;
     69	*output |= ((uint_least32_t)(value - state_lut[t].value_offset)
     70	            << state_lut[t].mask_shift) &
     71	           state_lut[t].filter_mask;
     72}
     73
     74struct isolate_runner {
     75	uint_least32_t *buf;
     76	size_t buflen;
     77	size_t start;
     78
     79	struct {
     80		size_t off;
     81	} prev, cur, next;
     82
     83	enum bidi_property sos, eos;
     84
     85	uint_least8_t paragraph_level;
     86	int_least8_t isolating_run_level;
     87};
     88
     89static inline enum bidi_property
     90ir_get_previous_prop(const struct isolate_runner *ir)
     91{
     92	return (ir->prev.off == SIZE_MAX) ?
     93	               ir->sos :
     94	               (uint_least8_t)get_state(STATE_PROP,
     95	                                        ir->buf[ir->prev.off]);
     96}
     97
     98static inline enum bidi_property
     99ir_get_current_prop(const struct isolate_runner *ir)
    100{
    101	return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]);
    102}
    103
    104static inline enum bidi_property
    105ir_get_next_prop(const struct isolate_runner *ir)
    106{
    107	return (ir->next.off == SIZE_MAX) ?
    108	               ir->eos :
    109	               (uint_least8_t)get_state(STATE_PROP,
    110	                                        ir->buf[ir->next.off]);
    111}
    112
    113static inline enum bidi_property
    114ir_get_current_preserved_prop(const struct isolate_runner *ir)
    115{
    116	return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
    117	                                ir->buf[ir->cur.off]);
    118}
    119
    120static inline int_least8_t
    121ir_get_current_level(const struct isolate_runner *ir)
    122{
    123	return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]);
    124}
    125
    126static inline const struct bracket *
    127ir_get_current_bracket_prop(const struct isolate_runner *ir)
    128{
    129	return bidi_bracket +
    130	       (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]);
    131}
    132
    133static void
    134ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop)
    135{
    136	set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off]));
    137}
    138
    139static void
    140ir_init(uint_least32_t *buf, size_t buflen, size_t off,
    141        uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
    142{
    143	size_t i;
    144	int_least8_t sos_level;
    145
    146	/* initialize invariants */
    147	ir->buf = buf;
    148	ir->buflen = buflen;
    149	ir->paragraph_level = paragraph_level;
    150	ir->start = off;
    151
    152	/* advance off until we are at a non-removed character */
    153	for (; off < buflen; off++) {
    154		if (get_state(STATE_LEVEL, buf[off]) != -1) {
    155			break;
    156		}
    157	}
    158	if (off == buflen) {
    159		/* we encountered no more non-removed character, terminate */
    160		ir->next.off = SIZE_MAX;
    161		return;
    162	}
    163
    164	/* set the isolating run level to that of the current offset */
    165	ir->isolating_run_level =
    166		(int_least8_t)get_state(STATE_LEVEL, buf[off]);
    167
    168	/* initialize sos and eos to dummy values */
    169	ir->sos = ir->eos = NUM_BIDI_PROPS;
    170
    171	/*
    172	 * we write the information of the "current" state into next,
    173	 * so that the shift-in at the first advancement moves it in
    174	 * cur, as desired.
    175	 */
    176	ir->next.off = off;
    177
    178	/*
    179	 * determine the previous state but store its offset in cur.off,
    180	 * given it's shifted in on the first advancement
    181	 */
    182	ir->cur.off = SIZE_MAX;
    183	for (i = off, sos_level = -1; i >= 1; i--) {
    184		if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
    185			/*
    186			 * we found a character that has not been
    187			 * removed in X9
    188			 */
    189			sos_level = (int_least8_t)get_state(STATE_LEVEL,
    190			                                    buf[i - 1]);
    191
    192			if (within) {
    193				/* we just take it */
    194				ir->cur.off = i;
    195			}
    196
    197			break;
    198		}
    199	}
    200	if (sos_level == -1) {
    201		/*
    202		 * there were no preceding non-removed characters, set
    203		 * sos-level to paragraph embedding level
    204		 */
    205		sos_level = (int_least8_t)paragraph_level;
    206	}
    207
    208	if (!within || ir->cur.off == SIZE_MAX) {
    209		/*
    210		 * we are at the beginning of the sequence; initialize
    211		 * it faithfully according to the algorithm by looking
    212		 * at the sos-level
    213		 */
    214		if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
    215			/* the higher level is even, set sos to L */
    216			ir->sos = BIDI_PROP_L;
    217		} else {
    218			/* the higher level is odd, set sos to R */
    219			ir->sos = BIDI_PROP_R;
    220		}
    221	}
    222}
    223
    224static int
    225ir_advance(struct isolate_runner *ir)
    226{
    227	enum bidi_property prop;
    228	int_least8_t level, isolate_level, last_isolate_level;
    229	size_t i;
    230
    231	if (ir->next.off == SIZE_MAX) {
    232		/* the sequence is over */
    233		return 1;
    234	}
    235
    236	/* shift in */
    237	ir->prev.off = ir->cur.off;
    238	ir->cur.off = ir->next.off;
    239
    240	/* mark as visited */
    241	set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
    242
    243	/* initialize next state by going to the next character in the sequence
    244	 */
    245	ir->next.off = SIZE_MAX;
    246
    247	last_isolate_level = -1;
    248	for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
    249		level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
    250		prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
    251
    252		if (level == -1) {
    253			/* this is one of the ignored characters, skip */
    254			continue;
    255		} else if (level == ir->isolating_run_level) {
    256			last_isolate_level = level;
    257		}
    258
    259		/* follow BD8/BD9 and P2 to traverse the current sequence */
    260		if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    261		    prop == BIDI_PROP_FSI) {
    262			/*
    263			 * we encountered an isolate initiator, increment
    264			 * counter, but go into processing when we
    265			 * were not isolated before
    266			 */
    267			if (isolate_level < MAX_DEPTH) {
    268				isolate_level++;
    269			}
    270			if (isolate_level != 1) {
    271				continue;
    272			}
    273		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    274			isolate_level--;
    275
    276			/*
    277			 * if the current PDI dropped the isolate-level
    278			 * to zero, it is itself part of the isolating
    279			 * run sequence; otherwise we simply continue.
    280			 */
    281			if (isolate_level > 0) {
    282				continue;
    283			}
    284		} else if (isolate_level > 0) {
    285			/* we are in an isolating sequence */
    286			continue;
    287		}
    288
    289		/*
    290		 * now we either still are in our sequence or we hit
    291		 * the eos-case as we left the sequence and hit the
    292		 * first non-isolating-sequence character.
    293		 */
    294		if (i == ir->cur.off) {
    295			/* we were in the first initializing round */
    296			continue;
    297		} else if (level == ir->isolating_run_level) {
    298			/* isolate_level-skips have been handled before, we're
    299			 * good */
    300			/* still in the sequence */
    301			ir->next.off = i;
    302		} else {
    303			/* out of sequence or isolated, compare levels via eos
    304			 */
    305			ir->next.off = SIZE_MAX;
    306			if (MAX(last_isolate_level, level) % 2 == 0) {
    307				ir->eos = BIDI_PROP_L;
    308			} else {
    309				ir->eos = BIDI_PROP_R;
    310			}
    311		}
    312		break;
    313	}
    314	if (i == ir->buflen) {
    315		/*
    316		 * the sequence ended before we could grab an offset.
    317		 * we need to determine the eos-prop by comparing the
    318		 * level of the last element in the isolating run sequence
    319		 * with the paragraph level.
    320		 */
    321		ir->next.off = SIZE_MAX;
    322		if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) {
    323			/* the higher level is even, set eos to L */
    324			ir->eos = BIDI_PROP_L;
    325		} else {
    326			/* the higher level is odd, set eos to R */
    327			ir->eos = BIDI_PROP_R;
    328		}
    329	}
    330
    331	return 0;
    332}
    333
    334static enum bidi_property
    335ir_get_last_strong_prop(const struct isolate_runner *ir)
    336{
    337	struct isolate_runner tmp;
    338	enum bidi_property last_strong_prop = ir->sos, prop;
    339
    340	ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
    341	        &tmp);
    342	for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
    343		prop = ir_get_current_prop(&tmp);
    344
    345		if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
    346		    prop == BIDI_PROP_AL) {
    347			last_strong_prop = prop;
    348		}
    349	}
    350
    351	return last_strong_prop;
    352}
    353
    354static enum bidi_property
    355ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
    356{
    357	struct isolate_runner tmp;
    358	enum bidi_property last_strong_or_number_prop = ir->sos, prop;
    359
    360	ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
    361	        &tmp);
    362	for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
    363		prop = ir_get_current_prop(&tmp);
    364
    365		if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
    366		    prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
    367		    prop == BIDI_PROP_AN) {
    368			last_strong_or_number_prop = prop;
    369		}
    370	}
    371
    372	return last_strong_or_number_prop;
    373}
    374
    375static void
    376preprocess_bracket_pair(const struct isolate_runner *start,
    377                        const struct isolate_runner *end)
    378{
    379	enum bidi_property prop, bracket_prop, last_strong_or_number_prop;
    380	struct isolate_runner ir;
    381	size_t strong_type_off;
    382
    383	/*
    384	 * check if the bracket contains a strong type (L or R|EN|AN)
    385	 */
    386	for (ir = *start, strong_type_off = SIZE_MAX,
    387	    bracket_prop = NUM_BIDI_PROPS;
    388	     !ir_advance(&ir) && ir.cur.off < end->cur.off;) {
    389		prop = ir_get_current_prop(&ir);
    390
    391		if (prop == BIDI_PROP_L) {
    392			strong_type_off = ir.cur.off;
    393			if (ir.isolating_run_level % 2 == 0) {
    394				/*
    395				 * set the type for both brackets to L (so they
    396				 * match the strong type they contain)
    397				 */
    398				bracket_prop = BIDI_PROP_L;
    399			}
    400		} else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
    401		           prop == BIDI_PROP_AN) {
    402			strong_type_off = ir.cur.off;
    403			if (ir.isolating_run_level % 2 != 0) {
    404				/*
    405				 * set the type for both brackets to R (so they
    406				 * match the strong type they contain)
    407				 */
    408				bracket_prop = BIDI_PROP_R;
    409			}
    410		}
    411	}
    412	if (strong_type_off == SIZE_MAX) {
    413		/*
    414		 * there are no strong types within the brackets and we just
    415		 * leave the brackets as is
    416		 */
    417		return;
    418	}
    419
    420	if (bracket_prop == NUM_BIDI_PROPS) {
    421		/*
    422		 * We encountered a strong type, but it was opposite
    423		 * to the embedding direction.
    424		 * Check the previous strong type before the opening
    425		 * bracket
    426		 */
    427		last_strong_or_number_prop =
    428			ir_get_last_strong_or_number_prop(start);
    429		if (last_strong_or_number_prop == BIDI_PROP_L &&
    430		    ir.isolating_run_level % 2 != 0) {
    431			/*
    432			 * the previous strong type is also opposite
    433			 * to the embedding direction, so the context
    434			 * was established and we set the brackets
    435			 * accordingly.
    436			 */
    437			bracket_prop = BIDI_PROP_L;
    438		} else if ((last_strong_or_number_prop == BIDI_PROP_R ||
    439		            last_strong_or_number_prop == BIDI_PROP_EN ||
    440		            last_strong_or_number_prop == BIDI_PROP_AN) &&
    441		           ir.isolating_run_level % 2 == 0) {
    442			/*
    443			 * the previous strong type is also opposite
    444			 * to the embedding direction, so the context
    445			 * was established and we set the brackets
    446			 * accordingly.
    447			 */
    448			bracket_prop = BIDI_PROP_R;
    449		} else {
    450			/* set brackets to the embedding direction */
    451			if (ir.isolating_run_level % 2 == 0) {
    452				bracket_prop = BIDI_PROP_L;
    453			} else {
    454				bracket_prop = BIDI_PROP_R;
    455			}
    456		}
    457	}
    458
    459	ir_set_current_prop(start, bracket_prop);
    460	ir_set_current_prop(end, bracket_prop);
    461
    462	/*
    463	 * any sequence of NSMs after opening or closing brackets get
    464	 * the same property as the one we set on the brackets
    465	 */
    466	for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop(
    467						      &ir) == BIDI_PROP_NSM;) {
    468		ir_set_current_prop(&ir, bracket_prop);
    469	}
    470	for (ir = *end; !ir_advance(&ir) &&
    471	                ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) {
    472		ir_set_current_prop(&ir, bracket_prop);
    473	}
    474}
    475
    476static void
    477preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
    478                         uint_least8_t paragraph_level)
    479{
    480	/*
    481	 * The N0-rule deals with bracket pairs that shall be determined
    482	 * with the rule BD16. This is specified as an algorithm with a
    483	 * stack of 63 bracket openings that are used to resolve into a
    484	 * separate list of pairs, which is then to be sorted by opening
    485	 * position. Thus, even though the bracketing-depth is limited
    486	 * by 63, the algorithm, as is, requires dynamic memory
    487	 * management.
    488	 *
    489	 * A naive approach (used by Fribidi) would be to screw the
    490	 * stack-approach and simply directly determine the
    491	 * corresponding closing bracket offset for a given opening
    492	 * bracket, leading to O(n²) time complexity in the worst case
    493	 * with a lot of brackets. While many brackets are not common,
    494	 * it is still possible to find a middle ground where you obtain
    495	 * strongly linear time complexity in most common cases:
    496	 *
    497	 * Instead of a stack, we use a FIFO data structure which is
    498	 * filled with bracket openings in the order of appearance (thus
    499	 * yielding an implicit sorting!) at the top. If the
    500	 * corresponding closing bracket is encountered, it is added to
    501	 * the respective entry, making it ready to "move out" at the
    502	 * bottom (i.e. passed to the bracket processing). Due to the
    503	 * nature of the specified pair detection algorithm, which only
    504	 * cares about the bracket type and nothing else (bidi class,
    505	 * level, etc.), we can mix processing and bracket detection.
    506	 *
    507	 * Obviously, if you, for instance, have one big bracket pair at
    508	 * the bottom that has not been closed yet, it will block the
    509	 * bracket processing and the FIFO might hit its capacity limit.
    510	 * At this point, the blockage is manually resolved using the
    511	 * naive quadratic approach.
    512	 *
    513	 * To remain within the specified standard behaviour, which
    514	 * mandates that processing of brackets should stop when the
    515	 * bracketing-depth is at 63, we simply check in an "overflow"
    516	 * scenario if all 63 elements in the LIFO are unfinished, which
    517	 * corresponds with such a bracketing depth.
    518	 */
    519	enum bidi_property prop;
    520
    521	struct {
    522		bool complete;
    523		size_t bracket_class;
    524		struct isolate_runner start;
    525		struct isolate_runner end;
    526	} fifo[63];
    527	const struct bracket *bracket_prop, *tmp_bracket_prop;
    528	struct isolate_runner ir, tmp_ir;
    529	size_t fifo_len = 0, i, blevel, j, k;
    530
    531	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    532	while (!ir_advance(&ir)) {
    533		prop = ir_get_current_prop(&ir);
    534		bracket_prop = ir_get_current_bracket_prop(&ir);
    535		if (prop == BIDI_PROP_ON &&
    536		    bracket_prop->type == BIDI_BRACKET_OPEN) {
    537			if (fifo_len == LEN(fifo)) {
    538				/*
    539				 * The FIFO is full, check first if it's
    540				 * completely blocked (i.e. no finished
    541				 * bracket pairs, triggering the standard
    542				 * that mandates to abort in such a case
    543				 */
    544				for (i = 0; i < fifo_len; i++) {
    545					if (fifo[i].complete) {
    546						break;
    547					}
    548				}
    549				if (i == fifo_len) {
    550					/* abort processing */
    551					return;
    552				}
    553
    554				/*
    555				 * by construction, the bottom entry
    556				 * in the FIFO is guaranteed to be
    557				 * unfinished (given we "consume" all
    558				 * finished bottom entries after each
    559				 * iteration).
    560				 *
    561				 * iterate, starting after the opening
    562				 * bracket, and find the corresponding
    563				 * closing bracket.
    564				 *
    565				 * if we find none, just drop the FIFO
    566				 * entry silently
    567				 */
    568				for (tmp_ir = fifo[0].start, blevel = 0;
    569				     !ir_advance(&tmp_ir);) {
    570					tmp_bracket_prop =
    571						ir_get_current_bracket_prop(
    572							&tmp_ir);
    573
    574					if (tmp_bracket_prop->type ==
    575					            BIDI_BRACKET_OPEN &&
    576					    tmp_bracket_prop->class ==
    577					            bracket_prop->class) {
    578						/* we encountered another
    579						 * opening bracket of the
    580						 * same class */
    581						blevel++;
    582
    583					} else if (tmp_bracket_prop->type ==
    584					                   BIDI_BRACKET_CLOSE &&
    585					           tmp_bracket_prop->class ==
    586					                   bracket_prop
    587					                           ->class) {
    588						/* we encountered a closing
    589						 * bracket of the same class
    590						 */
    591						if (blevel == 0) {
    592							/* this is the
    593							 * corresponding
    594							 * closing bracket
    595							 */
    596							fifo[0].complete = true;
    597							fifo[0].end = ir;
    598						} else {
    599							blevel--;
    600						}
    601					}
    602				}
    603				if (fifo[0].complete) {
    604					/* we found the matching bracket */
    605					preprocess_bracket_pair(
    606						&(fifo[i].start),
    607						&(fifo[i].end));
    608				}
    609
    610				/* shift FIFO one to the left */
    611				for (i = 1; i < fifo_len; i++) {
    612					fifo[i - 1] = fifo[i];
    613				}
    614				fifo_len--;
    615			}
    616
    617			/* add element to the FIFO */
    618			fifo_len++;
    619			fifo[fifo_len - 1].complete = false;
    620			fifo[fifo_len - 1].bracket_class = bracket_prop->class;
    621			fifo[fifo_len - 1].start = ir;
    622		} else if (prop == BIDI_PROP_ON &&
    623		           bracket_prop->type == BIDI_BRACKET_CLOSE) {
    624			/*
    625			 * go backwards in the FIFO, skip finished entries
    626			 * and simply ignore (do nothing) the closing
    627			 * bracket if we do not match anything
    628			 */
    629			for (i = fifo_len; i > 0; i--) {
    630				if (bracket_prop->class ==
    631				            fifo[i - 1].bracket_class &&
    632				    !fifo[i - 1].complete) {
    633					/* we have found a pair */
    634					fifo[i - 1].complete = true;
    635					fifo[i - 1].end = ir;
    636
    637					/* remove all uncompleted FIFO elements
    638					 * above i - 1 */
    639					for (j = i; j < fifo_len;) {
    640						if (fifo[j].complete) {
    641							j++;
    642							continue;
    643						}
    644
    645						/* shift FIFO one to the left */
    646						for (k = j + 1; k < fifo_len;
    647						     k++) {
    648							fifo[k - 1] = fifo[k];
    649						}
    650						fifo_len--;
    651					}
    652					break;
    653				}
    654			}
    655		}
    656
    657		/* process all ready bracket pairs from the FIFO bottom */
    658		while (fifo_len > 0 && fifo[0].complete) {
    659			preprocess_bracket_pair(&(fifo[0].start),
    660			                        &(fifo[0].end));
    661
    662			/* shift FIFO one to the left */
    663			for (j = 0; j + 1 < fifo_len; j++) {
    664				fifo[j] = fifo[j + 1];
    665			}
    666			fifo_len--;
    667		}
    668	}
    669
    670	/*
    671	 * afterwards, we still might have unfinished bracket pairs
    672	 * that will remain as such, but the subsequent finished pairs
    673	 * also need to be taken into account, so we go through the
    674	 * FIFO once more and process all finished pairs
    675	 */
    676	for (i = 0; i < fifo_len; i++) {
    677		if (fifo[i].complete) {
    678			preprocess_bracket_pair(&(fifo[i].start),
    679			                        &(fifo[i].end));
    680		}
    681	}
    682}
    683
    684static size_t
    685preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
    686                                  size_t off, uint_least8_t paragraph_level)
    687{
    688	enum bidi_property sequence_prop, prop;
    689	struct isolate_runner ir, tmp;
    690	size_t runsince, sequence_end;
    691
    692	/* W1 */
    693	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    694	while (!ir_advance(&ir)) {
    695		if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
    696			prop = ir_get_previous_prop(&ir);
    697
    698			if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    699			    prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) {
    700				ir_set_current_prop(&ir, BIDI_PROP_ON);
    701			} else {
    702				ir_set_current_prop(&ir, prop);
    703			}
    704		}
    705	}
    706
    707	/* W2 */
    708	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    709	while (!ir_advance(&ir)) {
    710		if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
    711		    ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
    712			ir_set_current_prop(&ir, BIDI_PROP_AN);
    713		}
    714	}
    715
    716	/* W3 */
    717	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    718	while (!ir_advance(&ir)) {
    719		if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
    720			ir_set_current_prop(&ir, BIDI_PROP_R);
    721		}
    722	}
    723
    724	/* W4 */
    725	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    726	while (!ir_advance(&ir)) {
    727		if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
    728		    (ir_get_current_prop(&ir) == BIDI_PROP_ES ||
    729		     ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
    730		    ir_get_next_prop(&ir) == BIDI_PROP_EN) {
    731			ir_set_current_prop(&ir, BIDI_PROP_EN);
    732		}
    733
    734		if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
    735		    ir_get_current_prop(&ir) == BIDI_PROP_CS &&
    736		    ir_get_next_prop(&ir) == BIDI_PROP_AN) {
    737			ir_set_current_prop(&ir, BIDI_PROP_AN);
    738		}
    739	}
    740
    741	/* W5 */
    742	runsince = SIZE_MAX;
    743	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    744	while (!ir_advance(&ir)) {
    745		if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
    746			if (runsince == SIZE_MAX) {
    747				/* a new run has begun */
    748				runsince = ir.cur.off;
    749			}
    750		} else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
    751			/* set the preceding sequence */
    752			if (runsince != SIZE_MAX) {
    753				ir_init(buf, buflen, runsince, paragraph_level,
    754				        (runsince > off), &tmp);
    755				while (!ir_advance(&tmp) &&
    756				       tmp.cur.off < ir.cur.off) {
    757					ir_set_current_prop(&tmp, BIDI_PROP_EN);
    758				}
    759				runsince = SIZE_MAX;
    760			} else {
    761				ir_init(buf, buflen, ir.cur.off,
    762				        paragraph_level, (ir.cur.off > off),
    763				        &tmp);
    764				ir_advance(&tmp);
    765			}
    766			/* follow the succeeding sequence */
    767			while (!ir_advance(&tmp)) {
    768				if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) {
    769					break;
    770				}
    771				ir_set_current_prop(&tmp, BIDI_PROP_EN);
    772			}
    773		} else {
    774			/* sequence ended */
    775			runsince = SIZE_MAX;
    776		}
    777	}
    778
    779	/* W6 */
    780	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    781	while (!ir_advance(&ir)) {
    782		prop = ir_get_current_prop(&ir);
    783
    784		if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
    785		    prop == BIDI_PROP_CS) {
    786			ir_set_current_prop(&ir, BIDI_PROP_ON);
    787		}
    788	}
    789
    790	/* W7 */
    791	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    792	while (!ir_advance(&ir)) {
    793		if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
    794		    ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
    795			ir_set_current_prop(&ir, BIDI_PROP_L);
    796		}
    797	}
    798
    799	/* N0 */
    800	preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
    801
    802	/* N1 */
    803	sequence_end = SIZE_MAX;
    804	sequence_prop = NUM_BIDI_PROPS;
    805	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    806	while (!ir_advance(&ir)) {
    807		if (sequence_end == SIZE_MAX) {
    808			prop = ir_get_current_prop(&ir);
    809
    810			if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
    811			    prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
    812			    prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
    813			    prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
    814				/* the current character is an NI (neutral
    815				 * or isolate) */
    816
    817				/* scan ahead to the end of the NI-sequence
    818				 */
    819				ir_init(buf, buflen, ir.cur.off,
    820				        paragraph_level, (ir.cur.off > off),
    821				        &tmp);
    822				while (!ir_advance(&tmp)) {
    823					prop = ir_get_next_prop(&tmp);
    824
    825					if (prop != BIDI_PROP_B &&
    826					    prop != BIDI_PROP_S &&
    827					    prop != BIDI_PROP_WS &&
    828					    prop != BIDI_PROP_ON &&
    829					    prop != BIDI_PROP_FSI &&
    830					    prop != BIDI_PROP_LRI &&
    831					    prop != BIDI_PROP_RLI &&
    832					    prop != BIDI_PROP_PDI) {
    833						break;
    834					}
    835				}
    836
    837				/*
    838				 * check what follows and see if the text
    839				 * has the same direction on both sides
    840				 */
    841				if (ir_get_previous_prop(&ir) == BIDI_PROP_L &&
    842				    ir_get_next_prop(&tmp) == BIDI_PROP_L) {
    843					sequence_end = tmp.cur.off;
    844					sequence_prop = BIDI_PROP_L;
    845				} else if ((ir_get_previous_prop(&ir) ==
    846				                    BIDI_PROP_R ||
    847				            ir_get_previous_prop(&ir) ==
    848				                    BIDI_PROP_EN ||
    849				            ir_get_previous_prop(&ir) ==
    850				                    BIDI_PROP_AN) &&
    851				           (ir_get_next_prop(&tmp) ==
    852				                    BIDI_PROP_R ||
    853				            ir_get_next_prop(&tmp) ==
    854				                    BIDI_PROP_EN ||
    855				            ir_get_next_prop(&tmp) ==
    856				                    BIDI_PROP_AN)) {
    857					sequence_end = tmp.cur.off;
    858					sequence_prop = BIDI_PROP_R;
    859				}
    860			}
    861		}
    862
    863		if (sequence_end != SIZE_MAX) {
    864			if (ir.cur.off <= sequence_end) {
    865				ir_set_current_prop(&ir, sequence_prop);
    866			} else {
    867				/* end of sequence, reset */
    868				sequence_end = SIZE_MAX;
    869				sequence_prop = NUM_BIDI_PROPS;
    870			}
    871		}
    872	}
    873
    874	/* N2 */
    875	ir_init(buf, buflen, off, paragraph_level, false, &ir);
    876	while (!ir_advance(&ir)) {
    877		prop = ir_get_current_prop(&ir);
    878
    879		if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
    880		    prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
    881		    prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
    882		    prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
    883			/* N2 */
    884			if (ir_get_current_level(&ir) % 2 == 0) {
    885				/* even embedding level */
    886				ir_set_current_prop(&ir, BIDI_PROP_L);
    887			} else {
    888				/* odd embedding level */
    889				ir_set_current_prop(&ir, BIDI_PROP_R);
    890			}
    891		}
    892	}
    893
    894	return 0;
    895}
    896
    897static uint_least8_t
    898get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
    899{
    900	enum bidi_property prop;
    901	int_least8_t isolate_level;
    902	size_t stateoff;
    903
    904	/* determine paragraph level (rules P1-P3) and terminate on PDI */
    905	for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) {
    906		prop = (uint_least8_t)get_state(STATE_PROP, state[stateoff]);
    907
    908		if (prop == BIDI_PROP_PDI && isolate_level == 0) {
    909			/*
    910			 * we are in a FSI-subsection of a paragraph and
    911			 * matched with the terminating PDI
    912			 */
    913			break;
    914		}
    915
    916		/* BD8/BD9 */
    917		if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    918		     prop == BIDI_PROP_FSI) &&
    919		    isolate_level < MAX_DEPTH) {
    920			/* we hit an isolate initiator, increment counter */
    921			isolate_level++;
    922		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    923			isolate_level--;
    924		}
    925
    926		/* P2 */
    927		if (isolate_level > 0) {
    928			continue;
    929		}
    930
    931		/* P3 */
    932		if (prop == BIDI_PROP_L) {
    933			return 0;
    934		} else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
    935			return 1;
    936		}
    937	}
    938
    939	return 0;
    940}
    941
    942static inline uint_least8_t
    943get_bidi_property(uint_least32_t cp)
    944{
    945	if (likely(cp <= 0x10FFFF)) {
    946		return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
    947		       0x1F /* 00011111 */;
    948	} else {
    949		return BIDI_PROP_L;
    950	}
    951}
    952
    953static uint_least8_t
    954get_paragraph_level(enum grapheme_bidirectional_direction override,
    955                    const HERODOTUS_READER *r)
    956{
    957	HERODOTUS_READER tmp;
    958	enum bidi_property prop;
    959	int_least8_t isolate_level;
    960	uint_least32_t cp;
    961
    962	/* check overrides first according to rule HL1 */
    963	if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
    964		return 0;
    965	} else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
    966		return 1;
    967	}
    968
    969	/* copy reader into temporary reader */
    970	herodotus_reader_copy(r, &tmp);
    971
    972	/* determine paragraph level (rules P1-P3) */
    973	for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
    974	                        HERODOTUS_STATUS_SUCCESS;) {
    975		prop = get_bidi_property(cp);
    976
    977		/* BD8/BD9 */
    978		if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
    979		     prop == BIDI_PROP_FSI) &&
    980		    isolate_level < MAX_DEPTH) {
    981			/* we hit an isolate initiator, increment counter */
    982			isolate_level++;
    983		} else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
    984			isolate_level--;
    985		}
    986
    987		/* P2 */
    988		if (isolate_level > 0) {
    989			continue;
    990		}
    991
    992		/* P3 */
    993		if (prop == BIDI_PROP_L) {
    994			return 0;
    995		} else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
    996			return 1;
    997		}
    998	}
    999
   1000	return 0;
   1001}
   1002
   1003static void
   1004preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
   1005                     size_t buflen)
   1006{
   1007	enum bidi_property prop;
   1008	int_least8_t level;
   1009
   1010	struct {
   1011		int_least8_t level;
   1012		enum grapheme_bidirectional_direction override;
   1013		bool directional_isolate;
   1014	} directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
   1015
   1016	size_t overflow_isolate_count, overflow_embedding_count,
   1017		valid_isolate_count, bufoff, i, runsince;
   1018
   1019	/* X1 */
   1020	dirstat->level = (int_least8_t)paragraph_level;
   1021	dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1022	dirstat->directional_isolate = false;
   1023	overflow_isolate_count = overflow_embedding_count =
   1024		valid_isolate_count = 0;
   1025
   1026	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1027		prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
   1028
   1029		/* set paragraph level we need for line-level-processing */
   1030		set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
   1031		          &(buf[bufoff]));
   1032again:
   1033		if (prop == BIDI_PROP_RLE) {
   1034			/* X2 */
   1035			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1036			            MAX_DEPTH &&
   1037			    overflow_isolate_count == 0 &&
   1038			    overflow_embedding_count == 0) {
   1039				/* valid RLE */
   1040				dirstat++;
   1041				dirstat->level =
   1042					(dirstat - 1)->level +
   1043					((dirstat - 1)->level % 2 != 0) + 1;
   1044				dirstat->override =
   1045					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1046				dirstat->directional_isolate = false;
   1047			} else {
   1048				/* overflow RLE */
   1049				overflow_embedding_count +=
   1050					(overflow_isolate_count == 0);
   1051			}
   1052		} else if (prop == BIDI_PROP_LRE) {
   1053			/* X3 */
   1054			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1055			            MAX_DEPTH &&
   1056			    overflow_isolate_count == 0 &&
   1057			    overflow_embedding_count == 0) {
   1058				/* valid LRE */
   1059				dirstat++;
   1060				dirstat->level =
   1061					(dirstat - 1)->level +
   1062					((dirstat - 1)->level % 2 == 0) + 1;
   1063				dirstat->override =
   1064					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1065				dirstat->directional_isolate = false;
   1066			} else {
   1067				/* overflow LRE */
   1068				overflow_embedding_count +=
   1069					(overflow_isolate_count == 0);
   1070			}
   1071		} else if (prop == BIDI_PROP_RLO) {
   1072			/* X4 */
   1073			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1074			            MAX_DEPTH &&
   1075			    overflow_isolate_count == 0 &&
   1076			    overflow_embedding_count == 0) {
   1077				/* valid RLO */
   1078				dirstat++;
   1079				dirstat->level =
   1080					(dirstat - 1)->level +
   1081					((dirstat - 1)->level % 2 != 0) + 1;
   1082				dirstat->override =
   1083					GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
   1084				dirstat->directional_isolate = false;
   1085			} else {
   1086				/* overflow RLO */
   1087				overflow_embedding_count +=
   1088					(overflow_isolate_count == 0);
   1089			}
   1090		} else if (prop == BIDI_PROP_LRO) {
   1091			/* X5 */
   1092			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1093			            MAX_DEPTH &&
   1094			    overflow_isolate_count == 0 &&
   1095			    overflow_embedding_count == 0) {
   1096				/* valid LRE */
   1097				dirstat++;
   1098				dirstat->level =
   1099					(dirstat - 1)->level +
   1100					((dirstat - 1)->level % 2 == 0) + 1;
   1101				dirstat->override =
   1102					GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
   1103				dirstat->directional_isolate = false;
   1104			} else {
   1105				/* overflow LRO */
   1106				overflow_embedding_count +=
   1107					(overflow_isolate_count == 0);
   1108			}
   1109		} else if (prop == BIDI_PROP_RLI) {
   1110			/* X5a */
   1111			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1112			if (dirstat->override ==
   1113			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1114				set_state(STATE_PROP, BIDI_PROP_L,
   1115				          &(buf[bufoff]));
   1116			} else if (dirstat->override ==
   1117			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1118				set_state(STATE_PROP, BIDI_PROP_R,
   1119				          &(buf[bufoff]));
   1120			}
   1121
   1122			if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
   1123			            MAX_DEPTH &&
   1124			    overflow_isolate_count == 0 &&
   1125			    overflow_embedding_count == 0) {
   1126				/* valid RLI */
   1127				valid_isolate_count++;
   1128
   1129				dirstat++;
   1130				dirstat->level =
   1131					(dirstat - 1)->level +
   1132					((dirstat - 1)->level % 2 != 0) + 1;
   1133				dirstat->override =
   1134					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1135				dirstat->directional_isolate = true;
   1136			} else {
   1137				/* overflow RLI */
   1138				overflow_isolate_count++;
   1139			}
   1140		} else if (prop == BIDI_PROP_LRI) {
   1141			/* X5b */
   1142			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1143			if (dirstat->override ==
   1144			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1145				set_state(STATE_PROP, BIDI_PROP_L,
   1146				          &(buf[bufoff]));
   1147			} else if (dirstat->override ==
   1148			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1149				set_state(STATE_PROP, BIDI_PROP_R,
   1150				          &(buf[bufoff]));
   1151			}
   1152
   1153			if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
   1154			            MAX_DEPTH &&
   1155			    overflow_isolate_count == 0 &&
   1156			    overflow_embedding_count == 0) {
   1157				/* valid LRI */
   1158				valid_isolate_count++;
   1159
   1160				dirstat++;
   1161				dirstat->level =
   1162					(dirstat - 1)->level +
   1163					((dirstat - 1)->level % 2 == 0) + 1;
   1164				dirstat->override =
   1165					GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
   1166				dirstat->directional_isolate = true;
   1167			} else {
   1168				/* overflow LRI */
   1169				overflow_isolate_count++;
   1170			}
   1171		} else if (prop == BIDI_PROP_FSI) {
   1172			/* X5c */
   1173			if (get_isolated_paragraph_level(
   1174				    buf + (bufoff + 1),
   1175				    buflen - (bufoff + 1)) == 1) {
   1176				prop = BIDI_PROP_RLI;
   1177				goto again;
   1178			} else { /* ... == 0 */
   1179				prop = BIDI_PROP_LRI;
   1180				goto again;
   1181			}
   1182		} else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
   1183		           prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) {
   1184			/* X6 */
   1185			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1186			if (dirstat->override ==
   1187			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1188				set_state(STATE_PROP, BIDI_PROP_L,
   1189				          &(buf[bufoff]));
   1190			} else if (dirstat->override ==
   1191			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1192				set_state(STATE_PROP, BIDI_PROP_R,
   1193				          &(buf[bufoff]));
   1194			}
   1195		} else if (prop == BIDI_PROP_PDI) {
   1196			/* X6a */
   1197			if (overflow_isolate_count > 0) {
   1198				/* PDI matches an overflow isolate initiator
   1199				 */
   1200				overflow_isolate_count--;
   1201			} else if (valid_isolate_count > 0) {
   1202				/* PDI matches a normal isolate initiator */
   1203				overflow_embedding_count = 0;
   1204				while (dirstat->directional_isolate == false &&
   1205				       dirstat > directional_status) {
   1206					/*
   1207					 * we are safe here as given the
   1208					 * valid isolate count is positive
   1209					 * there must be a stack-entry
   1210					 * with positive directional
   1211					 * isolate status, but we take
   1212					 * no chances and include an
   1213					 * explicit check
   1214					 *
   1215					 * POSSIBLE OPTIMIZATION: Whenever
   1216					 * we push on the stack, check if it
   1217					 * has the directional isolate
   1218					 * status true and store a pointer
   1219					 * to it so we can jump to it very
   1220					 * quickly.
   1221					 */
   1222					dirstat--;
   1223				}
   1224
   1225				/*
   1226				 * as above, the following check is not
   1227				 * necessary, given we are guaranteed to
   1228				 * have at least one stack entry left,
   1229				 * but it's better to be safe
   1230				 */
   1231				if (dirstat > directional_status) {
   1232					dirstat--;
   1233				}
   1234				valid_isolate_count--;
   1235			}
   1236
   1237			set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
   1238			if (dirstat->override ==
   1239			    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
   1240				set_state(STATE_PROP, BIDI_PROP_L,
   1241				          &(buf[bufoff]));
   1242			} else if (dirstat->override ==
   1243			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
   1244				set_state(STATE_PROP, BIDI_PROP_R,
   1245				          &(buf[bufoff]));
   1246			}
   1247		} else if (prop == BIDI_PROP_PDF) {
   1248			/* X7 */
   1249			if (overflow_isolate_count > 0) {
   1250				/* do nothing */
   1251			} else if (overflow_embedding_count > 0) {
   1252				overflow_embedding_count--;
   1253			} else if (dirstat->directional_isolate == false &&
   1254			           dirstat > directional_status) {
   1255				dirstat--;
   1256			}
   1257		} else if (prop == BIDI_PROP_B) {
   1258			/* X8 */
   1259			set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
   1260		}
   1261
   1262		/* X9 */
   1263		if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
   1264		    prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
   1265		    prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
   1266			set_state(STATE_LEVEL, -1, &(buf[bufoff]));
   1267		}
   1268	}
   1269
   1270	/* X10 (W1-W7, N0-N2) */
   1271	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1272		if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
   1273		    get_state(STATE_LEVEL, buf[bufoff]) != -1) {
   1274			bufoff += preprocess_isolating_run_sequence(
   1275				buf, buflen, bufoff, paragraph_level);
   1276		}
   1277	}
   1278
   1279	/*
   1280	 * I1-I2 (given our sequential approach to processing the
   1281	 * isolating run sequences, we apply this rule separately)
   1282	 */
   1283	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1284		level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
   1285		prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
   1286
   1287		if (level % 2 == 0) {
   1288			/* even level */
   1289			if (prop == BIDI_PROP_R) {
   1290				set_state(STATE_LEVEL, level + 1,
   1291				          &(buf[bufoff]));
   1292			} else if (prop == BIDI_PROP_AN ||
   1293			           prop == BIDI_PROP_EN) {
   1294				set_state(STATE_LEVEL, level + 2,
   1295				          &(buf[bufoff]));
   1296			}
   1297		} else {
   1298			/* odd level */
   1299			if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN ||
   1300			    prop == BIDI_PROP_AN) {
   1301				set_state(STATE_LEVEL, level + 1,
   1302				          &(buf[bufoff]));
   1303			}
   1304		}
   1305	}
   1306
   1307	/* L1 (rules 1-3) */
   1308	runsince = SIZE_MAX;
   1309	for (bufoff = 0; bufoff < buflen; bufoff++) {
   1310		level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
   1311		prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
   1312		                                buf[bufoff]);
   1313
   1314		if (level == -1) {
   1315			/* ignored character */
   1316			continue;
   1317		}
   1318
   1319		/* rules 1 and 2 */
   1320		if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
   1321			set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
   1322		}
   1323
   1324		/* rule 3 */
   1325		if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
   1326		    prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
   1327		    prop == BIDI_PROP_PDI) {
   1328			if (runsince == SIZE_MAX) {
   1329				/* a new run has begun */
   1330				runsince = bufoff;
   1331			}
   1332		} else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
   1333		           runsince != SIZE_MAX) {
   1334			/*
   1335			 * we hit a segment or paragraph separator in a
   1336			 * sequence, reset sequence-levels
   1337			 */
   1338			for (i = runsince; i < bufoff; i++) {
   1339				if (get_state(STATE_LEVEL, buf[i]) != -1) {
   1340					set_state(STATE_LEVEL, paragraph_level,
   1341					          &(buf[i]));
   1342				}
   1343			}
   1344			runsince = SIZE_MAX;
   1345		} else {
   1346			/* sequence ended */
   1347			runsince = SIZE_MAX;
   1348		}
   1349	}
   1350	if (runsince != SIZE_MAX) {
   1351		/*
   1352		 * this is the end of the paragraph and we
   1353		 * are in a run
   1354		 */
   1355		for (i = runsince; i < buflen; i++) {
   1356			if (get_state(STATE_LEVEL, buf[i]) != -1) {
   1357				set_state(STATE_LEVEL, paragraph_level,
   1358				          &(buf[i]));
   1359			}
   1360		}
   1361		runsince = SIZE_MAX;
   1362	}
   1363}
   1364
   1365static inline uint_least8_t
   1366get_bidi_bracket_off(uint_least32_t cp)
   1367{
   1368	if (likely(cp <= 0x10FFFF)) {
   1369		return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
   1370		                                   (cp & 0xff)]) >>
   1371		                       5);
   1372	} else {
   1373		return 0;
   1374	}
   1375}
   1376
   1377static size_t
   1378preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
   1379           uint_least32_t *buf, size_t buflen,
   1380           enum grapheme_bidirectional_direction *resolved)
   1381{
   1382	HERODOTUS_READER tmp;
   1383	size_t bufoff, bufsize, paragraph_len;
   1384	uint_least32_t cp;
   1385	uint_least8_t paragraph_level;
   1386
   1387	/* determine length and level of the paragraph */
   1388	herodotus_reader_copy(r, &tmp);
   1389	for (; herodotus_read_codepoint(&tmp, true, &cp) ==
   1390	       HERODOTUS_STATUS_SUCCESS;) {
   1391		/* break on paragraph separator */
   1392		if (get_bidi_property(cp) == BIDI_PROP_B) {
   1393			break;
   1394		}
   1395	}
   1396	paragraph_len = herodotus_reader_number_read(&tmp);
   1397	paragraph_level = get_paragraph_level(override, r);
   1398
   1399	if (resolved != NULL) {
   1400		/* store resolved paragraph level in output variable */
   1401		/* TODO use enum-type */
   1402		*resolved = (paragraph_level == 0) ?
   1403		                    GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR :
   1404		                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
   1405	}
   1406
   1407	if (buf == NULL) {
   1408		/* see below for return value reasoning */
   1409		return paragraph_len;
   1410	}
   1411
   1412	/*
   1413	 * the first step is to determine the bidirectional properties
   1414	 * and store them in the buffer
   1415	 */
   1416	for (bufoff = 0;
   1417	     bufoff < paragraph_len &&
   1418	     herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
   1419	     bufoff++) {
   1420		if (bufoff < buflen) {
   1421			/*
   1422			 * actually only do something when we have
   1423			 * space in the level-buffer. We continue
   1424			 * the iteration to be able to give a good
   1425			 * return value
   1426			 */
   1427			set_state(STATE_PROP,
   1428			          (uint_least8_t)get_bidi_property(cp),
   1429			          &(buf[bufoff]));
   1430			set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp),
   1431			          &(buf[bufoff]));
   1432			set_state(STATE_LEVEL, 0, &(buf[bufoff]));
   1433			set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
   1434			set_state(STATE_VISITED, 0, &(buf[bufoff]));
   1435			set_state(STATE_PRESERVED_PROP,
   1436			          (uint_least8_t)get_bidi_property(cp),
   1437			          &(buf[bufoff]));
   1438		}
   1439	}
   1440	bufsize = herodotus_reader_number_read(r);
   1441
   1442	for (bufoff = 0; bufoff < bufsize; bufoff++) {
   1443		if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
   1444		    bufoff != bufsize - 1) {
   1445			continue;
   1446		}
   1447
   1448		/*
   1449		 * we either encountered a paragraph terminator or this
   1450		 * is the last character in the string.
   1451		 * Call the paragraph handler on the paragraph, including
   1452		 * the terminating character or last character of the
   1453		 * string respectively
   1454		 */
   1455		preprocess_paragraph(paragraph_level, buf, bufoff + 1);
   1456		break;
   1457	}
   1458
   1459	/*
   1460	 * we return the number of total bytes read, as the function
   1461	 * should indicate if the given level-buffer is too small
   1462	 */
   1463	return herodotus_reader_number_read(r);
   1464}
   1465
   1466size_t
   1467grapheme_bidirectional_preprocess_paragraph(
   1468	const uint_least32_t *src, size_t srclen,
   1469	enum grapheme_bidirectional_direction override, uint_least32_t *dest,
   1470	size_t destlen, enum grapheme_bidirectional_direction *resolved)
   1471{
   1472	HERODOTUS_READER r;
   1473
   1474	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
   1475
   1476	return preprocess(&r, override, dest, destlen, resolved);
   1477}
   1478
   1479static inline size_t
   1480get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
   1481                          int_least8_t (*get_level)(const void *, size_t),
   1482                          void (*set_level)(void *, size_t, int_least8_t),
   1483                          void *lev, size_t levsize, bool skipignored)
   1484{
   1485	enum bidi_property prop;
   1486	size_t i, levlen, runsince;
   1487	int_least8_t level, runlevel;
   1488
   1489	/* rule L1.4 */
   1490	runsince = SIZE_MAX;
   1491	runlevel = -1;
   1492	for (i = 0, levlen = 0; i < linelen; i++) {
   1493		level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
   1494		prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
   1495		                                linedata[i]);
   1496
   1497		/* write level into level array if we still have space */
   1498		if (level != -1 || skipignored == false) {
   1499			if (levlen <= levsize) {
   1500				set_level(lev, levlen, level);
   1501			}
   1502			levlen++;
   1503		}
   1504
   1505		if (level == -1) {
   1506			/* ignored character */
   1507			continue;
   1508		}
   1509
   1510		if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
   1511		    prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
   1512		    prop == BIDI_PROP_PDI) {
   1513			if (runsince == SIZE_MAX) {
   1514				/* a new run has begun */
   1515				runsince = levlen - 1; /* levlen > 0 */
   1516				runlevel = (int_least8_t)get_state(
   1517					STATE_PARAGRAPH_LEVEL, linedata[i]);
   1518			}
   1519		} else {
   1520			/* sequence ended */
   1521			runsince = SIZE_MAX;
   1522			runlevel = -1;
   1523		}
   1524	}
   1525	if (runsince != SIZE_MAX) {
   1526		/*
   1527		 * we hit the end of the line but were in a run;
   1528		 * reset the line levels to the paragraph level
   1529		 */
   1530		for (i = runsince; i < MIN(linelen, levlen); i++) {
   1531			if (get_level(lev, i) != -1) {
   1532				set_level(lev, i, runlevel);
   1533			}
   1534		}
   1535	}
   1536
   1537	return levlen;
   1538}
   1539
   1540static inline int_least8_t
   1541get_level_int8(const void *lev, size_t off)
   1542{
   1543	return ((const int_least8_t *)lev)[off];
   1544}
   1545
   1546static inline void
   1547set_level_int8(void *lev, size_t off, int_least8_t value)
   1548{
   1549	((int_least8_t *)lev)[off] = value;
   1550}
   1551
   1552size_t
   1553grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
   1554                                                 size_t linelen,
   1555                                                 int_least8_t *lev,
   1556                                                 size_t levlen)
   1557{
   1558	return get_line_embedding_levels(linedata, linelen, get_level_int8,
   1559	                                 set_level_int8, lev, levlen, false);
   1560}
   1561
   1562static inline int_least8_t
   1563get_level_uint32(const void *lev, size_t off)
   1564{
   1565	return (int_least8_t)((((const uint_least32_t *)lev)[off] &
   1566	                       UINT32_C(0x1FE00000)) >>
   1567	                      21) -
   1568	       1;
   1569}
   1570
   1571static inline void
   1572set_level_uint32(void *lev, size_t off, int_least8_t value)
   1573{
   1574	((uint_least32_t *)lev)[off] ^=
   1575		((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
   1576	((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
   1577}
   1578
   1579static inline int_least16_t
   1580get_mirror_offset(uint_least32_t cp)
   1581{
   1582	if (cp <= UINT32_C(0x10FFFF)) {
   1583		return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
   1584	} else {
   1585		return 0;
   1586	}
   1587}
   1588
   1589size_t
   1590grapheme_bidirectional_reorder_line(const uint_least32_t *line,
   1591                                    const uint_least32_t *linedata,
   1592                                    size_t linelen, uint_least32_t *output,
   1593                                    size_t outputsize)
   1594{
   1595	size_t i, outputlen, first, last, j, k, l /*, laststart*/;
   1596	int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
   1597	uint_least32_t tmp;
   1598
   1599	/* write output characters (and apply possible mirroring) */
   1600	for (i = 0, outputlen = 0; i < linelen; i++) {
   1601		if (get_state(STATE_LEVEL, linedata[i]) != -1) {
   1602			if (outputlen < outputsize) {
   1603				output[outputlen] =
   1604					(uint_least32_t)((int_least32_t)
   1605				                                 line[i] +
   1606				                         get_mirror_offset(
   1607								 line[i]));
   1608			}
   1609			outputlen++;
   1610		}
   1611	}
   1612	if (outputlen >= outputsize) {
   1613		/* clear output buffer */
   1614		for (i = 0; i < outputsize; i++) {
   1615			output[i] = GRAPHEME_INVALID_CODEPOINT;
   1616		}
   1617
   1618		/* return required size */
   1619		return outputlen;
   1620	}
   1621
   1622	/*
   1623	 * write line embedding levels as metadata and codepoints into the
   1624	 * output
   1625	 */
   1626	get_line_embedding_levels(linedata, linelen, get_level_uint32,
   1627	                          set_level_uint32, output, outputsize, true);
   1628
   1629	/* determine level range */
   1630	for (i = 0; i < outputlen; i++) {
   1631		level = get_level_uint32(output, i);
   1632
   1633		if (level == -1) {
   1634			/* ignored character */
   1635			continue;
   1636		}
   1637
   1638		if (level % 2 == 1 && level < min_odd_level) {
   1639			min_odd_level = level;
   1640		}
   1641		if (level > max_level) {
   1642			max_level = level;
   1643		}
   1644	}
   1645
   1646	for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
   1647		for (i = 0; i < outputlen; i++) {
   1648			if (get_level_uint32(output, i) >= level) {
   1649				/*
   1650				 * the current character has the desired level
   1651				 */
   1652				first = last = i;
   1653
   1654				/* find the end of the level-sequence */
   1655				for (i++; i < outputlen; i++) {
   1656					if (get_level_uint32(output, i) >=
   1657					    level) {
   1658						/* the sequence continues */
   1659						last = i;
   1660					} else {
   1661						break;
   1662					}
   1663				}
   1664
   1665				/* invert the sequence first..last respecting
   1666				 * grapheme clusters
   1667				 *
   1668				 * The standard only speaks of combining marks
   1669				 * inversion, but we should in the perfect case
   1670				 * respect _all_ grapheme clusters, which we do
   1671				 * here!
   1672				 */
   1673
   1674				/* mark grapheme cluster breaks */
   1675				for (j = first; j <= last;
   1676				     j += grapheme_next_character_break(
   1677					     line + j, outputlen - j)) {
   1678					/*
   1679					 * we use a special trick here: The
   1680					 * first 21 bits of the state are filled
   1681					 * with the codepoint, the next 8 bits
   1682					 * are used for the level, so we can use
   1683					 * the 30th bit to mark the grapheme
   1684					 * cluster breaks. This allows us to
   1685					 * reinvert the grapheme clusters into
   1686					 * the proper direction later.
   1687					 */
   1688					output[j] |= UINT32_C(1) << 29;
   1689				}
   1690
   1691				/* global inversion */
   1692				for (k = first, l = last; k < l; k++, l--) {
   1693					/* swap */
   1694					tmp = output[k];
   1695					output[k] = output[l];
   1696					output[l] = tmp;
   1697				}
   1698
   1699				/* grapheme cluster reinversion */
   1700#if 0
   1701				for (j = first, laststart = first; j <= last;
   1702				     j++) {
   1703					if (output[j] & (UINT32_C(1) << 29)) {
   1704						/* we hit a mark! given the
   1705						 * grapheme cluster is inverted,
   1706						 * this means that the cluster
   1707						 * ended and we now reinvert it
   1708						 * again
   1709						 */
   1710						for (k = laststart, l = j;
   1711						     k < l; k++, l--) {
   1712							/* swap */
   1713							tmp = output[k];
   1714							output[k] = output[l];
   1715							output[l] = tmp;
   1716						}
   1717						laststart = j + 1;
   1718					}
   1719				}
   1720#endif
   1721
   1722				/* unmark grapheme cluster breaks */
   1723				for (j = first; j <= last; j++) {
   1724					output[j] ^= output[j] &
   1725					             UINT32_C(0x20000000);
   1726				}
   1727			}
   1728		}
   1729	}
   1730
   1731	/* remove embedding level metadata */
   1732	for (i = 0; i < outputlen; i++) {
   1733		output[i] ^= output[i] & UINT32_C(0x1FE00000);
   1734	}
   1735
   1736	return outputlen;
   1737}