libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

line.c (14397B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdbool.h>
      3#include <stddef.h>
      4
      5#include "../gen/line.h"
      6#include "../grapheme.h"
      7#include "util.h"
      8
      9static inline enum line_break_property
     10get_break_prop(uint_least32_t cp)
     11{
     12	if (likely(cp <= UINT32_C(0x10FFFF))) {
     13		return (enum line_break_property)
     14			line_break_minor[line_break_major[cp >> 8] +
     15		                         (cp & 0xff)];
     16	} else {
     17		return LINE_BREAK_PROP_AL;
     18	}
     19}
     20
     21static size_t
     22next_line_break(HERODOTUS_READER *r)
     23{
     24	HERODOTUS_READER tmp;
     25	enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
     26		last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
     27	uint_least32_t cp;
     28	uint_least8_t lb25_level = 0;
     29	bool lb21a_flag = false, ri_even = true;
     30
     31	/*
     32	 * Apply line breaking algorithm (UAX #14), see
     33	 * https://unicode.org/reports/tr14/#Algorithm and tailoring
     34	 * https://unicode.org/reports/tr14/#Examples (example 7),
     35	 * given the automatic test-cases implement this example for
     36	 * better number handling.
     37	 *
     38	 */
     39
     40	/*
     41	 * Initialize the different properties such that we have
     42	 * a good state after the state-update in the loop
     43	 */
     44	last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
     45	last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
     46
     47	for (herodotus_read_codepoint(r, true, &cp),
     48	     cp0_prop = get_break_prop(cp);
     49	     herodotus_read_codepoint(r, false, &cp) ==
     50	     HERODOTUS_STATUS_SUCCESS;
     51	     herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
     52		/* get property of the right codepoint */
     53		cp1_prop = get_break_prop(cp);
     54
     55		/* update retention-states */
     56
     57		/*
     58		 * store the last observed non-CM-or-ZWJ-property for
     59		 * LB9 and following.
     60		 */
     61		if (cp0_prop != LINE_BREAK_PROP_CM &&
     62		    cp0_prop != LINE_BREAK_PROP_ZWJ) {
     63			/*
     64			 * check if the property we are overwriting now is an
     65			 * HL. If so, we set the LB21a-flag which depends on
     66			 * this knowledge.
     67			 */
     68			lb21a_flag =
     69				(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
     70
     71			/* check regional indicator state */
     72			if (cp0_prop == LINE_BREAK_PROP_RI) {
     73				/*
     74				 * The property we just shifted in is
     75				 * a regional indicator, increasing the
     76				 * number of consecutive RIs on the left
     77				 * side of the breakpoint by one, changing
     78				 * the oddness.
     79				 *
     80				 */
     81				ri_even = !ri_even;
     82			} else {
     83				/*
     84				 * We saw no regional indicator, so the
     85				 * number of consecutive RIs on the left
     86				 * side of the breakpoint is zero, which
     87				 * is an even number.
     88				 *
     89				 */
     90				ri_even = true;
     91			}
     92
     93			/*
     94			 * Here comes a bit of magic. The tailored rule
     95			 * LB25 (using example 7) has a very complicated
     96			 * left-hand-side-rule of the form
     97			 *
     98			 *  NU (NU | SY | IS)* (CL | CP)?
     99			 *
    100			 * but instead of backtracking, we keep the state
    101			 * as some kind of "power level" in the variable
    102			 *
    103			 *  lb25_level
    104			 *
    105			 * that goes from 0 to 3
    106			 *
    107			 *  0: we are not in the sequence
    108			 *  1: we have one NU to the left of the middle
    109			 *     spot
    110			 *  2: we have one NU and one or more (NU | SY | IS)
    111			 *     to the left of the middle spot
    112			 *  3: we have one NU, zero or more (NU | SY | IS)
    113			 *     and one (CL | CP) to the left of the middle
    114			 *     spot
    115			 */
    116			if ((lb25_level == 0 || lb25_level == 1) &&
    117			    cp0_prop == LINE_BREAK_PROP_NU) {
    118				/* sequence has begun */
    119				lb25_level = 1;
    120			} else if ((lb25_level == 1 || lb25_level == 2) &&
    121			           (cp0_prop == LINE_BREAK_PROP_NU ||
    122			            cp0_prop == LINE_BREAK_PROP_SY ||
    123			            cp0_prop == LINE_BREAK_PROP_IS)) {
    124				/* (NU | SY | IS) sequence begins or continued
    125				 */
    126				lb25_level = 2;
    127			} else if (
    128				(lb25_level == 1 || lb25_level == 2) &&
    129				(cp0_prop == LINE_BREAK_PROP_CL ||
    130			         cp0_prop ==
    131			                 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    132			         cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
    133				/* CL or CP at the end of the sequence */
    134				lb25_level = 3;
    135			} else {
    136				/* sequence broke */
    137				lb25_level = 0;
    138			}
    139
    140			last_non_cm_or_zwj_prop = cp0_prop;
    141		}
    142
    143		/*
    144		 * store the last observed non-SP-property for LB8, LB14,
    145		 * LB15, LB16 and LB17. LB8 gets its own unskipped property,
    146		 * whereas the others build on top of the CM-ZWJ-skipped
    147		 * properties as they come after LB9
    148		 */
    149		if (cp0_prop != LINE_BREAK_PROP_SP) {
    150			last_non_sp_prop = cp0_prop;
    151		}
    152		if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
    153			last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
    154		}
    155
    156		/* apply the algorithm */
    157
    158		/* LB4 */
    159		if (cp0_prop == LINE_BREAK_PROP_BK) {
    160			break;
    161		}
    162
    163		/* LB5 */
    164		if (cp0_prop == LINE_BREAK_PROP_CR &&
    165		    cp1_prop == LINE_BREAK_PROP_LF) {
    166			continue;
    167		}
    168		if (cp0_prop == LINE_BREAK_PROP_CR ||
    169		    cp0_prop == LINE_BREAK_PROP_LF ||
    170		    cp0_prop == LINE_BREAK_PROP_NL) {
    171			break;
    172		}
    173
    174		/* LB6 */
    175		if (cp1_prop == LINE_BREAK_PROP_BK ||
    176		    cp1_prop == LINE_BREAK_PROP_CR ||
    177		    cp1_prop == LINE_BREAK_PROP_LF ||
    178		    cp1_prop == LINE_BREAK_PROP_NL) {
    179			continue;
    180		}
    181
    182		/* LB7 */
    183		if (cp1_prop == LINE_BREAK_PROP_SP ||
    184		    cp1_prop == LINE_BREAK_PROP_ZW) {
    185			continue;
    186		}
    187
    188		/* LB8 */
    189		if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
    190			break;
    191		}
    192
    193		/* LB8a */
    194		if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
    195			continue;
    196		}
    197
    198		/* LB9 */
    199		if ((cp0_prop != LINE_BREAK_PROP_BK &&
    200		     cp0_prop != LINE_BREAK_PROP_CR &&
    201		     cp0_prop != LINE_BREAK_PROP_LF &&
    202		     cp0_prop != LINE_BREAK_PROP_NL &&
    203		     cp0_prop != LINE_BREAK_PROP_SP &&
    204		     cp0_prop != LINE_BREAK_PROP_ZW) &&
    205		    (cp1_prop == LINE_BREAK_PROP_CM ||
    206		     cp1_prop == LINE_BREAK_PROP_ZWJ)) {
    207			/*
    208			 * given we skip them, we don't break in such
    209			 * a sequence
    210			 */
    211			continue;
    212		}
    213
    214		/* LB10 is baked into the following rules */
    215
    216		/* LB11 */
    217		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
    218		    cp1_prop == LINE_BREAK_PROP_WJ) {
    219			continue;
    220		}
    221
    222		/* LB12 */
    223		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
    224			continue;
    225		}
    226
    227		/* LB12a */
    228		if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
    229		     last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
    230		     last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
    231		    cp1_prop == LINE_BREAK_PROP_GL) {
    232			continue;
    233		}
    234
    235		/* LB13 (affected by tailoring for LB25, see example 7) */
    236		if (cp1_prop == LINE_BREAK_PROP_EX ||
    237		    (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
    238		     (cp1_prop == LINE_BREAK_PROP_CL ||
    239		      cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    240		      cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
    241		      cp1_prop == LINE_BREAK_PROP_IS ||
    242		      cp1_prop == LINE_BREAK_PROP_SY))) {
    243			continue;
    244		}
    245
    246		/* LB14 */
    247		if (last_non_sp_cm_or_zwj_prop ==
    248		            LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    249		    last_non_sp_cm_or_zwj_prop ==
    250		            LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
    251			continue;
    252		}
    253
    254		/* LB15 */
    255		if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
    256		    (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    257		     cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
    258			continue;
    259		}
    260
    261		/* LB16 */
    262		if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
    263		     last_non_sp_cm_or_zwj_prop ==
    264		             LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    265		     last_non_sp_cm_or_zwj_prop ==
    266		             LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
    267		    cp1_prop == LINE_BREAK_PROP_NS) {
    268			continue;
    269		}
    270
    271		/* LB17 */
    272		if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
    273		    cp1_prop == LINE_BREAK_PROP_B2) {
    274			continue;
    275		}
    276
    277		/* LB18 */
    278		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
    279			break;
    280		}
    281
    282		/* LB19 */
    283		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
    284		    cp1_prop == LINE_BREAK_PROP_QU) {
    285			continue;
    286		}
    287
    288		/* LB20 */
    289		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
    290		    cp1_prop == LINE_BREAK_PROP_CB) {
    291			break;
    292		}
    293
    294		/* LB21 */
    295		if (cp1_prop == LINE_BREAK_PROP_BA ||
    296		    cp1_prop == LINE_BREAK_PROP_HY ||
    297		    cp1_prop == LINE_BREAK_PROP_NS ||
    298		    last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
    299			continue;
    300		}
    301
    302		/* LB21a */
    303		if (lb21a_flag &&
    304		    (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
    305		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
    306			continue;
    307		}
    308
    309		/* LB21b */
    310		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
    311		    cp1_prop == LINE_BREAK_PROP_HL) {
    312			continue;
    313		}
    314
    315		/* LB22 */
    316		if (cp1_prop == LINE_BREAK_PROP_IN) {
    317			continue;
    318		}
    319
    320		/* LB23 */
    321		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    322		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    323		    cp1_prop == LINE_BREAK_PROP_NU) {
    324			continue;
    325		}
    326		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
    327		    (cp1_prop == LINE_BREAK_PROP_AL ||
    328		     cp1_prop == LINE_BREAK_PROP_HL)) {
    329			continue;
    330		}
    331
    332		/* LB23a */
    333		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
    334		    (cp1_prop == LINE_BREAK_PROP_ID ||
    335		     cp1_prop == LINE_BREAK_PROP_EB ||
    336		     cp1_prop == LINE_BREAK_PROP_EM)) {
    337			continue;
    338		}
    339		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
    340		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
    341		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
    342		    cp1_prop == LINE_BREAK_PROP_PO) {
    343			continue;
    344		}
    345
    346		/* LB24 */
    347		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
    348		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
    349		    (cp1_prop == LINE_BREAK_PROP_AL ||
    350		     cp1_prop == LINE_BREAK_PROP_HL)) {
    351			continue;
    352		}
    353		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    354		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    355		    (cp1_prop == LINE_BREAK_PROP_PR ||
    356		     cp1_prop == LINE_BREAK_PROP_PO)) {
    357			continue;
    358		}
    359
    360		/* LB25 (tailored with example 7) */
    361		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
    362		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
    363			if (cp1_prop == LINE_BREAK_PROP_NU) {
    364				continue;
    365			}
    366
    367			/* this stupid rule is the reason why we cannot
    368			 * simply have a stateful break-detection between
    369			 * two adjacent codepoints as we have it with
    370			 * characters.
    371			 */
    372			herodotus_reader_copy(r, &tmp);
    373			herodotus_read_codepoint(&tmp, true, &cp);
    374			if (herodotus_read_codepoint(&tmp, true, &cp) ==
    375			            HERODOTUS_STATUS_SUCCESS &&
    376			    (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    377			     cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
    378			     cp1_prop == LINE_BREAK_PROP_HY)) {
    379				if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
    380					continue;
    381				}
    382			}
    383		}
    384		if ((last_non_cm_or_zwj_prop ==
    385		             LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
    386		     last_non_cm_or_zwj_prop ==
    387		             LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
    388		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
    389		    cp1_prop == LINE_BREAK_PROP_NU) {
    390			continue;
    391		}
    392		if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
    393		                        cp1_prop == LINE_BREAK_PROP_SY ||
    394		                        cp1_prop == LINE_BREAK_PROP_IS)) {
    395			continue;
    396		}
    397		if ((lb25_level == 1 || lb25_level == 2) &&
    398		    (cp1_prop == LINE_BREAK_PROP_NU ||
    399		     cp1_prop == LINE_BREAK_PROP_SY ||
    400		     cp1_prop == LINE_BREAK_PROP_IS ||
    401		     cp1_prop == LINE_BREAK_PROP_CL ||
    402		     cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
    403		     cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
    404			continue;
    405		}
    406		if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
    407		    (cp1_prop == LINE_BREAK_PROP_PO ||
    408		     cp1_prop == LINE_BREAK_PROP_PR)) {
    409			continue;
    410		}
    411
    412		/* LB26 */
    413		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
    414		    (cp1_prop == LINE_BREAK_PROP_JL ||
    415		     cp1_prop == LINE_BREAK_PROP_JV ||
    416		     cp1_prop == LINE_BREAK_PROP_H2 ||
    417		     cp1_prop == LINE_BREAK_PROP_H3)) {
    418			continue;
    419		}
    420		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
    421		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
    422		    (cp1_prop == LINE_BREAK_PROP_JV ||
    423		     cp1_prop == LINE_BREAK_PROP_JT)) {
    424			continue;
    425		}
    426		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
    427		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
    428		    cp1_prop == LINE_BREAK_PROP_JT) {
    429			continue;
    430		}
    431
    432		/* LB27 */
    433		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
    434		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
    435		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
    436		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
    437		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
    438		    cp1_prop == LINE_BREAK_PROP_PO) {
    439			continue;
    440		}
    441		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
    442		    (cp1_prop == LINE_BREAK_PROP_JL ||
    443		     cp1_prop == LINE_BREAK_PROP_JV ||
    444		     cp1_prop == LINE_BREAK_PROP_JT ||
    445		     cp1_prop == LINE_BREAK_PROP_H2 ||
    446		     cp1_prop == LINE_BREAK_PROP_H3)) {
    447			continue;
    448		}
    449
    450		/* LB28 */
    451		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    452		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
    453		    (cp1_prop == LINE_BREAK_PROP_AL ||
    454		     cp1_prop == LINE_BREAK_PROP_HL)) {
    455			continue;
    456		}
    457
    458		/* LB29 */
    459		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
    460		    (cp1_prop == LINE_BREAK_PROP_AL ||
    461		     cp1_prop == LINE_BREAK_PROP_HL)) {
    462			continue;
    463		}
    464
    465		/* LB30 */
    466		if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
    467		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
    468		     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
    469		    cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
    470			continue;
    471		}
    472		if (last_non_cm_or_zwj_prop ==
    473		            LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
    474		    (cp1_prop == LINE_BREAK_PROP_AL ||
    475		     cp1_prop == LINE_BREAK_PROP_HL ||
    476		     cp1_prop == LINE_BREAK_PROP_NU)) {
    477			continue;
    478		}
    479
    480		/* LB30a */
    481		if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
    482		    cp1_prop == LINE_BREAK_PROP_RI) {
    483			continue;
    484		}
    485
    486		/* LB30b */
    487		if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
    488		    cp1_prop == LINE_BREAK_PROP_EM) {
    489			continue;
    490		}
    491		if (last_non_cm_or_zwj_prop ==
    492		            LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
    493		    cp1_prop == LINE_BREAK_PROP_EM) {
    494			continue;
    495		}
    496
    497		/* LB31 */
    498		break;
    499	}
    500
    501	return herodotus_reader_number_read(r);
    502}
    503
    504size_t
    505grapheme_next_line_break(const uint_least32_t *str, size_t len)
    506{
    507	HERODOTUS_READER r;
    508
    509	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    510
    511	return next_line_break(&r);
    512}
    513
    514size_t
    515grapheme_next_line_break_utf8(const char *str, size_t len)
    516{
    517	HERODOTUS_READER r;
    518
    519	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    520
    521	return next_line_break(&r);
    522}