line.c (14397B)
1/* See LICENSE file for copyright and license details. */ 2#include <stdbool.h> 3#include <stddef.h> 4 5#include "../gen/line.h" 6#include "../grapheme.h" 7#include "util.h" 8 9static inline enum line_break_property 10get_break_prop(uint_least32_t cp) 11{ 12 if (likely(cp <= UINT32_C(0x10FFFF))) { 13 return (enum line_break_property) 14 line_break_minor[line_break_major[cp >> 8] + 15 (cp & 0xff)]; 16 } else { 17 return LINE_BREAK_PROP_AL; 18 } 19} 20 21static size_t 22next_line_break(HERODOTUS_READER *r) 23{ 24 HERODOTUS_READER tmp; 25 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop, 26 last_non_sp_prop, last_non_sp_cm_or_zwj_prop; 27 uint_least32_t cp; 28 uint_least8_t lb25_level = 0; 29 bool lb21a_flag = false, ri_even = true; 30 31 /* 32 * Apply line breaking algorithm (UAX #14), see 33 * https://unicode.org/reports/tr14/#Algorithm and tailoring 34 * https://unicode.org/reports/tr14/#Examples (example 7), 35 * given the automatic test-cases implement this example for 36 * better number handling. 37 * 38 */ 39 40 /* 41 * Initialize the different properties such that we have 42 * a good state after the state-update in the loop 43 */ 44 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */ 45 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS; 46 47 for (herodotus_read_codepoint(r, true, &cp), 48 cp0_prop = get_break_prop(cp); 49 herodotus_read_codepoint(r, false, &cp) == 50 HERODOTUS_STATUS_SUCCESS; 51 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) { 52 /* get property of the right codepoint */ 53 cp1_prop = get_break_prop(cp); 54 55 /* update retention-states */ 56 57 /* 58 * store the last observed non-CM-or-ZWJ-property for 59 * LB9 and following. 60 */ 61 if (cp0_prop != LINE_BREAK_PROP_CM && 62 cp0_prop != LINE_BREAK_PROP_ZWJ) { 63 /* 64 * check if the property we are overwriting now is an 65 * HL. If so, we set the LB21a-flag which depends on 66 * this knowledge. 67 */ 68 lb21a_flag = 69 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL); 70 71 /* check regional indicator state */ 72 if (cp0_prop == LINE_BREAK_PROP_RI) { 73 /* 74 * The property we just shifted in is 75 * a regional indicator, increasing the 76 * number of consecutive RIs on the left 77 * side of the breakpoint by one, changing 78 * the oddness. 79 * 80 */ 81 ri_even = !ri_even; 82 } else { 83 /* 84 * We saw no regional indicator, so the 85 * number of consecutive RIs on the left 86 * side of the breakpoint is zero, which 87 * is an even number. 88 * 89 */ 90 ri_even = true; 91 } 92 93 /* 94 * Here comes a bit of magic. The tailored rule 95 * LB25 (using example 7) has a very complicated 96 * left-hand-side-rule of the form 97 * 98 * NU (NU | SY | IS)* (CL | CP)? 99 * 100 * but instead of backtracking, we keep the state 101 * as some kind of "power level" in the variable 102 * 103 * lb25_level 104 * 105 * that goes from 0 to 3 106 * 107 * 0: we are not in the sequence 108 * 1: we have one NU to the left of the middle 109 * spot 110 * 2: we have one NU and one or more (NU | SY | IS) 111 * to the left of the middle spot 112 * 3: we have one NU, zero or more (NU | SY | IS) 113 * and one (CL | CP) to the left of the middle 114 * spot 115 */ 116 if ((lb25_level == 0 || lb25_level == 1) && 117 cp0_prop == LINE_BREAK_PROP_NU) { 118 /* sequence has begun */ 119 lb25_level = 1; 120 } else if ((lb25_level == 1 || lb25_level == 2) && 121 (cp0_prop == LINE_BREAK_PROP_NU || 122 cp0_prop == LINE_BREAK_PROP_SY || 123 cp0_prop == LINE_BREAK_PROP_IS)) { 124 /* (NU | SY | IS) sequence begins or continued 125 */ 126 lb25_level = 2; 127 } else if ( 128 (lb25_level == 1 || lb25_level == 2) && 129 (cp0_prop == LINE_BREAK_PROP_CL || 130 cp0_prop == 131 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 132 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { 133 /* CL or CP at the end of the sequence */ 134 lb25_level = 3; 135 } else { 136 /* sequence broke */ 137 lb25_level = 0; 138 } 139 140 last_non_cm_or_zwj_prop = cp0_prop; 141 } 142 143 /* 144 * store the last observed non-SP-property for LB8, LB14, 145 * LB15, LB16 and LB17. LB8 gets its own unskipped property, 146 * whereas the others build on top of the CM-ZWJ-skipped 147 * properties as they come after LB9 148 */ 149 if (cp0_prop != LINE_BREAK_PROP_SP) { 150 last_non_sp_prop = cp0_prop; 151 } 152 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) { 153 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop; 154 } 155 156 /* apply the algorithm */ 157 158 /* LB4 */ 159 if (cp0_prop == LINE_BREAK_PROP_BK) { 160 break; 161 } 162 163 /* LB5 */ 164 if (cp0_prop == LINE_BREAK_PROP_CR && 165 cp1_prop == LINE_BREAK_PROP_LF) { 166 continue; 167 } 168 if (cp0_prop == LINE_BREAK_PROP_CR || 169 cp0_prop == LINE_BREAK_PROP_LF || 170 cp0_prop == LINE_BREAK_PROP_NL) { 171 break; 172 } 173 174 /* LB6 */ 175 if (cp1_prop == LINE_BREAK_PROP_BK || 176 cp1_prop == LINE_BREAK_PROP_CR || 177 cp1_prop == LINE_BREAK_PROP_LF || 178 cp1_prop == LINE_BREAK_PROP_NL) { 179 continue; 180 } 181 182 /* LB7 */ 183 if (cp1_prop == LINE_BREAK_PROP_SP || 184 cp1_prop == LINE_BREAK_PROP_ZW) { 185 continue; 186 } 187 188 /* LB8 */ 189 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) { 190 break; 191 } 192 193 /* LB8a */ 194 if (cp0_prop == LINE_BREAK_PROP_ZWJ) { 195 continue; 196 } 197 198 /* LB9 */ 199 if ((cp0_prop != LINE_BREAK_PROP_BK && 200 cp0_prop != LINE_BREAK_PROP_CR && 201 cp0_prop != LINE_BREAK_PROP_LF && 202 cp0_prop != LINE_BREAK_PROP_NL && 203 cp0_prop != LINE_BREAK_PROP_SP && 204 cp0_prop != LINE_BREAK_PROP_ZW) && 205 (cp1_prop == LINE_BREAK_PROP_CM || 206 cp1_prop == LINE_BREAK_PROP_ZWJ)) { 207 /* 208 * given we skip them, we don't break in such 209 * a sequence 210 */ 211 continue; 212 } 213 214 /* LB10 is baked into the following rules */ 215 216 /* LB11 */ 217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ || 218 cp1_prop == LINE_BREAK_PROP_WJ) { 219 continue; 220 } 221 222 /* LB12 */ 223 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) { 224 continue; 225 } 226 227 /* LB12a */ 228 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP && 229 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA && 230 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) && 231 cp1_prop == LINE_BREAK_PROP_GL) { 232 continue; 233 } 234 235 /* LB13 (affected by tailoring for LB25, see example 7) */ 236 if (cp1_prop == LINE_BREAK_PROP_EX || 237 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU && 238 (cp1_prop == LINE_BREAK_PROP_CL || 239 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 240 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF || 241 cp1_prop == LINE_BREAK_PROP_IS || 242 cp1_prop == LINE_BREAK_PROP_SY))) { 243 continue; 244 } 245 246 /* LB14 */ 247 if (last_non_sp_cm_or_zwj_prop == 248 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 249 last_non_sp_cm_or_zwj_prop == 250 LINE_BREAK_PROP_OP_WITH_EAW_HWF) { 251 continue; 252 } 253 254 /* LB15 */ 255 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU && 256 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 257 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) { 258 continue; 259 } 260 261 /* LB16 */ 262 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL || 263 last_non_sp_cm_or_zwj_prop == 264 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 265 last_non_sp_cm_or_zwj_prop == 266 LINE_BREAK_PROP_CP_WITH_EAW_HWF) && 267 cp1_prop == LINE_BREAK_PROP_NS) { 268 continue; 269 } 270 271 /* LB17 */ 272 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 && 273 cp1_prop == LINE_BREAK_PROP_B2) { 274 continue; 275 } 276 277 /* LB18 */ 278 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) { 279 break; 280 } 281 282 /* LB19 */ 283 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU || 284 cp1_prop == LINE_BREAK_PROP_QU) { 285 continue; 286 } 287 288 /* LB20 */ 289 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB || 290 cp1_prop == LINE_BREAK_PROP_CB) { 291 break; 292 } 293 294 /* LB21 */ 295 if (cp1_prop == LINE_BREAK_PROP_BA || 296 cp1_prop == LINE_BREAK_PROP_HY || 297 cp1_prop == LINE_BREAK_PROP_NS || 298 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) { 299 continue; 300 } 301 302 /* LB21a */ 303 if (lb21a_flag && 304 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY || 305 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) { 306 continue; 307 } 308 309 /* LB21b */ 310 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY && 311 cp1_prop == LINE_BREAK_PROP_HL) { 312 continue; 313 } 314 315 /* LB22 */ 316 if (cp1_prop == LINE_BREAK_PROP_IN) { 317 continue; 318 } 319 320 /* LB23 */ 321 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 322 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 323 cp1_prop == LINE_BREAK_PROP_NU) { 324 continue; 325 } 326 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU && 327 (cp1_prop == LINE_BREAK_PROP_AL || 328 cp1_prop == LINE_BREAK_PROP_HL)) { 329 continue; 330 } 331 332 /* LB23a */ 333 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && 334 (cp1_prop == LINE_BREAK_PROP_ID || 335 cp1_prop == LINE_BREAK_PROP_EB || 336 cp1_prop == LINE_BREAK_PROP_EM)) { 337 continue; 338 } 339 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID || 340 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB || 341 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) && 342 cp1_prop == LINE_BREAK_PROP_PO) { 343 continue; 344 } 345 346 /* LB24 */ 347 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || 348 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) && 349 (cp1_prop == LINE_BREAK_PROP_AL || 350 cp1_prop == LINE_BREAK_PROP_HL)) { 351 continue; 352 } 353 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 354 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 355 (cp1_prop == LINE_BREAK_PROP_PR || 356 cp1_prop == LINE_BREAK_PROP_PO)) { 357 continue; 358 } 359 360 /* LB25 (tailored with example 7) */ 361 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || 362 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) { 363 if (cp1_prop == LINE_BREAK_PROP_NU) { 364 continue; 365 } 366 367 /* this stupid rule is the reason why we cannot 368 * simply have a stateful break-detection between 369 * two adjacent codepoints as we have it with 370 * characters. 371 */ 372 herodotus_reader_copy(r, &tmp); 373 herodotus_read_codepoint(&tmp, true, &cp); 374 if (herodotus_read_codepoint(&tmp, true, &cp) == 375 HERODOTUS_STATUS_SUCCESS && 376 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 377 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF || 378 cp1_prop == LINE_BREAK_PROP_HY)) { 379 if (get_break_prop(cp) == LINE_BREAK_PROP_NU) { 380 continue; 381 } 382 } 383 } 384 if ((last_non_cm_or_zwj_prop == 385 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || 386 last_non_cm_or_zwj_prop == 387 LINE_BREAK_PROP_OP_WITH_EAW_HWF || 388 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) && 389 cp1_prop == LINE_BREAK_PROP_NU) { 390 continue; 391 } 392 if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU || 393 cp1_prop == LINE_BREAK_PROP_SY || 394 cp1_prop == LINE_BREAK_PROP_IS)) { 395 continue; 396 } 397 if ((lb25_level == 1 || lb25_level == 2) && 398 (cp1_prop == LINE_BREAK_PROP_NU || 399 cp1_prop == LINE_BREAK_PROP_SY || 400 cp1_prop == LINE_BREAK_PROP_IS || 401 cp1_prop == LINE_BREAK_PROP_CL || 402 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || 403 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { 404 continue; 405 } 406 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) && 407 (cp1_prop == LINE_BREAK_PROP_PO || 408 cp1_prop == LINE_BREAK_PROP_PR)) { 409 continue; 410 } 411 412 /* LB26 */ 413 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL && 414 (cp1_prop == LINE_BREAK_PROP_JL || 415 cp1_prop == LINE_BREAK_PROP_JV || 416 cp1_prop == LINE_BREAK_PROP_H2 || 417 cp1_prop == LINE_BREAK_PROP_H3)) { 418 continue; 419 } 420 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || 421 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) && 422 (cp1_prop == LINE_BREAK_PROP_JV || 423 cp1_prop == LINE_BREAK_PROP_JT)) { 424 continue; 425 } 426 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || 427 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && 428 cp1_prop == LINE_BREAK_PROP_JT) { 429 continue; 430 } 431 432 /* LB27 */ 433 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL || 434 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || 435 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || 436 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 || 437 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && 438 cp1_prop == LINE_BREAK_PROP_PO) { 439 continue; 440 } 441 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && 442 (cp1_prop == LINE_BREAK_PROP_JL || 443 cp1_prop == LINE_BREAK_PROP_JV || 444 cp1_prop == LINE_BREAK_PROP_JT || 445 cp1_prop == LINE_BREAK_PROP_H2 || 446 cp1_prop == LINE_BREAK_PROP_H3)) { 447 continue; 448 } 449 450 /* LB28 */ 451 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 452 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && 453 (cp1_prop == LINE_BREAK_PROP_AL || 454 cp1_prop == LINE_BREAK_PROP_HL)) { 455 continue; 456 } 457 458 /* LB29 */ 459 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS && 460 (cp1_prop == LINE_BREAK_PROP_AL || 461 cp1_prop == LINE_BREAK_PROP_HL)) { 462 continue; 463 } 464 465 /* LB30 */ 466 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || 467 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL || 468 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) && 469 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) { 470 continue; 471 } 472 if (last_non_cm_or_zwj_prop == 473 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF && 474 (cp1_prop == LINE_BREAK_PROP_AL || 475 cp1_prop == LINE_BREAK_PROP_HL || 476 cp1_prop == LINE_BREAK_PROP_NU)) { 477 continue; 478 } 479 480 /* LB30a */ 481 if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI && 482 cp1_prop == LINE_BREAK_PROP_RI) { 483 continue; 484 } 485 486 /* LB30b */ 487 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB && 488 cp1_prop == LINE_BREAK_PROP_EM) { 489 continue; 490 } 491 if (last_non_cm_or_zwj_prop == 492 LINE_BREAK_PROP_BOTH_CN_EXTPICT && 493 cp1_prop == LINE_BREAK_PROP_EM) { 494 continue; 495 } 496 497 /* LB31 */ 498 break; 499 } 500 501 return herodotus_reader_number_read(r); 502} 503 504size_t 505grapheme_next_line_break(const uint_least32_t *str, size_t len) 506{ 507 HERODOTUS_READER r; 508 509 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); 510 511 return next_line_break(&r); 512} 513 514size_t 515grapheme_next_line_break_utf8(const char *str, size_t len) 516{ 517 HERODOTUS_READER r; 518 519 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); 520 521 return next_line_break(&r); 522}