line.c (11273B)
1/* See LICENSE file for copyright and license details. */ 2#include <stdio.h> 3#include <stdlib.h> 4#include <string.h> 5 6#include "util.h" 7 8#define FILE_EAW "data/EastAsianWidth.txt" 9#define FILE_EMOJI "data/emoji-data.txt" 10#define FILE_LINE "data/LineBreak.txt" 11 12static const struct property_spec line_break_property[] = { 13 { 14 .enumname = "AL", 15 .file = FILE_LINE, 16 .ucdname = "AL", 17 }, 18 /* 19 * Both extended pictographic and cn are large classes, 20 * but we are only interested in their intersection for LB30b, 21 * so we have the following two temporary classes. At first 22 * the extpict-class is filled, then the cn-class, which leads 23 * to conflicts (that we handle by putting them in the "proper" 24 * class BOTH_CN_EXTPICT). We make use of the fact that there 25 * is no intersection between AL and Cn. 26 * 27 * Any consecutive conflicts are permitted to overwrite 28 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need 29 * them, and in the final postprocessing we "reset" all 30 * remaining matches (that then didn't fit any of the other 31 * classes) to the generic class AL. 32 */ 33 { 34 .enumname = "TMP_CN", 35 .file = FILE_LINE, 36 .ucdname = "Cn", 37 }, 38 { 39 .enumname = "TMP_EXTENDED_PICTOGRAPHIC", 40 .file = FILE_EMOJI, 41 .ucdname = "Extended_Pictographic", 42 }, 43 /* end of special block */ 44 { 45 .enumname = "B2", 46 .file = FILE_LINE, 47 .ucdname = "B2", 48 }, 49 { 50 .enumname = "BA", 51 .file = FILE_LINE, 52 .ucdname = "BA", 53 }, 54 { 55 .enumname = "BB", 56 .file = FILE_LINE, 57 .ucdname = "BB", 58 }, 59 { 60 .enumname = "BK", 61 .file = FILE_LINE, 62 .ucdname = "BK", 63 }, 64 { 65 .enumname = "BOTH_CN_EXTPICT", 66 .file = NULL, 67 .ucdname = NULL, 68 }, 69 { 70 .enumname = "CB", 71 .file = FILE_LINE, 72 .ucdname = "CB", 73 }, 74 { 75 .enumname = "CL", 76 .file = FILE_LINE, 77 .ucdname = "CL", 78 }, 79 { 80 .enumname = "CM", 81 .file = FILE_LINE, 82 .ucdname = "CM", 83 }, 84 { 85 .enumname = "CP_WITHOUT_EAW_HWF", 86 .file = FILE_LINE, 87 .ucdname = "CP", 88 }, 89 { 90 .enumname = "CP_WITH_EAW_HWF", 91 .file = NULL, 92 .ucdname = NULL, 93 }, 94 { 95 .enumname = "CR", 96 .file = FILE_LINE, 97 .ucdname = "CR", 98 }, 99 { 100 .enumname = "EB", 101 .file = FILE_LINE, 102 .ucdname = "EB", 103 }, 104 { 105 .enumname = "EM", 106 .file = FILE_LINE, 107 .ucdname = "EM", 108 }, 109 { 110 .enumname = "EX", 111 .file = FILE_LINE, 112 .ucdname = "EX", 113 }, 114 { 115 .enumname = "GL", 116 .file = FILE_LINE, 117 .ucdname = "GL", 118 }, 119 { 120 .enumname = "H2", 121 .file = FILE_LINE, 122 .ucdname = "H2", 123 }, 124 { 125 .enumname = "H3", 126 .file = FILE_LINE, 127 .ucdname = "H3", 128 }, 129 { 130 .enumname = "HL", 131 .file = FILE_LINE, 132 .ucdname = "HL", 133 }, 134 { 135 .enumname = "HY", 136 .file = FILE_LINE, 137 .ucdname = "HY", 138 }, 139 { 140 .enumname = "ID", 141 .file = FILE_LINE, 142 .ucdname = "ID", 143 }, 144 { 145 .enumname = "IN", 146 .file = FILE_LINE, 147 .ucdname = "IN", 148 }, 149 { 150 .enumname = "IS", 151 .file = FILE_LINE, 152 .ucdname = "IS", 153 }, 154 { 155 .enumname = "JL", 156 .file = FILE_LINE, 157 .ucdname = "JL", 158 }, 159 { 160 .enumname = "JT", 161 .file = FILE_LINE, 162 .ucdname = "JT", 163 }, 164 { 165 .enumname = "JV", 166 .file = FILE_LINE, 167 .ucdname = "JV", 168 }, 169 { 170 .enumname = "LF", 171 .file = FILE_LINE, 172 .ucdname = "LF", 173 }, 174 { 175 .enumname = "NL", 176 .file = FILE_LINE, 177 .ucdname = "NL", 178 }, 179 { 180 .enumname = "NS", 181 .file = FILE_LINE, 182 .ucdname = "NS", 183 }, 184 { 185 .enumname = "NU", 186 .file = FILE_LINE, 187 .ucdname = "NU", 188 }, 189 { 190 .enumname = "OP_WITHOUT_EAW_HWF", 191 .file = FILE_LINE, 192 .ucdname = "OP", 193 }, 194 { 195 .enumname = "OP_WITH_EAW_HWF", 196 .file = NULL, 197 .ucdname = NULL, 198 }, 199 { 200 .enumname = "PO", 201 .file = FILE_LINE, 202 .ucdname = "PO", 203 }, 204 { 205 .enumname = "PR", 206 .file = FILE_LINE, 207 .ucdname = "PR", 208 }, 209 { 210 .enumname = "QU", 211 .file = FILE_LINE, 212 .ucdname = "QU", 213 }, 214 { 215 .enumname = "RI", 216 .file = FILE_LINE, 217 .ucdname = "RI", 218 }, 219 { 220 .enumname = "SP", 221 .file = FILE_LINE, 222 .ucdname = "SP", 223 }, 224 { 225 .enumname = "SY", 226 .file = FILE_LINE, 227 .ucdname = "SY", 228 }, 229 { 230 .enumname = "WJ", 231 .file = FILE_LINE, 232 .ucdname = "WJ", 233 }, 234 { 235 .enumname = "ZW", 236 .file = FILE_LINE, 237 .ucdname = "ZW", 238 }, 239 { 240 .enumname = "ZWJ", 241 .file = FILE_LINE, 242 .ucdname = "ZWJ", 243 }, 244 { 245 .enumname = "TMP_AI", 246 .file = FILE_LINE, 247 .ucdname = "AI", 248 }, 249 { 250 .enumname = "TMP_CJ", 251 .file = FILE_LINE, 252 .ucdname = "CJ", 253 }, 254 { 255 .enumname = "TMP_XX", 256 .file = NULL, 257 .ucdname = NULL, 258 }, 259 { 260 .enumname = "TMP_MN", 261 .file = FILE_LINE, 262 .ucdname = "Mn", 263 }, 264 { 265 .enumname = "TMP_MC", 266 .file = FILE_LINE, 267 .ucdname = "Mc", 268 }, 269 { 270 .enumname = "TMP_SA_WITHOUT_MN_OR_MC", 271 .file = FILE_LINE, 272 .ucdname = "SA", 273 }, 274 { 275 .enumname = "TMP_SA_WITH_MN_OR_MC", 276 .file = FILE_LINE, 277 .ucdname = "SA", 278 }, 279 { 280 .enumname = "TMP_SG", 281 .file = FILE_LINE, 282 .ucdname = "SG", 283 }, 284 { 285 .enumname = "TMP_EAW_H", 286 .file = FILE_EAW, 287 .ucdname = "H", 288 }, 289 { 290 .enumname = "TMP_EAW_W", 291 .file = FILE_EAW, 292 .ucdname = "W", 293 }, 294 { 295 .enumname = "TMP_EAW_F", 296 .file = FILE_EAW, 297 .ucdname = "F", 298 }, 299}; 300 301static uint_least8_t 302handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) 303{ 304 uint_least8_t result = prop2; 305 char *target = NULL; 306 307 (void)cp; 308 309 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || 310 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || 311 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) || 312 (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") || 313 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") || 314 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) { 315 if (!strcmp(line_break_property[prop1].enumname, 316 "CP_WITHOUT_EAW_HWF") || 317 !strcmp(line_break_property[prop2].enumname, 318 "CP_WITHOUT_EAW_HWF")) { 319 target = "CP_WITH_EAW_HWF"; 320 } else if (!strcmp(line_break_property[prop1].enumname, 321 "OP_WITHOUT_EAW_HWF") || 322 !strcmp(line_break_property[prop2].enumname, 323 "OP_WITHOUT_EAW_HWF")) { 324 target = "OP_WITH_EAW_HWF"; 325 } else { 326 /* ignore EAW for the rest */ 327 if ((!strcmp(line_break_property[prop1].enumname, 328 "TMP_EAW_H") || 329 !strcmp(line_break_property[prop1].enumname, 330 "TMP_EAW_W") || 331 !strcmp(line_break_property[prop1].enumname, 332 "TMP_EAW_F"))) { 333 result = prop2; 334 } else { 335 result = prop1; 336 } 337 } 338 } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") || 339 !strcmp(line_break_property[prop1].enumname, "TMP_MC")) || 340 (!strcmp(line_break_property[prop2].enumname, "TMP_MN") || 341 !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) { 342 if (!strcmp(line_break_property[prop1].enumname, 343 "SA_WITHOUT_MN_OR_MC") || 344 !strcmp(line_break_property[prop2].enumname, 345 "SA_WITHOUT_MN_OR_MC")) { 346 target = "SA_WITH_MN_OR_MC"; 347 } else { 348 /* ignore Mn and Mc for the rest */ 349 if ((!strcmp(line_break_property[prop1].enumname, 350 "TMP_MN") || 351 !strcmp(line_break_property[prop1].enumname, 352 "TMP_MC"))) { 353 result = prop2; 354 } else { 355 result = prop1; 356 } 357 } 358 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || 359 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { 360 if (!strcmp(line_break_property[prop1].enumname, 361 "TMP_EXTENDED_PICTOGRAPHIC") || 362 !strcmp(line_break_property[prop2].enumname, 363 "TMP_EXTENDED_PICTOGRAPHIC")) { 364 target = "BOTH_CN_EXTPICT"; 365 } else { 366 /* ignore Cn for all the other properties */ 367 if (!strcmp(line_break_property[prop1].enumname, 368 "TMP_CN")) { 369 result = prop2; 370 } else { 371 result = prop1; 372 } 373 } 374 } else if (!strcmp(line_break_property[prop1].enumname, 375 "TMP_EXTENDED_PICTOGRAPHIC") || 376 !strcmp(line_break_property[prop2].enumname, 377 "TMP_EXTENDED_PICTOGRAPHIC")) { 378 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || 379 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { 380 target = "BOTH_CN_EXTPICT"; 381 } else { 382 /* ignore Extended_Pictographic for all the other 383 * properties */ 384 if (!strcmp(line_break_property[prop1].enumname, 385 "TMP_EXTENDED_PICTOGRAPHIC")) { 386 result = prop2; 387 } else { 388 result = prop1; 389 } 390 } 391 } else { 392 fprintf(stderr, 393 "handle_conflict: Cannot handle conflict %s <- %s.\n", 394 line_break_property[prop1].enumname, 395 line_break_property[prop2].enumname); 396 exit(1); 397 } 398 399 if (target) { 400 for (result = 0; result < LEN(line_break_property); result++) { 401 if (!strcmp(line_break_property[result].enumname, 402 target)) { 403 break; 404 } 405 } 406 if (result == LEN(line_break_property)) { 407 fprintf(stderr, "handle_conflict: Internal error.\n"); 408 exit(1); 409 } 410 } 411 412 return result; 413} 414 415static void 416post_process(struct properties *prop) 417{ 418 const char *target; 419 uint_least8_t result; 420 size_t i; 421 422 /* post-mapping according to the line breaking algorithm */ 423 for (i = 0; i < UINT32_C(0x110000); i++) { 424 /* LB1 */ 425 if (!strcmp(line_break_property[prop[i].property].enumname, 426 "TMP_AI") || 427 !strcmp(line_break_property[prop[i].property].enumname, 428 "TMP_SG") || 429 !strcmp(line_break_property[prop[i].property].enumname, 430 "TMP_XX")) { 431 /* map AI, SG and XX to AL */ 432 target = "AL"; 433 } else if (!strcmp(line_break_property[prop[i].property] 434 .enumname, 435 "TMP_SA_WITH_MN_OR_MC")) { 436 /* map SA (with General_Category Mn or Mc) to CM */ 437 target = "CM"; 438 } else if (!strcmp(line_break_property[prop[i].property] 439 .enumname, 440 "TMP_SA_WITHOUT_MN_OR_MC")) { 441 /* map SA (without General_Category Mn or Mc) to AL */ 442 target = "AL"; 443 } else if (!strcmp(line_break_property[prop[i].property] 444 .enumname, 445 "TMP_CJ")) { 446 /* map CJ to NS */ 447 target = "NS"; 448 } else if ( 449 !strcmp(line_break_property[prop[i].property].enumname, 450 "TMP_CN") || 451 !strcmp(line_break_property[prop[i].property].enumname, 452 "TMP_EXTENDED_PICTOGRAPHIC") || 453 !strcmp(line_break_property[prop[i].property].enumname, 454 "TMP_MN") || 455 !strcmp(line_break_property[prop[i].property].enumname, 456 "TMP_MC") || 457 !strcmp(line_break_property[prop[i].property].enumname, 458 "TMP_EAW_H") || 459 !strcmp(line_break_property[prop[i].property].enumname, 460 "TMP_EAW_W") || 461 !strcmp(line_break_property[prop[i].property].enumname, 462 "TMP_EAW_F")) { 463 /* map all the temporary classes "residue" to AL */ 464 target = "AL"; 465 } else { 466 target = NULL; 467 } 468 469 if (target) { 470 for (result = 0; result < LEN(line_break_property); 471 result++) { 472 if (!strcmp(line_break_property[result] 473 .enumname, 474 target)) { 475 break; 476 } 477 } 478 if (result == LEN(line_break_property)) { 479 fprintf(stderr, 480 "handle_conflict: Internal error.\n"); 481 exit(1); 482 } 483 484 prop[i].property = result; 485 } 486 } 487} 488 489int 490main(int argc, char *argv[]) 491{ 492 (void)argc; 493 494 properties_generate_break_property( 495 line_break_property, LEN(line_break_property), NULL, 496 handle_conflict, post_process, "line_break", argv[0]); 497 498 return 0; 499}