libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

line.c (11273B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdio.h>
      3#include <stdlib.h>
      4#include <string.h>
      5
      6#include "util.h"
      7
      8#define FILE_EAW   "data/EastAsianWidth.txt"
      9#define FILE_EMOJI "data/emoji-data.txt"
     10#define FILE_LINE  "data/LineBreak.txt"
     11
     12static const struct property_spec line_break_property[] = {
     13	{
     14		.enumname = "AL",
     15		.file = FILE_LINE,
     16		.ucdname = "AL",
     17	},
     18	/*
     19	 * Both extended pictographic and cn are large classes,
     20	 * but we are only interested in their intersection for LB30b,
     21	 * so we have the following two temporary classes. At first
     22	 * the extpict-class is filled, then the cn-class, which leads
     23	 * to conflicts (that we handle by putting them in the "proper"
     24	 * class BOTH_CN_EXTPICT). We make use of the fact that there
     25	 * is no intersection between AL and Cn.
     26	 *
     27	 * Any consecutive conflicts are permitted to overwrite
     28	 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
     29	 * them, and in the final postprocessing we "reset" all
     30	 * remaining matches (that then didn't fit any of the other
     31	 * classes) to the generic class AL.
     32	 */
     33	{
     34		.enumname = "TMP_CN",
     35		.file = FILE_LINE,
     36		.ucdname = "Cn",
     37	},
     38	{
     39		.enumname = "TMP_EXTENDED_PICTOGRAPHIC",
     40		.file = FILE_EMOJI,
     41		.ucdname = "Extended_Pictographic",
     42	},
     43	/* end of special block */
     44	{
     45		.enumname = "B2",
     46		.file = FILE_LINE,
     47		.ucdname = "B2",
     48	},
     49	{
     50		.enumname = "BA",
     51		.file = FILE_LINE,
     52		.ucdname = "BA",
     53	},
     54	{
     55		.enumname = "BB",
     56		.file = FILE_LINE,
     57		.ucdname = "BB",
     58	},
     59	{
     60		.enumname = "BK",
     61		.file = FILE_LINE,
     62		.ucdname = "BK",
     63	},
     64	{
     65		.enumname = "BOTH_CN_EXTPICT",
     66		.file = NULL,
     67		.ucdname = NULL,
     68	},
     69	{
     70		.enumname = "CB",
     71		.file = FILE_LINE,
     72		.ucdname = "CB",
     73	},
     74	{
     75		.enumname = "CL",
     76		.file = FILE_LINE,
     77		.ucdname = "CL",
     78	},
     79	{
     80		.enumname = "CM",
     81		.file = FILE_LINE,
     82		.ucdname = "CM",
     83	},
     84	{
     85		.enumname = "CP_WITHOUT_EAW_HWF",
     86		.file = FILE_LINE,
     87		.ucdname = "CP",
     88	},
     89	{
     90		.enumname = "CP_WITH_EAW_HWF",
     91		.file = NULL,
     92		.ucdname = NULL,
     93	},
     94	{
     95		.enumname = "CR",
     96		.file = FILE_LINE,
     97		.ucdname = "CR",
     98	},
     99	{
    100		.enumname = "EB",
    101		.file = FILE_LINE,
    102		.ucdname = "EB",
    103	},
    104	{
    105		.enumname = "EM",
    106		.file = FILE_LINE,
    107		.ucdname = "EM",
    108	},
    109	{
    110		.enumname = "EX",
    111		.file = FILE_LINE,
    112		.ucdname = "EX",
    113	},
    114	{
    115		.enumname = "GL",
    116		.file = FILE_LINE,
    117		.ucdname = "GL",
    118	},
    119	{
    120		.enumname = "H2",
    121		.file = FILE_LINE,
    122		.ucdname = "H2",
    123	},
    124	{
    125		.enumname = "H3",
    126		.file = FILE_LINE,
    127		.ucdname = "H3",
    128	},
    129	{
    130		.enumname = "HL",
    131		.file = FILE_LINE,
    132		.ucdname = "HL",
    133	},
    134	{
    135		.enumname = "HY",
    136		.file = FILE_LINE,
    137		.ucdname = "HY",
    138	},
    139	{
    140		.enumname = "ID",
    141		.file = FILE_LINE,
    142		.ucdname = "ID",
    143	},
    144	{
    145		.enumname = "IN",
    146		.file = FILE_LINE,
    147		.ucdname = "IN",
    148	},
    149	{
    150		.enumname = "IS",
    151		.file = FILE_LINE,
    152		.ucdname = "IS",
    153	},
    154	{
    155		.enumname = "JL",
    156		.file = FILE_LINE,
    157		.ucdname = "JL",
    158	},
    159	{
    160		.enumname = "JT",
    161		.file = FILE_LINE,
    162		.ucdname = "JT",
    163	},
    164	{
    165		.enumname = "JV",
    166		.file = FILE_LINE,
    167		.ucdname = "JV",
    168	},
    169	{
    170		.enumname = "LF",
    171		.file = FILE_LINE,
    172		.ucdname = "LF",
    173	},
    174	{
    175		.enumname = "NL",
    176		.file = FILE_LINE,
    177		.ucdname = "NL",
    178	},
    179	{
    180		.enumname = "NS",
    181		.file = FILE_LINE,
    182		.ucdname = "NS",
    183	},
    184	{
    185		.enumname = "NU",
    186		.file = FILE_LINE,
    187		.ucdname = "NU",
    188	},
    189	{
    190		.enumname = "OP_WITHOUT_EAW_HWF",
    191		.file = FILE_LINE,
    192		.ucdname = "OP",
    193	},
    194	{
    195		.enumname = "OP_WITH_EAW_HWF",
    196		.file = NULL,
    197		.ucdname = NULL,
    198	},
    199	{
    200		.enumname = "PO",
    201		.file = FILE_LINE,
    202		.ucdname = "PO",
    203	},
    204	{
    205		.enumname = "PR",
    206		.file = FILE_LINE,
    207		.ucdname = "PR",
    208	},
    209	{
    210		.enumname = "QU",
    211		.file = FILE_LINE,
    212		.ucdname = "QU",
    213	},
    214	{
    215		.enumname = "RI",
    216		.file = FILE_LINE,
    217		.ucdname = "RI",
    218	},
    219	{
    220		.enumname = "SP",
    221		.file = FILE_LINE,
    222		.ucdname = "SP",
    223	},
    224	{
    225		.enumname = "SY",
    226		.file = FILE_LINE,
    227		.ucdname = "SY",
    228	},
    229	{
    230		.enumname = "WJ",
    231		.file = FILE_LINE,
    232		.ucdname = "WJ",
    233	},
    234	{
    235		.enumname = "ZW",
    236		.file = FILE_LINE,
    237		.ucdname = "ZW",
    238	},
    239	{
    240		.enumname = "ZWJ",
    241		.file = FILE_LINE,
    242		.ucdname = "ZWJ",
    243	},
    244	{
    245		.enumname = "TMP_AI",
    246		.file = FILE_LINE,
    247		.ucdname = "AI",
    248	},
    249	{
    250		.enumname = "TMP_CJ",
    251		.file = FILE_LINE,
    252		.ucdname = "CJ",
    253	},
    254	{
    255		.enumname = "TMP_XX",
    256		.file = NULL,
    257		.ucdname = NULL,
    258	},
    259	{
    260		.enumname = "TMP_MN",
    261		.file = FILE_LINE,
    262		.ucdname = "Mn",
    263	},
    264	{
    265		.enumname = "TMP_MC",
    266		.file = FILE_LINE,
    267		.ucdname = "Mc",
    268	},
    269	{
    270		.enumname = "TMP_SA_WITHOUT_MN_OR_MC",
    271		.file = FILE_LINE,
    272		.ucdname = "SA",
    273	},
    274	{
    275		.enumname = "TMP_SA_WITH_MN_OR_MC",
    276		.file = FILE_LINE,
    277		.ucdname = "SA",
    278	},
    279	{
    280		.enumname = "TMP_SG",
    281		.file = FILE_LINE,
    282		.ucdname = "SG",
    283	},
    284	{
    285		.enumname = "TMP_EAW_H",
    286		.file = FILE_EAW,
    287		.ucdname = "H",
    288	},
    289	{
    290		.enumname = "TMP_EAW_W",
    291		.file = FILE_EAW,
    292		.ucdname = "W",
    293	},
    294	{
    295		.enumname = "TMP_EAW_F",
    296		.file = FILE_EAW,
    297		.ucdname = "F",
    298	},
    299};
    300
    301static uint_least8_t
    302handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
    303{
    304	uint_least8_t result = prop2;
    305	char *target = NULL;
    306
    307	(void)cp;
    308
    309	if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
    310	     !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
    311	     !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
    312	    (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
    313	     !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
    314	     !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
    315		if (!strcmp(line_break_property[prop1].enumname,
    316		            "CP_WITHOUT_EAW_HWF") ||
    317		    !strcmp(line_break_property[prop2].enumname,
    318		            "CP_WITHOUT_EAW_HWF")) {
    319			target = "CP_WITH_EAW_HWF";
    320		} else if (!strcmp(line_break_property[prop1].enumname,
    321		                   "OP_WITHOUT_EAW_HWF") ||
    322		           !strcmp(line_break_property[prop2].enumname,
    323		                   "OP_WITHOUT_EAW_HWF")) {
    324			target = "OP_WITH_EAW_HWF";
    325		} else {
    326			/* ignore EAW for the rest */
    327			if ((!strcmp(line_break_property[prop1].enumname,
    328			             "TMP_EAW_H") ||
    329			     !strcmp(line_break_property[prop1].enumname,
    330			             "TMP_EAW_W") ||
    331			     !strcmp(line_break_property[prop1].enumname,
    332			             "TMP_EAW_F"))) {
    333				result = prop2;
    334			} else {
    335				result = prop1;
    336			}
    337		}
    338	} else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
    339	            !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
    340	           (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
    341	            !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
    342		if (!strcmp(line_break_property[prop1].enumname,
    343		            "SA_WITHOUT_MN_OR_MC") ||
    344		    !strcmp(line_break_property[prop2].enumname,
    345		            "SA_WITHOUT_MN_OR_MC")) {
    346			target = "SA_WITH_MN_OR_MC";
    347		} else {
    348			/* ignore Mn and Mc for the rest */
    349			if ((!strcmp(line_break_property[prop1].enumname,
    350			             "TMP_MN") ||
    351			     !strcmp(line_break_property[prop1].enumname,
    352			             "TMP_MC"))) {
    353				result = prop2;
    354			} else {
    355				result = prop1;
    356			}
    357		}
    358	} else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
    359	           !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
    360		if (!strcmp(line_break_property[prop1].enumname,
    361		            "TMP_EXTENDED_PICTOGRAPHIC") ||
    362		    !strcmp(line_break_property[prop2].enumname,
    363		            "TMP_EXTENDED_PICTOGRAPHIC")) {
    364			target = "BOTH_CN_EXTPICT";
    365		} else {
    366			/* ignore Cn for all the other properties */
    367			if (!strcmp(line_break_property[prop1].enumname,
    368			            "TMP_CN")) {
    369				result = prop2;
    370			} else {
    371				result = prop1;
    372			}
    373		}
    374	} else if (!strcmp(line_break_property[prop1].enumname,
    375	                   "TMP_EXTENDED_PICTOGRAPHIC") ||
    376	           !strcmp(line_break_property[prop2].enumname,
    377	                   "TMP_EXTENDED_PICTOGRAPHIC")) {
    378		if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
    379		    !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
    380			target = "BOTH_CN_EXTPICT";
    381		} else {
    382			/* ignore Extended_Pictographic for all the other
    383			 * properties */
    384			if (!strcmp(line_break_property[prop1].enumname,
    385			            "TMP_EXTENDED_PICTOGRAPHIC")) {
    386				result = prop2;
    387			} else {
    388				result = prop1;
    389			}
    390		}
    391	} else {
    392		fprintf(stderr,
    393		        "handle_conflict: Cannot handle conflict %s <- %s.\n",
    394		        line_break_property[prop1].enumname,
    395		        line_break_property[prop2].enumname);
    396		exit(1);
    397	}
    398
    399	if (target) {
    400		for (result = 0; result < LEN(line_break_property); result++) {
    401			if (!strcmp(line_break_property[result].enumname,
    402			            target)) {
    403				break;
    404			}
    405		}
    406		if (result == LEN(line_break_property)) {
    407			fprintf(stderr, "handle_conflict: Internal error.\n");
    408			exit(1);
    409		}
    410	}
    411
    412	return result;
    413}
    414
    415static void
    416post_process(struct properties *prop)
    417{
    418	const char *target;
    419	uint_least8_t result;
    420	size_t i;
    421
    422	/* post-mapping according to the line breaking algorithm */
    423	for (i = 0; i < UINT32_C(0x110000); i++) {
    424		/* LB1 */
    425		if (!strcmp(line_break_property[prop[i].property].enumname,
    426		            "TMP_AI") ||
    427		    !strcmp(line_break_property[prop[i].property].enumname,
    428		            "TMP_SG") ||
    429		    !strcmp(line_break_property[prop[i].property].enumname,
    430		            "TMP_XX")) {
    431			/* map AI, SG and XX to AL */
    432			target = "AL";
    433		} else if (!strcmp(line_break_property[prop[i].property]
    434		                           .enumname,
    435		                   "TMP_SA_WITH_MN_OR_MC")) {
    436			/* map SA (with General_Category Mn or Mc) to CM */
    437			target = "CM";
    438		} else if (!strcmp(line_break_property[prop[i].property]
    439		                           .enumname,
    440		                   "TMP_SA_WITHOUT_MN_OR_MC")) {
    441			/* map SA (without General_Category Mn or Mc) to AL */
    442			target = "AL";
    443		} else if (!strcmp(line_break_property[prop[i].property]
    444		                           .enumname,
    445		                   "TMP_CJ")) {
    446			/* map CJ to NS */
    447			target = "NS";
    448		} else if (
    449			!strcmp(line_break_property[prop[i].property].enumname,
    450		                "TMP_CN") ||
    451			!strcmp(line_break_property[prop[i].property].enumname,
    452		                "TMP_EXTENDED_PICTOGRAPHIC") ||
    453			!strcmp(line_break_property[prop[i].property].enumname,
    454		                "TMP_MN") ||
    455			!strcmp(line_break_property[prop[i].property].enumname,
    456		                "TMP_MC") ||
    457			!strcmp(line_break_property[prop[i].property].enumname,
    458		                "TMP_EAW_H") ||
    459			!strcmp(line_break_property[prop[i].property].enumname,
    460		                "TMP_EAW_W") ||
    461			!strcmp(line_break_property[prop[i].property].enumname,
    462		                "TMP_EAW_F")) {
    463			/* map all the temporary classes "residue" to AL */
    464			target = "AL";
    465		} else {
    466			target = NULL;
    467		}
    468
    469		if (target) {
    470			for (result = 0; result < LEN(line_break_property);
    471			     result++) {
    472				if (!strcmp(line_break_property[result]
    473				                    .enumname,
    474				            target)) {
    475					break;
    476				}
    477			}
    478			if (result == LEN(line_break_property)) {
    479				fprintf(stderr,
    480				        "handle_conflict: Internal error.\n");
    481				exit(1);
    482			}
    483
    484			prop[i].property = result;
    485		}
    486	}
    487}
    488
    489int
    490main(int argc, char *argv[])
    491{
    492	(void)argc;
    493
    494	properties_generate_break_property(
    495		line_break_property, LEN(line_break_property), NULL,
    496		handle_conflict, post_process, "line_break", argv[0]);
    497
    498	return 0;
    499}