libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

bidirectional-test.c (15968B)


      1/* See LICENSE file for copyright and license details. */
      2#include <errno.h>
      3#include <inttypes.h>
      4#include <stddef.h>
      5#include <stdio.h>
      6#include <stdlib.h>
      7#include <string.h>
      8
      9#include "../grapheme.h"
     10#include "util.h"
     11
     12struct bidirectional_test {
     13	uint_least32_t *cp;
     14	size_t cplen;
     15	enum grapheme_bidirectional_direction mode[3];
     16	size_t modelen;
     17	enum grapheme_bidirectional_direction resolved;
     18	int_least8_t *level;
     19	int_least16_t *reorder;
     20	size_t reorderlen;
     21};
     22
     23static const struct {
     24	const char *class;
     25	const uint_least32_t cp;
     26} classcpmap[] = {
     27	{ .class = "L", .cp = UINT32_C(0x0041) },
     28	{ .class = "AL", .cp = UINT32_C(0x0608) },
     29	{ .class = "AN", .cp = UINT32_C(0x0600) },
     30	{ .class = "B", .cp = UINT32_C(0x000A) },
     31	{ .class = "BN", .cp = UINT32_C(0x0000) },
     32	{ .class = "CS", .cp = UINT32_C(0x002C) },
     33	{ .class = "EN", .cp = UINT32_C(0x0030) },
     34	{ .class = "ES", .cp = UINT32_C(0x002B) },
     35	{ .class = "ET", .cp = UINT32_C(0x0023) },
     36	{ .class = "FSI", .cp = UINT32_C(0x2068) },
     37	{ .class = "LRE", .cp = UINT32_C(0x202A) },
     38	{ .class = "LRI", .cp = UINT32_C(0x2066) },
     39	{ .class = "LRO", .cp = UINT32_C(0x202D) },
     40	{ .class = "NSM", .cp = UINT32_C(0x0300) },
     41	{ .class = "ON", .cp = UINT32_C(0x0021) },
     42	{ .class = "PDF", .cp = UINT32_C(0x202C) },
     43	{ .class = "PDI", .cp = UINT32_C(0x2069) },
     44	{ .class = "R", .cp = UINT32_C(0x05BE) },
     45	{ .class = "RLE", .cp = UINT32_C(0x202B) },
     46	{ .class = "RLI", .cp = UINT32_C(0x2067) },
     47	{ .class = "RLO", .cp = UINT32_C(0x202E) },
     48	{ .class = "S", .cp = UINT32_C(0x0009) },
     49	{ .class = "WS", .cp = UINT32_C(0x000C) },
     50};
     51
     52static int
     53classtocp(const char *str, size_t len, uint_least32_t *cp)
     54{
     55	size_t i;
     56
     57	for (i = 0; i < LEN(classcpmap); i++) {
     58		if (!strncmp(str, classcpmap[i].class, len)) {
     59			*cp = classcpmap[i].cp;
     60			return 0;
     61		}
     62	}
     63	fprintf(stderr, "classtocp: unknown class string '%.*s'.\n", (int)len,
     64	        str);
     65
     66	return 1;
     67}
     68
     69static int
     70parse_class_list(const char *str, uint_least32_t **cp, size_t *cplen)
     71{
     72	size_t count, i;
     73	const char *tmp1 = NULL, *tmp2 = NULL;
     74
     75	if (strlen(str) == 0) {
     76		*cp = NULL;
     77		*cplen = 0;
     78		return 0;
     79	}
     80
     81	/* count the number of spaces in the string and infer list length */
     82	for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
     83	     count++, tmp1 = tmp2 + 1) {
     84		;
     85	}
     86
     87	/* allocate resources */
     88	if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
     89		fprintf(stderr, "calloc: %s\n", strerror(errno));
     90		exit(1);
     91	}
     92
     93	/* go through the string again, parsing the classes */
     94	for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
     95		tmp2 = strchr(tmp1, ' ');
     96		if (classtocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
     97		              &((*cp)[i]))) {
     98			return 1;
     99		}
    100		if (tmp2 != NULL) {
    101			tmp1 = tmp2 + 1;
    102		}
    103	}
    104
    105	return 0;
    106}
    107
    108static int
    109strtolevel(const char *str, size_t len, int_least8_t *level)
    110{
    111	size_t i;
    112
    113	if (len == 1 && str[0] == 'x') {
    114		/*
    115		 * 'x' indicates those characters that are ignored.
    116		 * We indicate this with a level of -1
    117		 */
    118		*level = -1;
    119		return 0;
    120	}
    121
    122	if (len > 3) {
    123		/*
    124		 * given we can only express (positive) numbers from
    125		 * 0..127, more than 3 digits means an excess
    126		 */
    127		goto toolarge;
    128	}
    129
    130	/* check if the string is completely numerical */
    131	for (i = 0; i < len; i++) {
    132		if (str[i] < '0' && str[i] > '9') {
    133			fprintf(stderr,
    134			        "strtolevel: '%.*s' is not an integer.\n",
    135			        (int)len, str);
    136			return 1;
    137		}
    138	}
    139
    140	if (len == 3) {
    141		if (str[0] != '1' || str[1] > '2' ||
    142		    (str[1] == '2' && str[2] > '7')) {
    143			goto toolarge;
    144		}
    145		*level = (str[0] - '0') * 100 + (str[1] - '0') * 10 +
    146		         (str[2] - '0');
    147	} else if (len == 2) {
    148		*level = (str[0] - '0') * 10 + (str[1] - '0');
    149	} else if (len == 1) {
    150		*level = (str[0] - '0');
    151	} else { /* len == 0 */
    152		*level = 0;
    153	}
    154
    155	return 0;
    156toolarge:
    157	fprintf(stderr, "strtolevel: '%.*s' is too large.\n", (int)len, str);
    158	return 1;
    159}
    160
    161static int
    162strtoreorder(const char *str, size_t len, int_least16_t *reorder)
    163{
    164	size_t i;
    165
    166	if (len == 1 && str[0] == 'x') {
    167		/*
    168		 * 'x' indicates those characters that are ignored.
    169		 * We indicate this with a reorder of -1
    170		 */
    171		*reorder = -1;
    172		return 0;
    173	}
    174
    175	if (len > 3) {
    176		/*
    177		 * given we want to only express (positive) numbers from
    178		 * 0..999 (at most!), more than 3 digits means an excess
    179		 */
    180		goto toolarge;
    181	}
    182
    183	/* check if the string is completely numerical */
    184	for (i = 0; i < len; i++) {
    185		if (str[i] < '0' && str[i] > '9') {
    186			fprintf(stderr,
    187			        "strtoreorder: '%.*s' is not an integer.\n",
    188			        (int)len, str);
    189			return 1;
    190		}
    191	}
    192
    193	if (len == 3) {
    194		*reorder = (str[0] - '0') * 100 + (str[1] - '0') * 10 +
    195		           (str[2] - '0');
    196	} else if (len == 2) {
    197		*reorder = (str[0] - '0') * 10 + (str[1] - '0');
    198	} else if (len == 1) {
    199		*reorder = (str[0] - '0');
    200	} else { /* len == 0 */
    201		*reorder = 0;
    202	}
    203
    204	return 0;
    205toolarge:
    206	fprintf(stderr, "strtoreorder: '%.*s' is too large.\n", (int)len, str);
    207	return 1;
    208}
    209
    210static int
    211parse_level_list(const char *str, int_least8_t **level, size_t *levellen)
    212{
    213	size_t count, i;
    214	const char *tmp1 = NULL, *tmp2 = NULL;
    215
    216	if (strlen(str) == 0) {
    217		*level = NULL;
    218		*levellen = 0;
    219		return 0;
    220	}
    221
    222	/* count the number of spaces in the string and infer list length */
    223	for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
    224	     count++, tmp1 = tmp2 + 1) {
    225		;
    226	}
    227
    228	/* allocate resources */
    229	if (!(*level = calloc((*levellen = count), sizeof(**level)))) {
    230		fprintf(stderr, "calloc: %s\n", strerror(errno));
    231		exit(1);
    232	}
    233
    234	/* go through the string again, parsing the levels */
    235	for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
    236		tmp2 = strchr(tmp1, ' ');
    237		if (strtolevel(tmp1,
    238		               tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
    239		               &((*level)[i]))) {
    240			return 1;
    241		}
    242		if (tmp2 != NULL) {
    243			tmp1 = tmp2 + 1;
    244		}
    245	}
    246
    247	return 0;
    248}
    249
    250static int
    251parse_reorder_list(const char *str, int_least16_t **reorder, size_t *reorderlen)
    252{
    253	size_t count, i;
    254	const char *tmp1 = NULL, *tmp2 = NULL;
    255
    256	if (strlen(str) == 0) {
    257		*reorder = NULL;
    258		*reorderlen = 0;
    259		return 0;
    260	}
    261
    262	/* count the number of spaces in the string and infer list length */
    263	for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
    264	     count++, tmp1 = tmp2 + 1) {
    265		;
    266	}
    267
    268	/* allocate resources */
    269	if (!(*reorder = calloc((*reorderlen = count), sizeof(**reorder)))) {
    270		fprintf(stderr, "calloc: %s\n", strerror(errno));
    271		exit(1);
    272	}
    273
    274	/* go through the string again, parsing the reorders */
    275	for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
    276		tmp2 = strchr(tmp1, ' ');
    277		if (strtoreorder(tmp1,
    278		                 tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
    279		                 &((*reorder)[i]))) {
    280			return 1;
    281		}
    282		if (tmp2 != NULL) {
    283			tmp1 = tmp2 + 1;
    284		}
    285	}
    286
    287	return 0;
    288}
    289
    290static void
    291bidirectional_test_list_print(const struct bidirectional_test *test,
    292                              size_t testlen, const char *identifier,
    293                              const char *progname)
    294{
    295	size_t i, j;
    296
    297	printf("/* Automatically generated by %s */\n"
    298	       "#include <stdint.h>\n#include <stddef.h>\n\n"
    299	       "#include \"../grapheme.h\"\n\n",
    300	       progname);
    301
    302	printf("static const struct {\n"
    303	       "\tuint_least32_t *cp;\n"
    304	       "\tsize_t cplen;\n"
    305	       "\tenum grapheme_bidirectional_direction *mode;\n"
    306	       "\tsize_t modelen;\n"
    307	       "\tenum grapheme_bidirectional_direction resolved;\n"
    308	       "\tint_least8_t *level;\n"
    309	       "\tint_least16_t *reorder;\n"
    310	       "\tsize_t reorderlen;\n} %s[] = {\n",
    311	       identifier);
    312	for (i = 0; i < testlen; i++) {
    313		printf("\t{\n");
    314
    315		printf("\t\t.cp         = (uint_least32_t[]){");
    316		for (j = 0; j < test[i].cplen; j++) {
    317			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
    318			if (j + 1 < test[i].cplen) {
    319				putchar(',');
    320			}
    321		}
    322		printf(" },\n");
    323		printf("\t\t.cplen      = %zu,\n", test[i].cplen);
    324
    325		printf("\t\t.mode       = (enum "
    326		       "grapheme_bidirectional_direction[]){");
    327		for (j = 0; j < test[i].modelen; j++) {
    328			if (test[i].mode[j] ==
    329			    GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) {
    330				printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_"
    331				       "NEUTRAL");
    332			} else if (test[i].mode[j] ==
    333			           GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
    334				printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR");
    335			} else if (test[i].mode[j] ==
    336			           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
    337				printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL");
    338			}
    339			if (j + 1 < test[i].modelen) {
    340				putchar(',');
    341			}
    342		}
    343		printf(" },\n");
    344		printf("\t\t.modelen    = %zu,\n", test[i].modelen);
    345
    346		printf("\t\t.resolved   = ");
    347		if (test[i].resolved ==
    348		    GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) {
    349			printf("GRAPHEME_BIDIRECTIONAL_DIRECTION_"
    350			       "NEUTRAL");
    351		} else if (test[i].resolved ==
    352		           GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
    353			printf("GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR");
    354		} else if (test[i].resolved ==
    355		           GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
    356			printf("GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL");
    357		}
    358		printf(",\n");
    359
    360		printf("\t\t.level      = (int_least8_t[]){");
    361		for (j = 0; j < test[i].cplen; j++) {
    362			printf(" %" PRIdLEAST8, test[i].level[j]);
    363			if (j + 1 < test[i].cplen) {
    364				putchar(',');
    365			}
    366		}
    367		printf(" },\n");
    368
    369		printf("\t\t.reorder    = ");
    370		if (test[i].reorderlen > 0) {
    371			printf("(int_least16_t[]){");
    372			for (j = 0; j < test[i].reorderlen; j++) {
    373				printf(" %" PRIdLEAST16, test[i].reorder[j]);
    374				if (j + 1 < test[i].reorderlen) {
    375					putchar(',');
    376				}
    377			}
    378			printf(" },\n");
    379		} else {
    380			printf("NULL,\n");
    381		}
    382		printf("\t\t.reorderlen = %zu,\n", test[i].reorderlen);
    383
    384		printf("\t},\n");
    385	}
    386	printf("};\n");
    387}
    388
    389static struct bidirectional_test *test;
    390static size_t testlen;
    391
    392static int_least8_t *current_level;
    393static size_t current_level_len;
    394static int_least16_t *current_reorder;
    395static size_t current_reorder_len;
    396
    397static int
    398test_callback(const char *file, char **field, size_t nfields, char *comment,
    399              void *payload)
    400{
    401	char *tmp;
    402
    403	(void)file;
    404	(void)comment;
    405	(void)payload;
    406
    407	/* we either get a line beginning with an '@', or an input line */
    408	if (nfields > 0 && field[0][0] == '@') {
    409		if (!strncmp(field[0], "@Levels:", sizeof("@Levels:") - 1)) {
    410			tmp = field[0] + sizeof("@Levels:") - 1;
    411			for (; *tmp != '\0' && (*tmp == ' ' || *tmp == '\t');
    412			     tmp++) {
    413				;
    414			}
    415			free(current_level);
    416			parse_level_list(tmp, &current_level,
    417			                 &current_level_len);
    418		} else if (!strncmp(field[0],
    419		                    "@Reorder:", sizeof("@Reorder:") - 1)) {
    420			tmp = field[0] + sizeof("@Reorder:") - 1;
    421			for (; *tmp != '\0' && (*tmp == ' ' || *tmp == '\t');
    422			     tmp++) {
    423				;
    424			}
    425			free(current_reorder);
    426			parse_reorder_list(tmp, &current_reorder,
    427			                   &current_reorder_len);
    428		} else {
    429			fprintf(stderr, "Unknown @-input-line.\n");
    430			exit(1);
    431		}
    432	} else {
    433		if (nfields < 2) {
    434			/* discard any line that does not have at least 2 fields
    435			 */
    436			return 0;
    437		}
    438
    439		/* extend test array */
    440		if (!(test = realloc(test, (++testlen) * sizeof(*test)))) {
    441			fprintf(stderr, "realloc: %s\n", strerror(errno));
    442			exit(1);
    443		}
    444
    445		/* parse field data */
    446		parse_class_list(field[0], &(test[testlen - 1].cp),
    447		                 &(test[testlen - 1].cplen));
    448
    449		/* copy current level- and reorder-arrays */
    450		if (!(test[testlen - 1].level =
    451		              calloc(current_level_len,
    452		                     sizeof(*(test[testlen - 1].level))))) {
    453			fprintf(stderr, "calloc: %s\n", strerror(errno));
    454			exit(1);
    455		}
    456		memcpy(test[testlen - 1].level, current_level,
    457		       current_level_len * sizeof(*(test[testlen - 1].level)));
    458
    459		if (!(test[testlen - 1].reorder =
    460		              calloc(current_reorder_len,
    461		                     sizeof(*(test[testlen - 1].reorder))))) {
    462			fprintf(stderr, "calloc: %s\n", strerror(errno));
    463			exit(1);
    464		}
    465		if (current_reorder != NULL) {
    466			memcpy(test[testlen - 1].reorder, current_reorder,
    467			       current_reorder_len *
    468			               sizeof(*(test[testlen - 1].reorder)));
    469		}
    470		test[testlen - 1].reorderlen = current_reorder_len;
    471
    472		if (current_level_len != test[testlen - 1].cplen) {
    473			fprintf(stderr,
    474			        "mismatch between string and level lengths.\n");
    475			exit(1);
    476		}
    477
    478		/* parse paragraph-level-bitset */
    479		if (strlen(field[1]) != 1) {
    480			fprintf(stderr, "malformed paragraph-level-bitset.\n");
    481			exit(1);
    482		} else if (field[1][0] == '2') {
    483			test[testlen - 1].mode[0] =
    484				GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
    485			test[testlen - 1].modelen = 1;
    486		} else if (field[1][0] == '3') {
    487			/* auto=0 and LTR=1 */
    488			test[testlen - 1].mode[0] =
    489				GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
    490			test[testlen - 1].mode[1] =
    491				GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
    492			test[testlen - 1].modelen = 2;
    493		} else if (field[1][0] == '4') {
    494			test[testlen - 1].mode[0] =
    495				GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
    496			test[testlen - 1].modelen = 1;
    497		} else if (field[1][0] == '5') {
    498			test[testlen - 1].mode[0] =
    499				GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
    500			test[testlen - 1].mode[1] =
    501				GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
    502			test[testlen - 1].modelen = 2;
    503		} else if (field[1][0] == '7') {
    504			test[testlen - 1].mode[0] =
    505				GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
    506			test[testlen - 1].mode[1] =
    507				GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
    508			test[testlen - 1].mode[2] =
    509				GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
    510			test[testlen - 1].modelen = 3;
    511		} else {
    512			fprintf(stderr,
    513			        "unhandled paragraph-level-bitset %s.\n",
    514			        field[1]);
    515			exit(1);
    516		}
    517
    518		/* the resolved paragraph level is always neutral as the test
    519		 * file does not specify it */
    520		test[testlen - 1].resolved =
    521			GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
    522	}
    523
    524	return 0;
    525}
    526
    527static int
    528character_test_callback(const char *file, char **field, size_t nfields,
    529                        char *comment, void *payload)
    530{
    531	size_t tmp;
    532
    533	(void)file;
    534	(void)comment;
    535	(void)payload;
    536
    537	if (nfields < 5) {
    538		/* discard any line that does not have at least 5 fields */
    539		return 0;
    540	}
    541
    542	/* extend test array */
    543	if (!(test = realloc(test, (++testlen) * sizeof(*test)))) {
    544		fprintf(stderr, "realloc: %s\n", strerror(errno));
    545		exit(1);
    546	}
    547
    548	/* parse field data */
    549	parse_cp_list(field[0], &(test[testlen - 1].cp),
    550	              &(test[testlen - 1].cplen));
    551	parse_level_list(field[3], &(test[testlen - 1].level), &tmp);
    552	parse_reorder_list(field[4], &(test[testlen - 1].reorder),
    553	                   &(test[testlen - 1].reorderlen));
    554
    555	/* parse paragraph-level-mode */
    556	if (strlen(field[1]) != 1) {
    557		fprintf(stderr, "malformed paragraph-level-setting.\n");
    558		exit(1);
    559	} else if (field[1][0] == '0') {
    560		test[testlen - 1].mode[0] =
    561			GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
    562	} else if (field[1][0] == '1') {
    563		test[testlen - 1].mode[0] =
    564			GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
    565	} else if (field[1][0] == '2') {
    566		test[testlen - 1].mode[0] =
    567			GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
    568	} else {
    569		fprintf(stderr, "unhandled paragraph-level-setting.\n");
    570		exit(1);
    571	}
    572	test[testlen - 1].modelen = 1;
    573
    574	/* parse resolved paragraph level */
    575	if (strlen(field[2]) != 1) {
    576		fprintf(stderr, "malformed resolved paragraph level.\n");
    577		exit(1);
    578	} else if (field[2][0] == '0') {
    579		test[testlen - 1].resolved =
    580			GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
    581	} else if (field[2][0] == '1') {
    582		test[testlen - 1].resolved =
    583			GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
    584	} else {
    585		fprintf(stderr, "unhandled resolved paragraph level.\n");
    586		exit(1);
    587	}
    588
    589	if (tmp != test[testlen - 1].cplen) {
    590		fprintf(stderr, "mismatch between string and level lengths.\n");
    591		exit(1);
    592	}
    593
    594	return 0;
    595}
    596
    597int
    598main(int argc, char *argv[])
    599{
    600	(void)argc;
    601
    602	parse_file_with_callback("data/BidiTest.txt", test_callback, NULL);
    603	parse_file_with_callback("data/BidiCharacterTest.txt",
    604	                         character_test_callback, NULL);
    605	bidirectional_test_list_print(test, testlen, "bidirectional_test",
    606	                              argv[0]);
    607
    608	return 0;
    609}