libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

util.c (21349B)


      1/* See LICENSE file for copyright and license details. */
      2#include <ctype.h>
      3#include <errno.h>
      4#include <inttypes.h>
      5#include <stdbool.h>
      6#include <stddef.h>
      7#include <stdint.h>
      8#include <stdio.h>
      9#include <stdlib.h>
     10#include <string.h>
     11
     12#include "util.h"
     13
     14struct range {
     15	uint_least32_t lower;
     16	uint_least32_t upper;
     17};
     18
     19struct properties_payload {
     20	struct properties *prop;
     21	const struct property_spec *spec;
     22	uint_least8_t speclen;
     23	int (*set_value)(struct properties_payload *, uint_least32_t,
     24	                 int_least64_t);
     25	uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
     26	                                 uint_least8_t);
     27};
     28
     29struct break_test_payload {
     30	struct break_test **test;
     31	size_t *testlen;
     32};
     33
     34static void *
     35reallocate_array(void *p, size_t len, size_t size)
     36{
     37	if (len > 0 && size > SIZE_MAX / len) {
     38		errno = ENOMEM;
     39		return NULL;
     40	}
     41
     42	return realloc(p, len * size);
     43}
     44
     45int
     46hextocp(const char *str, size_t len, uint_least32_t *cp)
     47{
     48	size_t i;
     49	int off;
     50	char relative;
     51
     52	/* the maximum valid codepoint is 0x10FFFF */
     53	if (len > 6) {
     54		fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
     55		        str);
     56		return 1;
     57	}
     58
     59	for (i = 0, *cp = 0; i < len; i++) {
     60		if (str[i] >= '0' && str[i] <= '9') {
     61			relative = '0';
     62			off = 0;
     63		} else if (str[i] >= 'a' && str[i] <= 'f') {
     64			relative = 'a';
     65			off = 10;
     66		} else if (str[i] >= 'A' && str[i] <= 'F') {
     67			relative = 'A';
     68			off = 10;
     69		} else {
     70			fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
     71			        (int)len, str);
     72			return 1;
     73		}
     74
     75		*cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
     76		       (uint_least32_t)(str[i] - relative + off);
     77	}
     78
     79	if (*cp > UINT32_C(0x10FFFF)) {
     80		fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
     81		        str);
     82		return 1;
     83	}
     84
     85	return 0;
     86}
     87
     88int
     89parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
     90{
     91	size_t count, i;
     92	const char *tmp1 = NULL, *tmp2 = NULL;
     93
     94	if (strlen(str) == 0) {
     95		*cp = NULL;
     96		*cplen = 0;
     97		return 0;
     98	}
     99
    100	/* count the number of spaces in the string and infer list length */
    101	for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL;
    102	     count++, tmp1 = tmp2 + 1) {
    103		;
    104	}
    105
    106	/* allocate resources */
    107	if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
    108		fprintf(stderr, "calloc: %s\n", strerror(errno));
    109		exit(1);
    110	}
    111
    112	/* go through the string again, parsing the numbers */
    113	for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
    114		tmp2 = strchr(tmp1, ' ');
    115		if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1),
    116		            &((*cp)[i]))) {
    117			return 1;
    118		}
    119		if (tmp2 != NULL) {
    120			tmp1 = tmp2 + 1;
    121		}
    122	}
    123
    124	return 0;
    125}
    126
    127static int
    128range_parse(const char *str, struct range *range)
    129{
    130	char *p;
    131
    132	if ((p = strstr(str, "..")) == NULL) {
    133		/* input has the form "XXXXXX" */
    134		if (hextocp(str, strlen(str), &range->lower)) {
    135			return 1;
    136		}
    137		range->upper = range->lower;
    138	} else {
    139		/* input has the form "XXXXXX..XXXXXX" */
    140		if (hextocp(str, (size_t)(p - str), &range->lower) ||
    141		    hextocp(p + 2, strlen(p + 2), &range->upper)) {
    142			return 1;
    143		}
    144	}
    145
    146	return 0;
    147}
    148
    149static bool
    150get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len)
    151{
    152	int ret = EOF;
    153
    154	for (*len = 0;; (*len)++) {
    155		if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') {
    156			/*
    157			 * if the previously read character was a newline,
    158			 * we fake an end-of-file so we NUL-terminate and
    159			 * are done.
    160			 */
    161			ret = EOF;
    162		} else {
    163			ret = fgetc(fp);
    164		}
    165
    166		if (*len >= *bufsize) {
    167			/* the buffer needs to be expanded */
    168			*bufsize += 512;
    169			if ((*buf = realloc(*buf, *bufsize)) == NULL) {
    170				fprintf(stderr, "get_line: Out of memory.\n");
    171				exit(1);
    172			}
    173		}
    174
    175		if (ret != EOF) {
    176			(*buf)[*len] = (char)ret;
    177		} else {
    178			(*buf)[*len] = '\0';
    179			break;
    180		}
    181	}
    182
    183	return *len == 0 && (feof(fp) || ferror(fp));
    184}
    185
    186void
    187parse_file_with_callback(const char *fname,
    188                         int (*callback)(const char *, char **, size_t, char *,
    189                                         void *),
    190                         void *payload)
    191{
    192	FILE *fp;
    193	char *line = NULL, **field = NULL, *comment;
    194	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len;
    195
    196	/* open file */
    197	if (!(fp = fopen(fname, "r"))) {
    198		fprintf(stderr, "parse_file_with_callback: fopen '%s': %s.\n",
    199		        fname, strerror(errno));
    200		exit(1);
    201	}
    202
    203	while (!get_line(&line, &linebufsize, fp, &len)) {
    204		/* remove trailing newline */
    205		if (len > 0 && line[len - 1] == '\n') {
    206			line[len - 1] = '\0';
    207			len--;
    208		}
    209
    210		/* skip empty lines and comment lines */
    211		if (len == 0 || line[0] == '#') {
    212			continue;
    213		}
    214
    215		/* tokenize line into fields */
    216		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
    217			/* skip leading whitespace */
    218			while (line[i] == ' ') {
    219				i++;
    220			}
    221
    222			/* check if we crashed into the comment */
    223			if (line[i] != '#') {
    224				/* extend field buffer, if necessary */
    225				if (++nfields > fieldbufsize) {
    226					if ((field = realloc(
    227						     field,
    228						     nfields *
    229							     sizeof(*field))) ==
    230					    NULL) {
    231						fprintf(stderr,
    232						        "parse_file_with_"
    233						        "callback: realloc: "
    234						        "%s.\n",
    235						        strerror(errno));
    236						exit(1);
    237					}
    238					fieldbufsize = nfields;
    239				}
    240
    241				/* set current position as field start */
    242				field[nfields - 1] = &line[i];
    243
    244				/* continue until we reach ';' or '#' or end */
    245				while (line[i] != ';' && line[i] != '#' &&
    246				       line[i] != '\0') {
    247					i++;
    248				}
    249			}
    250
    251			if (line[i] == '#') {
    252				/* set comment-variable for later */
    253				comment = &line[i + 1];
    254			}
    255
    256			/* go back whitespace and terminate field there */
    257			if (i > 0) {
    258				for (j = i - 1; line[j] == ' '; j--) {
    259					;
    260				}
    261				line[j + 1] = '\0';
    262			} else {
    263				line[i] = '\0';
    264			}
    265
    266			/* if comment is set, we are done */
    267			if (comment != NULL) {
    268				break;
    269			}
    270		}
    271
    272		/* skip leading whitespace in comment */
    273		while (comment != NULL && comment[0] == ' ') {
    274			comment++;
    275		}
    276
    277		/* call callback function */
    278		if (callback(fname, field, nfields, comment, payload)) {
    279			fprintf(stderr, "parse_file_with_callback: "
    280			                "Malformed input.\n");
    281			exit(1);
    282		}
    283	}
    284
    285	/* close file */
    286	if (fclose(fp)) {
    287		fprintf(stderr, "parse_file_with_callback: fclose '%s': %s.\n",
    288		        fname, strerror(errno));
    289		exit(1);
    290	}
    291
    292	/* cleanup */
    293	free(line);
    294	free(field);
    295}
    296
    297static int
    298properties_callback(const char *file, char **field, size_t nfields,
    299                    char *comment, void *payload)
    300{
    301	/* prop always has the length 0x110000 */
    302	struct properties_payload *p = (struct properties_payload *)payload;
    303	struct range r;
    304	uint_least8_t i;
    305	uint_least32_t cp;
    306
    307	(void)comment;
    308
    309	if (nfields < 2) {
    310		return 1;
    311	}
    312
    313	for (i = 0; i < p->speclen; i++) {
    314		/* identify fitting file and identifier */
    315		if (p->spec[i].file && !strcmp(p->spec[i].file, file) &&
    316		    (!strcmp(p->spec[i].ucdname, field[1]) ||
    317		     (comment != NULL &&
    318		      !strncmp(p->spec[i].ucdname, comment,
    319		               strlen(p->spec[i].ucdname)) &&
    320		      comment[strlen(p->spec[i].ucdname)] == ' '))) {
    321			/* parse range in first field */
    322			if (range_parse(field[0], &r)) {
    323				return 1;
    324			}
    325
    326			/* apply to all codepoints in the range */
    327			for (cp = r.lower; cp <= r.upper; cp++) {
    328				if (p->set_value(payload, cp, i)) {
    329					exit(1);
    330				}
    331			}
    332			break;
    333		}
    334	}
    335
    336	return 0;
    337}
    338
    339void
    340properties_compress(const struct properties *prop,
    341                    struct properties_compressed *comp)
    342{
    343	uint_least32_t cp, i;
    344
    345	/* initialization */
    346	if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) *
    347	                            sizeof(*(comp->offset))))) {
    348		fprintf(stderr, "malloc: %s\n", strerror(errno));
    349		exit(1);
    350	}
    351	comp->data = NULL;
    352	comp->datalen = 0;
    353
    354	for (cp = 0; cp < UINT32_C(0x110000); cp++) {
    355		for (i = 0; i < comp->datalen; i++) {
    356			if (!memcmp(&(prop[cp]), &(comp->data[i]),
    357			            sizeof(*prop))) {
    358				/* found a match! */
    359				comp->offset[cp] = i;
    360				break;
    361			}
    362		}
    363		if (i == comp->datalen) {
    364			/*
    365			 * found no matching properties-struct, so
    366			 * add current properties to data and add the
    367			 * offset in the offset-table
    368			 */
    369			if (!(comp->data = reallocate_array(
    370				      comp->data, ++(comp->datalen),
    371				      sizeof(*(comp->data))))) {
    372				fprintf(stderr, "reallocate_array: %s\n",
    373				        strerror(errno));
    374				exit(1);
    375			}
    376			memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]),
    377			       sizeof(*prop));
    378			comp->offset[cp] = comp->datalen - 1;
    379		}
    380	}
    381}
    382
    383double
    384properties_get_major_minor(const struct properties_compressed *comp,
    385                           struct properties_major_minor *mm)
    386{
    387	size_t i, j, compression_count = 0;
    388
    389	/*
    390	 * we currently have an array comp->offset which maps the
    391	 * codepoints 0..0x110000 to offsets into comp->data.
    392	 * To improve cache-locality instead and allow a bit of
    393	 * compressing, instead of directly mapping a codepoint
    394	 * 0xAAAABB with comp->offset, we generate two arrays major
    395	 * and minor such that
    396	 *    comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB]
    397	 * This yields a major-array of length 2^16 and a minor array
    398	 * of variable length depending on how many common subsequences
    399	 * can be filtered out.
    400	 */
    401
    402	/* initialize */
    403	if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) {
    404		fprintf(stderr, "malloc: %s\n", strerror(errno));
    405		exit(1);
    406	}
    407	mm->minor = NULL;
    408	mm->minorlen = 0;
    409
    410	for (i = 0; i < (size_t)0x1100; i++) {
    411		/*
    412		 * we now look at the cp-range (i << 8)..(i << 8 + 0xFF)
    413		 * and check if its corresponding offset-data already
    414		 * exists in minor (because then we just point there
    415		 * and need less storage)
    416		 */
    417		for (j = 0; j + 0xFF < mm->minorlen; j++) {
    418			if (!memcmp(&(comp->offset[i << 8]), &(mm->minor[j]),
    419			            sizeof(*(comp->offset)) * 0x100)) {
    420				break;
    421			}
    422		}
    423		if (j + 0xFF < mm->minorlen) {
    424			/* found an index */
    425			compression_count++;
    426			mm->major[i] = j;
    427		} else {
    428			/*
    429			 * add "new" sequence to minor and point to it
    430			 * in major
    431			 */
    432			mm->minorlen += 0x100;
    433			if (!(mm->minor =
    434			              reallocate_array(mm->minor, mm->minorlen,
    435			                               sizeof(*(mm->minor))))) {
    436				fprintf(stderr, "reallocate_array: %s\n",
    437				        strerror(errno));
    438				exit(1);
    439			}
    440			memcpy(&(mm->minor[mm->minorlen - 0x100]),
    441			       &(comp->offset[i << 8]),
    442			       sizeof(*(mm->minor)) * 0x100);
    443			mm->major[i] = mm->minorlen - 0x100;
    444		}
    445	}
    446
    447	/* return compression ratio */
    448	return (double)compression_count / 0x1100 * 100;
    449}
    450
    451void
    452properties_print_lookup_table(const char *name, const size_t *data,
    453                              size_t datalen)
    454{
    455	const char *type;
    456	size_t i, maxval;
    457
    458	for (i = 0, maxval = 0; i < datalen; i++) {
    459		if (data[i] > maxval) {
    460			maxval = data[i];
    461		}
    462	}
    463
    464	type = (maxval <= UINT_LEAST8_MAX)  ? "uint_least8_t" :
    465	       (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
    466	       (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
    467	                                      "uint_least64_t";
    468
    469	printf("static const %s %s[] = {\n\t", type, name);
    470	for (i = 0; i < datalen; i++) {
    471		printf("%zu", data[i]);
    472		if (i + 1 == datalen) {
    473			printf("\n");
    474		} else if ((i + 1) % 8 != 0) {
    475			printf(", ");
    476		} else {
    477			printf(",\n\t");
    478		}
    479	}
    480	printf("};\n");
    481}
    482
    483void
    484properties_print_derived_lookup_table(
    485	char *name, size_t *offset, size_t offsetlen,
    486	int_least64_t (*get_value)(const struct properties *, size_t),
    487	const void *payload)
    488{
    489	const char *type;
    490	size_t i;
    491	int_least64_t minval, maxval;
    492
    493	for (i = 0, minval = INT_LEAST64_MAX, maxval = INT_LEAST64_MIN;
    494	     i < offsetlen; i++) {
    495		if (get_value(payload, offset[i]) > maxval) {
    496			maxval = get_value(payload, offset[i]);
    497		} else if (get_value(payload, offset[i]) < minval) {
    498			minval = get_value(payload, offset[i]);
    499		}
    500	}
    501
    502	if (minval < 0) {
    503		/* we need a signed type */
    504		type = (minval >= INT_LEAST8_MIN && maxval <= INT_LEAST8_MAX) ?
    505		               "int_least8_t" :
    506		       (minval >= INT_LEAST16_MIN &&
    507		        maxval <= INT_LEAST16_MAX) ?
    508		               "int_least16_t" :
    509		       (minval >= INT_LEAST32_MIN &&
    510		        maxval <= INT_LEAST32_MAX) ?
    511		               "int_least32_t" :
    512		               "int_least64_t";
    513	} else {
    514		/* we are fine with an unsigned type */
    515		type = (maxval <= UINT_LEAST8_MAX)  ? "uint_least8_t" :
    516		       (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
    517		       (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
    518		                                      "uint_least64_t";
    519	}
    520
    521	printf("static const %s %s[] = {\n\t", type, name);
    522	for (i = 0; i < offsetlen; i++) {
    523		printf("%" PRIiLEAST64, get_value(payload, offset[i]));
    524		if (i + 1 == offsetlen) {
    525			printf("\n");
    526		} else if ((i + 1) % 8 != 0) {
    527			printf(", ");
    528		} else {
    529			printf(",\n\t");
    530		}
    531	}
    532	printf("};\n");
    533}
    534
    535static void
    536properties_print_enum(const struct property_spec *spec, size_t speclen,
    537                      const char *enumname, const char *enumprefix)
    538{
    539	size_t i;
    540
    541	printf("enum %s {\n", enumname);
    542	for (i = 0; i < speclen; i++) {
    543		printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
    544	}
    545	printf("\tNUM_%sS,\n};\n\n", enumprefix);
    546}
    547
    548static int
    549set_value_bp(struct properties_payload *payload, uint_least32_t cp,
    550             int_least64_t value)
    551{
    552	if (payload->prop[cp].property != payload->speclen) {
    553		if (payload->handle_conflict == NULL) {
    554			fprintf(stderr,
    555			        "set_value_bp: "
    556			        "Unhandled character break property "
    557			        "overwrite for 0x%06X (%s <- %s).\n",
    558			        cp,
    559			        payload->spec[payload->prop[cp].property]
    560			                .enumname,
    561			        payload->spec[value].enumname);
    562			return 1;
    563		} else {
    564			value = payload->handle_conflict(
    565				cp, (uint_least8_t)payload->prop[cp].property,
    566				(uint_least8_t)value);
    567		}
    568	}
    569	payload->prop[cp].property = value;
    570
    571	return 0;
    572}
    573
    574static int_least64_t
    575get_value_bp(const struct properties *prop, size_t offset)
    576{
    577	return prop[offset].property;
    578}
    579
    580void
    581properties_generate_break_property(
    582	const struct property_spec *spec, uint_least8_t speclen,
    583	uint_least8_t (*fill_missing)(uint_least32_t),
    584	uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t,
    585                                         uint_least8_t),
    586	void (*post_process)(struct properties *), const char *prefix,
    587	const char *argv0)
    588{
    589	struct properties_compressed comp;
    590	struct properties_major_minor mm;
    591	struct properties_payload payload;
    592	struct properties *prop;
    593	size_t i, j, prefixlen = strlen(prefix);
    594	char buf1[64], prefix_uc[64], buf2[64], buf3[64], buf4[64];
    595
    596	/*
    597	 * allocate property buffer for all 0x110000 codepoints and
    598	 * initialize its entries to the known invalid value "speclen"
    599	 */
    600	if (!(prop = calloc(UINT32_C(0x110000), sizeof(*prop)))) {
    601		fprintf(stderr, "calloc: %s\n", strerror(errno));
    602		exit(1);
    603	}
    604	for (i = 0; i < UINT32_C(0x110000); i++) {
    605		prop[i].property = speclen;
    606	}
    607
    608	/* generate data */
    609	payload.prop = prop;
    610	payload.spec = spec;
    611	payload.speclen = speclen;
    612	payload.set_value = set_value_bp;
    613	payload.handle_conflict = handle_conflict;
    614
    615	/* parse each file exactly once and ignore NULL-fields */
    616	for (i = 0; i < speclen; i++) {
    617		for (j = 0; j < i; j++) {
    618			if (spec[i].file && spec[j].file &&
    619			    !strcmp(spec[i].file, spec[j].file)) {
    620				/* file has already been parsed */
    621				break;
    622			}
    623		}
    624		if (i == j && spec[i].file) {
    625			/* file has not been processed yet */
    626			parse_file_with_callback(spec[i].file,
    627			                         properties_callback, &payload);
    628		}
    629	}
    630
    631	/* fill in the missing properties that weren't explicitly given */
    632	for (i = 0; i < UINT32_C(0x110000); i++) {
    633		if (payload.prop[i].property == speclen) {
    634			if (fill_missing != NULL) {
    635				payload.prop[i].property =
    636					fill_missing((uint_least32_t)i);
    637			} else {
    638				payload.prop[i].property = 0;
    639			}
    640		}
    641	}
    642
    643	/* post-processing */
    644	if (post_process != NULL) {
    645		post_process(payload.prop);
    646	}
    647
    648	/* compress data */
    649	printf("/* Automatically generated by %s */\n#include <stdint.h>\n\n",
    650	       argv0);
    651	properties_compress(prop, &comp);
    652
    653	fprintf(stderr, "%s: %s-LUT compression-ratio: %.2f%%\n", argv0, prefix,
    654	        properties_get_major_minor(&comp, &mm));
    655
    656	/* prepare names */
    657	if ((size_t)snprintf(buf1, LEN(buf1), "%s_property", prefix) >=
    658	    LEN(buf1)) {
    659		fprintf(stderr, "snprintf: String truncated.\n");
    660		exit(1);
    661	}
    662	if (LEN(prefix_uc) + 1 < prefixlen) {
    663		fprintf(stderr, "snprintf: Buffer too small.\n");
    664		exit(1);
    665	}
    666	for (i = 0; i < prefixlen; i++) {
    667		prefix_uc[i] = (char)toupper(prefix[i]);
    668	}
    669	prefix_uc[prefixlen] = '\0';
    670	if ((size_t)snprintf(buf2, LEN(buf2), "%s_PROP", prefix_uc) >=
    671	            LEN(buf2) ||
    672	    (size_t)snprintf(buf3, LEN(buf3), "%s_major", prefix) >=
    673	            LEN(buf3) ||
    674	    (size_t)snprintf(buf4, LEN(buf4), "%s_minor", prefix) >=
    675	            LEN(buf4)) {
    676		fprintf(stderr, "snprintf: String truncated.\n");
    677		exit(1);
    678	}
    679
    680	/* print data */
    681	properties_print_enum(spec, speclen, buf1, buf2);
    682	properties_print_lookup_table(buf3, mm.major, 0x1100);
    683	printf("\n");
    684	properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen,
    685	                                      get_value_bp, comp.data);
    686
    687	/* free data */
    688	free(prop);
    689	free(comp.data);
    690	free(comp.offset);
    691	free(mm.major);
    692	free(mm.minor);
    693}
    694
    695static int
    696break_test_callback(const char *fname, char **field, size_t nfields,
    697                    char *comment, void *payload)
    698{
    699	struct break_test *t,
    700		**test = ((struct break_test_payload *)payload)->test;
    701	size_t i, *testlen = ((struct break_test_payload *)payload)->testlen,
    702		  commentlen;
    703	char *token;
    704
    705	(void)fname;
    706
    707	if (nfields < 1) {
    708		return 1;
    709	}
    710
    711	/* append new testcase and initialize with zeroes */
    712	if ((*test = realloc(*test, ++(*testlen) * sizeof(**test))) == NULL) {
    713		fprintf(stderr, "break_test_callback: realloc: %s.\n",
    714		        strerror(errno));
    715		return 1;
    716	}
    717	t = &(*test)[*testlen - 1];
    718	memset(t, 0, sizeof(*t));
    719
    720	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
    721	for (token = strtok(field[0], " "), i = 0; token != NULL;
    722	     i++, token = strtok(NULL, " ")) {
    723		if (i % 2 == 0) {
    724			/* delimiter or start of sequence */
    725			if (i == 0 ||
    726			    !strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
    727				/*
    728				 * '÷' indicates a breakpoint,
    729				 * the current length is done; allocate
    730				 * a new length field and set it to 0
    731				 */
    732				if ((t->len = realloc(
    733					     t->len,
    734					     ++t->lenlen * sizeof(*t->len))) ==
    735				    NULL) {
    736					fprintf(stderr,
    737					        "break_test_"
    738					        "callback: realloc: %s.\n",
    739					        strerror(errno));
    740					return 1;
    741				}
    742				t->len[t->lenlen - 1] = 0;
    743			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
    744				/* '×' indicates a non-breakpoint, do nothing */
    745			} else {
    746				fprintf(stderr,
    747				        "break_test_callback: "
    748				        "Malformed delimiter '%s'.\n",
    749				        token);
    750				return 1;
    751			}
    752		} else {
    753			/* add codepoint to cp-array */
    754			if ((t->cp = realloc(t->cp,
    755			                     ++t->cplen * sizeof(*t->cp))) ==
    756			    NULL) {
    757				fprintf(stderr,
    758				        "break_test_callback: "
    759				        "realloc: %s.\n",
    760				        strerror(errno));
    761				return 1;
    762			}
    763			if (hextocp(token, strlen(token),
    764			            &t->cp[t->cplen - 1])) {
    765				return 1;
    766			}
    767			if (t->lenlen > 0) {
    768				t->len[t->lenlen - 1]++;
    769			}
    770		}
    771	}
    772	if (t->lenlen > 0 && t->len[t->lenlen - 1] == 0) {
    773		/*
    774		 * we allocated one more length than we needed because
    775		 * the breakpoint was at the end
    776		 */
    777		t->lenlen--;
    778	}
    779
    780	/* store comment */
    781	if (comment != NULL) {
    782		commentlen = strlen(comment) + 1;
    783		if (((*test)[*testlen - 1].descr = malloc(commentlen)) ==
    784		    NULL) {
    785			fprintf(stderr, "break_test_callback: malloc: %s.\n",
    786			        strerror(errno));
    787			return 1;
    788		}
    789		memcpy((*test)[*testlen - 1].descr, comment, commentlen);
    790	}
    791
    792	return 0;
    793}
    794
    795void
    796break_test_list_parse(char *fname, struct break_test **test, size_t *testlen)
    797{
    798	struct break_test_payload pl = {
    799		.test = test,
    800		.testlen = testlen,
    801	};
    802	*test = NULL;
    803	*testlen = 0;
    804
    805	parse_file_with_callback(fname, break_test_callback, &pl);
    806}
    807
    808void
    809break_test_list_print(const struct break_test *test, size_t testlen,
    810                      const char *identifier, const char *progname)
    811{
    812	size_t i, j;
    813
    814	printf("/* Automatically generated by %s */\n"
    815	       "#include <stdint.h>\n#include <stddef.h>\n\n"
    816	       "#include \"../gen/types.h\"\n\n",
    817	       progname);
    818
    819	printf("static const struct break_test %s[] = {\n", identifier);
    820	for (i = 0; i < testlen; i++) {
    821		printf("\t{\n");
    822
    823		printf("\t\t.cp     = (uint_least32_t[]){");
    824		for (j = 0; j < test[i].cplen; j++) {
    825			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
    826			if (j + 1 < test[i].cplen) {
    827				putchar(',');
    828			}
    829		}
    830		printf(" },\n");
    831		printf("\t\t.cplen  = %zu,\n", test[i].cplen);
    832
    833		printf("\t\t.len    = (size_t[]){");
    834		for (j = 0; j < test[i].lenlen; j++) {
    835			printf(" %zu", test[i].len[j]);
    836			if (j + 1 < test[i].lenlen) {
    837				putchar(',');
    838			}
    839		}
    840		printf(" },\n");
    841		printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
    842
    843		printf("\t\t.descr  = \"%s\",\n", test[i].descr);
    844
    845		printf("\t},\n");
    846	}
    847	printf("};\n");
    848}
    849
    850void
    851break_test_list_free(struct break_test *test, size_t testlen)
    852{
    853	size_t i;
    854
    855	for (i = 0; i < testlen; i++) {
    856		free(test[i].cp);
    857		free(test[i].len);
    858		free(test[i].descr);
    859	}
    860
    861	free(test);
    862}