libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

bidirectional.c (12515B)


      1/* See LICENSE file for copyright and license details. */
      2#include <errno.h>
      3#include <inttypes.h>
      4#include <stddef.h>
      5#include <stdio.h>
      6#include <stdlib.h>
      7#include <string.h>
      8
      9#include "util.h"
     10
     11#define FILE_BIDI_BRACKETS  "data/BidiBrackets.txt"
     12#define FILE_BIDI_CLASS     "data/DerivedBidiClass.txt"
     13#define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
     14#define FILE_UNICODE_DATA   "data/UnicodeData.txt"
     15
     16#define NUM_BRACKET_ALIASES 20
     17
     18static const struct property_spec bidi_property[] = {
     19	{
     20		/* default */
     21		.enumname = "L",
     22		.file = FILE_BIDI_CLASS,
     23		.ucdname = "L",
     24	},
     25	{
     26		.enumname = "AL",
     27		.file = FILE_BIDI_CLASS,
     28		.ucdname = "AL",
     29	},
     30	{
     31		.enumname = "AN",
     32		.file = FILE_BIDI_CLASS,
     33		.ucdname = "AN",
     34	},
     35	{
     36		.enumname = "B",
     37		.file = FILE_BIDI_CLASS,
     38		.ucdname = "B",
     39	},
     40	{
     41		.enumname = "BN",
     42		.file = FILE_BIDI_CLASS,
     43		.ucdname = "BN",
     44	},
     45	{
     46		.enumname = "CS",
     47		.file = FILE_BIDI_CLASS,
     48		.ucdname = "CS",
     49	},
     50	{
     51		.enumname = "EN",
     52		.file = FILE_BIDI_CLASS,
     53		.ucdname = "EN",
     54	},
     55	{
     56		.enumname = "ES",
     57		.file = FILE_BIDI_CLASS,
     58		.ucdname = "ES",
     59	},
     60	{
     61		.enumname = "ET",
     62		.file = FILE_BIDI_CLASS,
     63		.ucdname = "ET",
     64	},
     65	{
     66		.enumname = "FSI",
     67		.file = FILE_BIDI_CLASS,
     68		.ucdname = "FSI",
     69	},
     70	{
     71		.enumname = "LRE",
     72		.file = FILE_BIDI_CLASS,
     73		.ucdname = "LRE",
     74	},
     75	{
     76		.enumname = "LRI",
     77		.file = FILE_BIDI_CLASS,
     78		.ucdname = "LRI",
     79	},
     80	{
     81		.enumname = "LRO",
     82		.file = FILE_BIDI_CLASS,
     83		.ucdname = "LRO",
     84	},
     85	{
     86		.enumname = "NSM",
     87		.file = FILE_BIDI_CLASS,
     88		.ucdname = "NSM",
     89	},
     90	{
     91		.enumname = "ON",
     92		.file = FILE_BIDI_CLASS,
     93		.ucdname = "ON",
     94	},
     95	{
     96		.enumname = "PDF",
     97		.file = FILE_BIDI_CLASS,
     98		.ucdname = "PDF",
     99	},
    100	{
    101		.enumname = "PDI",
    102		.file = FILE_BIDI_CLASS,
    103		.ucdname = "PDI",
    104	},
    105	{
    106		.enumname = "R",
    107		.file = FILE_BIDI_CLASS,
    108		.ucdname = "R",
    109	},
    110	{
    111		.enumname = "RLE",
    112		.file = FILE_BIDI_CLASS,
    113		.ucdname = "RLE",
    114	},
    115	{
    116		.enumname = "RLI",
    117		.file = FILE_BIDI_CLASS,
    118		.ucdname = "RLI",
    119	},
    120	{
    121		.enumname = "RLO",
    122		.file = FILE_BIDI_CLASS,
    123		.ucdname = "RLO",
    124	},
    125	{
    126		.enumname = "S",
    127		.file = FILE_BIDI_CLASS,
    128		.ucdname = "S",
    129	},
    130	{
    131		.enumname = "WS",
    132		.file = FILE_BIDI_CLASS,
    133		.ucdname = "WS",
    134	},
    135};
    136
    137struct decomposition_payload {
    138	uint_least32_t cp;
    139	uint_least32_t decomposition;
    140};
    141
    142static int
    143decomposition_callback(const char *file, char **field, size_t nfields,
    144                       char *comment, void *payload)
    145{
    146	char *p;
    147	struct decomposition_payload *decomp =
    148		(struct decomposition_payload *)payload;
    149	uint_least32_t cp;
    150
    151	(void)file;
    152	(void)comment;
    153
    154	if (nfields < 6) {
    155		/* we have fewer than 6 fields, discard the line */
    156		return 0;
    157	}
    158
    159	hextocp(field[0], strlen(field[0]), &cp);
    160
    161	if (decomp->cp == cp) {
    162		/* we hit the line that contains our decomposition target */
    163		if (strlen(field[5]) > 0) {
    164			p = field[5];
    165			if (*p == '<') {
    166				/*
    167				 * the decomposition contains some metadata
    168				 * <...> we skip
    169				 */
    170				for (; *p != '\0'; p++) {
    171					if (*p == '>') {
    172						p++;
    173						while (*p == ' ') {
    174							p++;
    175						}
    176						break;
    177					}
    178				}
    179			}
    180			hextocp(p, strlen(p), &(decomp->decomposition));
    181		} else {
    182			decomp->decomposition = decomp->cp;
    183		}
    184	}
    185
    186	return 0;
    187}
    188
    189static struct {
    190	uint_least32_t base[NUM_BRACKET_ALIASES];
    191	size_t baselen;
    192	uint_least32_t pair[NUM_BRACKET_ALIASES];
    193	size_t pairlen;
    194	uint_least8_t class;
    195	char type;
    196} *b = NULL;
    197
    198static size_t blen;
    199static uint_least8_t bracket_class_count = 1;
    200
    201static int
    202bracket_callback(const char *file, char **field, size_t nfields, char *comment,
    203                 void *payload)
    204{
    205	size_t i, j;
    206	struct decomposition_payload decomp_base, decomp_pair;
    207	uint_least32_t cp_base, cp_pair;
    208
    209	(void)file;
    210	(void)comment;
    211	(void)payload;
    212
    213	if (nfields < 3) {
    214		/* we have fewer than 3 fields, discard the line */
    215		return 0;
    216	}
    217
    218	/* parse field data */
    219	hextocp(field[0], strlen(field[0]), &cp_base);
    220	hextocp(field[1], strlen(field[1]), &cp_pair);
    221
    222	/* determine decomposition of the base and pair codepoints */
    223	decomp_base.cp = cp_base;
    224	decomp_pair.cp = cp_pair;
    225	parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
    226	                         &decomp_base);
    227	parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
    228	                         &decomp_pair);
    229
    230	/*
    231	 * check if we already have the canonical form in the bracket array,
    232	 * per convention the canonical form is the first element of the alias
    233	 * array
    234	 */
    235	for (i = 0; i < blen; i++) {
    236		if (decomp_base.decomposition == b[i].base[0]) {
    237			/* we have a match, check type */
    238			if (strlen(field[2]) != 1 ||
    239			    (field[2][0] != 'o' && field[2][0] != 'c')) {
    240				/* malformed line */
    241				return 1;
    242			} else if (b[i].type != field[2][0]) {
    243				/* mismatching types */
    244				return 1;
    245			}
    246
    247			/*
    248			 * add our base alias to the base array unless it isn't
    249			 * already in it
    250			 */
    251			for (j = 0; j < b[i].baselen; j++) {
    252				if (cp_base == b[i].base[j]) {
    253					/* already in array, do nothing */
    254					break;
    255				}
    256			}
    257			if (j == b[i].baselen) {
    258				/*
    259				 * the base alias is not already in the array,
    260				 * add it
    261				 */
    262				if (b[i].baselen == NUM_BRACKET_ALIASES) {
    263					fprintf(stderr, "too many aliases\n");
    264					return 1;
    265				}
    266				b[i].baselen++;
    267				b[i].base[b[i].baselen - 1] = cp_base;
    268			}
    269
    270			/*
    271			 * also add our pair alias to the pair array unless
    272			 * it isn't already in it
    273			 */
    274			for (j = 0; j < b[i].pairlen; j++) {
    275				if (cp_pair == b[i].pair[j]) {
    276					/* already in array, do nothing */
    277					break;
    278				}
    279			}
    280			if (j == b[i].pairlen) {
    281				/*
    282				 * the pair alias is not already in the array,
    283				 * add it
    284				 */
    285				if (b[i].pairlen == NUM_BRACKET_ALIASES) {
    286					fprintf(stderr, "too many aliases\n");
    287					return 1;
    288				}
    289				b[i].pairlen++;
    290				b[i].pair[b[i].pairlen - 1] = cp_pair;
    291			}
    292
    293			return 0;
    294		}
    295	}
    296
    297	/* extend bracket pair array, as this is a new bracket type */
    298	if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
    299		fprintf(stderr, "realloc: %s\n", strerror(errno));
    300		exit(1);
    301	}
    302
    303	/* fill field data by adding the canonical form first */
    304	b[blen - 1].base[0] = decomp_base.decomposition;
    305	b[blen - 1].baselen = 1;
    306	b[blen - 1].pair[0] = decomp_pair.decomposition;
    307	b[blen - 1].pairlen = 1;
    308
    309	/* add alias if it differs from the canonical form */
    310	if (cp_base != decomp_base.decomposition) {
    311		b[blen - 1].base[1] = cp_base;
    312		b[blen - 1].baselen = 2;
    313	}
    314	if (cp_pair != decomp_pair.decomposition) {
    315		b[blen - 1].pair[1] = cp_pair;
    316		b[blen - 1].pairlen = 2;
    317	}
    318
    319	/* add bracket type */
    320	if (strlen(field[2]) != 1 ||
    321	    (field[2][0] != 'o' && field[2][0] != 'c')) {
    322		/* malformed line */
    323		return 1;
    324	} else {
    325		b[blen - 1].type = field[2][0];
    326	}
    327
    328	/*
    329	 * determine bracket class by iterating over the bracket-array
    330	 * and seeing if our current canonical cp already has a matching pair.
    331	 * We only need to check the first entry in each bracket alias
    332	 * list, as this is, per convention, the canonical form.
    333	 * If not, add a new class.
    334	 */
    335	for (i = 0; i + 1 < blen; i++) {
    336		if (b[i].pair[0] == b[blen - 1].base[0]) {
    337			/* matched class */
    338			b[blen - 1].class = b[i].class;
    339			break;
    340		}
    341	}
    342	if (i + 1 == blen) {
    343		/* no match, assign a new class */
    344		b[blen - 1].class = bracket_class_count++;
    345	}
    346
    347	return 0;
    348}
    349
    350static void
    351post_process(struct properties *prop)
    352{
    353	size_t i, j;
    354
    355	for (i = 0; i < blen; i++) {
    356		/*
    357		 * given the base property fits in 5 bits, we simply
    358		 * store the bracket-offset in the bits above that.
    359		 *
    360		 * All those properties that are not set here implicitly
    361		 * have offset 0, which we prepared to contain a stub
    362		 * for a character that is not a bracket.
    363		 */
    364		for (j = 0; j < b[i].baselen; j++) {
    365			prop[b[i].base[j]].property |= (i << 5);
    366		}
    367	}
    368}
    369
    370static uint_least8_t
    371fill_missing(uint_least32_t cp)
    372{
    373	/* based on the @missing-properties in data/DerivedBidiClass.txt */
    374	if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
    375	    (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
    376	    (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
    377	    (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
    378	    (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
    379	    (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
    380	    (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
    381	    (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
    382	    (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
    383	    (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
    384	    (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
    385		return 17; /* class R */
    386	} else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
    387	           (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
    388	           (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
    389	           (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
    390	           (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
    391	           (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) ||
    392	           (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) ||
    393	           (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) ||
    394	           (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) ||
    395	           (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) ||
    396	           (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) {
    397		return 1; /* class AL */
    398	} else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
    399		return 8; /* class ET */
    400	} else {
    401		return 0; /* class L */
    402	}
    403}
    404
    405static struct properties *prop_mirror = NULL;
    406
    407static int
    408mirror_callback(const char *file, char **field, size_t nfields, char *comment,
    409                void *payload)
    410{
    411	uint_least32_t cp, cp_mirror;
    412
    413	(void)file;
    414	(void)comment;
    415	(void)payload;
    416
    417	hextocp(field[0], strlen(field[0]), &cp);
    418
    419	cp_mirror = cp;
    420
    421	if (nfields >= 2 && strlen(field[1]) > 0 &&
    422	    hextocp(field[1], strlen(field[1]), &cp_mirror)) {
    423		return 1;
    424	}
    425
    426	prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp;
    427
    428	return 0;
    429}
    430
    431static int_least64_t
    432get_value(const struct properties *prop, size_t offset)
    433{
    434	return prop[offset].property;
    435}
    436
    437int
    438main(int argc, char *argv[])
    439{
    440	struct properties_compressed comp_mirror;
    441	struct properties_major_minor mm_mirror;
    442	size_t i;
    443
    444	(void)argc;
    445
    446	/*
    447	 * the first element in the bracket array is initialized to
    448	 * all-zeros, as we use the implicit 0-offset for all those
    449	 * codepoints that are not a bracket
    450	 */
    451	if (!(b = calloc((blen = 1), sizeof(*b)))) {
    452		fprintf(stderr, "calloc: %s\n", strerror(errno));
    453		exit(1);
    454	}
    455	parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL);
    456
    457	properties_generate_break_property(bidi_property, LEN(bidi_property),
    458	                                   fill_missing, NULL, post_process,
    459	                                   "bidi", argv[0]);
    460
    461	printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
    462	       "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
    463	       "static const struct bracket {\n\tenum bracket_type type;\n"
    464	       "\tuint_least8_t class;\n} bidi_bracket[] = {\n");
    465	for (i = 0; i < blen; i++) {
    466		printf("\t{\n\t\t.type = %s,\n\t\t.class = "
    467		       "%" PRIuLEAST8 ",\n\t},\n",
    468		       (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
    469		       (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
    470		                            "BIDI_BRACKET_NONE",
    471		       b[i].class);
    472	}
    473	printf("};\n");
    474
    475	/*
    476	 * allocate property buffer for all 0x110000 codepoints
    477	 *
    478	 * the buffers contain the offset from the "base" character
    479	 * to the respective mirrored character. By callocing we set all
    480	 * fields to zero, which is also the Unicode "default" in the sense
    481	 * that the coe point is its mirror (unless we fill it in)
    482	 */
    483	if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) {
    484		fprintf(stderr, "calloc: %s\n", strerror(errno));
    485		exit(1);
    486	}
    487	parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL);
    488
    489	/* compress properties */
    490	properties_compress(prop_mirror, &comp_mirror);
    491
    492	fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0],
    493	        properties_get_major_minor(&comp_mirror, &mm_mirror));
    494
    495	/* print tables */
    496	properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100);
    497	printf("\n");
    498	properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor,
    499	                                      mm_mirror.minorlen, get_value,
    500	                                      comp_mirror.data);
    501
    502	free(comp_mirror.data);
    503	free(comp_mirror.offset);
    504	free(mm_mirror.major);
    505	free(mm_mirror.minor);
    506
    507	return 0;
    508}