libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

case.c (8442B)


      1/* See LICENSE file for copyright and license details. */
      2#include <errno.h>
      3#include <stdint.h>
      4#include <stdio.h>
      5#include <stdlib.h>
      6#include <string.h>
      7
      8#include "util.h"
      9
     10#define FILE_DCP "data/DerivedCoreProperties.txt"
     11
     12static const struct property_spec case_property[] = {
     13	{
     14		.enumname = "OTHER",
     15		.file = NULL,
     16		.ucdname = NULL,
     17	},
     18	{
     19		.enumname = "BOTH_CASED_CASE_IGNORABLE",
     20		.file = NULL,
     21		.ucdname = NULL,
     22	},
     23	{
     24		.enumname = "CASED",
     25		.file = FILE_DCP,
     26		.ucdname = "Cased",
     27	},
     28	{
     29		.enumname = "CASE_IGNORABLE",
     30		.file = FILE_DCP,
     31		.ucdname = "Case_Ignorable",
     32	},
     33	{
     34		.enumname = "UNCASED",
     35		.file = FILE_DCP,
     36		.ucdname = "Uncased",
     37	},
     38};
     39
     40static uint_least8_t
     41handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
     42{
     43	uint_least8_t result;
     44
     45	(void)cp;
     46
     47	if ((!strcmp(case_property[prop1].enumname, "CASED") &&
     48	     !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
     49	    (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
     50	     !strcmp(case_property[prop2].enumname, "CASED"))) {
     51		for (result = 0; result < LEN(case_property); result++) {
     52			if (!strcmp(case_property[result].enumname,
     53			            "BOTH_CASED_CASE_IGNORABLE")) {
     54				break;
     55			}
     56		}
     57		if (result == LEN(case_property)) {
     58			fprintf(stderr, "handle_conflict: Internal error.\n");
     59			exit(1);
     60		}
     61	} else {
     62		fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
     63		exit(1);
     64	}
     65
     66	return result;
     67}
     68
     69static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
     70
     71static struct special_case {
     72	struct {
     73		uint_least32_t *cp;
     74		size_t cplen;
     75	} upper, lower, title;
     76} *sc = NULL;
     77
     78static size_t sclen = 0;
     79
     80static int
     81unicodedata_callback(const char *file, char **field, size_t nfields,
     82                     char *comment, void *payload)
     83{
     84	uint_least32_t cp, upper, lower, title;
     85
     86	(void)file;
     87	(void)comment;
     88	(void)payload;
     89
     90	hextocp(field[0], strlen(field[0]), &cp);
     91
     92	upper = lower = title = cp;
     93
     94	if ((strlen(field[12]) > 0 &&
     95	     hextocp(field[12], strlen(field[12]), &upper)) ||
     96	    (strlen(field[13]) > 0 &&
     97	     hextocp(field[13], strlen(field[13]), &lower)) ||
     98	    (nfields >= 15 && strlen(field[14]) > 0 &&
     99	     hextocp(field[14], strlen(field[14]), &title))) {
    100		return 1;
    101	}
    102
    103	prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
    104	prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
    105	prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
    106
    107	return 0;
    108}
    109
    110static int
    111specialcasing_callback(const char *file, char **field, size_t nfields,
    112                       char *comment, void *payload)
    113{
    114	uint_least32_t cp;
    115
    116	(void)file;
    117	(void)comment;
    118	(void)payload;
    119
    120	if (nfields > 4 && strlen(field[4]) > 0) {
    121		/*
    122		 * we have more than 4 fields, i.e. the rule has a
    123		 * condition (language-sensitive, etc.) and is discarded
    124		 */
    125		return 0;
    126	}
    127
    128	/* parse affected codepoint */
    129	hextocp(field[0], strlen(field[0]), &cp);
    130
    131	/* extend special case array */
    132	if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
    133		fprintf(stderr, "realloc: %s\n", strerror(errno));
    134		exit(1);
    135	}
    136
    137	/* parse field data */
    138	parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
    139	              &(sc[sclen - 1].upper.cplen));
    140	parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
    141	              &(sc[sclen - 1].lower.cplen));
    142	parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
    143	              &(sc[sclen - 1].title.cplen));
    144
    145	/*
    146	 * overwrite value in "single mapping" property table by the
    147	 * special value 0x110000 + (offset in special case array),
    148	 * even if the special case has length 1
    149	 */
    150	prop_upper[cp].property =
    151		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    152	prop_lower[cp].property =
    153		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    154	prop_title[cp].property =
    155		(int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
    156
    157	return 0;
    158}
    159
    160static int_least64_t
    161get_value(const struct properties *prop, size_t offset)
    162{
    163	return prop[offset].property;
    164}
    165
    166int
    167main(int argc, char *argv[])
    168{
    169	struct properties_compressed comp_upper, comp_lower, comp_title;
    170	struct properties_major_minor mm_upper, mm_lower, mm_title;
    171	size_t i, j;
    172
    173	(void)argc;
    174
    175	/* generate case property table from the specification */
    176	properties_generate_break_property(case_property, LEN(case_property),
    177	                                   NULL, handle_conflict, NULL, "case",
    178	                                   argv[0]);
    179
    180	/*
    181	 * allocate property buffers for all 0x110000 codepoints
    182	 *
    183	 * the buffers contain the offset from the "base" character
    184	 * to the respective case mapping. By callocing we set all fields
    185	 * to zero, which is also the Unicode "default" in the sense that
    186	 * there is no case mapping by default (unless we fill it in)
    187	 */
    188	if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
    189	    !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
    190	    !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
    191		fprintf(stderr, "calloc: %s\n", strerror(errno));
    192		exit(1);
    193	}
    194	parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
    195	                         NULL);
    196	parse_file_with_callback("data/SpecialCasing.txt",
    197	                         specialcasing_callback, NULL);
    198
    199	/* compress properties */
    200	properties_compress(prop_upper, &comp_upper);
    201	properties_compress(prop_lower, &comp_lower);
    202	properties_compress(prop_title, &comp_title);
    203
    204	fprintf(stderr,
    205	        "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, "
    206	        "title=%.2f%%\n",
    207	        argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
    208	        properties_get_major_minor(&comp_lower, &mm_lower),
    209	        properties_get_major_minor(&comp_title, &mm_title));
    210
    211	/* print tables */
    212	printf("/* Automatically generated by %s */\n#include "
    213	       "<stdint.h>\n#include <stddef.h>\n\n",
    214	       argv[0]);
    215
    216	printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t "
    217	       "cplen;\n};\n\n");
    218
    219	properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
    220	printf("\n");
    221	properties_print_derived_lookup_table("upper_minor", mm_upper.minor,
    222	                                      mm_upper.minorlen, get_value,
    223	                                      comp_upper.data);
    224	printf("\n");
    225	properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
    226	printf("\n");
    227	properties_print_derived_lookup_table("lower_minor", mm_lower.minor,
    228	                                      mm_lower.minorlen, get_value,
    229	                                      comp_lower.data);
    230	printf("\n");
    231	properties_print_lookup_table("title_major", mm_title.major, 0x1100);
    232	printf("\n");
    233	properties_print_derived_lookup_table("title_minor", mm_title.minor,
    234	                                      mm_title.minorlen, get_value,
    235	                                      comp_title.data);
    236	printf("\n");
    237
    238	printf("static const struct special_case upper_special[] = {\n");
    239	for (i = 0; i < sclen; i++) {
    240		printf("\t{\n");
    241
    242		printf("\t\t.cp     = (uint_least32_t[]){");
    243		for (j = 0; j < sc[i].upper.cplen; j++) {
    244			printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
    245			if (j + 1 < sc[i].upper.cplen) {
    246				putchar(',');
    247			}
    248		}
    249		printf(" },\n");
    250		printf("\t\t.cplen  = %zu,\n", sc[i].upper.cplen);
    251		printf("\t},\n");
    252	}
    253	printf("};\n\n");
    254
    255	printf("static const struct special_case lower_special[] = {\n");
    256	for (i = 0; i < sclen; i++) {
    257		printf("\t{\n");
    258
    259		printf("\t\t.cp     = (uint_least32_t[]){");
    260		for (j = 0; j < sc[i].lower.cplen; j++) {
    261			printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
    262			if (j + 1 < sc[i].lower.cplen) {
    263				putchar(',');
    264			}
    265		}
    266		printf(" },\n");
    267		printf("\t\t.cplen  = %zu,\n", sc[i].lower.cplen);
    268		printf("\t},\n");
    269	}
    270	printf("};\n\n");
    271
    272	printf("static const struct special_case title_special[] = {\n");
    273	for (i = 0; i < sclen; i++) {
    274		printf("\t{\n");
    275
    276		printf("\t\t.cp     = (uint_least32_t[]){");
    277		for (j = 0; j < sc[i].title.cplen; j++) {
    278			printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
    279			if (j + 1 < sc[i].title.cplen) {
    280				putchar(',');
    281			}
    282		}
    283		printf(" },\n");
    284		printf("\t\t.cplen  = %zu,\n", sc[i].title.cplen);
    285		printf("\t},\n");
    286	}
    287	printf("};\n\n");
    288
    289	free(comp_lower.data);
    290	free(comp_lower.offset);
    291	free(comp_title.data);
    292	free(comp_title.offset);
    293	free(comp_upper.data);
    294	free(comp_upper.offset);
    295	free(mm_lower.major);
    296	free(mm_lower.minor);
    297	free(mm_title.major);
    298	free(mm_title.minor);
    299	free(mm_upper.major);
    300	free(mm_upper.minor);
    301
    302	return 0;
    303}