libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

word.c (3038B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdio.h>
      3#include <stdlib.h>
      4#include <string.h>
      5
      6#include "util.h"
      7
      8#define FILE_EMOJI "data/emoji-data.txt"
      9#define FILE_WORD  "data/WordBreakProperty.txt"
     10
     11static const struct property_spec word_break_property[] = {
     12	{
     13		.enumname = "OTHER",
     14		.file = NULL,
     15		.ucdname = NULL,
     16	},
     17	{
     18		.enumname = "ALETTER",
     19		.file = FILE_WORD,
     20		.ucdname = "ALetter",
     21	},
     22	{
     23		.enumname = "BOTH_ALETTER_EXTPICT",
     24		.file = NULL,
     25		.ucdname = NULL,
     26	},
     27	{
     28		.enumname = "CR",
     29		.file = FILE_WORD,
     30		.ucdname = "CR",
     31	},
     32	{
     33		.enumname = "DOUBLE_QUOTE",
     34		.file = FILE_WORD,
     35		.ucdname = "Double_Quote",
     36	},
     37	{
     38		.enumname = "EXTEND",
     39		.file = FILE_WORD,
     40		.ucdname = "Extend",
     41	},
     42	{
     43		.enumname = "EXTENDED_PICTOGRAPHIC",
     44		.file = FILE_EMOJI,
     45		.ucdname = "Extended_Pictographic",
     46	},
     47	{
     48		.enumname = "EXTENDNUMLET",
     49		.file = FILE_WORD,
     50		.ucdname = "ExtendNumLet",
     51	},
     52	{
     53		.enumname = "FORMAT",
     54		.file = FILE_WORD,
     55		.ucdname = "Format",
     56	},
     57	{
     58		.enumname = "HEBREW_LETTER",
     59		.file = FILE_WORD,
     60		.ucdname = "Hebrew_Letter",
     61	},
     62	{
     63		.enumname = "KATAKANA",
     64		.file = FILE_WORD,
     65		.ucdname = "Katakana",
     66	},
     67	{
     68		.enumname = "LF",
     69		.file = FILE_WORD,
     70		.ucdname = "LF",
     71	},
     72	{
     73		.enumname = "MIDLETTER",
     74		.file = FILE_WORD,
     75		.ucdname = "MidLetter",
     76	},
     77	{
     78		.enumname = "MIDNUM",
     79		.file = FILE_WORD,
     80		.ucdname = "MidNum",
     81	},
     82	{
     83		.enumname = "MIDNUMLET",
     84		.file = FILE_WORD,
     85		.ucdname = "MidNumLet",
     86	},
     87	{
     88		.enumname = "NEWLINE",
     89		.file = FILE_WORD,
     90		.ucdname = "Newline",
     91	},
     92	{
     93		.enumname = "NUMERIC",
     94		.file = FILE_WORD,
     95		.ucdname = "Numeric",
     96	},
     97	{
     98		.enumname = "REGIONAL_INDICATOR",
     99		.file = FILE_WORD,
    100		.ucdname = "Regional_Indicator",
    101	},
    102	{
    103		.enumname = "SINGLE_QUOTE",
    104		.file = FILE_WORD,
    105		.ucdname = "Single_Quote",
    106	},
    107	{
    108		.enumname = "WSEGSPACE",
    109		.file = FILE_WORD,
    110		.ucdname = "WSegSpace",
    111	},
    112	{
    113		.enumname = "ZWJ",
    114		.file = FILE_WORD,
    115		.ucdname = "ZWJ",
    116	},
    117};
    118
    119static uint_least8_t
    120handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
    121{
    122	uint_least8_t result;
    123
    124	(void)cp;
    125
    126	if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") &&
    127	     !strcmp(word_break_property[prop2].enumname,
    128	             "EXTENDED_PICTOGRAPHIC")) ||
    129	    (!strcmp(word_break_property[prop1].enumname,
    130	             "EXTENDED_PICTOGRAPHIC") &&
    131	     !strcmp(word_break_property[prop2].enumname, "ALETTER"))) {
    132		for (result = 0; result < LEN(word_break_property); result++) {
    133			if (!strcmp(word_break_property[result].enumname,
    134			            "BOTH_ALETTER_EXTPICT")) {
    135				break;
    136			}
    137		}
    138		if (result == LEN(word_break_property)) {
    139			fprintf(stderr, "handle_conflict: Internal error.\n");
    140			exit(1);
    141		}
    142	} else {
    143		fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
    144		exit(1);
    145	}
    146
    147	return result;
    148}
    149
    150int
    151main(int argc, char *argv[])
    152{
    153	(void)argc;
    154
    155	properties_generate_break_property(
    156		word_break_property, LEN(word_break_property), NULL,
    157		handle_conflict, NULL, "word_break", argv[0]);
    158
    159	return 0;
    160}