libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

character.c (8932B)


      1/* See LICENSE file for copyright and license details. */
      2#include <limits.h>
      3#include <stdbool.h>
      4#include <stddef.h>
      5
      6#include "../gen/character.h"
      7#include "../grapheme.h"
      8#include "util.h"
      9
     10struct character_break_state {
     11	uint_least8_t prop;
     12	bool prop_set;
     13	bool gb11_flag;
     14	bool gb12_13_flag;
     15};
     16
     17static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
     18	[CHAR_BREAK_PROP_OTHER] =
     19		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |           /* GB9  */
     20		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |              /* GB9  */
     21		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,       /* GB9a */
     22	[CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3  */
     23	[CHAR_BREAK_PROP_EXTEND] =
     24		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     25		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     26		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     27	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     28		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     29		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     30		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     31	[CHAR_BREAK_PROP_HANGUL_L] =
     32		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
     33		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
     34		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
     35		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
     36		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     37		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     38		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     39	[CHAR_BREAK_PROP_HANGUL_V] =
     40		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
     41		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
     42		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     43		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     44		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     45	[CHAR_BREAK_PROP_HANGUL_T] =
     46		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
     47		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     48		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     49		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     50	[CHAR_BREAK_PROP_HANGUL_LV] =
     51		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
     52		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
     53		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     54		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     55		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     56	[CHAR_BREAK_PROP_HANGUL_LVT] =
     57		UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
     58		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     59		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     60		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     61	[CHAR_BREAK_PROP_PREPEND] =
     62		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |      /* GB9  */
     63		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |         /* GB9  */
     64		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
     65		(UINT16_C(0xFFFF) &
     66	         ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
     67	           UINT16_C(1) << CHAR_BREAK_PROP_LF |
     68	           UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
     69	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
     70		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     71		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     72		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     73	[CHAR_BREAK_PROP_SPACINGMARK] =
     74		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     75		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     76		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     77	[CHAR_BREAK_PROP_ZWJ] =
     78		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
     79		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
     80		UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
     81};
     82static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     83	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
     84		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
     85		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     86	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     87		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     88	[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
     89		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
     90		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
     91	[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
     92		UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
     93		UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
     94};
     95static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
     96	[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
     97		UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
     98};
     99static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    100	[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
    101		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    102};
    103static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
    104	[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
    105		UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
    106};
    107
    108static inline enum char_break_property
    109get_break_prop(uint_least32_t cp)
    110{
    111	if (likely(cp <= UINT32_C(0x10FFFF))) {
    112		return (enum char_break_property)
    113			char_break_minor[char_break_major[cp >> 8] +
    114		                         (cp & 0xFF)];
    115	} else {
    116		return CHAR_BREAK_PROP_OTHER;
    117	}
    118}
    119
    120static inline void
    121state_serialize(const struct character_break_state *in, uint_least16_t *out)
    122{
    123	*out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
    124	       (uint_least16_t)(((uint_least16_t)(in->prop_set))
    125	                        << 8) | /* 9th bit */
    126	       (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
    127	                        << 9) | /* 10th bit */
    128	       (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
    129	                        << 10); /* 11th bit */
    130}
    131
    132static inline void
    133state_deserialize(uint_least16_t in, struct character_break_state *out)
    134{
    135	out->prop = in & UINT8_C(0xFF);
    136	out->prop_set = in & (UINT16_C(1) << 8);
    137	out->gb11_flag = in & (UINT16_C(1) << 9);
    138	out->gb12_13_flag = in & (UINT16_C(1) << 10);
    139}
    140
    141bool
    142grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
    143                            uint_least16_t *s)
    144{
    145	struct character_break_state state;
    146	enum char_break_property cp0_prop, cp1_prop;
    147	bool notbreak = false;
    148
    149	if (likely(s)) {
    150		state_deserialize(*s, &state);
    151
    152		if (likely(state.prop_set)) {
    153			cp0_prop = state.prop;
    154		} else {
    155			cp0_prop = get_break_prop(cp0);
    156		}
    157		cp1_prop = get_break_prop(cp1);
    158
    159		/* preserve prop of right codepoint for next iteration */
    160		state.prop = (uint_least8_t)cp1_prop;
    161		state.prop_set = true;
    162
    163		/* update flags */
    164		state.gb11_flag =
    165			flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
    166		                                            state.gb11_flag] &
    167			UINT16_C(1) << cp1_prop;
    168		state.gb12_13_flag =
    169			flag_update_gb12_13[cp0_prop +
    170		                            NUM_CHAR_BREAK_PROPS *
    171		                                    state.gb12_13_flag] &
    172			UINT16_C(1) << cp1_prop;
    173
    174		/*
    175		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    176		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    177		 */
    178		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    179		           (dont_break_gb11[cp0_prop +
    180		                            state.gb11_flag *
    181		                                    NUM_CHAR_BREAK_PROPS] &
    182		            (UINT16_C(1) << cp1_prop)) ||
    183		           (dont_break_gb12_13[cp0_prop +
    184		                               state.gb12_13_flag *
    185		                                       NUM_CHAR_BREAK_PROPS] &
    186		            (UINT16_C(1) << cp1_prop));
    187
    188		/* update or reset flags (when we have a break) */
    189		if (likely(!notbreak)) {
    190			state.gb11_flag = state.gb12_13_flag = false;
    191		}
    192
    193		state_serialize(&state, s);
    194	} else {
    195		cp0_prop = get_break_prop(cp0);
    196		cp1_prop = get_break_prop(cp1);
    197
    198		/*
    199		 * Apply grapheme cluster breaking algorithm (UAX #29), see
    200		 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
    201		 *
    202		 * Given we have no state, this behaves as if the state-booleans
    203		 * were all set to false
    204		 */
    205		notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
    206		           (dont_break_gb11[cp0_prop] &
    207		            (UINT16_C(1) << cp1_prop)) ||
    208		           (dont_break_gb12_13[cp0_prop] &
    209		            (UINT16_C(1) << cp1_prop));
    210	}
    211
    212	return !notbreak;
    213}
    214
    215static size_t
    216next_character_break(HERODOTUS_READER *r)
    217{
    218	uint_least16_t state = 0;
    219	uint_least32_t cp0 = 0, cp1 = 0;
    220
    221	for (herodotus_read_codepoint(r, true, &cp0);
    222	     herodotus_read_codepoint(r, false, &cp1) ==
    223	     HERODOTUS_STATUS_SUCCESS;
    224	     herodotus_read_codepoint(r, true, &cp0)) {
    225		if (grapheme_is_character_break(cp0, cp1, &state)) {
    226			break;
    227		}
    228	}
    229
    230	return herodotus_reader_number_read(r);
    231}
    232
    233size_t
    234grapheme_next_character_break(const uint_least32_t *str, size_t len)
    235{
    236	HERODOTUS_READER r;
    237
    238	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    239
    240	return next_character_break(&r);
    241}
    242
    243size_t
    244grapheme_next_character_break_utf8(const char *str, size_t len)
    245{
    246	HERODOTUS_READER r;
    247
    248	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    249
    250	return next_character_break(&r);
    251}