libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

word.c (8052B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdbool.h>
      3#include <stddef.h>
      4
      5#include "../gen/word.h"
      6#include "../grapheme.h"
      7#include "util.h"
      8
      9struct word_break_state {
     10	bool ri_even;
     11};
     12
     13static inline uint_least8_t
     14get_word_break_prop(uint_least32_t cp)
     15{
     16	if (likely(cp <= UINT32_C(0x10FFFF))) {
     17		return (uint_least8_t)
     18			word_break_minor[word_break_major[cp >> 8] +
     19		                         (cp & 0xff)];
     20	} else {
     21		return WORD_BREAK_PROP_OTHER;
     22	}
     23}
     24
     25static bool
     26is_skippable_word_prop(uint_least8_t prop)
     27{
     28	return prop == WORD_BREAK_PROP_EXTEND ||
     29	       prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP_ZWJ;
     30}
     31
     32static void
     33word_skip_shift_callback(uint_least8_t prop, void *s)
     34{
     35	struct word_break_state *state = (struct word_break_state *)s;
     36
     37	if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
     38		/*
     39		 * The property we just shifted in is
     40		 * a regional indicator, increasing the
     41		 * number of consecutive RIs on the left
     42		 * side of the breakpoint by one, changing
     43		 * the oddness.
     44		 *
     45		 */
     46		state->ri_even = !(state->ri_even);
     47	} else {
     48		/*
     49		 * We saw no regional indicator, so the
     50		 * number of consecutive RIs on the left
     51		 * side of the breakpoint is zero, which
     52		 * is an even number.
     53		 *
     54		 */
     55		state->ri_even = true;
     56	}
     57}
     58
     59static size_t
     60next_word_break(HERODOTUS_READER *r)
     61{
     62	struct proper p;
     63	struct word_break_state state = { .ri_even = true };
     64
     65	/*
     66	 * Apply word breaking algorithm (UAX #29), see
     67	 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
     68	 */
     69	proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
     70	            is_skippable_word_prop, word_skip_shift_callback, &p);
     71
     72	while (!proper_advance(&p)) {
     73		/* WB3 */
     74		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
     75		    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
     76			continue;
     77		}
     78
     79		/* WB3a */
     80		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
     81		    p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
     82		    p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
     83			break;
     84		}
     85
     86		/* WB3b */
     87		if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
     88		    p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
     89		    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
     90			break;
     91		}
     92
     93		/* WB3c */
     94		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
     95		    (p.raw.next_prop[0] ==
     96		             WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
     97		     p.raw.next_prop[0] ==
     98		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
     99			continue;
    100		}
    101
    102		/* WB3d */
    103		if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
    104		    p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
    105			continue;
    106		}
    107
    108		/* WB4 */
    109		if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
    110		    p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
    111		    p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
    112			continue;
    113		}
    114
    115		/* WB5 */
    116		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    117		     p.skip.prev_prop[0] ==
    118		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    119		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    120		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    121		     p.skip.next_prop[0] ==
    122		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    123		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    124			continue;
    125		}
    126
    127		/* WB6 */
    128		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    129		     p.skip.prev_prop[0] ==
    130		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    131		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    132		    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
    133		     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    134		     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    135		    (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
    136		     p.skip.next_prop[1] ==
    137		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    138		     p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    139			continue;
    140		}
    141
    142		/* WB7 */
    143		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
    144		     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    145		     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    146		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    147		     p.skip.next_prop[0] ==
    148		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    149		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    150		    (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
    151		     p.skip.prev_prop[1] ==
    152		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    153		     p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    154			continue;
    155		}
    156
    157		/* WB7a */
    158		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    159		    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
    160			continue;
    161		}
    162
    163		/* WB7b */
    164		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    165		    p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    166		    p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
    167			continue;
    168		}
    169
    170		/* WB7c */
    171		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
    172		    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
    173		    p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
    174			continue;
    175		}
    176
    177		/* WB8 */
    178		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    179		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
    180			continue;
    181		}
    182
    183		/* WB9 */
    184		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    185		     p.skip.prev_prop[0] ==
    186		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    187		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
    188		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
    189			continue;
    190		}
    191
    192		/* WB10 */
    193		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    194		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    195		     p.skip.next_prop[0] ==
    196		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    197		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
    198			continue;
    199		}
    200
    201		/* WB11 */
    202		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
    203		     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    204		     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    205		    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    206		    p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
    207			continue;
    208		}
    209
    210		/* WB12 */
    211		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
    212		    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
    213		     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
    214		     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
    215		    p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
    216			continue;
    217		}
    218
    219		/* WB13 */
    220		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
    221		    p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
    222			continue;
    223		}
    224
    225		/* WB13a */
    226		if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
    227		     p.skip.prev_prop[0] ==
    228		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    229		     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
    230		     p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
    231		     p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
    232		     p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
    233		    p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
    234			continue;
    235		}
    236
    237		/* WB13b */
    238		if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
    239		    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
    240		     p.skip.next_prop[0] ==
    241		             WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
    242		     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
    243		     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
    244		     p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
    245			continue;
    246		}
    247
    248		/* WB15 and WB16 */
    249		if (!state.ri_even &&
    250		    p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
    251			continue;
    252		}
    253
    254		/* WB999 */
    255		break;
    256	}
    257
    258	return herodotus_reader_number_read(&(p.mid_reader));
    259}
    260
    261size_t
    262grapheme_next_word_break(const uint_least32_t *str, size_t len)
    263{
    264	HERODOTUS_READER r;
    265
    266	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    267
    268	return next_word_break(&r);
    269}
    270
    271size_t
    272grapheme_next_word_break_utf8(const char *str, size_t len)
    273{
    274	HERODOTUS_READER r;
    275
    276	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    277
    278	return next_word_break(&r);
    279}