libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

sentence.c (8420B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stdbool.h>
      3#include <stddef.h>
      4
      5#include "../gen/sentence.h"
      6#include "../grapheme.h"
      7#include "util.h"
      8
      9struct sentence_break_state {
     10	uint_least8_t aterm_close_sp_level;
     11	uint_least8_t saterm_close_sp_parasep_level;
     12};
     13
     14static inline uint_least8_t
     15get_sentence_break_prop(uint_least32_t cp)
     16{
     17	if (likely(cp <= UINT32_C(0x10FFFF))) {
     18		return (uint_least8_t)
     19			sentence_break_minor[sentence_break_major[cp >> 8] +
     20		                             (cp & 0xff)];
     21	} else {
     22		return SENTENCE_BREAK_PROP_OTHER;
     23	}
     24}
     25
     26static bool
     27is_skippable_sentence_prop(uint_least8_t prop)
     28{
     29	return prop == SENTENCE_BREAK_PROP_EXTEND ||
     30	       prop == SENTENCE_BREAK_PROP_FORMAT;
     31}
     32
     33static void
     34sentence_skip_shift_callback(uint_least8_t prop, void *s)
     35{
     36	struct sentence_break_state *state = (struct sentence_break_state *)s;
     37
     38	/*
     39	 * Here comes a bit of magic. The rules
     40	 * SB8, SB8a, SB9 and SB10 have very complicated
     41	 * left-hand-side-rules of the form
     42	 *
     43	 *  ATerm Close* Sp*
     44	 *  SATerm Close*
     45	 *  SATerm Close* Sp*
     46	 *  SATerm Close* Sp* ParaSep?
     47	 *
     48	 * but instead of backtracking, we keep the
     49	 * state as some kind of "power level" in
     50	 * two state-variables
     51	 *
     52	 *  aterm_close_sp_level
     53	 *  saterm_close_sp_parasep_level
     54	 *
     55	 * that go from 0 to 3/4:
     56	 *
     57	 *  0: we are not in the sequence
     58	 *  1: we have one ATerm/SATerm to the left of
     59	 *     the middle spot
     60	 *  2: we have one ATerm/SATerm and one or more
     61	 *     Close to the left of the middle spot
     62	 *  3: we have one ATerm/SATerm, zero or more
     63	 *     Close and one or more Sp to the left of
     64	 *     the middle spot.
     65	 *  4: we have one SATerm, zero or more Close,
     66	 *     zero or more Sp and one ParaSep to the
     67	 *     left of the middle spot.
     68	 *
     69	 */
     70	if ((state->aterm_close_sp_level == 0 ||
     71	     state->aterm_close_sp_level == 1) &&
     72	    prop == SENTENCE_BREAK_PROP_ATERM) {
     73		/* sequence has begun */
     74		state->aterm_close_sp_level = 1;
     75	} else if ((state->aterm_close_sp_level == 1 ||
     76	            state->aterm_close_sp_level == 2) &&
     77	           prop == SENTENCE_BREAK_PROP_CLOSE) {
     78		/* close-sequence begins or continued */
     79		state->aterm_close_sp_level = 2;
     80	} else if ((state->aterm_close_sp_level == 1 ||
     81	            state->aterm_close_sp_level == 2 ||
     82	            state->aterm_close_sp_level == 3) &&
     83	           prop == SENTENCE_BREAK_PROP_SP) {
     84		/* sp-sequence begins or continued */
     85		state->aterm_close_sp_level = 3;
     86	} else {
     87		/* sequence broke */
     88		state->aterm_close_sp_level = 0;
     89	}
     90
     91	if ((state->saterm_close_sp_parasep_level == 0 ||
     92	     state->saterm_close_sp_parasep_level == 1) &&
     93	    (prop == SENTENCE_BREAK_PROP_STERM ||
     94	     prop == SENTENCE_BREAK_PROP_ATERM)) {
     95		/* sequence has begun */
     96		state->saterm_close_sp_parasep_level = 1;
     97	} else if ((state->saterm_close_sp_parasep_level == 1 ||
     98	            state->saterm_close_sp_parasep_level == 2) &&
     99	           prop == SENTENCE_BREAK_PROP_CLOSE) {
    100		/* close-sequence begins or continued */
    101		state->saterm_close_sp_parasep_level = 2;
    102	} else if ((state->saterm_close_sp_parasep_level == 1 ||
    103	            state->saterm_close_sp_parasep_level == 2 ||
    104	            state->saterm_close_sp_parasep_level == 3) &&
    105	           prop == SENTENCE_BREAK_PROP_SP) {
    106		/* sp-sequence begins or continued */
    107		state->saterm_close_sp_parasep_level = 3;
    108	} else if ((state->saterm_close_sp_parasep_level == 1 ||
    109	            state->saterm_close_sp_parasep_level == 2 ||
    110	            state->saterm_close_sp_parasep_level == 3) &&
    111	           (prop == SENTENCE_BREAK_PROP_SEP ||
    112	            prop == SENTENCE_BREAK_PROP_CR ||
    113	            prop == SENTENCE_BREAK_PROP_LF)) {
    114		/* ParaSep at the end of the sequence */
    115		state->saterm_close_sp_parasep_level = 4;
    116	} else {
    117		/* sequence broke */
    118		state->saterm_close_sp_parasep_level = 0;
    119	}
    120}
    121
    122static size_t
    123next_sentence_break(HERODOTUS_READER *r)
    124{
    125	HERODOTUS_READER tmp;
    126	enum sentence_break_property prop;
    127	struct proper p;
    128	struct sentence_break_state state = { 0 };
    129	uint_least32_t cp;
    130
    131	/*
    132	 * Apply sentence breaking algorithm (UAX #29), see
    133	 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
    134	 */
    135	proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
    136	            get_sentence_break_prop, is_skippable_sentence_prop,
    137	            sentence_skip_shift_callback, &p);
    138
    139	while (!proper_advance(&p)) {
    140		/* SB3 */
    141		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
    142		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
    143			continue;
    144		}
    145
    146		/* SB4 */
    147		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    148		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
    149		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
    150			break;
    151		}
    152
    153		/* SB5 */
    154		if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
    155		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
    156			continue;
    157		}
    158
    159		/* SB6 */
    160		if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
    161		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
    162			continue;
    163		}
    164
    165		/* SB7 */
    166		if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
    167		     p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
    168		    p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
    169		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
    170			continue;
    171		}
    172
    173		/* SB8 */
    174		if (state.aterm_close_sp_level == 1 ||
    175		    state.aterm_close_sp_level == 2 ||
    176		    state.aterm_close_sp_level == 3) {
    177			/*
    178			 * This is the most complicated rule, requiring
    179			 * the right-hand-side to satisfy the regular expression
    180			 *
    181			 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )*
    182			 * Lower
    183			 *
    184			 * which we simply check "manually" given LUT-lookups
    185			 * are very cheap by starting at the mid_reader.
    186			 *
    187			 */
    188			herodotus_reader_copy(&(p.mid_reader), &tmp);
    189
    190			prop = NUM_SENTENCE_BREAK_PROPS;
    191			while (herodotus_read_codepoint(&tmp, true, &cp) ==
    192			       HERODOTUS_STATUS_SUCCESS) {
    193				prop = get_sentence_break_prop(cp);
    194
    195				/*
    196				 * the skippable properties are ignored
    197				 * automatically here given they do not
    198				 * match the following condition
    199				 */
    200				if (prop == SENTENCE_BREAK_PROP_OLETTER ||
    201				    prop == SENTENCE_BREAK_PROP_UPPER ||
    202				    prop == SENTENCE_BREAK_PROP_LOWER ||
    203				    prop == SENTENCE_BREAK_PROP_SEP ||
    204				    prop == SENTENCE_BREAK_PROP_CR ||
    205				    prop == SENTENCE_BREAK_PROP_LF ||
    206				    prop == SENTENCE_BREAK_PROP_STERM ||
    207				    prop == SENTENCE_BREAK_PROP_ATERM) {
    208					break;
    209				}
    210			}
    211
    212			if (prop == SENTENCE_BREAK_PROP_LOWER) {
    213				continue;
    214			}
    215		}
    216
    217		/* SB8a */
    218		if ((state.saterm_close_sp_parasep_level == 1 ||
    219		     state.saterm_close_sp_parasep_level == 2 ||
    220		     state.saterm_close_sp_parasep_level == 3) &&
    221		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
    222		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
    223		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
    224			continue;
    225		}
    226
    227		/* SB9 */
    228		if ((state.saterm_close_sp_parasep_level == 1 ||
    229		     state.saterm_close_sp_parasep_level == 2) &&
    230		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
    231		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
    232		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    233		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
    234		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
    235			continue;
    236		}
    237
    238		/* SB10 */
    239		if ((state.saterm_close_sp_parasep_level == 1 ||
    240		     state.saterm_close_sp_parasep_level == 2 ||
    241		     state.saterm_close_sp_parasep_level == 3) &&
    242		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
    243		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
    244		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
    245		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
    246			continue;
    247		}
    248
    249		/* SB11 */
    250		if (state.saterm_close_sp_parasep_level == 1 ||
    251		    state.saterm_close_sp_parasep_level == 2 ||
    252		    state.saterm_close_sp_parasep_level == 3 ||
    253		    state.saterm_close_sp_parasep_level == 4) {
    254			break;
    255		}
    256
    257		/* SB998 */
    258		continue;
    259	}
    260
    261	return herodotus_reader_number_read(&(p.mid_reader));
    262}
    263
    264size_t
    265grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
    266{
    267	HERODOTUS_READER r;
    268
    269	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
    270
    271	return next_sentence_break(&r);
    272}
    273
    274size_t
    275grapheme_next_sentence_break_utf8(const char *str, size_t len)
    276{
    277	HERODOTUS_READER r;
    278
    279	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
    280
    281	return next_sentence_break(&r);
    282}