util.c - libgrapheme - Freestanding C library for unicode string handling

	libgrapheme Freestanding C library for unicode string handling
	git clone https://git.sinitax.com/suckless/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
util.c (11480B)
      1/* See LICENSE file for copyright and license details. */
      2#include <limits.h>
      3#include <stdbool.h>
      4#include <stddef.h>
      5#include <stdint.h>
      6
      7#include "../gen/types.h"
      8#include "../grapheme.h"
      9#include "util.h"
     10
     11void
     12herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
     13                      const void *src, size_t srclen)
     14{
     15	size_t i;
     16
     17	r->type = type;
     18	r->src = src;
     19	r->srclen = srclen;
     20	r->off = 0;
     21	r->terminated_by_null = false;
     22
     23	for (i = 0; i < LEN(r->soft_limit); i++) {
     24		r->soft_limit[i] = SIZE_MAX;
     25	}
     26}
     27
     28void
     29herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
     30{
     31	size_t i;
     32
     33	/*
     34	 * we copy such that we have a "fresh" start and build on the
     35	 * fact that src->soft_limit[i] for any i and src->srclen are
     36	 * always larger or equal to src->off
     37	 */
     38	dest->type = src->type;
     39	if (src->type == HERODOTUS_TYPE_CODEPOINT) {
     40		dest->src =
     41			(src->src == NULL) ?
     42				NULL :
     43				((const uint_least32_t *)(src->src)) + src->off;
     44	} else { /* src->type == HERODOTUS_TYPE_UTF8 */
     45		dest->src = (src->src == NULL) ?
     46		                    NULL :
     47		                    ((const char *)(src->src)) + src->off;
     48	}
     49	if (src->srclen == SIZE_MAX) {
     50		dest->srclen = SIZE_MAX;
     51	} else {
     52		dest->srclen =
     53			(src->off < src->srclen) ? src->srclen - src->off : 0;
     54	}
     55	dest->off = 0;
     56	dest->terminated_by_null = src->terminated_by_null;
     57
     58	for (i = 0; i < LEN(src->soft_limit); i++) {
     59		if (src->soft_limit[i] == SIZE_MAX) {
     60			dest->soft_limit[i] = SIZE_MAX;
     61		} else {
     62			/*
     63			 * if we have a degenerate case where the offset is
     64			 * higher than the soft-limit, we simply clamp the
     65			 * soft-limit to zero given we can't decide here
     66			 * to release the limit and, instead, we just
     67			 * prevent any more reads
     68			 */
     69			dest->soft_limit[i] =
     70				(src->off < src->soft_limit[i]) ?
     71					src->soft_limit[i] - src->off :
     72					0;
     73		}
     74	}
     75}
     76
     77void
     78herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
     79{
     80	size_t i;
     81
     82	for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
     83		r->soft_limit[i] = r->soft_limit[i - 1];
     84	}
     85	r->soft_limit[0] = r->off + count;
     86}
     87
     88void
     89herodotus_reader_pop_limit(HERODOTUS_READER *r)
     90{
     91	size_t i;
     92
     93	for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
     94		r->soft_limit[i] = r->soft_limit[i + 1];
     95	}
     96	r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
     97}
     98
     99size_t
    100herodotus_reader_next_word_break(const HERODOTUS_READER *r)
    101{
    102	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    103		return grapheme_next_word_break(
    104			(const uint_least32_t *)(r->src) + r->off,
    105			MIN(r->srclen, r->soft_limit[0]) - r->off);
    106	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    107		return grapheme_next_word_break_utf8(
    108			(const char *)(r->src) + r->off,
    109			MIN(r->srclen, r->soft_limit[0]) - r->off);
    110	}
    111}
    112
    113size_t
    114herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
    115{
    116	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    117		return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
    118	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    119		return grapheme_decode_utf8(
    120			(const char *)(r->src) + r->off,
    121			MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
    122	}
    123}
    124
    125size_t
    126herodotus_reader_number_read(const HERODOTUS_READER *r)
    127{
    128	return r->off;
    129}
    130
    131enum herodotus_status
    132herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
    133{
    134	size_t ret;
    135
    136	if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
    137		*cp = GRAPHEME_INVALID_CODEPOINT;
    138		return HERODOTUS_STATUS_END_OF_BUFFER;
    139	}
    140
    141	if (r->off >= r->soft_limit[0]) {
    142		*cp = GRAPHEME_INVALID_CODEPOINT;
    143		return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
    144	}
    145
    146	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    147		*cp = ((const uint_least32_t *)(r->src))[r->off];
    148		ret = 1;
    149	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    150		ret = grapheme_decode_utf8(
    151			(const char *)r->src + r->off,
    152			MIN(r->srclen, r->soft_limit[0]) - r->off, cp);
    153	}
    154
    155	if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
    156		/*
    157		 * We encountered a null-codepoint. Don't increment
    158		 * offset and return as if the buffer had ended here all
    159		 * along
    160		 */
    161		r->terminated_by_null = true;
    162		return HERODOTUS_STATUS_END_OF_BUFFER;
    163	}
    164
    165	if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
    166		/*
    167		 * we want more than we have; instead of returning
    168		 * garbage we terminate here.
    169		 */
    170		return HERODOTUS_STATUS_END_OF_BUFFER;
    171	}
    172
    173	/*
    174	 * Increase offset which we now know won't surpass the limits,
    175	 * unless we got told otherwise
    176	 */
    177	if (advance) {
    178		r->off += ret;
    179	}
    180
    181	return HERODOTUS_STATUS_SUCCESS;
    182}
    183
    184void
    185herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest,
    186                      size_t destlen)
    187{
    188	w->type = type;
    189	w->dest = dest;
    190	w->destlen = destlen;
    191	w->off = 0;
    192	w->first_unwritable_offset = SIZE_MAX;
    193}
    194
    195void
    196herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
    197{
    198	if (w->dest == NULL) {
    199		return;
    200	}
    201
    202	if (w->off < w->destlen) {
    203		/* We still have space in the buffer. Simply use it */
    204		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    205			((uint_least32_t *)(w->dest))[w->off] = 0;
    206		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    207			((char *)(w->dest))[w->off] = '\0';
    208		}
    209	} else if (w->first_unwritable_offset < w->destlen) {
    210		/*
    211		 * There is no more space in the buffer. However,
    212		 * we have noted down the first offset we couldn't
    213		 * use to write into the buffer and it's smaller than
    214		 * destlen. Thus we bailed writing into the
    215		 * destination when a multibyte-codepoint couldn't be
    216		 * written. So the last "real" byte might be at
    217		 * destlen-4, destlen-3, destlen-2 or destlen-1
    218		 * (the last case meaning truncation).
    219		 */
    220		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    221			((uint_least32_t
    222			          *)(w->dest))[w->first_unwritable_offset] = 0;
    223		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    224			((char *)(w->dest))[w->first_unwritable_offset] = '\0';
    225		}
    226	} else if (w->destlen > 0) {
    227		/*
    228		 * In this case, there is no more space in the buffer and
    229		 * the last unwritable offset is larger than
    230		 * or equal to the destination buffer length. This means
    231		 * that we are forced to simply write into the last
    232		 * byte.
    233		 */
    234		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    235			((uint_least32_t *)(w->dest))[w->destlen - 1] = 0;
    236		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    237			((char *)(w->dest))[w->destlen - 1] = '\0';
    238		}
    239	}
    240
    241	/* w->off is not incremented in any case */
    242}
    243
    244size_t
    245herodotus_writer_number_written(const HERODOTUS_WRITER *w)
    246{
    247	return w->off;
    248}
    249
    250void
    251herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
    252{
    253	size_t ret;
    254
    255	/*
    256	 * This function will always faithfully say how many codepoints
    257	 * were written, even if the buffer ends. This is used to enable
    258	 * truncation detection.
    259	 */
    260	if (w->type == HERODOTUS_TYPE_CODEPOINT) {
    261		if (w->dest != NULL && w->off < w->destlen) {
    262			((uint_least32_t *)(w->dest))[w->off] = cp;
    263		}
    264
    265		w->off += 1;
    266	} else { /* w->type == HERODOTUS_TYPE_UTF8 */
    267		/*
    268		 * First determine how many bytes we need to encode the
    269		 * codepoint
    270		 */
    271		ret = grapheme_encode_utf8(cp, NULL, 0);
    272
    273		if (w->dest != NULL && w->off + ret < w->destlen) {
    274			/* we still have enough room in the buffer */
    275			grapheme_encode_utf8(cp, (char *)(w->dest) + w->off,
    276			                     w->destlen - w->off);
    277		} else if (w->first_unwritable_offset == SIZE_MAX) {
    278			/*
    279			 * the first unwritable offset has not been
    280			 * noted down, so this is the first time we can't
    281			 * write (completely) to an offset
    282			 */
    283			w->first_unwritable_offset = w->off;
    284		}
    285
    286		w->off += ret;
    287	}
    288}
    289
    290void
    291proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
    292            uint_least8_t (*get_break_prop)(uint_least32_t),
    293            bool (*is_skippable_prop)(uint_least8_t),
    294            void (*skip_shift_callback)(uint_least8_t, void *),
    295            struct proper *p)
    296{
    297	uint_least8_t prop;
    298	uint_least32_t cp;
    299	size_t i;
    300
    301	/* set internal variables */
    302	p->state = state;
    303	p->no_prop = no_prop;
    304	p->get_break_prop = get_break_prop;
    305	p->is_skippable_prop = is_skippable_prop;
    306	p->skip_shift_callback = skip_shift_callback;
    307
    308	/*
    309	 * Initialize mid-reader, which is basically just there
    310	 * to reflect the current position of the viewing-line
    311	 */
    312	herodotus_reader_copy(r, &(p->mid_reader));
    313
    314	/*
    315	 * In the initialization, we simply (try to) fill in next_prop.
    316	 * If we cannot read in more (due to the buffer ending), we
    317	 * fill in the prop as invalid
    318	 */
    319
    320	/*
    321	 * initialize the previous properties to have no property
    322	 * (given we are at the start of the buffer)
    323	 */
    324	p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
    325	p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
    326
    327	/*
    328	 * initialize the next properties
    329	 */
    330
    331	/* initialize the raw reader */
    332	herodotus_reader_copy(r, &(p->raw_reader));
    333
    334	/* fill in the two next raw properties (after no-initialization) */
    335	p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
    336	for (i = 0;
    337	     i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
    338	                      HERODOTUS_STATUS_SUCCESS;) {
    339		p->raw.next_prop[i++] = p->get_break_prop(cp);
    340	}
    341
    342	/* initialize the skip reader */
    343	herodotus_reader_copy(r, &(p->skip_reader));
    344
    345	/* fill in the two next skip properties (after no-initialization) */
    346	p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
    347	for (i = 0;
    348	     i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
    349	                      HERODOTUS_STATUS_SUCCESS;) {
    350		prop = p->get_break_prop(cp);
    351		if (!p->is_skippable_prop(prop)) {
    352			p->skip.next_prop[i++] = prop;
    353		}
    354	}
    355}
    356
    357int
    358proper_advance(struct proper *p)
    359{
    360	uint_least8_t prop;
    361	uint_least32_t cp;
    362
    363	/* read in next "raw" property */
    364	if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
    365	    HERODOTUS_STATUS_SUCCESS) {
    366		prop = p->get_break_prop(cp);
    367	} else {
    368		prop = p->no_prop;
    369	}
    370
    371	/*
    372	 * do a shift-in, unless we find that the property that is to
    373	 * be moved past the "raw-viewing-line" (this property is stored
    374	 * in p->raw.next_prop[0]) is a no_prop, indicating that
    375	 * we are at the end of the buffer.
    376	 */
    377	if (p->raw.next_prop[0] == p->no_prop) {
    378		return 1;
    379	}
    380
    381	/* shift in the properties */
    382	p->raw.prev_prop[1] = p->raw.prev_prop[0];
    383	p->raw.prev_prop[0] = p->raw.next_prop[0];
    384	p->raw.next_prop[0] = p->raw.next_prop[1];
    385	p->raw.next_prop[1] = prop;
    386
    387	/* advance the middle reader viewing-line */
    388	(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
    389
    390	/* check skippability-property */
    391	if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
    392		/*
    393		 * the property that has moved past the "raw-viewing-line"
    394		 * (this property is now (after the raw-shift) stored in
    395		 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
    396		 * guaranteeing that we won't shift a no-prop past the
    397		 * "viewing-line" in the skip-properties) is not a skippable
    398		 * property, thus we need to shift the skip property as well.
    399		 */
    400		p->skip.prev_prop[1] = p->skip.prev_prop[0];
    401		p->skip.prev_prop[0] = p->skip.next_prop[0];
    402		p->skip.next_prop[0] = p->skip.next_prop[1];
    403
    404		/*
    405		 * call the skip-shift-callback on the property that
    406		 * passed the skip-viewing-line (this property is now
    407		 * stored in p->skip.prev_prop[0]).
    408		 */
    409		p->skip_shift_callback(p->skip.prev_prop[0], p->state);
    410
    411		/* determine the next shift property */
    412		p->skip.next_prop[1] = p->no_prop;
    413		while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
    414		       HERODOTUS_STATUS_SUCCESS) {
    415			prop = p->get_break_prop(cp);
    416			if (!p->is_skippable_prop(prop)) {
    417				p->skip.next_prop[1] = prop;
    418				break;
    419			}
    420		}
    421	}
    422
    423	return 0;
    424}