libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

utf8.c (6131B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stddef.h>
      3#include <stdint.h>
      4
      5#include "../grapheme.h"
      6#include "util.h"
      7
      8#define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
      9
     10/* lookup-table for the types of sequence first bytes */
     11static const struct {
     12	uint_least8_t lower;  /* lower bound of sequence first byte */
     13	uint_least8_t upper;  /* upper bound of sequence first byte */
     14	uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
     15	uint_least32_t maxcp; /* largest encodable codepoint */
     16			      /*
     17	                       * implicit: table-offset represents the number of following
     18	                       * bytes of the form 10xxxxxx (6 bits capacity each)
     19	                       */
     20} lut[] = {
     21	[0] = {
     22		/* 0xxxxxxx */
     23		.lower = 0x00, /* 00000000 */
     24		.upper = 0x7F, /* 01111111 */
     25		.mincp = (uint_least32_t)0,
     26		.maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
     27	},
     28	[1] = {
     29		/* 110xxxxx */
     30		.lower = 0xC0, /* 11000000 */
     31		.upper = 0xDF, /* 11011111 */
     32		.mincp = (uint_least32_t)1 << 7,
     33		.maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
     34	},
     35	[2] = {
     36		/* 1110xxxx */
     37		.lower = 0xE0, /* 11100000 */
     38		.upper = 0xEF, /* 11101111 */
     39		.mincp = (uint_least32_t)1 << 11,
     40		.maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
     41	},
     42	[3] = {
     43		/* 11110xxx */
     44		.lower = 0xF0, /* 11110000 */
     45		.upper = 0xF7, /* 11110111 */
     46		.mincp = (uint_least32_t)1 << 16,
     47		.maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
     48	},
     49};
     50
     51size_t
     52grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
     53{
     54	size_t off, i;
     55	uint_least32_t tmp;
     56
     57	if (cp == NULL) {
     58		/*
     59		 * instead of checking every time if cp is NULL within
     60		 * the decoder, simply point it at a dummy variable here.
     61		 */
     62		cp = &tmp;
     63	}
     64
     65	if (str == NULL || len == 0) {
     66		/* a sequence must be at least 1 byte long */
     67		*cp = GRAPHEME_INVALID_CODEPOINT;
     68		return 0;
     69	}
     70
     71	/* identify sequence type with the first byte */
     72	for (off = 0; off < LEN(lut); off++) {
     73		if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
     74		            lut[off].upper)) {
     75			/*
     76			 * first byte is within the bounds; fill
     77			 * p with the the first bits contained in
     78			 * the first byte (by subtracting the high bits)
     79			 */
     80			*cp = ((const unsigned char *)str)[0] - lut[off].lower;
     81			break;
     82		}
     83	}
     84	if (off == LEN(lut)) {
     85		/*
     86		 * first byte does not match a sequence type;
     87		 * set cp as invalid and return 1 byte processed
     88		 *
     89		 * this also includes the cases where bits higher than
     90		 * the 8th are set on systems with CHAR_BIT > 8
     91		 */
     92		*cp = GRAPHEME_INVALID_CODEPOINT;
     93		return 1;
     94	}
     95	if (1 + off > len) {
     96		/*
     97		 * input is not long enough, set cp as invalid
     98		 */
     99		*cp = GRAPHEME_INVALID_CODEPOINT;
    100
    101		/*
    102		 * count the following continuation bytes, but nothing
    103		 * else in case we have a "rogue" case where e.g. such a
    104		 * sequence starter occurs right before a NUL-byte.
    105		 */
    106		for (i = 0; 1 + i < len; i++) {
    107			if (!BETWEEN(((const unsigned char *)str)[1 + i], 0x80,
    108			             0xBF)) {
    109				break;
    110			}
    111		}
    112
    113		/*
    114		 * if the continuation bytes do not continue until
    115		 * the end, return the incomplete sequence length.
    116		 * Otherwise return the number of bytes we actually
    117		 * expected, which is larger than n.
    118		 */
    119		return ((1 + i) < len) ? (1 + i) : (1 + off);
    120	}
    121
    122	/*
    123	 * process 'off' following bytes, each of the form 10xxxxxx
    124	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
    125	 */
    126	for (i = 1; i <= off; i++) {
    127		if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
    128			/*
    129			 * byte does not match format; return
    130			 * number of bytes processed excluding the
    131			 * unexpected character as recommended since
    132			 * Unicode 6 (chapter 3)
    133			 *
    134			 * this also includes the cases where bits
    135			 * higher than the 8th are set on systems
    136			 * with CHAR_BIT > 8
    137			 */
    138			*cp = GRAPHEME_INVALID_CODEPOINT;
    139			return 1 + (i - 1);
    140		}
    141		/*
    142		 * shift codepoint by 6 bits and add the 6 stored bits
    143		 * in s[i] to it using the bitmask 0x3F (00111111)
    144		 */
    145		*cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
    146	}
    147
    148	if (*cp < lut[off].mincp ||
    149	    BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    150	    *cp > UINT32_C(0x10FFFF)) {
    151		/*
    152		 * codepoint is overlong encoded in the sequence, is a
    153		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
    154		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
    155		 * specifies the latter two conditions)
    156		 */
    157		*cp = GRAPHEME_INVALID_CODEPOINT;
    158	}
    159
    160	return 1 + off;
    161}
    162
    163size_t
    164grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
    165{
    166	size_t off, i;
    167
    168	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
    169	    cp > UINT32_C(0x10FFFF)) {
    170		/*
    171		 * codepoint is a high or low UTF-16 surrogate half
    172		 * (0xD800..0xDFFF) or not representable in UTF-16
    173		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
    174		 */
    175		cp = GRAPHEME_INVALID_CODEPOINT;
    176	}
    177
    178	/* determine necessary sequence type */
    179	for (off = 0; off < LEN(lut); off++) {
    180		if (cp <= lut[off].maxcp) {
    181			break;
    182		}
    183	}
    184	if (1 + off > len || str == NULL || len == 0) {
    185		/*
    186		 * specified buffer is too small to store sequence or
    187		 * the caller just wanted to know how many bytes the
    188		 * codepoint needs by passing a NULL-buffer.
    189		 */
    190		return 1 + off;
    191	}
    192
    193	/* build sequence by filling cp-bits into each byte */
    194
    195	/*
    196	 * lut[off].lower is the bit-format for the first byte and
    197	 * the bits to fill into it are determined by shifting the
    198	 * cp 6 times the number of following bytes, as each
    199	 * following byte stores 6 bits, yielding the wanted bits.
    200	 *
    201	 * We do not overwrite the mask because we guaranteed earlier
    202	 * that there are no bits higher than the mask allows.
    203	 */
    204	((unsigned char *)str)[0] =
    205		lut[off].lower | (uint_least8_t)(cp >> (6 * off));
    206
    207	for (i = 1; i <= off; i++) {
    208		/*
    209		 * the bit-format for following bytes is 10000000 (0x80)
    210		 * and it each stores 6 bits in the 6 low bits that we
    211		 * extract from the properly-shifted value using the
    212		 * mask 00111111 (0x3F)
    213		 */
    214		((unsigned char *)str)[i] =
    215			0x80 | ((cp >> (6 * (off - i))) & 0x3F);
    216	}
    217
    218	return 1 + off;
    219}