Refine types (uint8_t -> char, uint32_t -> uint_least32_t) - libgrapheme - Freestanding C library for unicode string handling

	libgrapheme Freestanding C library for unicode string handling
	git clone https://git.sinitax.com/suckless/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt

commit c0e14c9b89c1ac78b72b7d8840261fbb7285d07a
parent 7981a5db713073992d00ee2231b88558977671aa
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 11 Dec 2021 14:17:39 +0100

Refine types (uint8_t -> char, uint32_t -> uint_least32_t)

The type uint32_t is not guaranteed by the standard to be present,
but it guarantees uint_least32_t. If a libgrapheme-user passes a pointer
to an uint32_t (instead of uint_least32_t) there will be no problem,
as the presence of uint32_t immediately implies uint32_t ==
uint_least32_t. However, we won't depend on it internally and are
strict with using uint_least32_t. The type name is a mouthful, but still
clearer and not much longer than "long int" (which is guaranteed to be
at least 32 bits).

Regarding uint8_t, it was a bit clumsy to require it in the API. C does
not guarantee that a byte is actually an octet (i.e. char can have
more than 8 bits), and even though the relevance of non-8-bit-char
seems to be waning, I don't want to rely on that. But more importantly,
accepting "char *" saves some casts on the user-side.
Adapting the lg_utf8_* functions is trivial, as it requires just
being careful with casts. The cast "signed char" <-> "unsigned char"
is unproblematic, so every time we need the bit representation, we
explicitly cast to unsigned char and are done with it. Likewise, every
time we write to a char, we make sure that what we pass is explicitly
an unsigned char.
This became a bit awkward in the test cases where we have char-arrays
with hex literals. As C does not really have a concept of a sub-int
literal, all hexadecimal literals had to first be explicitly cast to
unsigned char, but that's it.

One more aspect where we've become more portable. :)

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M gen/util.c  | 6 +++---
M gen/util.h  | 6 +++---
M grapheme.h  | 6 +++---
M src/grapheme.c  | 8 ++++----
M src/utf8.c  | 49 +++++++++++++++++++++++++++++--------------------
M src/util.c  | 6 +++---
M src/util.h  | 6 +++---
M test/utf8-decode.c  | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M test/utf8-encode.c  | 42 ++++++++++++++++++++++++++++++++----------

9 files changed, 183 insertions(+), 73 deletions(-)
diff --git a/gen/util.c b/gen/util.c
@@ -39,7 +39,7 @@ valid_hexstring(const char *str)
 }
 
 static int
-cp_parse(const char *str, uint32_t *cp)
+cp_parse(const char *str, uint_least32_t *cp)
 {
 	if (!valid_hexstring(str)) {
 		return 1;
@@ -348,13 +348,13 @@ segment_test_list_print(struct segment_test *st, size_t numsegtests,
 	printf("/* Automatically generated by %s */\n"
 	       "#include <stdint.h>\n#include <stddef.h>\n\n", progname);
 
-	printf("static const struct {\n\tuint32_t *cp;\n"
+	printf("static const struct {\n\tuint_least32_t *cp;\n"
 	       "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
 	       "\tchar *descr;\n} %s[] = {\n", identifier);
 	for (i = 0; i < numsegtests; i++) {
 		printf("\t{\n");
 
-		printf("\t\t.cp     = (uint32_t[]){");
+		printf("\t\t.cp     = (uint_least32_t[]){");
 		for (j = 0; j < st[i].cplen; j++) {
 			printf(" UINT32_C(0x%06X)", st[i].cp[j]);
 			if (j + 1 < st[i].cplen) {
diff --git a/gen/util.h b/gen/util.h
@@ -8,8 +8,8 @@
 #define LEN(x) (sizeof (x) / sizeof *(x))
 
 struct range {
-	uint32_t lower;
-	uint32_t upper;
+	uint_least32_t lower;
+	uint_least32_t upper;
 };
 
 struct property {
@@ -21,7 +21,7 @@ struct property {
 };
 
 struct segment_test {
-	uint32_t *cp;
+	uint_least32_t *cp;
 	size_t cplen;
 	size_t *len;
 	size_t lenlen;
diff --git a/grapheme.h b/grapheme.h
@@ -20,9 +20,9 @@ typedef struct lg_internal_segmentation_state {
 
 size_t lg_grapheme_nextbreak(const char *);
 
-int lg_grapheme_isbreak(uint32_t, uint32_t, LG_SEGMENTATION_STATE *);
+int lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
 
-size_t lg_utf8_decode(const uint8_t *, size_t, uint32_t *);
-size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
 
 #endif /* GRAPHEME_H */
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -13,7 +13,7 @@ enum {
 };
 
 int
-lg_grapheme_isbreak(uint32_t a, uint32_t b, LG_SEGMENTATION_STATE *state)
+lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
 {
 	struct lg_internal_heisenstate *p[2] = { 0 };
 	int ret = 1, flags = 0;
@@ -179,7 +179,7 @@ hasbreak:
 size_t
 lg_grapheme_nextbreak(const char *str)
 {
-	uint32_t cp0, cp1;
+	uint_least32_t cp0, cp1;
 	size_t ret, len = 0;
 	LG_SEGMENTATION_STATE state = { 0 };
 
@@ -200,14 +200,14 @@ lg_grapheme_nextbreak(const char *str)
 	 */
 
 	/* get first code point */
-	len += lg_utf8_decode((uint8_t *)str, 5, &cp0);
+	len += lg_utf8_decode(str, 5, &cp0);
 	if (cp0 == LG_CODEPOINT_INVALID) {
 		return len;
 	}
 
 	while (cp0 != 0) {
 		/* get next code point */
-		ret = lg_utf8_decode((uint8_t *)(str + len), 5, &cp1);
+		ret = lg_utf8_decode(str + len, 5, &cp1);
 
 		if (cp1 == LG_CODEPOINT_INVALID ||
 		    lg_grapheme_isbreak(cp0, cp1, &state)) {
diff --git a/src/utf8.c b/src/utf8.c
@@ -8,10 +8,10 @@
 
 /* lookup-table for the types of sequence first bytes */
 static const struct {
-	uint8_t  lower; /* lower bound of sequence first byte */
-	uint8_t  upper; /* upper bound of sequence first byte */
-	uint32_t mincp; /* smallest non-overlong encoded code point */
-	uint32_t maxcp; /* largest encodable code point */
+	uint8_t        lower; /* lower bound of sequence first byte */
+	uint8_t        upper; /* upper bound of sequence first byte */
+	uint_least32_t mincp; /* smallest non-overlong encoded code point */
+	uint_least32_t maxcp; /* largest encodable code point */
 	/*
 	 * implicit: table-offset represents the number of following
 	 * bytes of the form 10xxxxxx (6 bits capacity each)
@@ -21,37 +21,44 @@ static const struct {
 		/* 0xxxxxxx */
 		.lower = 0x00, /* 00000000 */
 		.upper = 0x7F, /* 01111111 */
-		.mincp = (uint32_t)0,
-		.maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
+		.mincp = (uint_least32_t)0,
+		.maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
 	},
 	[1] = {
 		/* 110xxxxx */
 		.lower = 0xC0, /* 11000000 */
 		.upper = 0xDF, /* 11011111 */
-		.mincp = (uint32_t)1 << 7,
-		.maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
+		.mincp = (uint_least32_t)1 << 7,
+		.maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
 	},
 	[2] = {
 		/* 1110xxxx */
 		.lower = 0xE0, /* 11100000 */
 		.upper = 0xEF, /* 11101111 */
-		.mincp = (uint32_t)1 << 11,
-		.maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
+		.mincp = (uint_least32_t)1 << 11,
+		.maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
 	},
 	[3] = {
 		/* 11110xxx */
 		.lower = 0xF0, /* 11110000 */
 		.upper = 0xF7, /* 11110111 */
-		.mincp = (uint32_t)1 << 16,
-		.maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
+		.mincp = (uint_least32_t)1 << 16,
+		.maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
 	},
 };
 
 size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 {
 	size_t off, i;
 
+	/*
+	 * char is guaranteed to be at least 8 bits, but it could
+	 * be more. We assume that the encoding is faithful such
+	 * that any higher bits are zero. If we encounter anything
+	 * else, we treat it as an encoding error.
+	 */
+
 	if (n == 0) {
 		/* a sequence must be at least 1 byte long */
 		*cp = LG_CODEPOINT_INVALID;
@@ -60,13 +67,15 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
 
 	/* identify sequence type with the first byte */
 	for (off = 0; off < LEN(lut); off++) {
-		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+		if (BETWEEN((unsigned char)s[0], lut[off].lower,
+		            lut[off].upper)) {
 			/*
 			 * first byte is within the bounds; fill
 			 * p with the the first bits contained in
 			 * the first byte (by subtracting the high bits)
+			 * and discarding any higher bits than 8
 			 */
-			*cp = s[0] - lut[off].lower;
+			*cp = ((unsigned char)s[0] - lut[off].lower) & 0xff;
 			break;
 		}
 	}
@@ -92,7 +101,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
 	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
 	 */
 	for (i = 1; i <= off; i++) {
-		if(!BETWEEN(s[i], 0x80, 0xBF)) {
+		if(!BETWEEN((unsigned char)s[i], 0x80, 0xBF)) {
 			/*
 			 * byte does not match format; return
 			 * number of bytes processed excluding the
@@ -106,7 +115,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
 		 * shift code point by 6 bits and add the 6 stored bits
 		 * in s[i] to it using the bitmask 0x3F (00111111)
 		 */
-		*cp = (*cp << 6) | (s[i] & 0x3F);
+		*cp = (*cp << 6) | ((unsigned char)s[i] & 0x3F);
 	}
 
 	if (*cp < lut[off].mincp ||
@@ -125,7 +134,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
 }
 
 size_t
-lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
 {
 	size_t off, i;
 
@@ -161,7 +170,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
 	 * We do not overwrite the mask because we guaranteed earlier
 	 * that there are no bits higher than the mask allows.
 	 */
-	s[0] = lut[off].lower | (cp >> (6 * off));
+	s[0] = (unsigned char)(lut[off].lower | (cp >> (6 * off)));
 
 	for (i = 1; i <= off; i++) {
 		/*
@@ -170,7 +179,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
 		 * extract from the properly-shifted value using the
 		 * mask 00111111 (0x3F)
 		 */
-		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+		s[i] = (unsigned char)(0x80 | ((cp >> (6 * (off - i))) & 0x3F));
 	}
 
 	return 1 + off;
diff --git a/src/util.c b/src/util.c
@@ -41,14 +41,14 @@ heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state)
 static int
 cp_cmp(const void *a, const void *b)
 {
-	uint32_t cp = *(uint32_t *)a;
-	uint32_t *range = (uint32_t *)b;
+	uint_least32_t cp = *(uint_least32_t *)a;
+	uint_least32_t *range = (uint_least32_t *)b;
 
 	return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
 }
 
 int
-has_property(uint32_t cp, struct lg_internal_heisenstate *cpstate,
+has_property(uint_least32_t cp, struct lg_internal_heisenstate *cpstate,
              const struct range_list *proptable, int property)
 {
 	int res;
diff --git a/src/util.h b/src/util.h
@@ -10,8 +10,8 @@
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
 struct range {
-	uint32_t lower;
-	uint32_t upper;
+	uint_least32_t lower;
+	uint_least32_t upper;
 };
 
 struct range_list {
@@ -22,7 +22,7 @@ struct range_list {
 int heisenstate_get(struct lg_internal_heisenstate *, int);
 int heisenstate_set(struct lg_internal_heisenstate *, int, int);
 
-int has_property(uint32_t, struct lg_internal_heisenstate *,
+int has_property(uint_least32_t, struct lg_internal_heisenstate *,
                  const struct range_list *, int);
 
 #endif /* UTIL_H */
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -9,7 +9,7 @@
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
 static const struct {
-	uint8_t *arr;     /* UTF-8 byte sequence */
+	char    *arr;     /* UTF-8 byte sequence */
 	size_t   len;     /* length of UTF-8 byte sequence */
 	size_t   exp_len; /* expected length returned */
 	uint32_t exp_cp;  /* expected code point returned */
@@ -29,7 +29,9 @@ static const struct {
 		 * [ 11111101 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xFD },
+		.arr     = (char[]){
+			(unsigned char)0xFD,
+		},
 		.len     = 1,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -39,7 +41,9 @@ static const struct {
 		 * [ 00000001 ] ->
 		 * 0000001
 		 */
-		.arr     = (uint8_t[]){ 0x01 },
+		.arr     = (char[]){
+			(unsigned char)0x01,
+		},
 		.len     = 1,
 		.exp_len = 1,
 		.exp_cp  = 0x1,
@@ -49,7 +53,10 @@ static const struct {
 		 * [ 11000011 10111111 ] ->
 		 * 00011111111
 		 */
-		.arr     = (uint8_t[]){ 0xC3, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xC3,
+			(unsigned char)0xBF,
+		},
 		.len     = 2,
 		.exp_len = 2,
 		.exp_cp  = 0xFF,
@@ -59,7 +66,9 @@ static const struct {
 		 * [ 11000011 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC3 },
+		.arr     = (char[]){
+			(unsigned char)0xC3
+		},
 		.len     = 1,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -69,7 +78,10 @@ static const struct {
 		 * [ 11000011 11111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC3, 0xFF },
+		.arr     = (char[]){
+			(unsigned char)0xC3,
+			(unsigned char)0xFF,
+		},
 		.len     = 2,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -79,7 +91,10 @@ static const struct {
 		 * [ 11000001 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xC1, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xC1,
+			(unsigned char)0xBF,
+		},
 		.len     = 2,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -89,7 +104,11 @@ static const struct {
 		 * [ 11100000 10111111 10111111 ] ->
 		 * 0000111111111111
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = 0xFFF,
@@ -99,7 +118,9 @@ static const struct {
 		 * [ 11100000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0 },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+		},
 		.len     = 1,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -109,7 +130,11 @@ static const struct {
 		 * [ 11100000 01111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0x7F,
+			(unsigned char)0xBF,
+		},
 		.len     = 3,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -119,7 +144,10 @@ static const struct {
 		 * [ 11100000 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0xBF,
+		},
 		.len     = 2,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -129,7 +157,11 @@ static const struct {
 		 * [ 11100000 10111111 01111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0xBF,
+			(unsigned char)0x7F,
+		},
 		.len     = 3,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -139,7 +171,11 @@ static const struct {
 		 * [ 11100000 10011111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0x9F,
+			(unsigned char)0xBF,
+		},
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -149,7 +185,11 @@ static const struct {
 		 * [ 11101101 10100000 10000000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+		.arr     = (char[]){
+			(unsigned char)0xED,
+			(unsigned char)0xA0,
+			(unsigned char)0x80,
+		},
 		.len     = 3,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -159,7 +199,12 @@ static const struct {
 		 * [ 11110011 10111111 10111111 10111111 ] ->
 		 * 011111111111111111111
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = UINT32_C(0xFFFFF),
@@ -169,7 +214,9 @@ static const struct {
 		 * [ 11110011 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3 },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+		},
 		.len     = 1,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -179,7 +226,12 @@ static const struct {
 		 * [ 11110011 01111111 10111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0x7F,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.len     = 4,
 		.exp_len = 1,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -189,7 +241,10 @@ static const struct {
 		 * [ 11110011 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+		},
 		.len     = 2,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -199,7 +254,12 @@ static const struct {
 		 * [ 11110011 10111111 01111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+			(unsigned char)0x7F,
+			(unsigned char)0xBF,
+		},
 		.len     = 4,
 		.exp_len = 2,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -209,7 +269,11 @@ static const struct {
 		 * [ 11110011 10111111 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.len     = 3,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -219,7 +283,12 @@ static const struct {
 		 * [ 11110011 10111111 10111111 01111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+		.arr     = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+			(unsigned char)0x7F,
+		},
 		.len     = 4,
 		.exp_len = 3,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -229,7 +298,12 @@ static const struct {
 		 * [ 11110000 10000000 10000001 10111111 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+		.arr     = (char[]){
+			(unsigned char)0xF0,
+			(unsigned char)0x80,
+			(unsigned char)0x81,
+			(unsigned char)0xBF,
+		},
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -239,7 +313,12 @@ static const struct {
 		 * [ 11110100 10010000 10000000 10000000 ] ->
 		 * INVALID
 		 */
-		.arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+		.arr     = (char[]){
+			(unsigned char)0xF4,
+			(unsigned char)0x90,
+			(unsigned char)0x80,
+			(unsigned char)0x80,
+		},
 		.len     = 4,
 		.exp_len = 4,
 		.exp_cp  = LG_CODEPOINT_INVALID,
@@ -254,7 +333,7 @@ main(void)
 	/* UTF-8 decoder test */
 	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
 		size_t len;
-		uint32_t cp;
+		uint_least32_t cp;
 
 		len = lg_utf8_decode(dec_test[i].arr,
 		                     dec_test[i].len, &cp);
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -9,44 +9,66 @@
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
 static const struct {
-	uint32_t cp;      /* input code point */
-	uint8_t *exp_arr; /* expected UTF-8 byte sequence */
-	size_t   exp_len; /* expected length of UTF-8 sequence */
+	uint_least32_t cp;      /* input code point */
+	char          *exp_arr; /* expected UTF-8 byte sequence */
+	size_t         exp_len; /* expected length of UTF-8 sequence */
 } enc_test[] = {
 	{
 		/* invalid code point (UTF-16 surrogate half) */
 		.cp      = UINT32_C(0xD800),
-		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_arr = (char[]){
+			(unsigned char)0xEF,
+			(unsigned char)0xBF,
+			(unsigned char)0xBD,
+		},
 		.exp_len = 3,
 	},
 	{
 		/* invalid code point (UTF-16-unrepresentable) */
 		.cp      = UINT32_C(0x110000),
-		.exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+		.exp_arr = (char[]){
+			(unsigned char)0xEF,
+			(unsigned char)0xBF,
+			(unsigned char)0xBD,
+		},
 		.exp_len = 3,
 	},
 	{
 		/* code point encoded to a 1-byte sequence */
 		.cp      = 0x01,
-		.exp_arr = (uint8_t[]){ 0x01 },
+		.exp_arr = (char[]){
+			(unsigned char)0x01
+		},
 		.exp_len = 1,
 	},
 	{
 		/* code point encoded to a 2-byte sequence */
 		.cp      = 0xFF,
-		.exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+		.exp_arr = (char[]){
+			(unsigned char)0xC3,
+			(unsigned char)0xBF,
+		},
 		.exp_len = 2,
 	},
 	{
 		/* code point encoded to a 3-byte sequence */
 		.cp      = 0xFFF,
-		.exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+		.exp_arr = (char[]){
+			(unsigned char)0xE0,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.exp_len = 3,
 	},
 	{
 		/* code point encoded to a 4-byte sequence */
 		.cp      = UINT32_C(0xFFFFF),
-		.exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+		.exp_arr = (char[]){
+			(unsigned char)0xF3,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+			(unsigned char)0xBF,
+		},
 		.exp_len = 4,
 	},
 };
@@ -58,7 +80,7 @@ main(void)
 
 	/* UTF-8 encoder test */
 	for (i = 0, failed = 0; i < LEN(enc_test); i++) {
-		uint8_t arr[4];
+		char arr[4];
 		size_t len;
 
 		len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));

M	gen/util.c	\|	6	+++---
M	gen/util.h	\|	6	+++---
M	grapheme.h	\|	6	+++---
M	src/grapheme.c	\|	8	++++----
M	src/utf8.c	\|	49	+++++++++++++++++++++++++++++--------------------
M	src/util.c	\|	6	+++---
M	src/util.h	\|	6	+++---
M	test/utf8-decode.c	\|	127	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	test/utf8-encode.c	\|	42	++++++++++++++++++++++++++++++++----------