Refactor API ("lg_" prefix, better naming scheme) - libgrapheme - Freestanding C library for unicode string handling

	libgrapheme Freestanding C library for unicode string handling
	git clone https://git.sinitax.com/suckless/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt

commit 1c126d7ee10854b29e606e4eeb491621d021beeb
parent 0e3d5f60213ba55935364c73422b373ac380f574
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed,  8 Dec 2021 18:16:48 +0100

Refactor API ("lg_" prefix, better naming scheme)

The "grapheme_" prefix was sadly a bit confusing so it now switches
to the "lg_" prefix which also will not get in the way too much.

"_nextbreak" and "_isbreak" as a general form makes clearer what
we actually do.

"utf8_decode" and "utf8_encode" instead of "cp_decode" and
"cp_encode" greatly improves readability and removes any doubt about
what these functions do. libgrapheme is usable with any other encoding
via the "_isbreak"-functions, but you'll have to decode yourself, but
it should be clear by now that UTF-8 should be used everywhere. :)

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M Makefile  | 4 ++--
M grapheme.h  | 10 +++++-----
D src/codepoint.c  | 176 -------------------------------------------------------------------------------
M src/grapheme.c  | 16 ++++++++--------
A src/utf8.c  | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/grapheme.c  | 6 +++---
M test/utf8-decode.c  | 40 ++++++++++++++++++++--------------------
M test/utf8-encode.c  | 2 +-

8 files changed, 215 insertions(+), 215 deletions(-)
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ DATA =\
 	data/GraphemeBreakProperty.txt\
 	data/GraphemeBreakTest.txt
 GEN = gen/grapheme gen/grapheme-test
-LIB = src/codepoint src/grapheme src/util
+LIB = src/grapheme src/utf8 src/util
 TEST = test/grapheme test/utf8-decode test/utf8-encode
 
 MAN3 = man/grapheme_bytelen.3
@@ -20,7 +20,7 @@ all: libgrapheme.a libgrapheme.so
 gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
 gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
 gen/util.o: gen/util.c config.mk gen/util.h
-src/codepoint.o: src/codepoint.c config.mk grapheme.h
+src/utf8.o: src/utf8.c config.mk grapheme.h
 src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
 src/util.o: src/util.c config.mk src/util.h
 test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h
diff --git a/grapheme.h b/grapheme.h
@@ -5,12 +5,12 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#define GRAPHEME_CP_INVALID UINT32_C(0xFFFD)
+#define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
 
-int grapheme_boundary(uint32_t, uint32_t, int *);
-size_t grapheme_bytelen(const char *);
+size_t lg_utf8_decode(uint32_t *, const uint8_t *, size_t);
+size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
 
-size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
-size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
+size_t lg_grapheme_nextbreak(const char *);
+int lg_grapheme_isbreak(uint32_t, uint32_t, int *);
 
 #endif /* GRAPHEME_H */
diff --git a/src/codepoint.c b/src/codepoint.c
@@ -1,176 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include "../grapheme.h"
-#include <stdio.h>
-
-#define BETWEEN(c, l, u) (c >= l && c <= u)
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
-/* lookup-table for the types of sequence first bytes */
-static const struct {
-	uint8_t  lower; /* lower bound of sequence first byte */
-	uint8_t  upper; /* upper bound of sequence first byte */
-	uint32_t mincp; /* smallest non-overlong encoded code point */
-	uint32_t maxcp; /* largest encodable code point */
-	/*
-	 * implicit: table-offset represents the number of following
-	 * bytes of the form 10xxxxxx (6 bits capacity each)
-	 */
-} lut[] = {
-	[0] = {
-		/* 0xxxxxxx */
-		.lower = 0x00, /* 00000000 */
-		.upper = 0x7F, /* 01111111 */
-		.mincp = (uint32_t)0,
-		.maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
-	},
-	[1] = {
-		/* 110xxxxx */
-		.lower = 0xC0, /* 11000000 */
-		.upper = 0xDF, /* 11011111 */
-		.mincp = (uint32_t)1 << 7,
-		.maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
-	},
-	[2] = {
-		/* 1110xxxx */
-		.lower = 0xE0, /* 11100000 */
-		.upper = 0xEF, /* 11101111 */
-		.mincp = (uint32_t)1 << 11,
-		.maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
-	},
-	[3] = {
-		/* 11110xxx */
-		.lower = 0xF0, /* 11110000 */
-		.upper = 0xF7, /* 11110111 */
-		.mincp = (uint32_t)1 << 16,
-		.maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
-	},
-};
-
-size_t
-grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
-{
-	size_t off, i;
-
-	if (n == 0) {
-		/* a sequence must be at least 1 byte long */
-		*cp = GRAPHEME_CP_INVALID;
-		return 1;
-	}
-
-	/* identify sequence type with the first byte */
-	for (off = 0; off < LEN(lut); off++) {
-		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
-			/*
-			 * first byte is within the bounds; fill
-			 * p with the the first bits contained in
-			 * the first byte (by subtracting the high bits)
-			 */
-			*cp = s[0] - lut[off].lower;
-			break;
-		}
-	}
-	if (off == LEN(lut)) {
-		/*
-		 * first byte does not match a sequence type;
-		 * set cp as invalid and return 1 byte processed
-		 */
-		*cp = GRAPHEME_CP_INVALID;
-		return 1;
-	}
-	if (1 + off > n) {
-		/*
-		 * input is not long enough, set cp as invalid and
-		 * return number of bytes needed
-		 */
-		*cp = GRAPHEME_CP_INVALID;
-		return 1 + off;
-	}
-
-	/*
-	 * process 'off' following bytes, each of the form 10xxxxxx
-	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
-	 */
-	for (i = 1; i <= off; i++) {
-		if(!BETWEEN(s[i], 0x80, 0xBF)) {
-			/*
-			 * byte does not match format; return
-			 * number of bytes processed excluding the
-			 * unexpected character as recommended since
-			 * Unicode 6 (chapter 3)
-			 */
-			*cp = GRAPHEME_CP_INVALID;
-			return 1 + (i - 1);
-		}
-		/*
-		 * shift code point by 6 bits and add the 6 stored bits
-		 * in s[i] to it using the bitmask 0x3F (00111111)
-		 */
-		*cp = (*cp << 6) | (s[i] & 0x3F);
-	}
-
-	if (*cp < lut[off].mincp ||
-	    BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
-	    *cp > UINT32_C(0x10FFFF)) {
-		/*
-		 * code point is overlong encoded in the sequence, is a
-		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
-		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
-		 * specifies the latter two conditions)
-		 */
-		*cp = GRAPHEME_CP_INVALID;
-	}
-
-	return 1 + off;
-}
-
-size_t
-grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
-{
-	size_t off, i;
-
-	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
-	    cp > UINT32_C(0x10FFFF)) {
-		/*
-		 * code point is a high or low UTF-16 surrogate half
-		 * (0xD800..0xDFFF) or not representable in UTF-16
-		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
-		 */
-		cp = GRAPHEME_CP_INVALID;
-	}
-
-	/* determine necessary sequence type */
-	for (off = 0; off < LEN(lut); off++) {
-		if (cp <= lut[off].maxcp) {
-			break;
-		}
-	}
-	if (1 + off > n) {
-		/* specified buffer is too small to store sequence */
-		return 1 + off;
-	}
-
-	/* build sequence by filling cp-bits into each byte */
-
-	/*
-	 * lut[off].lower is the bit-format for the first byte and
-	 * the bits to fill into it are determined by shifting the
-	 * cp 6 times the number of following bytes, as each
-	 * following byte stores 6 bits, yielding the wanted bits.
-	 *
-	 * We do not overwrite the mask because we guaranteed earlier
-	 * that there are no bits higher than the mask allows.
-	 */
-	s[0] = lut[off].lower | (cp >> (6 * off));
-
-	for (i = 1; i <= off; i++) {
-		/*
-		 * the bit-format for following bytes is 10000000 (0x80)
-		 * and it each stores 6 bits in the 6 low bits that we
-		 * extract from the properly-shifted value using the
-		 * mask 00111111 (0x3F)
-		 */
-		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
-	}
-
-	return 1 + off;
-}
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -11,7 +11,7 @@ enum {
 };
 
 int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
+lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state)
 {
 	struct heisenstate prop[2] = { 0 };
 	int s;
@@ -155,7 +155,7 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state)
 }
 
 size_t
-grapheme_bytelen(const char *str)
+lg_grapheme_nextbreak(const char *str)
 {
 	uint32_t cp0, cp1;
 	size_t ret, len = 0;
@@ -166,7 +166,7 @@ grapheme_bytelen(const char *str)
 	}
 
 	/*
-	 * grapheme_cp_decode, when it encounters an unexpected byte,
+	 * lg_utf8_decode, when it encounters an unexpected byte,
 	 * does not count it to the error and instead assumes that the
 	 * unexpected byte is the beginning of a new sequence.
 	 * This way, when the string ends with a null byte, we never
@@ -178,17 +178,17 @@ grapheme_bytelen(const char *str)
 	 */
 
 	/* get first code point */
-	len += grapheme_cp_decode(&cp0, (uint8_t *)str, 5);
-	if (cp0 == GRAPHEME_CP_INVALID) {
+	len += lg_utf8_decode(&cp0, (uint8_t *)str, 5);
+	if (cp0 == LG_CODEPOINT_INVALID) {
 		return len;
 	}
 
 	while (cp0 != 0) {
 		/* get next code point */
-		ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5);
+		ret = lg_utf8_decode(&cp1, (uint8_t *)(str + len), 5);
 
-		if (cp1 == GRAPHEME_CP_INVALID ||
-		    grapheme_boundary(cp0, cp1, &state)) {
+		if (cp1 == LG_CODEPOINT_INVALID ||
+		    lg_grapheme_isbreak(cp0, cp1, &state)) {
 			/* we read an invalid cp or have a breakpoint */
 			break;
 		} else {
diff --git a/src/utf8.c b/src/utf8.c
@@ -0,0 +1,176 @@
+/* See LICENSE file for copyright and license details. */
+#include "../grapheme.h"
+#include <stdio.h>
+
+#define BETWEEN(c, l, u) (c >= l && c <= u)
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+/* lookup-table for the types of sequence first bytes */
+static const struct {
+	uint8_t  lower; /* lower bound of sequence first byte */
+	uint8_t  upper; /* upper bound of sequence first byte */
+	uint32_t mincp; /* smallest non-overlong encoded code point */
+	uint32_t maxcp; /* largest encodable code point */
+	/*
+	 * implicit: table-offset represents the number of following
+	 * bytes of the form 10xxxxxx (6 bits capacity each)
+	 */
+} lut[] = {
+	[0] = {
+		/* 0xxxxxxx */
+		.lower = 0x00, /* 00000000 */
+		.upper = 0x7F, /* 01111111 */
+		.mincp = (uint32_t)0,
+		.maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
+	},
+	[1] = {
+		/* 110xxxxx */
+		.lower = 0xC0, /* 11000000 */
+		.upper = 0xDF, /* 11011111 */
+		.mincp = (uint32_t)1 << 7,
+		.maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
+	},
+	[2] = {
+		/* 1110xxxx */
+		.lower = 0xE0, /* 11100000 */
+		.upper = 0xEF, /* 11101111 */
+		.mincp = (uint32_t)1 << 11,
+		.maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
+	},
+	[3] = {
+		/* 11110xxx */
+		.lower = 0xF0, /* 11110000 */
+		.upper = 0xF7, /* 11110111 */
+		.mincp = (uint32_t)1 << 16,
+		.maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
+	},
+};
+
+size_t
+lg_utf8_decode(uint32_t *cp, const uint8_t *s, size_t n)
+{
+	size_t off, i;
+
+	if (n == 0) {
+		/* a sequence must be at least 1 byte long */
+		*cp = LG_CODEPOINT_INVALID;
+		return 1;
+	}
+
+	/* identify sequence type with the first byte */
+	for (off = 0; off < LEN(lut); off++) {
+		if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+			/*
+			 * first byte is within the bounds; fill
+			 * p with the the first bits contained in
+			 * the first byte (by subtracting the high bits)
+			 */
+			*cp = s[0] - lut[off].lower;
+			break;
+		}
+	}
+	if (off == LEN(lut)) {
+		/*
+		 * first byte does not match a sequence type;
+		 * set cp as invalid and return 1 byte processed
+		 */
+		*cp = LG_CODEPOINT_INVALID;
+		return 1;
+	}
+	if (1 + off > n) {
+		/*
+		 * input is not long enough, set cp as invalid and
+		 * return number of bytes needed
+		 */
+		*cp = LG_CODEPOINT_INVALID;
+		return 1 + off;
+	}
+
+	/*
+	 * process 'off' following bytes, each of the form 10xxxxxx
+	 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
+	 */
+	for (i = 1; i <= off; i++) {
+		if(!BETWEEN(s[i], 0x80, 0xBF)) {
+			/*
+			 * byte does not match format; return
+			 * number of bytes processed excluding the
+			 * unexpected character as recommended since
+			 * Unicode 6 (chapter 3)
+			 */
+			*cp = LG_CODEPOINT_INVALID;
+			return 1 + (i - 1);
+		}
+		/*
+		 * shift code point by 6 bits and add the 6 stored bits
+		 * in s[i] to it using the bitmask 0x3F (00111111)
+		 */
+		*cp = (*cp << 6) | (s[i] & 0x3F);
+	}
+
+	if (*cp < lut[off].mincp ||
+	    BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
+	    *cp > UINT32_C(0x10FFFF)) {
+		/*
+		 * code point is overlong encoded in the sequence, is a
+		 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
+		 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
+		 * specifies the latter two conditions)
+		 */
+		*cp = LG_CODEPOINT_INVALID;
+	}
+
+	return 1 + off;
+}
+
+size_t
+lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
+{
+	size_t off, i;
+
+	if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
+	    cp > UINT32_C(0x10FFFF)) {
+		/*
+		 * code point is a high or low UTF-16 surrogate half
+		 * (0xD800..0xDFFF) or not representable in UTF-16
+		 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
+		 */
+		cp = LG_CODEPOINT_INVALID;
+	}
+
+	/* determine necessary sequence type */
+	for (off = 0; off < LEN(lut); off++) {
+		if (cp <= lut[off].maxcp) {
+			break;
+		}
+	}
+	if (1 + off > n) {
+		/* specified buffer is too small to store sequence */
+		return 1 + off;
+	}
+
+	/* build sequence by filling cp-bits into each byte */
+
+	/*
+	 * lut[off].lower is the bit-format for the first byte and
+	 * the bits to fill into it are determined by shifting the
+	 * cp 6 times the number of following bytes, as each
+	 * following byte stores 6 bits, yielding the wanted bits.
+	 *
+	 * We do not overwrite the mask because we guaranteed earlier
+	 * that there are no bits higher than the mask allows.
+	 */
+	s[0] = lut[off].lower | (cp >> (6 * off));
+
+	for (i = 1; i <= off; i++) {
+		/*
+		 * the bit-format for following bytes is 10000000 (0x80)
+		 * and it each stores 6 bits in the 6 low bits that we
+		 * extract from the properly-shifted value using the
+		 * mask 00111111 (0x3F)
+		 */
+		s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+	}
+
+	return 1 + off;
+}
diff --git a/test/grapheme.c b/test/grapheme.c
@@ -19,9 +19,9 @@ main(void)
 	for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
 		for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) {
 			if ((j + 1) == grapheme_test[i].cplen ||
-			    grapheme_boundary(grapheme_test[i].cp[j],
-			                      grapheme_test[i].cp[j + 1],
-			                      &state)) {
+			    lg_grapheme_isbreak(grapheme_test[i].cp[j],
+			                        grapheme_test[i].cp[j + 1],
+			                        &state)) {
 				/* check if our resulting length matches */
 				if (k == grapheme_test[i].lenlen ||
 				    len != grapheme_test[i].len[k++]) {
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
@@ -22,7 +22,7 @@ static const struct {
 		.arr     = NULL,
 		.len     = 0,
 		.exp_len = 1,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid lead byte
@@ -32,7 +32,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xFD },
 		.len     = 1,
 		.exp_len = 1,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* valid 1-byte sequence
@@ -62,7 +62,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xC3 },
 		.len     = 1,
 		.exp_len = 2,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 2-byte sequence (second byte malformed)
@@ -72,7 +72,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xC3, 0xFF },
 		.len     = 2,
 		.exp_len = 1,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 2-byte sequence (overlong encoded)
@@ -82,7 +82,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xC1, 0xBF },
 		.len     = 2,
 		.exp_len = 2,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* valid 3-byte sequence
@@ -102,7 +102,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xE0 },
 		.len     = 1,
 		.exp_len = 3,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 3-byte sequence (second byte malformed)
@@ -112,7 +112,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
 		.len     = 3,
 		.exp_len = 1,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 3-byte sequence (third byte missing)
@@ -122,7 +122,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xE0, 0xBF },
 		.len     = 2,
 		.exp_len = 3,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 3-byte sequence (third byte malformed)
@@ -132,7 +132,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
 		.len     = 3,
 		.exp_len = 2,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 3-byte sequence (overlong encoded)
@@ -142,7 +142,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
 		.len     = 3,
 		.exp_len = 3,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 3-byte sequence (UTF-16 surrogate half)
@@ -152,7 +152,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xED, 0xA0, 0x80 },
 		.len     = 3,
 		.exp_len = 3,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* valid 4-byte sequence
@@ -172,7 +172,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3 },
 		.len     = 1,
 		.exp_len = 4,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (second byte malformed)
@@ -182,7 +182,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
 		.len     = 4,
 		.exp_len = 1,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (third byte missing)
@@ -192,7 +192,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3, 0xBF },
 		.len     = 2,
 		.exp_len = 4,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (third byte malformed)
@@ -202,7 +202,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
 		.len     = 4,
 		.exp_len = 2,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (fourth byte missing)
@@ -212,7 +212,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
 		.len     = 3,
 		.exp_len = 4,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (fourth byte malformed)
@@ -222,7 +222,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
 		.len     = 4,
 		.exp_len = 3,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (overlong encoded)
@@ -232,7 +232,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
 		.len     = 4,
 		.exp_len = 4,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 	{
 		/* invalid 4-byte sequence (UTF-16-unrepresentable)
@@ -242,7 +242,7 @@ static const struct {
 		.arr     = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
 		.len     = 4,
 		.exp_len = 4,
-		.exp_cp  = GRAPHEME_CP_INVALID,
+		.exp_cp  = LG_CODEPOINT_INVALID,
 	},
 };
 
@@ -256,7 +256,7 @@ main(void)
 		size_t len;
 		uint32_t cp;
 
-		len = grapheme_cp_decode(&cp, dec_test[i].arr,
+		len = lg_utf8_decode(&cp, dec_test[i].arr,
 		                         dec_test[i].len);
 
 		if (len != dec_test[i].exp_len ||
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
@@ -61,7 +61,7 @@ main(void)
 		uint8_t arr[4];
 		size_t len;
 
-		len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+		len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
 
 		if (len != enc_test[i].exp_len ||
 		    memcmp(arr, enc_test[i].exp_arr, len)) {

M	Makefile	\|	4	++--
M	grapheme.h	\|	10	+++++-----
D	src/codepoint.c	\|	176	-------------------------------------------------------------------------------
M	src/grapheme.c	\|	16	++++++++--------
A	src/utf8.c	\|	176	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/grapheme.c	\|	6	+++---
M	test/utf8-decode.c	\|	40	++++++++++++++++++++--------------------
M	test/utf8-encode.c	\|	2	+-