libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 1c126d7ee10854b29e606e4eeb491621d021beeb
parent 0e3d5f60213ba55935364c73422b373ac380f574
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed,  8 Dec 2021 18:16:48 +0100

Refactor API ("lg_" prefix, better naming scheme)

The "grapheme_" prefix was sadly a bit confusing so it now switches
to the "lg_" prefix which also will not get in the way too much.

"_nextbreak" and "_isbreak" as a general form makes clearer what
we actually do.

"utf8_decode" and "utf8_encode" instead of "cp_decode" and
"cp_encode" greatly improves readability and removes any doubt about
what these functions do. libgrapheme is usable with any other encoding
via the "_isbreak"-functions, but you'll have to decode yourself, but
it should be clear by now that UTF-8 should be used everywhere. :)

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 4++--
Mgrapheme.h | 10+++++-----
Dsrc/codepoint.c | 176-------------------------------------------------------------------------------
Msrc/grapheme.c | 16++++++++--------
Asrc/utf8.c | 176+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/grapheme.c | 6+++---
Mtest/utf8-decode.c | 40++++++++++++++++++++--------------------
Mtest/utf8-encode.c | 2+-
8 files changed, 215 insertions(+), 215 deletions(-)

diff --git a/Makefile b/Makefile @@ -9,7 +9,7 @@ DATA =\ data/GraphemeBreakProperty.txt\ data/GraphemeBreakTest.txt GEN = gen/grapheme gen/grapheme-test -LIB = src/codepoint src/grapheme src/util +LIB = src/grapheme src/utf8 src/util TEST = test/grapheme test/utf8-decode test/utf8-encode MAN3 = man/grapheme_bytelen.3 @@ -20,7 +20,7 @@ all: libgrapheme.a libgrapheme.so gen/grapheme.o: gen/grapheme.c config.mk gen/util.h gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h gen/util.o: gen/util.c config.mk gen/util.h -src/codepoint.o: src/codepoint.c config.mk grapheme.h +src/utf8.o: src/utf8.c config.mk grapheme.h src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h src/util.o: src/util.c config.mk src/util.h test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h diff --git a/grapheme.h b/grapheme.h @@ -5,12 +5,12 @@ #include <stddef.h> #include <stdint.h> -#define GRAPHEME_CP_INVALID UINT32_C(0xFFFD) +#define LG_CODEPOINT_INVALID UINT32_C(0xFFFD) -int grapheme_boundary(uint32_t, uint32_t, int *); -size_t grapheme_bytelen(const char *); +size_t lg_utf8_decode(uint32_t *, const uint8_t *, size_t); +size_t lg_utf8_encode(uint32_t, uint8_t *, size_t); -size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t); -size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t); +size_t lg_grapheme_nextbreak(const char *); +int lg_grapheme_isbreak(uint32_t, uint32_t, int *); #endif /* GRAPHEME_H */ diff --git a/src/codepoint.c b/src/codepoint.c @@ -1,176 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include "../grapheme.h" -#include <stdio.h> - -#define BETWEEN(c, l, u) (c >= l && c <= u) -#define LEN(x) (sizeof(x) / sizeof(*x)) - -/* lookup-table for the types of sequence first bytes */ -static const struct { - uint8_t lower; /* lower bound of sequence first byte */ - uint8_t upper; /* upper bound of sequence first byte */ - uint32_t mincp; /* smallest non-overlong encoded code point */ - uint32_t maxcp; /* largest encodable code point */ - /* - * implicit: table-offset represents the number of following - * bytes of the form 10xxxxxx (6 bits capacity each) - */ -} lut[] = { - [0] = { - /* 0xxxxxxx */ - .lower = 0x00, /* 00000000 */ - .upper = 0x7F, /* 01111111 */ - .mincp = (uint32_t)0, - .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */ - }, - [1] = { - /* 110xxxxx */ - .lower = 0xC0, /* 11000000 */ - .upper = 0xDF, /* 11011111 */ - .mincp = (uint32_t)1 << 7, - .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ - }, - [2] = { - /* 1110xxxx */ - .lower = 0xE0, /* 11100000 */ - .upper = 0xEF, /* 11101111 */ - .mincp = (uint32_t)1 << 11, - .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ - }, - [3] = { - /* 11110xxx */ - .lower = 0xF0, /* 11110000 */ - .upper = 0xF7, /* 11110111 */ - .mincp = (uint32_t)1 << 16, - .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ - }, -}; - -size_t -grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n) -{ - size_t off, i; - - if (n == 0) { - /* a sequence must be at least 1 byte long */ - *cp = GRAPHEME_CP_INVALID; - return 1; - } - - /* identify sequence type with the first byte */ - for (off = 0; off < LEN(lut); off++) { - if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) { - /* - * first byte is within the bounds; fill - * p with the the first bits contained in - * the first byte (by subtracting the high bits) - */ - *cp = s[0] - lut[off].lower; - break; - } - } - if (off == LEN(lut)) { - /* - * first byte does not match a sequence type; - * set cp as invalid and return 1 byte processed - */ - *cp = GRAPHEME_CP_INVALID; - return 1; - } - if (1 + off > n) { - /* - * input is not long enough, set cp as invalid and - * return number of bytes needed - */ - *cp = GRAPHEME_CP_INVALID; - return 1 + off; - } - - /* - * process 'off' following bytes, each of the form 10xxxxxx - * (i.e. between 0x80 (10000000) and 0xBF (10111111)) - */ - for (i = 1; i <= off; i++) { - if(!BETWEEN(s[i], 0x80, 0xBF)) { - /* - * byte does not match format; return - * number of bytes processed excluding the - * unexpected character as recommended since - * Unicode 6 (chapter 3) - */ - *cp = GRAPHEME_CP_INVALID; - return 1 + (i - 1); - } - /* - * shift code point by 6 bits and add the 6 stored bits - * in s[i] to it using the bitmask 0x3F (00111111) - */ - *cp = (*cp << 6) | (s[i] & 0x3F); - } - - if (*cp < lut[off].mincp || - BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || - *cp > UINT32_C(0x10FFFF)) { - /* - * code point is overlong encoded in the sequence, is a - * high or low UTF-16 surrogate half (0xD800..0xDFFF) or - * not representable in UTF-16 (>0x10FFFF) (RFC-3629 - * specifies the latter two conditions) - */ - *cp = GRAPHEME_CP_INVALID; - } - - return 1 + off; -} - -size_t -grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n) -{ - size_t off, i; - - if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || - cp > UINT32_C(0x10FFFF)) { - /* - * code point is a high or low UTF-16 surrogate half - * (0xD800..0xDFFF) or not representable in UTF-16 - * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. - */ - cp = GRAPHEME_CP_INVALID; - } - - /* determine necessary sequence type */ - for (off = 0; off < LEN(lut); off++) { - if (cp <= lut[off].maxcp) { - break; - } - } - if (1 + off > n) { - /* specified buffer is too small to store sequence */ - return 1 + off; - } - - /* build sequence by filling cp-bits into each byte */ - - /* - * lut[off].lower is the bit-format for the first byte and - * the bits to fill into it are determined by shifting the - * cp 6 times the number of following bytes, as each - * following byte stores 6 bits, yielding the wanted bits. - * - * We do not overwrite the mask because we guaranteed earlier - * that there are no bits higher than the mask allows. - */ - s[0] = lut[off].lower | (cp >> (6 * off)); - - for (i = 1; i <= off; i++) { - /* - * the bit-format for following bytes is 10000000 (0x80) - * and it each stores 6 bits in the 6 low bits that we - * extract from the properly-shifted value using the - * mask 00111111 (0x3F) - */ - s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); - } - - return 1 + off; -} diff --git a/src/grapheme.c b/src/grapheme.c @@ -11,7 +11,7 @@ enum { }; int -grapheme_boundary(uint32_t a, uint32_t b, int *state) +lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state) { struct heisenstate prop[2] = { 0 }; int s; @@ -155,7 +155,7 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state) } size_t -grapheme_bytelen(const char *str) +lg_grapheme_nextbreak(const char *str) { uint32_t cp0, cp1; size_t ret, len = 0; @@ -166,7 +166,7 @@ grapheme_bytelen(const char *str) } /* - * grapheme_cp_decode, when it encounters an unexpected byte, + * lg_utf8_decode, when it encounters an unexpected byte, * does not count it to the error and instead assumes that the * unexpected byte is the beginning of a new sequence. * This way, when the string ends with a null byte, we never @@ -178,17 +178,17 @@ grapheme_bytelen(const char *str) */ /* get first code point */ - len += grapheme_cp_decode(&cp0, (uint8_t *)str, 5); - if (cp0 == GRAPHEME_CP_INVALID) { + len += lg_utf8_decode(&cp0, (uint8_t *)str, 5); + if (cp0 == LG_CODEPOINT_INVALID) { return len; } while (cp0 != 0) { /* get next code point */ - ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5); + ret = lg_utf8_decode(&cp1, (uint8_t *)(str + len), 5); - if (cp1 == GRAPHEME_CP_INVALID || - grapheme_boundary(cp0, cp1, &state)) { + if (cp1 == LG_CODEPOINT_INVALID || + lg_grapheme_isbreak(cp0, cp1, &state)) { /* we read an invalid cp or have a breakpoint */ break; } else { diff --git a/src/utf8.c b/src/utf8.c @@ -0,0 +1,176 @@ +/* See LICENSE file for copyright and license details. */ +#include "../grapheme.h" +#include <stdio.h> + +#define BETWEEN(c, l, u) (c >= l && c <= u) +#define LEN(x) (sizeof(x) / sizeof(*x)) + +/* lookup-table for the types of sequence first bytes */ +static const struct { + uint8_t lower; /* lower bound of sequence first byte */ + uint8_t upper; /* upper bound of sequence first byte */ + uint32_t mincp; /* smallest non-overlong encoded code point */ + uint32_t maxcp; /* largest encodable code point */ + /* + * implicit: table-offset represents the number of following + * bytes of the form 10xxxxxx (6 bits capacity each) + */ +} lut[] = { + [0] = { + /* 0xxxxxxx */ + .lower = 0x00, /* 00000000 */ + .upper = 0x7F, /* 01111111 */ + .mincp = (uint32_t)0, + .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */ + }, + [1] = { + /* 110xxxxx */ + .lower = 0xC0, /* 11000000 */ + .upper = 0xDF, /* 11011111 */ + .mincp = (uint32_t)1 << 7, + .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */ + }, + [2] = { + /* 1110xxxx */ + .lower = 0xE0, /* 11100000 */ + .upper = 0xEF, /* 11101111 */ + .mincp = (uint32_t)1 << 11, + .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */ + }, + [3] = { + /* 11110xxx */ + .lower = 0xF0, /* 11110000 */ + .upper = 0xF7, /* 11110111 */ + .mincp = (uint32_t)1 << 16, + .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */ + }, +}; + +size_t +lg_utf8_decode(uint32_t *cp, const uint8_t *s, size_t n) +{ + size_t off, i; + + if (n == 0) { + /* a sequence must be at least 1 byte long */ + *cp = LG_CODEPOINT_INVALID; + return 1; + } + + /* identify sequence type with the first byte */ + for (off = 0; off < LEN(lut); off++) { + if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) { + /* + * first byte is within the bounds; fill + * p with the the first bits contained in + * the first byte (by subtracting the high bits) + */ + *cp = s[0] - lut[off].lower; + break; + } + } + if (off == LEN(lut)) { + /* + * first byte does not match a sequence type; + * set cp as invalid and return 1 byte processed + */ + *cp = LG_CODEPOINT_INVALID; + return 1; + } + if (1 + off > n) { + /* + * input is not long enough, set cp as invalid and + * return number of bytes needed + */ + *cp = LG_CODEPOINT_INVALID; + return 1 + off; + } + + /* + * process 'off' following bytes, each of the form 10xxxxxx + * (i.e. between 0x80 (10000000) and 0xBF (10111111)) + */ + for (i = 1; i <= off; i++) { + if(!BETWEEN(s[i], 0x80, 0xBF)) { + /* + * byte does not match format; return + * number of bytes processed excluding the + * unexpected character as recommended since + * Unicode 6 (chapter 3) + */ + *cp = LG_CODEPOINT_INVALID; + return 1 + (i - 1); + } + /* + * shift code point by 6 bits and add the 6 stored bits + * in s[i] to it using the bitmask 0x3F (00111111) + */ + *cp = (*cp << 6) | (s[i] & 0x3F); + } + + if (*cp < lut[off].mincp || + BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || + *cp > UINT32_C(0x10FFFF)) { + /* + * code point is overlong encoded in the sequence, is a + * high or low UTF-16 surrogate half (0xD800..0xDFFF) or + * not representable in UTF-16 (>0x10FFFF) (RFC-3629 + * specifies the latter two conditions) + */ + *cp = LG_CODEPOINT_INVALID; + } + + return 1 + off; +} + +size_t +lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n) +{ + size_t off, i; + + if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || + cp > UINT32_C(0x10FFFF)) { + /* + * code point is a high or low UTF-16 surrogate half + * (0xD800..0xDFFF) or not representable in UTF-16 + * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. + */ + cp = LG_CODEPOINT_INVALID; + } + + /* determine necessary sequence type */ + for (off = 0; off < LEN(lut); off++) { + if (cp <= lut[off].maxcp) { + break; + } + } + if (1 + off > n) { + /* specified buffer is too small to store sequence */ + return 1 + off; + } + + /* build sequence by filling cp-bits into each byte */ + + /* + * lut[off].lower is the bit-format for the first byte and + * the bits to fill into it are determined by shifting the + * cp 6 times the number of following bytes, as each + * following byte stores 6 bits, yielding the wanted bits. + * + * We do not overwrite the mask because we guaranteed earlier + * that there are no bits higher than the mask allows. + */ + s[0] = lut[off].lower | (cp >> (6 * off)); + + for (i = 1; i <= off; i++) { + /* + * the bit-format for following bytes is 10000000 (0x80) + * and it each stores 6 bits in the 6 low bits that we + * extract from the properly-shifted value using the + * mask 00111111 (0x3F) + */ + s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); + } + + return 1 + off; +} diff --git a/test/grapheme.c b/test/grapheme.c @@ -19,9 +19,9 @@ main(void) for (i = 0, failed = 0; i < LEN(grapheme_test); i++) { for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) { if ((j + 1) == grapheme_test[i].cplen || - grapheme_boundary(grapheme_test[i].cp[j], - grapheme_test[i].cp[j + 1], - &state)) { + lg_grapheme_isbreak(grapheme_test[i].cp[j], + grapheme_test[i].cp[j + 1], + &state)) { /* check if our resulting length matches */ if (k == grapheme_test[i].lenlen || len != grapheme_test[i].len[k++]) { diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -22,7 +22,7 @@ static const struct { .arr = NULL, .len = 0, .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid lead byte @@ -32,7 +32,7 @@ static const struct { .arr = (uint8_t[]){ 0xFD }, .len = 1, .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* valid 1-byte sequence @@ -62,7 +62,7 @@ static const struct { .arr = (uint8_t[]){ 0xC3 }, .len = 1, .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 2-byte sequence (second byte malformed) @@ -72,7 +72,7 @@ static const struct { .arr = (uint8_t[]){ 0xC3, 0xFF }, .len = 2, .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 2-byte sequence (overlong encoded) @@ -82,7 +82,7 @@ static const struct { .arr = (uint8_t[]){ 0xC1, 0xBF }, .len = 2, .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* valid 3-byte sequence @@ -102,7 +102,7 @@ static const struct { .arr = (uint8_t[]){ 0xE0 }, .len = 1, .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 3-byte sequence (second byte malformed) @@ -112,7 +112,7 @@ static const struct { .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, .len = 3, .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 3-byte sequence (third byte missing) @@ -122,7 +122,7 @@ static const struct { .arr = (uint8_t[]){ 0xE0, 0xBF }, .len = 2, .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 3-byte sequence (third byte malformed) @@ -132,7 +132,7 @@ static const struct { .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, .len = 3, .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 3-byte sequence (overlong encoded) @@ -142,7 +142,7 @@ static const struct { .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, .len = 3, .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 3-byte sequence (UTF-16 surrogate half) @@ -152,7 +152,7 @@ static const struct { .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, .len = 3, .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* valid 4-byte sequence @@ -172,7 +172,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3 }, .len = 1, .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (second byte malformed) @@ -182,7 +182,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, .len = 4, .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (third byte missing) @@ -192,7 +192,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3, 0xBF }, .len = 2, .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (third byte malformed) @@ -202,7 +202,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, .len = 4, .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (fourth byte missing) @@ -212,7 +212,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, .len = 3, .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (fourth byte malformed) @@ -222,7 +222,7 @@ static const struct { .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, .len = 4, .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (overlong encoded) @@ -232,7 +232,7 @@ static const struct { .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, .len = 4, .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, { /* invalid 4-byte sequence (UTF-16-unrepresentable) @@ -242,7 +242,7 @@ static const struct { .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, .len = 4, .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, + .exp_cp = LG_CODEPOINT_INVALID, }, }; @@ -256,7 +256,7 @@ main(void) size_t len; uint32_t cp; - len = grapheme_cp_decode(&cp, dec_test[i].arr, + len = lg_utf8_decode(&cp, dec_test[i].arr, dec_test[i].len); if (len != dec_test[i].exp_len || diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -61,7 +61,7 @@ main(void) uint8_t arr[4]; size_t len; - len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); + len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr)); if (len != enc_test[i].exp_len || memcmp(arr, enc_test[i].exp_arr, len)) {