libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit b99a40eefc2ec1ad8714ed210a3aeedfb3283159
parent 20c105bcdd1c54401d4d23cdb9ded56ee7a2ffd4
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 17 Dec 2021 00:34:27 +0100

Encourage strict aliasing for library users (uint8_t * -> char *)

After a long-winded discussion with Michael Forney who has a really
deep understanding of the C-specification, he rightfully pointed out
that using uint8_t * might look good on paper, but leads to subtle
problems due to intrinsics within the C99-specification.

While you can alias any pointer to character types (char, unsigned char,
signed char), uint8_t is not a character type and aliasing to it breaks
the strict aliasing rule. This is not a problem in practice as gcc
is the only big compiler enforcing strict aliasing and uint8_t is
usually defined as unsigned char, inheriting the aliasing property for
technical reasons, but strictly speaking uint8_t is not a character
type.

With uint8_t * in the API, library users would've been forced to cast
any input-string to uint8_t *, breaking the strict aliasing rule. A
lot of code relies on this or conveniently disables strict aliasing
through compiler flags, but using char-arrays is the only really
portable and safe way to work with it.
Given char is usually 8 bits and indicates strongly that we're dealing
with a string is one strong point for using char *, another is that
C11 introduced UTF-8-string-literals of the form u8"..." which are
of type char[]. In this sense, using char * ensures some form of
forward-compatibility and fits nicely within the spec that's slowly
converging towards UTF-8.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 6+++---
Mman/lg_grapheme_nextbreak.3 | 4++--
Msrc/grapheme.c | 2+-
Msrc/utf8.c | 25+++++++++++++++++--------
Mtest/utf8-decode.c | 46+++++++++++++++++++++++-----------------------
Mtest/utf8-encode.c | 16++++++++--------
6 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -19,11 +19,11 @@ typedef struct lg_internal_segmentation_state { #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD) -size_t lg_grapheme_nextbreak(const uint8_t *); +size_t lg_grapheme_nextbreak(const char *); bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *); -size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *); -size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t); +size_t lg_utf8_decode(const char *, size_t, uint_least32_t *); +size_t lg_utf8_encode(uint_least32_t, char *, size_t); #endif /* GRAPHEME_H */ diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3 @@ -7,7 +7,7 @@ .Sh SYNOPSIS .In grapheme.h .Ft size_t -.Fn lg_grapheme_nextbreak "const uint8_t *str" +.Fn lg_grapheme_nextbreak "const char *str" .Sh DESCRIPTION .Fn lg_grapheme_nextbreak computes the offset (in bytes) to the next grapheme @@ -52,7 +52,7 @@ main(void) /* print each grapheme cluster with byte-length */ for (; *s != '\\0';) { - len = lg_grapheme_nextbreak((uint8_t *)s); + len = lg_grapheme_nextbreak(s); printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); s += len; } diff --git a/src/grapheme.c b/src/grapheme.c @@ -179,7 +179,7 @@ hasbreak: } size_t -lg_grapheme_nextbreak(const uint8_t *str) +lg_grapheme_nextbreak(const char *str) { uint_least32_t cp0, cp1; size_t ret, len = 0; diff --git a/src/utf8.c b/src/utf8.c @@ -48,7 +48,7 @@ static const struct { }; size_t -lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) +lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) { size_t off, i; @@ -60,13 +60,14 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) /* identify sequence type with the first byte */ for (off = 0; off < LEN(lut); off++) { - if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) { + if (BETWEEN(((unsigned char *)s)[0], lut[off].lower, + lut[off].upper)) { /* * first byte is within the bounds; fill * p with the the first bits contained in * the first byte (by subtracting the high bits) */ - *cp = s[0] - lut[off].lower; + *cp = ((unsigned char *)s)[0] - lut[off].lower; break; } } @@ -74,6 +75,9 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) /* * first byte does not match a sequence type; * set cp as invalid and return 1 byte processed + * + * this also includes the cases where bits higher than + * the 8th are set on systems with CHAR_BIT > 8 */ *cp = LG_CODEPOINT_INVALID; return 1; @@ -92,12 +96,16 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) * (i.e. between 0x80 (10000000) and 0xBF (10111111)) */ for (i = 1; i <= off; i++) { - if(!BETWEEN(s[i], 0x80, 0xBF)) { + if(!BETWEEN(((unsigned char *)s)[i], 0x80, 0xBF)) { /* * byte does not match format; return * number of bytes processed excluding the * unexpected character as recommended since * Unicode 6 (chapter 3) + * + * this also includes the cases where bits + * higher than the 8th are set on systems + * with CHAR_BIT > 8 */ *cp = LG_CODEPOINT_INVALID; return 1 + (i - 1); @@ -106,7 +114,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) * shift code point by 6 bits and add the 6 stored bits * in s[i] to it using the bitmask 0x3F (00111111) */ - *cp = (*cp << 6) | (s[i] & 0x3F); + *cp = (*cp << 6) | (((unsigned char *)s)[i] & 0x3F); } if (*cp < lut[off].mincp || @@ -125,7 +133,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp) } size_t -lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n) +lg_utf8_encode(uint_least32_t cp, char *s, size_t n) { size_t off, i; @@ -165,7 +173,7 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n) * We do not overwrite the mask because we guaranteed earlier * that there are no bits higher than the mask allows. */ - s[0] = lut[off].lower | (uint8_t)(cp >> (6 * off)); + ((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off)); for (i = 1; i <= off; i++) { /* @@ -174,7 +182,8 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n) * extract from the properly-shifted value using the * mask 00111111 (0x3F) */ - s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F); + ((unsigned char *)s)[i] = 0x80 | + ((cp >> (6 * (off - i))) & 0x3F); } return 1 + off; diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -8,7 +8,7 @@ #include "util.h" static const struct { - uint8_t *arr; /* UTF-8 byte sequence */ + char *arr; /* UTF-8 byte sequence */ size_t len; /* length of UTF-8 byte sequence */ size_t exp_len; /* expected length returned */ uint_least32_t exp_cp; /* expected code point returned */ @@ -28,7 +28,7 @@ static const struct { * [ 11111101 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xFD }, + .arr = (char *)(unsigned char[]){ 0xFD }, .len = 1, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -38,7 +38,7 @@ static const struct { * [ 00000001 ] -> * 0000001 */ - .arr = (uint8_t[]){ 0x01 }, + .arr = (char *)(unsigned char[]){ 0x01 }, .len = 1, .exp_len = 1, .exp_cp = 0x1, @@ -48,7 +48,7 @@ static const struct { * [ 11000011 10111111 ] -> * 00011111111 */ - .arr = (uint8_t[]){ 0xC3, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xC3, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = 0xFF, @@ -58,7 +58,7 @@ static const struct { * [ 11000011 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xC3 }, + .arr = (char *)(unsigned char[]){ 0xC3 }, .len = 1, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -68,7 +68,7 @@ static const struct { * [ 11000011 11111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xC3, 0xFF }, + .arr = (char *)(unsigned char[]){ 0xC3, 0xFF }, .len = 2, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -78,7 +78,7 @@ static const struct { * [ 11000001 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xC1, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xC1, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -88,7 +88,7 @@ static const struct { * [ 11100000 10111111 10111111 ] -> * 0000111111111111 */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = 0xFFF, @@ -98,7 +98,7 @@ static const struct { * [ 11100000 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xE0 }, + .arr = (char *)(unsigned char[]){ 0xE0 }, .len = 1, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -108,7 +108,7 @@ static const struct { * [ 11100000 01111111 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF }, .len = 3, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -118,7 +118,7 @@ static const struct { * [ 11100000 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xE0, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xE0, 0xBF }, .len = 2, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -128,7 +128,7 @@ static const struct { * [ 11100000 10111111 01111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, + .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F }, .len = 3, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -138,7 +138,7 @@ static const struct { * [ 11100000 10011111 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -148,7 +148,7 @@ static const struct { * [ 11101101 10100000 10000000 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, + .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 }, .len = 3, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -158,7 +158,7 @@ static const struct { * [ 11110011 10111111 10111111 10111111 ] -> * 011111111111111111111 */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = UINT32_C(0xFFFFF), @@ -168,7 +168,7 @@ static const struct { * [ 11110011 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3 }, + .arr = (char *)(unsigned char[]){ 0xF3 }, .len = 1, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -178,7 +178,7 @@ static const struct { * [ 11110011 01111111 10111111 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF }, .len = 4, .exp_len = 1, .exp_cp = LG_CODEPOINT_INVALID, @@ -188,7 +188,7 @@ static const struct { * [ 11110011 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF }, .len = 2, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -198,7 +198,7 @@ static const struct { * [ 11110011 10111111 01111111 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF }, .len = 4, .exp_len = 2, .exp_cp = LG_CODEPOINT_INVALID, @@ -208,7 +208,7 @@ static const struct { * [ 11110011 10111111 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF }, .len = 3, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -218,7 +218,7 @@ static const struct { * [ 11110011 10111111 10111111 01111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, + .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F }, .len = 4, .exp_len = 3, .exp_cp = LG_CODEPOINT_INVALID, @@ -228,7 +228,7 @@ static const struct { * [ 11110000 10000000 10000001 10111111 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, + .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, @@ -238,7 +238,7 @@ static const struct { * [ 11110100 10010000 10000000 10000000 ] -> * INVALID */ - .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, + .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 }, .len = 4, .exp_len = 4, .exp_cp = LG_CODEPOINT_INVALID, diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -9,43 +9,43 @@ static const struct { uint_least32_t cp; /* input code point */ - uint8_t *exp_arr; /* expected UTF-8 byte sequence */ + char *exp_arr; /* expected UTF-8 byte sequence */ size_t exp_len; /* expected length of UTF-8 sequence */ } enc_test[] = { { /* invalid code point (UTF-16 surrogate half) */ .cp = UINT32_C(0xD800), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD }, .exp_len = 3, }, { /* invalid code point (UTF-16-unrepresentable) */ .cp = UINT32_C(0x110000), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD }, .exp_len = 3, }, { /* code point encoded to a 1-byte sequence */ .cp = 0x01, - .exp_arr = (uint8_t[]){ 0x01 }, + .exp_arr = (char *)(unsigned char[]){ 0x01 }, .exp_len = 1, }, { /* code point encoded to a 2-byte sequence */ .cp = 0xFF, - .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, + .exp_arr = (char *)(unsigned char[]){ 0xC3, 0xBF }, .exp_len = 2, }, { /* code point encoded to a 3-byte sequence */ .cp = 0xFFF, - .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .exp_arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF }, .exp_len = 3, }, { /* code point encoded to a 4-byte sequence */ .cp = UINT32_C(0xFFFFF), - .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .exp_arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF }, .exp_len = 4, }, }; @@ -59,7 +59,7 @@ main(int argc, char *argv[]) /* UTF-8 encoder test */ for (i = 0, failed = 0; i < LEN(enc_test); i++) { - uint8_t arr[4]; + char arr[4]; size_t len; len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));