libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 82b85a60b3a334c928aa22de2555a55367bf739d
parent dfda0db8503b0051addc96368840b06c22fa8eeb
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 12:48:32 +0100

Reintroduce the "grapheme_" prefix

With the character ambiguity out of the way we can now go back to
prefixing everything with "grapheme_" instead of "lg_". It's always
better to have a prefix matching the library name, as it's otherwise
not immediately obvious where a given symbol or function comes from.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 21+++++++++++----------
Aman/grapheme_character_isbreak.3 | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_character_nextbreak.3 | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_utf8_decode.3 | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_utf8_encode.3 | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dman/lg_grapheme_isbreak.3 | 80-------------------------------------------------------------------------------
Dman/lg_grapheme_nextbreak.3 | 72------------------------------------------------------------------------
Dman/lg_utf8_decode.3 | 101-------------------------------------------------------------------------------
Dman/lg_utf8_encode.3 | 98-------------------------------------------------------------------------------
Mman/libgrapheme.7 | 8++++----
Msrc/character.c | 21+++++++++++----------
Msrc/utf8.c | 16++++++++--------
Msrc/util.c | 6+++---
Msrc/util.h | 6+++---
Mtest/character-performance.c | 4++--
Mtest/character.c | 8++++----
Mtest/utf8-decode.c | 50+++++++++++++++++++++++++-------------------------
Mtest/utf8-encode.c | 2+-
18 files changed, 423 insertions(+), 421 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -6,24 +6,25 @@ #include <stddef.h> #include <stdint.h> -struct lg_internal_heisenstate { +struct grapheme_internal_heisenstate { uint_least64_t determined; uint_least64_t state; }; -typedef struct lg_internal_segmentation_state { - struct lg_internal_heisenstate a; - struct lg_internal_heisenstate b; +typedef struct grapheme_internal_segmentation_state { + struct grapheme_internal_heisenstate a; + struct grapheme_internal_heisenstate b; uint_least16_t flags; -} LG_SEGMENTATION_STATE; +} GRAPHEME_SEGMENTATION_STATE; -#define LG_INVALID_CODE_POINT UINT32_C(0xFFFD) +#define GRAPHEME_INVALID_CODE_POINT UINT32_C(0xFFFD) -size_t lg_character_nextbreak(const char *); +size_t grapheme_character_nextbreak(const char *); -bool lg_character_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *); +bool grapheme_character_isbreak(uint_least32_t, uint_least32_t, + GRAPHEME_SEGMENTATION_STATE *); -size_t lg_utf8_decode(const char *, size_t, uint_least32_t *); -size_t lg_utf8_encode(uint_least32_t, char *, size_t); +size_t grapheme_utf8_decode(const char *, size_t, uint_least32_t *); +size_t grapheme_utf8_encode(uint_least32_t, char *, size_t); #endif /* GRAPHEME_H */ diff --git a/man/grapheme_character_isbreak.3 b/man/grapheme_character_isbreak.3 @@ -0,0 +1,80 @@ +.Dd 2021-12-18 +.Dt GRAPHEME_CHARACTER_ISBREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_character_isbreak +.Nd test for a grapheme cluster break between two code points +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_character_isbreak "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_SEGMENTATION_STATE *state" +.Sh DESCRIPTION +The +.Fn grapheme_character_isbreak +function determines if there is a grapheme cluster break (see +.Xr libgrapheme 7 ) +between the two code points +.Va cp1 +and +.Va cp2 . +By specification this decision depends on a +.Va state +that can at most be completely reset after detecting a break and must +be reset every time one deviates from sequential processing. +.Pp +If +.Va state +is +.Dv NULL +.Fn grapheme_character_isbreak +behaves as if it was called with a fully reset state. +.Sh RETURN VALUES +The +.Fn grapheme_character_isbreak +function returns +.Va true +if there is a grapheme cluster break between the code points +.Va cp1 +and +.Va cp2 +and +.Va false +if there is not. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +int +main(void) +{ + GRAPHEME_SEGMENTATION_STATE state = { 0 }; + uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */ + size_t i; + + for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) { + if (grapheme_character_isbreak(s[i], s[i + 1], &state)) { + printf("break in s1 at offset %zu\n", i); + } + } + memset(&state, 0, sizeof(state)); /* reset state */ + for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) { + if (grapheme_character_isbreak(s[i], s[i + 1], &state)) { + printf("break in s2 at offset %zu\n", i); + } + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_character_nextbreak 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_character_isbreak +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_character_nextbreak.3 b/man/grapheme_character_nextbreak.3 @@ -0,0 +1,72 @@ +.Dd 2021-12-18 +.Dt GRAPHEME_CHARACTER_NEXTBREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_character_nextbreak +.Nd determine byte-offset to next grapheme cluster break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_character_nextbreak "const char *str" +.Sh DESCRIPTION +The +.Fn grapheme_character_nextbreak +function computes the offset (in bytes) to the next grapheme +cluster break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded NUL-terminated string +.Va str . +If a grapheme cluster begins at +.Va str +this offset is equal to the length of said grapheme cluster. +.Pp +For non-UTF-8 input data +.Xr grapheme_character_isbreak 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_character_nextbreak +function returns the offset (in bytes) to the next grapheme cluster +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" + "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" + "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" + "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; + size_t len; + + printf("Input: \\"%s\\"\\n", s); + + /* print each grapheme cluster with byte-length */ + for (; *s != '\\0';) { + len = grapheme_character_nextbreak(s); + printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); + s += len; + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_character_isbreak 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_character_nextbreak +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_utf8_decode.3 b/man/grapheme_utf8_decode.3 @@ -0,0 +1,101 @@ +.Dd 2021-12-17 +.Dt GRAPHEME_UTF8_DECODE 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_utf8_decode +.Nd decode first code point in UTF-8-encoded string +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp" +.Sh DESCRIPTION +The +.Fn grapheme_utf8_decode +function decodes the next code point in the UTF-8-encoded string +.Va str +of length +.Va len . +If the UTF-8-sequence is invalid (overlong encoding, unexpected byte, +string ends unexpectedly, empty string, etc.) the decoding is stopped +at the last processed byte and the decoded code point set to +.Dv GRAPHEME_INVALID_CODE_POINT. +.Pp +If +.Va cp +is not +.Dv NULL +the decoded code point is stored in the memory pointed to by +.Va cp . +.Pp +Given NUL has a unique 1 byte representation, it is safe to operate on +NUL-terminated strings by setting +.Va len +to +.Dv (size_t)-1 +and terminating when +.Va cp +is 0 (see +.Sx EXAMPLES +for an example). +.Sh RETURN VALUES +The +.Fn grapheme_utf8_decode +function returns the number of processed bytes and 0 if +.Va str +is +.Dv NULL +or +.Va len +is 0. +If the string ends unexpectedly in a multibyte sequence, the desired +length (that is larger than +.Va len ) +is returned. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <inttypes.h> +#include <stdio.h> + +void +print_cps(const char *str, size_t len) +{ + size_t ret, off; + uint_least32_t cp; + + for (off = 0; off < len; off += ret) { + if ((ret = grapheme_utf8_decode(str + off, + len - off, &cp)) > (len - off)) { + /* + * string ended unexpectedly in the middle of a + * multibyte sequence and we have the choice + * here to possibly expand str by ret - len + off + * bytes to get a full sequence, but we just + * bail out in this case. + */ + break; + } + printf("%"PRIxLEAST32"\\n", cp); + } +} + +void +print_cps_nul_terminated(const char *str) +{ + size_t ret, off; + uint_least32_t cp; + + for (off = 0; (ret = grapheme_utf8_decode(str + off, + (size_t)-1, &cp)) > 0 && + cp != 0; off += ret) { + printf("%"PRIxLEAST32"\\n", cp); + } +} +.Ed +.Sh SEE ALSO +.Xr grapheme_utf8_encode 3 , +.Xr grapheme_character_isbreak 3 , +.Xr libgrapheme 7 +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_utf8_encode.3 b/man/grapheme_utf8_encode.3 @@ -0,0 +1,98 @@ +.Dd 2021-12-17 +.Dt GRAPHEME_UTF8_ENCODE 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_utf8_encode +.Nd encode code point into UTF-8 string +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_utf8_encode "uint_least32_t cp" "char *" "size_t" +.Sh DESCRIPTION +The +.Fn grapheme_utf8_encode +function encodes the code point +.Va cp +into a UTF-8-string. +If +.Va str +is not +.Dv NULL +and +.Va len +is large enough it writes the UTF-8-string to the memory pointed to by +.Va str . +.Sh RETURN VALUES +The +.Fn grapheme_utf8_encode +function returns the length (in bytes) of the UTF-8-string resulting +from encoding +.Va cp . +When the returned value is larger than +.Va len +it is indicated that the output string is too small and no data has been +written. +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stddef.h> +#include <stdlib.h> + +size_t +cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len) +{ + size_t i, off, ret; + + for (i = 0, off = 0; i < cplen; i++, off += ret) { + if ((ret = grapheme_utf8_encode(cp[i], str + off, + len - off)) > (len - off)) { + /* buffer too small */ + break; + } + } + + return off; +} + +size_t +cps_bytelen(const uint_least32_t *cp, size_t cplen) +{ + size_t i, len; + + for (i = 0, len = 0; i < cplen; i++) { + len += grapheme_utf8_encode(cp[i], NULL, 0); + } + + return len; +} + +char * +cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen) +{ + char *str; + size_t len, i, ret, off; + + len = cps_bytelen(cp, cplen); + + if (!(str = malloc(len))) { + return NULL; + } + + for (i = 0, off = 0; i < cplen; i++, off += ret) { + if ((ret = grapheme_utf8_encode(cp[i], str + off, + len - off)) > (len - off)) { + /* buffer too small */ + break; + } + } + str[off] = '\\0'; + + return str; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_utf8_decode 3 , +.Xr libgrapheme 7 +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_grapheme_isbreak.3 b/man/lg_grapheme_isbreak.3 @@ -1,80 +0,0 @@ -.Dd 2021-12-18 -.Dt LG_GRAPHEME_ISBREAK 3 -.Os suckless.org -.Sh NAME -.Nm lg_grapheme_isbreak -.Nd test for a grapheme cluster break between two code points -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn lg_grapheme_isbreak "uint_least32_t cp1" "uint_least32_t cp2" "LG_SEGMENTATION_STATE *state" -.Sh DESCRIPTION -The -.Fn lg_grapheme_isbreak -function determines if there is a grapheme cluster break (see -.Xr libgrapheme 7 ) -between the two code points -.Va cp1 -and -.Va cp2 . -By specification this decision depends on a -.Va state -that can at most be completely reset after detecting a break and must -be reset every time one deviates from sequential processing. -.Pp -If -.Va state -is -.Dv NULL -.Fn lg_grapheme_isbreak -behaves as if it was called with a fully reset state. -.Sh RETURN VALUES -The -.Fn lg_grapheme_isbreak -function returns -.Va true -if there is a grapheme cluster break between the code points -.Va cp1 -and -.Va cp2 -and -.Va false -if there is not. -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> - -int -main(void) -{ - LG_SEGMENTATION_STATE state = { 0 }; - uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */ - size_t i; - - for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) { - if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) { - printf("break in s1 at offset %zu\n", i); - } - } - memset(&state, 0, sizeof(state)); /* reset state */ - for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) { - if (lg_grapheme_isbreak(s[i], s[i + 1], &state)) { - printf("break in s2 at offset %zu\n", i); - } - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr lg_grapheme_nextbreak 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn lg_grapheme_isbreak -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3 @@ -1,72 +0,0 @@ -.Dd 2021-12-18 -.Dt LG_GRAPHEME_NEXTBREAK 3 -.Os suckless.org -.Sh NAME -.Nm lg_grapheme_nextbreak -.Nd determine byte-offset to next grapheme cluster break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn lg_grapheme_nextbreak "const char *str" -.Sh DESCRIPTION -The -.Fn lg_grapheme_nextbreak -function computes the offset (in bytes) to the next grapheme -cluster break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded NUL-terminated string -.Va str . -If a grapheme cluster begins at -.Va str -this offset is equal to the length of said grapheme cluster. -.Pp -For non-UTF-8 input data -.Xr lg_grapheme_isbreak 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn lg_grapheme_nextbreak -function returns the offset (in bytes) to the next grapheme cluster -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" - "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" - "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" - "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t len; - - printf("Input: \\"%s\\"\\n", s); - - /* print each grapheme cluster with byte-length */ - for (; *s != '\\0';) { - len = lg_grapheme_nextbreak(s); - printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); - s += len; - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr lg_grapheme_isbreak 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn lg_grapheme_nextbreak -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_utf8_decode.3 b/man/lg_utf8_decode.3 @@ -1,101 +0,0 @@ -.Dd 2021-12-17 -.Dt LG_UTF8_DECODE 3 -.Os suckless.org -.Sh NAME -.Nm lg_utf8_decode -.Nd decode first code point in UTF-8-encoded string -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn lg_utf8_decode "const char *str" "size_t len" "uint_least32_t *cp" -.Sh DESCRIPTION -The -.Fn lg_utf8_decode -function decodes the next code point in the UTF-8-encoded string -.Va str -of length -.Va len . -If the UTF-8-sequence is invalid (overlong encoding, unexpected byte, -string ends unexpectedly, empty string, etc.) the decoding is stopped -at the last processed byte and the decoded code point set to -.Dv LG_INVALID_CODE_POINT. -.Pp -If -.Va cp -is not -.Dv NULL -the decoded code point is stored in the memory pointed to by -.Va cp . -.Pp -Given NUL has a unique 1 byte representation, it is safe to operate on -NUL-terminated strings by setting -.Va len -to -.Dv (size_t)-1 -and terminating when -.Va cp -is 0 (see -.Sx EXAMPLES -for an example). -.Sh RETURN VALUES -The -.Fn lg_utf8_decode -function returns the number of processed bytes and 0 if -.Va str -is -.Dv NULL -or -.Va len -is 0. -If the string ends unexpectedly in a multibyte sequence, the desired -length (that is larger than -.Va len ) -is returned. -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <inttypes.h> -#include <stdio.h> - -void -print_cps(const char *str, size_t len) -{ - size_t ret, off; - uint_least32_t cp; - - for (off = 0; off < len; off += ret) { - if ((ret = lg_utf8_decode(str + off, - len - off, &cp)) > (len - off)) { - /* - * string ended unexpectedly in the middle of a - * multibyte sequence and we have the choice - * here to possibly expand str by ret - len + off - * bytes to get a full sequence, but we just - * bail out in this case. - */ - break; - } - printf("%"PRIxLEAST32"\\n", cp); - } -} - -void -print_cps_nul_terminated(const char *str) -{ - size_t ret, off; - uint_least32_t cp; - - for (off = 0; (ret = lg_utf8_decode(str + off, - (size_t)-1, &cp)) > 0 && - cp != 0; off += ret) { - printf("%"PRIxLEAST32"\\n", cp); - } -} -.Ed -.Sh SEE ALSO -.Xr lg_grapheme_encode 3 , -.Xr lg_grapheme_isbreak 3 , -.Xr libgrapheme 7 -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/lg_utf8_encode.3 b/man/lg_utf8_encode.3 @@ -1,98 +0,0 @@ -.Dd 2021-12-17 -.Dt LG_UTF8_ENCODE 3 -.Os suckless.org -.Sh NAME -.Nm lg_utf8_encode -.Nd encode code point into UTF-8 string -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn lg_utf8_encode "uint_least32_t cp" "char *" "size_t" -.Sh DESCRIPTION -The -.Fn lg_utf8_encode -function encodes the code point -.Va cp -into a UTF-8-string. -If -.Va str -is not -.Dv NULL -and -.Va len -is large enough it writes the UTF-8-string to the memory pointed to by -.Va str . -.Sh RETURN VALUES -The -.Fn lg_utf8_encode -function returns the length (in bytes) of the UTF-8-string resulting -from encoding -.Va cp . -When the returned value is larger than -.Va len -it is indicated that the output string is too small and no data has been -written. -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stddef.h> -#include <stdlib.h> - -size_t -cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len) -{ - size_t i, off, ret; - - for (i = 0, off = 0; i < cplen; i++, off += ret) { - if ((ret = lg_utf8_encode(cp[i], str + off, - len - off)) > (len - off)) { - /* buffer too small */ - break; - } - } - - return off; -} - -size_t -cps_bytelen(const uint_least32_t *cp, size_t cplen) -{ - size_t i, len; - - for (i = 0, len = 0; i < cplen; i++) { - len += lg_utf8_encode(cp[i], NULL, 0); - } - - return len; -} - -char * -cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen) -{ - char *str; - size_t len, i, ret, off; - - len = cps_bytelen(cp, cplen); - - if (!(str = malloc(len))) { - return NULL; - } - - for (i = 0, off = 0; i < cplen; i++, off += ret) { - if ((ret = lg_utf8_encode(cp[i], str + off, - len - off)) > (len - off)) { - /* buffer too small */ - break; - } - } - str[off] = '\\0'; - - return str; -} -.Ed -.Sh SEE ALSO -.Xr lg_grapheme_decode 3 , -.Xr libgrapheme 7 -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/libgrapheme.7 b/man/libgrapheme.7 @@ -15,10 +15,10 @@ see .Sx MOTIVATION ) according to the Unicode specification. .Sh SEE ALSO -.Xr lg_grapheme_isbreak 3 , -.Xr lg_grapheme_nextbreak 3 , -.Xr lg_utf8_decode 3 , -.Xr lg_utf8_encode 3 +.Xr grapheme_character_isbreak 3 , +.Xr grapheme_character_nextbreak 3 , +.Xr grapheme_utf8_decode 3 , +.Xr grapheme_utf8_encode 3 .Sh STANDARDS .Nm is compliant with the Unicode 14.0.0 specification. diff --git a/src/character.c b/src/character.c @@ -14,9 +14,10 @@ enum { }; bool -lg_character_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state) +grapheme_character_isbreak(uint_least32_t a, uint_least32_t b, + GRAPHEME_SEGMENTATION_STATE *state) { - struct lg_internal_heisenstate *p[2] = { 0 }; + struct grapheme_internal_heisenstate *p[2] = { 0 }; uint_least16_t flags = 0; bool isbreak = true; @@ -179,18 +180,18 @@ hasbreak: } size_t -lg_character_nextbreak(const char *str) +grapheme_character_nextbreak(const char *str) { uint_least32_t cp0, cp1; size_t ret, len = 0; - LG_SEGMENTATION_STATE state = { 0 }; + GRAPHEME_SEGMENTATION_STATE state = { 0 }; if (str == NULL) { return 0; } /* - * lg_utf8_decode, when it encounters an unexpected byte, + * grapheme_utf8_decode, when it encounters an unexpected byte, * does not count it to the error and instead assumes that the * unexpected byte is the beginning of a new sequence. * This way, when the string ends with a null byte, we never @@ -202,17 +203,17 @@ lg_character_nextbreak(const char *str) */ /* get first code point */ - len += lg_utf8_decode(str, (size_t)-1, &cp0); - if (cp0 == LG_INVALID_CODE_POINT) { + len += grapheme_utf8_decode(str, (size_t)-1, &cp0); + if (cp0 == GRAPHEME_INVALID_CODE_POINT) { return len; } while (cp0 != 0) { /* get next code point */ - ret = lg_utf8_decode(str + len, (size_t)-1, &cp1); + ret = grapheme_utf8_decode(str + len, (size_t)-1, &cp1); - if (cp1 == LG_INVALID_CODE_POINT || - lg_character_isbreak(cp0, cp1, &state)) { + if (cp1 == GRAPHEME_INVALID_CODE_POINT || + grapheme_character_isbreak(cp0, cp1, &state)) { /* we read an invalid cp or have a breakpoint */ break; } else { diff --git a/src/utf8.c b/src/utf8.c @@ -48,13 +48,13 @@ static const struct { }; size_t -lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) +grapheme_utf8_decode(const char *s, size_t n, uint_least32_t *cp) { size_t off, i; if (s == NULL || n == 0) { /* a sequence must be at least 1 byte long */ - *cp = LG_INVALID_CODE_POINT; + *cp = GRAPHEME_INVALID_CODE_POINT; return 0; } @@ -79,14 +79,14 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) * this also includes the cases where bits higher than * the 8th are set on systems with CHAR_BIT > 8 */ - *cp = LG_INVALID_CODE_POINT; + *cp = GRAPHEME_INVALID_CODE_POINT; return 1; } if (1 + off > n) { /* * input is not long enough, set cp as invalid */ - *cp = LG_INVALID_CODE_POINT; + *cp = GRAPHEME_INVALID_CODE_POINT; /* * count the following continuation bytes, but nothing @@ -125,7 +125,7 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) * higher than the 8th are set on systems * with CHAR_BIT > 8 */ - *cp = LG_INVALID_CODE_POINT; + *cp = GRAPHEME_INVALID_CODE_POINT; return 1 + (i - 1); } /* @@ -144,14 +144,14 @@ lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp) * not representable in UTF-16 (>0x10FFFF) (RFC-3629 * specifies the latter two conditions) */ - *cp = LG_INVALID_CODE_POINT; + *cp = GRAPHEME_INVALID_CODE_POINT; } return 1 + off; } size_t -lg_utf8_encode(uint_least32_t cp, char *s, size_t n) +grapheme_utf8_encode(uint_least32_t cp, char *s, size_t n) { size_t off, i; @@ -162,7 +162,7 @@ lg_utf8_encode(uint_least32_t cp, char *s, size_t n) * (0xD800..0xDFFF) or not representable in UTF-16 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. */ - cp = LG_INVALID_CODE_POINT; + cp = GRAPHEME_INVALID_CODE_POINT; } /* determine necessary sequence type */ diff --git a/src/util.c b/src/util.c @@ -8,7 +8,7 @@ /* 64-slot (0,...,63) optionally undetermined binary state */ int -heisenstate_get(struct lg_internal_heisenstate *h, int slot) +heisenstate_get(struct grapheme_internal_heisenstate *h, int slot) { if (h == NULL || slot >= 64 || slot < 0 || !(h->determined & (1 << slot))) { @@ -21,7 +21,7 @@ heisenstate_get(struct lg_internal_heisenstate *h, int slot) } int -heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state) +heisenstate_set(struct grapheme_internal_heisenstate *h, int slot, int state) { if (h == NULL || slot >= 64 || slot < 0) { /* no state given or slot out of range */ @@ -54,7 +54,7 @@ cp_cmp(const void *a, const void *b) } int -has_property(uint_least32_t cp, struct lg_internal_heisenstate *cpstate, +has_property(uint_least32_t cp, struct grapheme_internal_heisenstate *cpstate, const struct range_list *proptable, int property) { int res; diff --git a/src/util.h b/src/util.h @@ -19,10 +19,10 @@ struct range_list { size_t len; }; -int heisenstate_get(struct lg_internal_heisenstate *, int); -int heisenstate_set(struct lg_internal_heisenstate *, int, int); +int heisenstate_get(struct grapheme_internal_heisenstate *, int); +int heisenstate_set(struct grapheme_internal_heisenstate *, int, int); -int has_property(uint_least32_t, struct lg_internal_heisenstate *, +int has_property(uint_least32_t, struct grapheme_internal_heisenstate *, const struct range_list *, int); #endif /* UTIL_H */ diff --git a/test/character-performance.c b/test/character-performance.c @@ -17,7 +17,7 @@ main(int argc, char *argv[]) struct timespec start, end; size_t i, j, bufsiz, off; uint32_t *buf; - LG_SEGMENTATION_STATE state; + GRAPHEME_SEGMENTATION_STATE state; double cp_per_sec; (void)argc; @@ -45,7 +45,7 @@ main(int argc, char *argv[]) for (i = 0; i < NUM_ITERATIONS; i++) { memset(&state, 0, sizeof(state)); for (j = 0; j < bufsiz - 1; j++) { - (void)lg_character_isbreak(buf[j], buf[j+1], &state); + (void)grapheme_character_isbreak(buf[j], buf[j+1], &state); } if (i % (NUM_ITERATIONS / 10) == 0) { printf("."); diff --git a/test/character.c b/test/character.c @@ -11,7 +11,7 @@ int main(int argc, char *argv[]) { - LG_SEGMENTATION_STATE state; + GRAPHEME_SEGMENTATION_STATE state; size_t i, j, k, len, failed; (void)argc; @@ -21,9 +21,9 @@ main(int argc, char *argv[]) memset(&state, 0, sizeof(state)); for (j = 0, k = 0, len = 1; j < character_test[i].cplen; j++) { if ((j + 1) == character_test[i].cplen || - lg_character_isbreak(character_test[i].cp[j], - character_test[i].cp[j + 1], - &state)) { + grapheme_character_isbreak(character_test[i].cp[j], + character_test[i].cp[j + 1], + &state)) { /* check if our resulting length matches */ if (k == character_test[i].lenlen || len != character_test[i].len[k++]) { diff --git a/test/utf8-decode.c b/test/utf8-decode.c @@ -21,7 +21,7 @@ static const struct { .arr = NULL, .len = 0, .exp_len = 0, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid lead byte @@ -31,7 +31,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xFD }, .len = 1, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* valid 1-byte sequence @@ -61,7 +61,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xC3 }, .len = 1, .exp_len = 2, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 2-byte sequence (second byte malformed) @@ -71,7 +71,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xC3, 0xFF }, .len = 2, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 2-byte sequence (overlong encoded) @@ -81,7 +81,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xC1, 0xBF }, .len = 2, .exp_len = 2, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* valid 3-byte sequence @@ -101,7 +101,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0 }, .len = 1, .exp_len = 3, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (second byte malformed) @@ -111,7 +111,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF }, .len = 3, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (short string, second byte malformed) @@ -121,7 +121,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0, 0x7F }, .len = 2, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (third byte missing) @@ -131,7 +131,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0, 0xBF }, .len = 2, .exp_len = 3, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (third byte malformed) @@ -141,7 +141,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F }, .len = 3, .exp_len = 2, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (overlong encoded) @@ -151,7 +151,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF }, .len = 3, .exp_len = 3, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 3-byte sequence (UTF-16 surrogate half) @@ -161,7 +161,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 }, .len = 3, .exp_len = 3, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* valid 4-byte sequence @@ -181,7 +181,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3 }, .len = 1, .exp_len = 4, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (second byte malformed) @@ -191,7 +191,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF }, .len = 4, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (short string 1, second byte malformed) @@ -201,7 +201,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0x7F }, .len = 2, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (short string 2, second byte malformed) @@ -211,7 +211,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF }, .len = 3, .exp_len = 1, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { @@ -222,7 +222,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0xBF }, .len = 2, .exp_len = 4, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (third byte malformed) @@ -232,7 +232,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF }, .len = 4, .exp_len = 2, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (short string, third byte malformed) @@ -242,7 +242,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F }, .len = 3, .exp_len = 2, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (fourth byte missing) @@ -252,7 +252,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF }, .len = 3, .exp_len = 4, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (fourth byte malformed) @@ -262,7 +262,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F }, .len = 4, .exp_len = 3, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (overlong encoded) @@ -272,7 +272,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF }, .len = 4, .exp_len = 4, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, { /* invalid 4-byte sequence (UTF-16-unrepresentable) @@ -282,7 +282,7 @@ static const struct { .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 }, .len = 4, .exp_len = 4, - .exp_cp = LG_INVALID_CODE_POINT, + .exp_cp = GRAPHEME_INVALID_CODE_POINT, }, }; @@ -298,8 +298,8 @@ main(int argc, char *argv[]) size_t len; uint_least32_t cp; - len = lg_utf8_decode(dec_test[i].arr, - dec_test[i].len, &cp); + len = grapheme_utf8_decode(dec_test[i].arr, + dec_test[i].len, &cp); if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) { diff --git a/test/utf8-encode.c b/test/utf8-encode.c @@ -62,7 +62,7 @@ main(int argc, char *argv[]) char arr[4]; size_t len; - len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr)); + len = grapheme_utf8_encode(enc_test[i].cp, arr, LEN(arr)); if (len != enc_test[i].exp_len || memcmp(arr, enc_test[i].exp_arr, len)) {