libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 6c5d42d5dfe3ed5b24a26c70bd6a86d98bb2b65e
parent e229e779acb4aec6a97551872e1f12a2ce46a9aa
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 28 Aug 2022 16:29:54 +0200

Merge next_break- and next_break_utf8-templates

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 9++++-----
Mman/grapheme_next_character_break.sh | 1+
Mman/grapheme_next_character_break_utf8.sh | 3++-
Mman/grapheme_next_line_break.sh | 1+
Mman/grapheme_next_line_break_utf8.sh | 3++-
Mman/grapheme_next_sentence_break.sh | 1+
Mman/grapheme_next_sentence_break_utf8.sh | 3++-
Mman/grapheme_next_word_break.sh | 1+
Mman/grapheme_next_word_break_utf8.sh | 3++-
Mman/template/next_break.sh | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Dman/template/next_break_utf8.sh | 96-------------------------------------------------------------------------------
11 files changed, 85 insertions(+), 120 deletions(-)

diff --git a/Makefile b/Makefile @@ -62,7 +62,6 @@ MAN_DATE = 2022-08-28 MAN_TEMPLATE =\ man/template/next_break.sh\ - man/template/next_break_utf8.sh\ man/template/to_case.sh\ MAN3 =\ @@ -199,10 +198,10 @@ man/grapheme_next_character_break.3: man/grapheme_next_character_break.sh man/te man/grapheme_next_line_break.3: man/grapheme_next_line_break.sh man/template/next_break.sh config.mk man/grapheme_next_sentence_break.3: man/grapheme_next_sentence_break.sh man/template/next_break.sh config.mk man/grapheme_next_word_break.3: man/grapheme_next_word_break.sh man/template/next_break.sh config.mk -man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh man/template/next_break_utf8.sh config.mk -man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh man/template/next_break_utf8.sh config.mk -man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh man/template/next_break_utf8.sh config.mk -man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh man/template/next_break_utf8.sh config.mk +man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh man/template/next_break.sh config.mk +man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh man/template/next_break.sh config.mk +man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh man/template/next_break.sh config.mk +man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh man/template/next_break.sh config.mk man/grapheme_to_uppercase.3: man/grapheme_to_uppercase.sh man/template/to_case.sh config.mk man/grapheme_to_lowercase.3: man/grapheme_to_lowercase.sh man/template/to_case.sh config.mk man/grapheme_to_titlecase.3: man/grapheme_to_titlecase.sh man/template/to_case.sh config.mk diff --git a/man/grapheme_next_character_break.sh b/man/grapheme_next_character_break.sh @@ -1,3 +1,4 @@ +ENCODING="codepoint" \ TYPE="character" \ REALTYPE="grapheme cluster" \ $SH man/template/next_break.sh diff --git a/man/grapheme_next_character_break_utf8.sh b/man/grapheme_next_character_break_utf8.sh @@ -1,3 +1,4 @@ +ENCODING="utf8" \ TYPE="character" \ REALTYPE="grapheme cluster" \ - $SH man/template/next_break_utf8.sh + $SH man/template/next_break.sh diff --git a/man/grapheme_next_line_break.sh b/man/grapheme_next_line_break.sh @@ -1,3 +1,4 @@ +ENCODING="codepoint" \ TYPE="line" \ REALTYPE="possible line" \ $SH man/template/next_break.sh diff --git a/man/grapheme_next_line_break_utf8.sh b/man/grapheme_next_line_break_utf8.sh @@ -1,3 +1,4 @@ +ENCODING="utf8" \ TYPE="line" \ REALTYPE="possible line" \ - $SH man/template/next_break_utf8.sh + $SH man/template/next_break.sh diff --git a/man/grapheme_next_sentence_break.sh b/man/grapheme_next_sentence_break.sh @@ -1,3 +1,4 @@ +ENCODING="codepoint" \ TYPE="sentence" \ REALTYPE="sentence" \ $SH man/template/next_break.sh diff --git a/man/grapheme_next_sentence_break_utf8.sh b/man/grapheme_next_sentence_break_utf8.sh @@ -1,3 +1,4 @@ +ENCODING="utf8" \ TYPE="sentence" \ REALTYPE="sentence" \ - $SH man/template/next_break_utf8.sh + $SH man/template/next_break.sh diff --git a/man/grapheme_next_word_break.sh b/man/grapheme_next_word_break.sh @@ -1,3 +1,4 @@ +ENCODING="codepoint" \ TYPE="word" \ REALTYPE="word" \ $SH man/template/next_break.sh diff --git a/man/grapheme_next_word_break_utf8.sh b/man/grapheme_next_word_break_utf8.sh @@ -1,3 +1,4 @@ +ENCODING="utf8" \ TYPE="word" \ REALTYPE="word" \ - $SH man/template/next_break_utf8.sh + $SH man/template/next_break.sh diff --git a/man/template/next_break.sh b/man/template/next_break.sh @@ -1,21 +1,29 @@ +if [ "$ENCODING" = "utf8" ]; then + UNIT="byte" + SUFFIX="_utf8" +else + UNIT="codepoint" + SUFFIX="" +fi + cat << EOF .Dd $MAN_DATE -.Dt GRAPHEME_NEXT_$(printf $TYPE | tr [:lower:] [:upper:])_BREAK 3 +.Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3 .Os suckless.org .Sh NAME -.Nm grapheme_next_$(printf $TYPE)_break -.Nd determine codepoint-offset to next $REALTYPE break +.Nm grapheme_next_$(printf $TYPE)_break$SUFFIX +.Nd determine $UNIT-offset to next $REALTYPE break .Sh SYNOPSIS .In grapheme.h .Ft size_t -.Fn grapheme_next_$(printf $TYPE)_break "const uint_least32_t *str" "size_t len" +.Fn grapheme_next_$(printf $TYPE)_break$SUFFIX "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len" .Sh DESCRIPTION The -.Fn grapheme_next_$(printf $TYPE)_break -function computes the offset (in codepoints) to the next $REALTYPE +.Fn grapheme_next_$(printf $TYPE)_break$SUFFIX +function computes the offset (in $(printf $UNIT)s) to the next $REALTYPE break (see .Xr libgrapheme 7 ) -in the codepoint array +in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi) .Va str of length .Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a $REALTYPE begins at @@ -31,25 +39,71 @@ is set to is interpreted to be NUL-terminated and processing stops when a NUL-byte is encountered. .Pp -For UTF-8-encoded input data -.Xr grapheme_next_$(printf $TYPE)_break_utf8 3 +For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input +data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi) +.Xr grapheme_next_$(printf $TYPE)_break$(if [ "$ENCODING" != "utf8" ]; then printf "_utf8"; fi) 3 can be used instead. .Sh RETURN VALUES The -.Fn grapheme_next_$(printf $TYPE)_break -function returns the offset (in codepoints) to the next $REALTYPE +.Fn grapheme_next_$(printf $TYPE)_break$SUFFIX +function returns the offset (in $(printf $UNIT)s) to the next $REALTYPE break in .Va str or 0 if .Va str is .Dv NULL . -.Sh SEE ALSO -.Xr grapheme_is_character_break 3 , -.Xr grapheme_next_$(printf $TYPE)_break_utf8 3 , +EOF + +if [ "$ENCODING" = "utf8" ]; then +cat << EOF +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0" + "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0" + "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0" + "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!"; + size_t ret, len, off; + + printf("Input: \\\\"%s\\\\"\\\\n", s); + + /* print each $REALTYPE with byte-length */ + printf("$(printf "$REALTYPE")s in NUL-delimited input:\\\\n"); + for (off = 0; s[off] != '\\\\0'; off += ret) { + ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); + } + printf("\\\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("$(printf "$REALTYPE")s in input delimited to %zu bytes:\\\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, len - off); + printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); + } + + return 0; +} +.Ed +EOF +fi + +cat << EOF +.Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi) +.Xr grapheme_next_$(printf $TYPE)_break$(if [ "$ENCODING" != "utf8" ]; then printf "_utf8"; fi) 3 , .Xr libgrapheme 7 .Sh STANDARDS -.Fn grapheme_next_$(printf $TYPE)_break +.Fn grapheme_next_$(printf $TYPE)_break$SUFFIX is compliant with the Unicode $UNICODE_VERSION specification. .Sh AUTHORS .An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/template/next_break_utf8.sh b/man/template/next_break_utf8.sh @@ -1,96 +0,0 @@ -cat << EOF -.Dd $MAN_DATE -.Dt GRAPHEME_NEXT_$(printf $TYPE | tr [:lower:] [:upper:])_BREAK_UTF8 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_$(printf $TYPE)_break_utf8 -.Nd determine byte-offset to next $REALTYPE break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_$(printf $TYPE)_break_utf8 "const char *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_$(printf $TYPE)_break_utf8 -function computes the offset (in bytes) to the next $REALTYPE -break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded string -.Va str -of length -.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a $REALTYPE begins at -.Va str -this offset is equal to the length of said $REALTYPE."; fi) -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For non-UTF-8 input data$(if [ "$TYPE" = "character" ]; -then printf "\n.Xr grapheme_is_character_break 3 -and"; fi) -.Xr grapheme_next_$(printf $TYPE)_break 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_$(printf $TYPE)_break_utf8 -function returns the offset (in bytes) to the next $REALTYPE -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0" - "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0" - "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0" - "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!"; - size_t ret, len, off; - - printf("Input: \\\\"%s\\\\"\\\\n", s); - - /* print each $REALTYPE with byte-length */ - printf("$(printf "$REALTYPE")s in NUL-delimited input:\\\\n"); - for (off = 0; s[off] != '\\\\0'; off += ret) { - ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, SIZE_MAX); - printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); - } - printf("\\\\n"); - - /* do the same, but this time string is length-delimited */ - len = 17; - printf("$(printf "$REALTYPE")s in input delimited to %zu bytes:\\\\n", len); - for (off = 0; off < len; off += ret) { - ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, len - off); - printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); - } - - return 0; -} -.Ed -.Sh SEE ALSO$(if [ "$TYPE" = "character" ]; -then printf "\n.Xr grapheme_is_character_break 3 ,"; fi) -.Xr grapheme_next_$(printf $TYPE)_break 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_$(printf $TYPE)_break_utf8 -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF