libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit c31ca4f0d107e505602fc746dd09001b33dd1811
parent f5ec499f2bdbf9a488e602168241d7c9ef0d9d4f
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 28 Aug 2022 14:59:24 +0200

Convert grapheme_next*() manuals to being templated-based

This removes a lot of redundancy and makes them much easier to
maintain.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 16++++++++--------
Mman/grapheme_next_character_break.sh | 60+++---------------------------------------------------------
Mman/grapheme_next_character_break_utf8.sh | 100+++----------------------------------------------------------------------------
Mman/grapheme_next_line_break.sh | 56+++-----------------------------------------------------
Mman/grapheme_next_line_break_utf8.sh | 94+++----------------------------------------------------------------------------
Mman/grapheme_next_sentence_break.sh | 59+++--------------------------------------------------------
Mman/grapheme_next_sentence_break_utf8.sh | 97+++----------------------------------------------------------------------------
Mman/grapheme_next_word_break.sh | 59+++--------------------------------------------------------
Mman/grapheme_next_word_break_utf8.sh | 97+++----------------------------------------------------------------------------
Aman/template/next_break.sh | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/template/next_break_utf8.sh | 96+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
11 files changed, 184 insertions(+), 606 deletions(-)

diff --git a/Makefile b/Makefile @@ -191,14 +191,14 @@ gen/word-test.h: data/WordBreakTest.txt gen/word-test man/grapheme_decode_utf8.3: man/grapheme_decode_utf8.sh config.mk man/grapheme_encode_utf8.3: man/grapheme_encode_utf8.sh config.mk man/grapheme_is_character_break.3: man/grapheme_is_character_break.sh config.mk -man/grapheme_next_character_break.3: man/grapheme_next_character_break.sh config.mk -man/grapheme_next_line_break.3: man/grapheme_next_line_break.sh config.mk -man/grapheme_next_sentence_break.3: man/grapheme_next_sentence_break.sh config.mk -man/grapheme_next_word_break.3: man/grapheme_next_word_break.sh config.mk -man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh config.mk -man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh config.mk -man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh config.mk -man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh config.mk +man/grapheme_next_character_break.3: man/grapheme_next_character_break.sh man/template/next_break.sh config.mk +man/grapheme_next_line_break.3: man/grapheme_next_line_break.sh man/template/next_break.sh config.mk +man/grapheme_next_sentence_break.3: man/grapheme_next_sentence_break.sh man/template/next_break.sh config.mk +man/grapheme_next_word_break.3: man/grapheme_next_word_break.sh man/template/next_break.sh config.mk +man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh man/template/next_break_utf8.sh config.mk +man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh man/template/next_break_utf8.sh config.mk +man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh man/template/next_break_utf8.sh config.mk +man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh man/template/next_break_utf8.sh config.mk man/grapheme_to_uppercase.3: man/grapheme_to_uppercase.sh man/template/to_case.sh config.mk man/grapheme_to_lowercase.3: man/grapheme_to_lowercase.sh man/template/to_case.sh config.mk man/grapheme_to_titlecase.3: man/grapheme_to_titlecase.sh man/template/to_case.sh config.mk diff --git a/man/grapheme_next_character_break.sh b/man/grapheme_next_character_break.sh @@ -1,57 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_character_break -.Nd determine codepoint-offset to next grapheme cluster break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_character_break "const uint_least32_t *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_character_break -function computes the offset (in codepoints) to the next grapheme -cluster break (see -.Xr libgrapheme 7 ) -in the codepoint array -.Va str -of length -.Va len . -If a grapheme cluster begins at -.Va str -this offset is equal to the length of said grapheme cluster. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For UTF-8-encoded input data -.Xr grapheme_next_character_break_utf8 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_character_break -function returns the offset (in codepoints) to the next grapheme cluster -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh SEE ALSO -.Xr grapheme_is_character_break 3 , -.Xr grapheme_next_character_break_utf8 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_character_break -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="character" \ +REALTYPE="grapheme cluster" \ + $SH man/template/next_break.sh diff --git a/man/grapheme_next_character_break_utf8.sh b/man/grapheme_next_character_break_utf8.sh @@ -1,97 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_character_break_utf8 -.Nd determine byte-offset to next grapheme cluster break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_character_break_utf8 -function computes the offset (in bytes) to the next grapheme -cluster break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded string -.Va str -of length -.Va len . -If a grapheme cluster begins at -.Va str -this offset is equal to the length of said grapheme cluster. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For non-UTF-8 input data -.Xr grapheme_is_character_break 3 -and -.Xr grapheme_next_character_break 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_character_break_utf8 -function returns the offset (in bytes) to the next grapheme cluster -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" - "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" - "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" - "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t ret, len, off; - - printf("Input: \\"%s\\"\\n", s); - - /* print each grapheme cluster with byte-length */ - printf("Grapheme clusters in NUL-delimited input:\\n"); - for (off = 0; s[off] != '\\0'; off += ret) { - ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - printf("\\n"); - - /* do the same, but this time string is length-delimited */ - len = 17; - printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); - for (off = 0; off < len; off += ret) { - ret = grapheme_next_character_break_utf8(s + off, len - off); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr grapheme_is_character_break 3 , -.Xr grapheme_next_character_break 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_character_break_utf8 -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="character" \ +REALTYPE="grapheme cluster" \ + $SH man/template/next_break_utf8.sh diff --git a/man/grapheme_next_line_break.sh b/man/grapheme_next_line_break.sh @@ -1,53 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_LINE_BREAK 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_line_break -.Nd determine codepoint-offset to next grapheme cluster break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_line_break "const uint_least32_t *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_line_break -function computes the offset (in codepoints) to the next possible line -break (see -.Xr libgrapheme 7 ) -in the codepoint array -.Va str -of length -.Va len . -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For UTF-8-encoded input data -.Xr grapheme_next_line_break_utf8 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_line_break -function returns the offset (in codepoints) to the next possible line -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh SEE ALSO -.Xr grapheme_next_line_break_utf8 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_line_break -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="line" \ +REALTYPE="possible line" \ + $SH man/template/next_break.sh diff --git a/man/grapheme_next_line_break_utf8.sh b/man/grapheme_next_line_break_utf8.sh @@ -1,91 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_LINE_BREAK_UTF8 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_line_break_utf8 -.Nd determine byte-offset to next possible line break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_line_break_utf8 "const char *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_line_break_utf8 -function computes the offset (in bytes) to the next possible line -break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded string -.Va str -of length -.Va len . -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For non-UTF-8 input data -.Xr grapheme_next_line_break 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_line_break_utf8 -function returns the offset (in bytes) to the next possible line -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" - "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" - "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" - "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t ret, len, off; - - printf("Input: \\"%s\\"\\n", s); - - /* print each grapheme cluster with byte-length */ - printf("Grapheme clusters in NUL-delimited input:\\n"); - for (off = 0; s[off] != '\\0'; off += ret) { - ret = grapheme_next_line_break_utf8(s + off, SIZE_MAX); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - printf("\\n"); - - /* do the same, but this time string is length-delimited */ - len = 17; - printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); - for (off = 0; off < len; off += ret) { - ret = grapheme_next_line_break_utf8(s + off, len - off); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr grapheme_next_line_break 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_line_break_utf8 -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="line" \ +REALTYPE="possible line" \ + $SH man/template/next_break_utf8.sh diff --git a/man/grapheme_next_sentence_break.sh b/man/grapheme_next_sentence_break.sh @@ -1,56 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_SENTENCE_BREAK 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_sentence_break -.Nd determine codepoint-offset to next sentence break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_sentence_break "const uint_least32_t *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_sentence_break -function computes the offset (in codepoints) to the next sentence -break (see -.Xr libgrapheme 7 ) -in the codepoint array -.Va str -of length -.Va len . -If a sentence begins at -.Va str -this offset is equal to the length of said sentence. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For UTF-8-encoded input data -.Xr grapheme_next_sentence_break_utf8 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_sentence_break -function returns the offset (in codepoints) to the next sentence -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh SEE ALSO -.Xr grapheme_next_sentence_break_utf8 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_sentence_break -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="sentence" \ +REALTYPE="sentence" \ + $SH man/template/next_break.sh diff --git a/man/grapheme_next_sentence_break_utf8.sh b/man/grapheme_next_sentence_break_utf8.sh @@ -1,94 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_SENTENCE_BREAK_UTF8 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_sentence_break_utf8 -.Nd determine byte-offset to next sentence break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_sentence_break_utf8 "const char *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_sentence_break_utf8 -function computes the offset (in bytes) to the next sentence -break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded string -.Va str -of length -.Va len . -If a sentence begins at -.Va str -this offset is equal to the length of said sentence. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For non-UTF-8 input data -.Xr grapheme_next_sentence_break 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_sentence_break_utf8 -function returns the offset (in bytes) to the next sentence -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" - "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" - "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" - "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t ret, len, off; - - printf("Input: \\"%s\\"\\n", s); - - /* print each grapheme cluster with byte-length */ - printf("Grapheme clusters in NUL-delimited input:\\n"); - for (off = 0; s[off] != '\\0'; off += ret) { - ret = grapheme_next_sentence_break_utf8(s + off, SIZE_MAX); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - printf("\\n"); - - /* do the same, but this time string is length-delimited */ - len = 17; - printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); - for (off = 0; off < len; off += ret) { - ret = grapheme_next_sentence_break_utf8(s + off, len - off); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr grapheme_next_sentence_break 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_sentence_break_utf8 -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="sentence" \ +REALTYPE="sentence" \ + $SH man/template/next_break_utf8.sh diff --git a/man/grapheme_next_word_break.sh b/man/grapheme_next_word_break.sh @@ -1,56 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_WORD_BREAK 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_word_break -.Nd determine codepoint-offset to next word break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_word_break "const uint_least32_t *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_word_break -function computes the offset (in codepoints) to the next word -break (see -.Xr libgrapheme 7 ) -in the codepoint array -.Va str -of length -.Va len . -If a word begins at -.Va str -this offset is equal to the length of said word. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For UTF-8-encoded input data -.Xr grapheme_next_word_break_utf8 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_word_break -function returns the offset (in codepoints) to the next word -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh SEE ALSO -.Xr grapheme_next_word_break_utf8 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_word_break -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="word" \ +REALTYPE="word" \ + $SH man/template/next_break.sh diff --git a/man/grapheme_next_word_break_utf8.sh b/man/grapheme_next_word_break_utf8.sh @@ -1,94 +1,3 @@ -cat << EOF -.Dd 2022-08-26 -.Dt GRAPHEME_NEXT_WORD_BREAK_UTF8 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_next_word_break_utf8 -.Nd determine byte-offset to next word break -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_next_word_break_utf8 "const char *str" "size_t len" -.Sh DESCRIPTION -The -.Fn grapheme_next_word_break_utf8 -function computes the offset (in bytes) to the next word -break (see -.Xr libgrapheme 7 ) -in the UTF-8-encoded string -.Va str -of length -.Va len . -If a word begins at -.Va str -this offset is equal to the length of said word. -.Pp -If -.Va len -is set to -.Dv SIZE_MAX -(stdint.h is already included by grapheme.h) the string -.Va str -is interpreted to be NUL-terminated and processing stops when a -NUL-byte is encountered. -.Pp -For non-UTF-8 input data -.Xr grapheme_next_word_break 3 -can be used instead. -.Sh RETURN VALUES -The -.Fn grapheme_next_word_break_utf8 -function returns the offset (in bytes) to the next word -break in -.Va str -or 0 if -.Va str -is -.Dv NULL . -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdint.h> -#include <stdio.h> - -int -main(void) -{ - /* UTF-8 encoded input */ - char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" - "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" - "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" - "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; - size_t ret, len, off; - - printf("Input: \\"%s\\"\\n", s); - - /* print each grapheme cluster with byte-length */ - printf("Grapheme clusters in NUL-delimited input:\\n"); - for (off = 0; s[off] != '\\0'; off += ret) { - ret = grapheme_next_word_break_utf8(s + off, SIZE_MAX); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - printf("\\n"); - - /* do the same, but this time string is length-delimited */ - len = 17; - printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); - for (off = 0; off < len; off += ret) { - ret = grapheme_next_word_break_utf8(s + off, len - off); - printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr grapheme_next_word_break 3 , -.Xr libgrapheme 7 -.Sh STANDARDS -.Fn grapheme_next_word_break_utf8 -is compliant with the Unicode 14.0.0 specification. -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de -EOF +TYPE="word" \ +REALTYPE="word" \ + $SH man/template/next_break_utf8.sh diff --git a/man/template/next_break.sh b/man/template/next_break.sh @@ -0,0 +1,56 @@ +cat << EOF +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_$(printf $TYPE | tr [:lower:] [:upper:])_BREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_$(printf $TYPE)_break +.Nd determine codepoint-offset to next $REALTYPE break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_$(printf $TYPE)_break "const uint_least32_t *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_$(printf $TYPE)_break +function computes the offset (in codepoints) to the next $REALTYPE +break (see +.Xr libgrapheme 7 ) +in the codepoint array +.Va str +of length +.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a $REALTYPE begins at +.Va str +this offset is equal to the length of said $REALTYPE."; fi) +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For UTF-8-encoded input data +.Xr grapheme_next_$(printf $TYPE)_break_utf8 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_$(printf $TYPE)_break +function returns the offset (in codepoints) to the next $REALTYPE +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh SEE ALSO +.Xr grapheme_is_character_break 3 , +.Xr grapheme_next_$(printf $TYPE)_break_utf8 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_$(printf $TYPE)_break +is compliant with the Unicode $UNICODE_VERSION specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de +EOF diff --git a/man/template/next_break_utf8.sh b/man/template/next_break_utf8.sh @@ -0,0 +1,96 @@ +cat << EOF +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_$(printf $TYPE | tr [:lower:] [:upper:])_BREAK_UTF8 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_$(printf $TYPE)_break_utf8 +.Nd determine byte-offset to next $REALTYPE break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_$(printf $TYPE)_break_utf8 "const char *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_$(printf $TYPE)_break_utf8 +function computes the offset (in bytes) to the next $REALTYPE +break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded string +.Va str +of length +.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a $REALTYPE begins at +.Va str +this offset is equal to the length of said $REALTYPE."; fi) +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For non-UTF-8 input data$(if [ "$TYPE" = "character" ]; +then printf "\n.Xr grapheme_is_character_break 3 +and"; fi) +.Xr grapheme_next_$(printf $TYPE)_break 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_$(printf $TYPE)_break_utf8 +function returns the offset (in bytes) to the next $REALTYPE +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0" + "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0" + "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0" + "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!"; + size_t ret, len, off; + + printf("Input: \\\\"%s\\\\"\\\\n", s); + + /* print each $REALTYPE with byte-length */ + printf("$(printf "$REALTYPE")s in NUL-delimited input:\\\\n"); + for (off = 0; s[off] != '\\\\0'; off += ret) { + ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); + } + printf("\\\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("$(printf "$REALTYPE")s in input delimited to %zu bytes:\\\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_$(printf $TYPE)_break_utf8(s + off, len - off); + printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off, ret); + } + + return 0; +} +.Ed +.Sh SEE ALSO$(if [ "$TYPE" = "character" ]; +then printf "\n.Xr grapheme_is_character_break 3 ,"; fi) +.Xr grapheme_next_$(printf $TYPE)_break 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_$(printf $TYPE)_break_utf8 +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de +EOF