libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 25808f024f907a8cac1227989fca883c7b0bc271
parent b035c4af5a10ea6ff3d1d948855b40094b559b78
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 26 Aug 2022 20:53:08 +0200

Add manuals for all implemented segmentation functions

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 8++++++++
Mman/grapheme_is_character_break.3 | 3++-
Aman/grapheme_next_character_break.3 | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mman/grapheme_next_character_break_utf8.3 | 5++++-
Aman/grapheme_next_line_break.3 | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_next_line_break_utf8.3 | 89+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_next_sentence_break.3 | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_next_sentence_break_utf8.3 | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_next_word_break.3 | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aman/grapheme_next_word_break_utf8.3 | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mman/libgrapheme.7 | 11+++++++++--
11 files changed, 510 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile @@ -60,6 +60,14 @@ MAN3 =\ man/grapheme_decode_utf8.3\ man/grapheme_encode_utf8.3\ man/grapheme_is_character_break.3\ + man/grapheme_next_character_break.3\ + man/grapheme_next_line_break.3\ + man/grapheme_next_sentence_break.3\ + man/grapheme_next_word_break.3\ + man/grapheme_next_character_break_utf8.3\ + man/grapheme_next_line_break_utf8.3\ + man/grapheme_next_sentence_break_utf8.3\ + man/grapheme_next_word_break_utf8.3\ MAN7 = man/libgrapheme.7 diff --git a/man/grapheme_is_character_break.3 b/man/grapheme_is_character_break.3 @@ -1,4 +1,4 @@ -.Dd 2021-12-22 +.Dd 2022-08-26 .Dt GRAPHEME_IS_CHARACTER_BREAK 3 .Os suckless.org .Sh NAME @@ -72,6 +72,7 @@ main(void) .Ed .Sh SEE ALSO .Xr grapheme_next_character_break 3 , +.Xr grapheme_next_character_break_utf8 3 , .Xr libgrapheme 7 .Sh STANDARDS .Fn grapheme_is_character_break diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3 @@ -0,0 +1,55 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_character_break +.Nd determine codepoint-offset to next grapheme cluster break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_character_break "const uint_least32_t *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_character_break +function computes the offset (in codepoints) to the next grapheme +cluster break (see +.Xr libgrapheme 7 ) +in the codepoint array +.Va str +of length +.Va len . +If a grapheme cluster begins at +.Va str +this offset is equal to the length of said grapheme cluster. +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For UTF-8-encoded input data +.Xr grapheme_next_character_break_utf8 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_character_break +function returns the offset (in codepoints) to the next grapheme cluster +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh SEE ALSO +.Xr grapheme_is_character_break 3 , +.Xr grapheme_next_character_break_utf8 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_character_break +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_character_break_utf8.3 b/man/grapheme_next_character_break_utf8.3 @@ -1,4 +1,4 @@ -.Dd 2021-12-22 +.Dd 2022-08-26 .Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3 .Os suckless.org .Sh NAME @@ -33,6 +33,8 @@ NUL-byte is encountered. .Pp For non-UTF-8 input data .Xr grapheme_is_character_break 3 +and +.Xr grapheme_next_character_break 3 can be used instead. .Sh RETURN VALUES The @@ -84,6 +86,7 @@ main(void) .Ed .Sh SEE ALSO .Xr grapheme_is_character_break 3 , +.Xr grapheme_next_character_break 3 , .Xr libgrapheme 7 .Sh STANDARDS .Fn grapheme_next_character_break_utf8 diff --git a/man/grapheme_next_line_break.3 b/man/grapheme_next_line_break.3 @@ -0,0 +1,51 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_LINE_BREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_line_break +.Nd determine codepoint-offset to next grapheme cluster break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_line_break "const uint_least32_t *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_line_break +function computes the offset (in codepoints) to the next possible line +break (see +.Xr libgrapheme 7 ) +in the codepoint array +.Va str +of length +.Va len . +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For UTF-8-encoded input data +.Xr grapheme_next_line_break_utf8 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_line_break +function returns the offset (in codepoints) to the next possible line +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh SEE ALSO +.Xr grapheme_next_line_break_utf8 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_line_break +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_line_break_utf8.3 b/man/grapheme_next_line_break_utf8.3 @@ -0,0 +1,89 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_LINE_BREAK_UTF8 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_line_break_utf8 +.Nd determine byte-offset to next possible line break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_line_break_utf8 "const char *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_line_break_utf8 +function computes the offset (in bytes) to the next possible line +break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded string +.Va str +of length +.Va len . +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For non-UTF-8 input data +.Xr grapheme_next_line_break 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_line_break_utf8 +function returns the offset (in bytes) to the next possible line +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" + "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" + "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" + "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; + size_t ret, len, off; + + printf("Input: \\"%s\\"\\n", s); + + /* print each grapheme cluster with byte-length */ + printf("Grapheme clusters in NUL-delimited input:\\n"); + for (off = 0; s[off] != '\\0'; off += ret) { + ret = grapheme_next_line_break_utf8(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + printf("\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_line_break_utf8(s + off, len - off); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_next_line_break 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_line_break_utf8 +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_sentence_break.3 b/man/grapheme_next_sentence_break.3 @@ -0,0 +1,54 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_SENTENCE_BREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_sentence_break +.Nd determine codepoint-offset to next sentence break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_sentence_break "const uint_least32_t *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_sentence_break +function computes the offset (in codepoints) to the next sentence +break (see +.Xr libgrapheme 7 ) +in the codepoint array +.Va str +of length +.Va len . +If a sentence begins at +.Va str +this offset is equal to the length of said sentence. +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For UTF-8-encoded input data +.Xr grapheme_next_sentence_break_utf8 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_sentence_break +function returns the offset (in codepoints) to the next sentence +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh SEE ALSO +.Xr grapheme_next_sentence_break_utf8 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_sentence_break +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_sentence_break_utf8.3 b/man/grapheme_next_sentence_break_utf8.3 @@ -0,0 +1,92 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_SENTENCE_BREAK_UTF8 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_sentence_break_utf8 +.Nd determine byte-offset to next sentence break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_sentence_break_utf8 "const char *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_sentence_break_utf8 +function computes the offset (in bytes) to the next sentence +break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded string +.Va str +of length +.Va len . +If a sentence begins at +.Va str +this offset is equal to the length of said sentence. +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For non-UTF-8 input data +.Xr grapheme_next_sentence_break 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_sentence_break_utf8 +function returns the offset (in bytes) to the next sentence +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" + "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" + "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" + "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; + size_t ret, len, off; + + printf("Input: \\"%s\\"\\n", s); + + /* print each grapheme cluster with byte-length */ + printf("Grapheme clusters in NUL-delimited input:\\n"); + for (off = 0; s[off] != '\\0'; off += ret) { + ret = grapheme_next_sentence_break_utf8(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + printf("\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_sentence_break_utf8(s + off, len - off); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_next_sentence_break 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_sentence_break_utf8 +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_word_break.3 b/man/grapheme_next_word_break.3 @@ -0,0 +1,54 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_WORD_BREAK 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_word_break +.Nd determine codepoint-offset to next word break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_word_break "const uint_least32_t *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_word_break +function computes the offset (in codepoints) to the next word +break (see +.Xr libgrapheme 7 ) +in the codepoint array +.Va str +of length +.Va len . +If a word begins at +.Va str +this offset is equal to the length of said word. +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For UTF-8-encoded input data +.Xr grapheme_next_word_break_utf8 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_word_break +function returns the offset (in codepoints) to the next word +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh SEE ALSO +.Xr grapheme_next_word_break_utf8 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_word_break +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_next_word_break_utf8.3 b/man/grapheme_next_word_break_utf8.3 @@ -0,0 +1,92 @@ +.Dd 2022-08-26 +.Dt GRAPHEME_NEXT_WORD_BREAK_UTF8 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_next_word_break_utf8 +.Nd determine byte-offset to next word break +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_next_word_break_utf8 "const char *str" "size_t len" +.Sh DESCRIPTION +The +.Fn grapheme_next_word_break_utf8 +function computes the offset (in bytes) to the next word +break (see +.Xr libgrapheme 7 ) +in the UTF-8-encoded string +.Va str +of length +.Va len . +If a word begins at +.Va str +this offset is equal to the length of said word. +.Pp +If +.Va len +is set to +.Dv SIZE_MAX +(stdint.h is already included by grapheme.h) the string +.Va str +is interpreted to be NUL-terminated and processing stops when a +NUL-byte is encountered. +.Pp +For non-UTF-8 input data +.Xr grapheme_next_word_break 3 +can be used instead. +.Sh RETURN VALUES +The +.Fn grapheme_next_word_break_utf8 +function returns the offset (in bytes) to the next word +break in +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdint.h> +#include <stdio.h> + +int +main(void) +{ + /* UTF-8 encoded input */ + char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0" + "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0" + "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0" + "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!"; + size_t ret, len, off; + + printf("Input: \\"%s\\"\\n", s); + + /* print each grapheme cluster with byte-length */ + printf("Grapheme clusters in NUL-delimited input:\\n"); + for (off = 0; s[off] != '\\0'; off += ret) { + ret = grapheme_next_word_break_utf8(s + off, SIZE_MAX); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + printf("\\n"); + + /* do the same, but this time string is length-delimited */ + len = 17; + printf("Grapheme clusters in input delimited to %zu bytes:\\n", len); + for (off = 0; off < len; off += ret) { + ret = grapheme_next_word_break_utf8(s + off, len - off); + printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret); + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr grapheme_next_word_break 3 , +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_next_word_break_utf8 +is compliant with the Unicode 14.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/libgrapheme.7 b/man/libgrapheme.7 @@ -1,4 +1,4 @@ -.Dd 2021-12-22 +.Dd 2022-08-26 .Dt LIBGRAPHEME 7 .Os suckless.org .Sh NAME @@ -38,7 +38,14 @@ example illustrating the possible usage. .Xr grapheme_decode_utf8 3 , .Xr grapheme_encode_utf8 3 , .Xr grapheme_is_character_break 3 , -.Xr grapheme_next_character_break 3 +.Xr grapheme_next_character_break 3 , +.Xr grapheme_next_line_break 3 , +.Xr grapheme_next_sentence_break 3 , +.Xr grapheme_next_word_break 3 , +.Xr grapheme_next_character_break_utf8 3 , +.Xr grapheme_next_line_break_utf8 3 , +.Xr grapheme_next_sentence_break_utf8 3 , +.Xr grapheme_next_word_break_utf8 3 .Sh STANDARDS .Nm is compliant with the Unicode 14.0.0 specification.