libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 91d7e5af278c6e4231a8330a2ec6e54440b01465
parent 706b4d4ce7d76eb627aea5c9f8d4da8088c0903b
Author: Laslo Hunhold <dev@frign.de>
Date:   Mon, 12 Oct 2020 12:32:27 +0200

Rename grapheme_len() to grapheme_bytelen() and refactor manual

I wasn't happy with the previous name, because it can be a bit confusing,
given there are functions like strlen() which count the number of bytes
in a string and one might assume that grapheme_len() counts the number
of grapheme clusters in a string.

Calling it grapheme_bytelen() clears this confusion up, as one can then
tell that it's about the number of bytes in a grapheme cluster and not
the number of grapheme clusters.

The manual was refactored inspired by the high standards set by the
OpenBSD manuals.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 2+-
Mgrapheme.h | 3+--
Aman/grapheme_bytelen.3 | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dman/grapheme_len.3 | 40----------------------------------------
Mman/libgrapheme.7 | 8++++----
Msrc/grapheme.c | 2+-
6 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/Makefile b/Makefile @@ -12,7 +12,7 @@ GBT_URL = https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest. GBP = data/gbp.txt EMO = data/emo.txt GBT = data/gbt.txt -MAN3 = man/grapheme_len.3 +MAN3 = man/grapheme_bytelen.3 MAN7 = man/libgrapheme.7 all: libgrapheme.a libgrapheme.so $(BIN) diff --git a/grapheme.h b/grapheme.h @@ -8,10 +8,9 @@ #define GRAPHEME_CP_INVALID UINT32_C(0xFFFD) int grapheme_boundary(uint32_t, uint32_t, int *); +size_t grapheme_bytelen(const char *); size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t); size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t); -size_t grapheme_len(const char *); - #endif /* GRAPHEME_H */ diff --git a/man/grapheme_bytelen.3 b/man/grapheme_bytelen.3 @@ -0,0 +1,57 @@ +.Dd 2020-03-26 +.Dt GRAPHEME_BYTELEN 3 +.Os suckless.org +.Sh NAME +.Nm grapheme_bytelen +.Nd compute grapheme cluster byte-length +.Sh SYNOPSIS +.In grapheme.h +.Ft size_t +.Fn grapheme_bytelen "const char *str" +.Sh DESCRIPTION +The +.Fn grapheme_bytelen +function computes the length (in bytes) of the grapheme cluster +(see +.Xr libgrapheme 7 ) +beginning at the NUL-terminated string +.Va str . +.Sh RETURN VALUES +The +.Fn grapheme_bytelen +function returns the length (in bytes) of the grapheme cluster beginning +at +.Va str +or 0 if +.Va str +is +.Dv NULL . +.Sh EXAMPLES +.Bd -literal +/* cc (-static) -o example example.c -lgrapheme */ +#include <grapheme.h> +#include <stdio.h> + +int +main(void) +{ + char *s = "Tëst 👨\\u200d👩\\u200d👦 🇺🇸 नी நி!"; + size_t len; + + /* print each grapheme cluster with accompanying byte-length */ + while (*s != '\\0') { + len = grapheme_bytelen(s); + printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); + s += len; + } + + return 0; +} +.Ed +.Sh SEE ALSO +.Xr libgrapheme 7 +.Sh STANDARDS +.Fn grapheme_bytelen +is compliant with the Unicode 13.0.0 specification. +.Sh AUTHORS +.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/grapheme_len.3 b/man/grapheme_len.3 @@ -1,40 +0,0 @@ -.Dd 2020-03-26 -.Dt GRAPHEME_LEN 3 -.Os suckless.org -.Sh NAME -.Nm grapheme_len -.Nd determine grapheme cluster length -.Sh SYNOPSIS -.In grapheme.h -.Ft size_t -.Fn grapheme_len "const char *" -.Sh DESCRIPTION -.Nm -returns the length (in bytes) of the grapheme cluster beginning at -the provided char-address. -.Sh EXAMPLES -.Bd -literal -/* cc (-static) -o example example.c -lgrapheme */ -#include <grapheme.h> -#include <stdio.h> - -int -main(void) -{ - char *s = "Tëst 👨\\u200d👩\\u200d👦 🇺🇸 नी நி!"; - size_t len; - - /* print each grapheme cluster with accompanying byte-length */ - while (*s != '\\0') { - len = grapheme_len(s); - printf("%2zu bytes | %.*s\\n", len, (int)len, s, len); - s += len; - } - - return 0; -} -.Ed -.Sh SEE ALSO -.Xr libgrapheme 7 -.Sh AUTHORS -.An Laslo Hunhold Aq Mt dev@frign.de diff --git a/man/libgrapheme.7 b/man/libgrapheme.7 @@ -16,9 +16,9 @@ see .Sx MOTIVATION ) using the Unicode grapheme cluster breaking algorithm (UAX #29). .Pp -You can either count the byte-length of the grapheme cluster at the -beginning of an UTF-8-encoded string (see -.Xr grapheme_len 3 ) +You can either count the length (in bytes) of the grapheme cluster at +the beginning of an UTF-8-encoded string (see +.Xr grapheme_bytelen 3 ) or determine if a grapheme cluster breaks between two Unicode code points (see .Xr grapheme_boundary 3 ) , @@ -28,9 +28,9 @@ and .Xr grapheme_cp_encode 3 ) . .Sh SEE ALSO .Xr grapheme_boundary 3 , +.Xr grapheme_bytelen 3 .Xr grapheme_cp_decode 3 , .Xr grapheme_cp_encode 3 , -.Xr grapheme_len 3 .Sh STANDARDS .Nm is compliant with the Unicode 13.0.0 specification. diff --git a/src/grapheme.c b/src/grapheme.c @@ -5,7 +5,7 @@ #include "../grapheme.h" size_t -grapheme_len(const char *str) +grapheme_bytelen(const char *str) { uint32_t cp0, cp1; size_t ret, len = 0;