libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 950adad158c79da041c85cbb3773208988ea7477
parent f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 19 Dec 2021 01:22:58 +0100

Match function parameters in code and documentation

This always helps with readability if you want to check upon the
implementation.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgrapheme.h | 4++--
Mman/grapheme_encode_utf8.3 | 2+-
Msrc/character.c | 102++++++++++++++++++++++++++++++++++++++++----------------------------------------
Msrc/utf8.c | 30+++++++++++++++---------------
4 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/grapheme.h b/grapheme.h @@ -12,8 +12,8 @@ struct grapheme_internal_heisenstate { }; typedef struct grapheme_internal_segmentation_state { - struct grapheme_internal_heisenstate a; - struct grapheme_internal_heisenstate b; + struct grapheme_internal_heisenstate cp0; + struct grapheme_internal_heisenstate cp1; uint_least16_t flags; } GRAPHEME_STATE; diff --git a/man/grapheme_encode_utf8.3 b/man/grapheme_encode_utf8.3 @@ -7,7 +7,7 @@ .Sh SYNOPSIS .In grapheme.h .Ft size_t -.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *" "size_t" +.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len" .Sh DESCRIPTION The .Fn grapheme_encode_utf8 diff --git a/src/character.c b/src/character.c @@ -14,7 +14,7 @@ enum { }; bool -grapheme_is_character_break(uint_least32_t a, uint_least32_t b, GRAPHEME_STATE *state) +grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state) { struct grapheme_internal_heisenstate *p[2] = { 0 }; uint_least16_t flags = 0; @@ -22,14 +22,14 @@ grapheme_is_character_break(uint_least32_t a, uint_least32_t b, GRAPHEME_STATE * /* set state depending on state pointer */ if (state != NULL) { - p[0] = &(state->a); - p[1] = &(state->b); + p[0] = &(state->cp0); + p[1] = &(state->cp1); flags = state->flags; } /* skip printable ASCII */ - if ((a >= 0x20 && a <= 0x7E) && - (b >= 0x20 && b <= 0x7E)) { + if ((cp0 >= 0x20 && cp0 <= 0x7E) && + (cp1 >= 0x20 && cp1 <= 0x7E)) { goto hasbreak; } @@ -41,8 +41,8 @@ grapheme_is_character_break(uint_least32_t a, uint_least32_t b, GRAPHEME_STATE * /* * update flags, if state-pointer given */ - if (has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { - if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { + if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { /* one more RI is on the left side of the seam, flip state */ flags ^= CHARACTER_FLAG_RI_ODD; } else { @@ -52,22 +52,22 @@ grapheme_is_character_break(uint_least32_t a, uint_least32_t b, GRAPHEME_STATE * } } if (!(flags & CHARACTER_FLAG_EMOJI) && - ((has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || - (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { + ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { flags |= CHARACTER_FLAG_EMOJI; } else if ((flags & CHARACTER_FLAG_EMOJI) && - ((has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) && - has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) || - (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) && - has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)) || - (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) && - has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || - (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || - (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { + ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_ZWJ) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) || + (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTEND) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)) || + (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTEND) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { /* CHARACTER_FLAG_EMOJI remains */ } else { flags &= ~CHARACTER_FLAG_EMOJI; @@ -85,76 +85,76 @@ grapheme_is_character_break(uint_least32_t a, uint_least32_t b, GRAPHEME_STATE * /* skip GB1 and GB2, as they are never satisfied here */ /* GB3 */ - if (has_property(a, p[0], character_prop, CHARACTER_PROP_CR) && - has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) { + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_CR) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_LF)) { goto nobreak; } /* GB4 */ - if (has_property(a, p[0], character_prop, CHARACTER_PROP_CONTROL) || - has_property(a, p[0], character_prop, CHARACTER_PROP_CR) || - has_property(a, p[0], character_prop, CHARACTER_PROP_LF)) { + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_CONTROL) || + has_property(cp0, p[0], character_prop, CHARACTER_PROP_CR) || + has_property(cp0, p[0], character_prop, CHARACTER_PROP_LF)) { goto hasbreak; } /* GB5 */ - if (has_property(b, p[1], character_prop, CHARACTER_PROP_CONTROL) || - has_property(b, p[1], character_prop, CHARACTER_PROP_CR) || - has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) { + if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_CONTROL) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_CR) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_LF)) { goto hasbreak; } /* GB6 */ - if (has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_L) && - (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_L) || - has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || - has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LV) || + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_L) && + (has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_L) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_LV) || - has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LVT))) { + has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_LVT))) { goto nobreak; } /* GB7 */ - if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LV) || - has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_V)) && - (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || - has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T))) { + if ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_LV) || + has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_V)) && + (has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_T))) { goto nobreak; } /* GB8 */ - if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LVT) || - has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_T)) && - has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T)) { + if ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_LVT) || + has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_T)) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_T)) { goto nobreak; } /* GB9 */ - if (has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND) || - has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) { + if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND) || + has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) { goto nobreak; } /* GB9a */ - if (has_property(b, p[1], character_prop, CHARACTER_PROP_SPACINGMARK)) { + if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_SPACINGMARK)) { goto nobreak; } /* GB9b */ - if (has_property(a, p[0], character_prop, CHARACTER_PROP_PREPEND)) { + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_PREPEND)) { goto nobreak; } /* GB11 */ if ((flags & CHARACTER_FLAG_EMOJI) && - has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) && - has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) { + has_property(cp0, p[0], character_prop, CHARACTER_PROP_ZWJ) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) { goto nobreak; } /* GB12/GB13 */ - if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && - has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && + if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && + has_property(cp1, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && (flags & CHARACTER_FLAG_RI_ODD)) { goto nobreak; } @@ -166,8 +166,8 @@ nobreak: hasbreak: if (state != NULL) { /* move b-state to a-state, discard b-state */ - memcpy(&(state->a), &(state->b), sizeof(state->a)); - memset(&(state->b), 0, sizeof(state->b)); + memcpy(&(state->cp0), &(state->cp1), sizeof(state->cp0)); + memset(&(state->cp1), 0, sizeof(state->cp1)); /* reset flags */ if (isbreak) { diff --git a/src/utf8.c b/src/utf8.c @@ -48,11 +48,11 @@ static const struct { }; size_t -grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) +grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp) { size_t off, i; - if (s == NULL || n == 0) { + if (str == NULL || len == 0) { /* a sequence must be at least 1 byte long */ *cp = GRAPHEME_INVALID_CODEPOINT; return 0; @@ -60,14 +60,14 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) /* identify sequence type with the first byte */ for (off = 0; off < LEN(lut); off++) { - if (BETWEEN(((const unsigned char *)s)[0], lut[off].lower, + if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower, lut[off].upper)) { /* * first byte is within the bounds; fill * p with the the first bits contained in * the first byte (by subtracting the high bits) */ - *cp = ((const unsigned char *)s)[0] - lut[off].lower; + *cp = ((const unsigned char *)str)[0] - lut[off].lower; break; } } @@ -82,7 +82,7 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) *cp = GRAPHEME_INVALID_CODEPOINT; return 1; } - if (1 + off > n) { + if (1 + off > len) { /* * input is not long enough, set cp as invalid */ @@ -93,8 +93,8 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) * else in case we have a "rogue" case where e.g. such a * sequence starter occurs right before a NUL-byte. */ - for (i = 0; 1 + i < n; i++) { - if(!BETWEEN(((const unsigned char *)s)[1 + i], + for (i = 0; 1 + i < len; i++) { + if(!BETWEEN(((const unsigned char *)str)[1 + i], 0x80, 0xBF)) { break; } @@ -106,7 +106,7 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) * Otherwise return the number of bytes we actually * expected, which is larger than n. */ - return ((1 + i) < n) ? (1 + i) : (1 + off); + return ((1 + i) < len) ? (1 + i) : (1 + off); } /* @@ -114,7 +114,7 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) * (i.e. between 0x80 (10000000) and 0xBF (10111111)) */ for (i = 1; i <= off; i++) { - if(!BETWEEN(((const unsigned char *)s)[i], 0x80, 0xBF)) { + if(!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) { /* * byte does not match format; return * number of bytes processed excluding the @@ -132,7 +132,7 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) * shift codepoint by 6 bits and add the 6 stored bits * in s[i] to it using the bitmask 0x3F (00111111) */ - *cp = (*cp << 6) | (((const unsigned char *)s)[i] & 0x3F); + *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F); } if (*cp < lut[off].mincp || @@ -151,7 +151,7 @@ grapheme_decode_utf8(const char *s, size_t n, uint_least32_t *cp) } size_t -grapheme_encode_utf8(uint_least32_t cp, char *s, size_t n) +grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len) { size_t off, i; @@ -171,7 +171,7 @@ grapheme_encode_utf8(uint_least32_t cp, char *s, size_t n) break; } } - if (1 + off > n || s == NULL || n == 0) { + if (1 + off > len || str == NULL || len == 0) { /* * specified buffer is too small to store sequence or * the caller just wanted to know how many bytes the @@ -191,7 +191,7 @@ grapheme_encode_utf8(uint_least32_t cp, char *s, size_t n) * We do not overwrite the mask because we guaranteed earlier * that there are no bits higher than the mask allows. */ - ((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off)); + ((unsigned char *)str)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off)); for (i = 1; i <= off; i++) { /* @@ -200,8 +200,8 @@ grapheme_encode_utf8(uint_least32_t cp, char *s, size_t n) * extract from the properly-shifted value using the * mask 00111111 (0x3F) */ - ((unsigned char *)s)[i] = 0x80 | - ((cp >> (6 * (off - i))) & 0x3F); + ((unsigned char *)str)[i] = 0x80 | + ((cp >> (6 * (off - i))) & 0x3F); } return 1 + off;