libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit aa5dda2687c4907d6a47e57b1d7973b8f9d158ae
parent 25d89e6e460e68329e7a3f388fe3e150a8f5474a
Author: Laslo Hunhold <dev@frign.de>
Date:   Tue, 16 Aug 2022 16:25:31 +0200

Move get_codepoint_*()-util-functions to src/util.c

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/case.c | 62--------------------------------------------------------------
Msrc/line.c | 28+++-------------------------
Msrc/sentence.c | 30++++--------------------------
Msrc/util.c | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/util.h | 6++++++
Msrc/word.c | 24------------------------
6 files changed, 75 insertions(+), 137 deletions(-)

diff --git a/src/case.c b/src/case.c @@ -33,68 +33,6 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *major, } static inline size_t -get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - *cp = ((const uint_least32_t *)str)[offset]; - return 1; - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - -static inline size_t -get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - size_t ret; - - if (offset < len) { - ret = grapheme_decode_utf8((const char *)str + offset, - len - offset, cp); - - if (unlikely(len == SIZE_MAX && cp == 0)) { - return 0; - } else { - return ret; - } - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - -static inline size_t -set_codepoint(uint_least32_t cp, void *str, size_t len, size_t offset) -{ - if (str == NULL || len == 0) { - return 1; - } - - if (offset < len) { - ((uint_least32_t *)str)[offset] = cp; - return 1; - } else { - return 0; - } -} - -static inline size_t -set_codepoint_utf8(uint_least32_t cp, void *str, size_t len, size_t offset) -{ - if (str == NULL || len == 0) { - return grapheme_encode_utf8(cp, NULL, 0); - } - - if (offset < len) { - return grapheme_encode_utf8(cp, (char *)str + offset, - len - offset); - } else { - return grapheme_encode_utf8(cp, NULL, 0); - } -} - -static inline size_t to_case(const void *src, size_t srclen, void *dest, size_t destlen, size_t srcnumprocess, uint_least8_t final_sigma_level, size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *), diff --git a/src/line.c b/src/line.c @@ -19,30 +19,6 @@ get_break_prop(uint_least32_t cp) } } -static inline size_t -get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - *cp = ((const uint_least32_t *)str)[offset]; - return 1; - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - -static inline size_t -get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - return grapheme_decode_utf8((const char *)str + offset, - len - offset, cp); - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - static size_t next_line_break(const void *str, size_t len, size_t (*get_codepoint) (const void *, size_t, size_t, uint_least32_t *)) @@ -152,7 +128,9 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint) * and one (CL | CP) to the left of the middle * spot */ - if (lb25_level == 0 && cp0_prop == LINE_BREAK_PROP_NU) { + if ((lb25_level == 0 || + lb25_level == 1) && + cp0_prop == LINE_BREAK_PROP_NU) { /* sequence has begun */ lb25_level = 1; } else if ((lb25_level == 1 || lb25_level == 2) && diff --git a/src/sentence.c b/src/sentence.c @@ -20,30 +20,6 @@ get_break_prop(uint_least32_t cp) } } -static inline size_t -get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - *cp = ((const uint_least32_t *)str)[offset]; - return 1; - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - -static inline size_t -get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - return grapheme_decode_utf8((const char *)str + offset, - len - offset, cp); - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - static size_t next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) (const void *, size_t, size_t, uint_least32_t *)) @@ -142,7 +118,8 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) * left of the middle spot. * */ - if (aterm_close_sp_level == 0 && + if ((aterm_close_sp_level == 0 || + aterm_close_sp_level == 1) && skip.b == SENTENCE_BREAK_PROP_ATERM) { /* sequence has begun */ aterm_close_sp_level = 1; @@ -162,7 +139,8 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) aterm_close_sp_level = 0; } - if (saterm_close_sp_parasep_level == 0 && + if ((saterm_close_sp_parasep_level == 0 || + saterm_close_sp_parasep_level == 1) && (skip.b == SENTENCE_BREAK_PROP_STERM || skip.b == SENTENCE_BREAK_PROP_ATERM)) { /* sequence has begun */ diff --git a/src/util.c b/src/util.c @@ -6,3 +6,65 @@ #include "../gen/types.h" #include "../grapheme.h" #include "util.h" + +inline size_t +get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) +{ + if (offset < len) { + *cp = ((const uint_least32_t *)str)[offset]; + return 1; + } else { + *cp = GRAPHEME_INVALID_CODEPOINT; + return 0; + } +} + +inline size_t +get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) +{ + size_t ret; + + if (offset < len) { + ret = grapheme_decode_utf8((const char *)str + offset, + len - offset, cp); + + if (unlikely(len == SIZE_MAX && cp == 0)) { + return 0; + } else { + return ret; + } + } else { + *cp = GRAPHEME_INVALID_CODEPOINT; + return 0; + } +} + +inline size_t +set_codepoint(uint_least32_t cp, void *str, size_t len, size_t offset) +{ + if (str == NULL || len == 0) { + return 1; + } + + if (offset < len) { + ((uint_least32_t *)str)[offset] = cp; + return 1; + } else { + return 0; + } +} + +inline size_t +set_codepoint_utf8(uint_least32_t cp, void *str, size_t len, size_t offset) +{ + if (str == NULL || len == 0) { + return grapheme_encode_utf8(cp, NULL, 0); + } + + if (offset < len) { + return grapheme_encode_utf8(cp, (char *)str + offset, + len - offset); + } else { + return grapheme_encode_utf8(cp, NULL, 0); + } +} diff --git a/src/util.h b/src/util.h @@ -25,4 +25,10 @@ #define unlikely(expr) (expr) #endif +size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *); +size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *); + +size_t set_codepoint(uint_least32_t, void *, size_t, size_t); +size_t set_codepoint_utf8(uint_least32_t, void *, size_t, size_t); + #endif /* UTIL_H */ diff --git a/src/word.c b/src/word.c @@ -19,30 +19,6 @@ get_break_prop(uint_least32_t cp) } } -static inline size_t -get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - *cp = ((const uint_least32_t *)str)[offset]; - return 1; - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - -static inline size_t -get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp) -{ - if (offset < len) { - return grapheme_decode_utf8((const char *)str + offset, - len - offset, cp); - } else { - *cp = GRAPHEME_INVALID_CODEPOINT; - return 0; - } -} - static size_t next_word_break(const void *str, size_t len, size_t (*get_codepoint) (const void *, size_t, size_t, uint_least32_t *))