libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 5332f7ee034081618617c2b0785733ccc9ec8753
parent 563eb65bfbaa4f27c77d73ae81b51882c916993d
Author: Laslo Hunhold <dev@frign.de>
Date:   Wed, 21 Sep 2022 20:16:00 +0200

Refactor case-conversion-functions with Herodotus

The readability of the code is greatly improved, and the code is now
much more robust than before.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Msrc/case.c | 255+++++++++++++++++++++++++++++++++++--------------------------------------------
1 file changed, 112 insertions(+), 143 deletions(-)

diff --git a/src/case.c b/src/case.c @@ -33,22 +33,18 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *major, } static inline size_t -to_case(const void *src, size_t srclen, void *dest, size_t destlen, - size_t srcnumprocess, uint_least8_t final_sigma_level, - size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *), - size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t), - const uint_least16_t *major, const int_least32_t *minor, - const struct special_case *sc) +to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, + uint_least8_t final_sigma_level, const uint_least16_t *major, + const int_least32_t *minor, const struct special_case *sc) { + HERODOTUS_READER tmp; enum case_property prop; - size_t srcoff, destoff, res, tmp, off, i; + enum herodotus_status s; + size_t off, i; uint_least32_t cp, tmp_cp; int_least32_t map; - for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) { - /* read in next source codepoint */ - res = get_codepoint((const char *)src, srclen, srcoff, &cp); - + for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) { if (sc == lower_special) { /* * For the special Final_Sigma-rule (see SpecialCasing.txt), @@ -72,8 +68,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen, * if the succeeding character is cased, invalidating * the after-condition */ - for (tmp = srcoff + res, prop = NUM_CASE_PROPS; tmp < srclen; ) { - tmp += get_codepoint(src, srclen, tmp, &tmp_cp); + herodotus_reader_copy(r, &tmp); + for (prop = NUM_CASE_PROPS; + (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) == + HERODOTUS_STATUS_SUCCESS; ) { prop = get_case_property(tmp_cp); if (prop != CASE_PROP_CASE_IGNORABLE && @@ -83,20 +81,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen, } /* - * Now prop is something other than case-ignorable. + * Now prop is something other than case-ignorable or + * the source-string ended. * If it is something other than cased, we know * that the after-condition holds */ - if (prop != CASE_PROP_CASED && - prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { + if (s != HERODOTUS_STATUS_SUCCESS || + (prop != CASE_PROP_CASED && + prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) { /* * write GREEK SMALL LETTER FINAL SIGMA to * destination */ - destoff += set_codepoint(UINT32_C(0x03C2), - dest, - destlen, - destoff); + herodotus_write_codepoint(w, UINT32_C(0x03C2)); /* reset Final_Sigma-state and continue */ final_sigma_level = 0; @@ -132,191 +129,163 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen, off = (uint_least32_t)map - UINT32_C(0x110000); for (i = 0; i < sc[off].cplen; i++) { - if (likely(destoff < destlen)) { - /* - * write special mapping to destination - */ - destoff += set_codepoint(sc[off].cp[i], - dest, - destlen, - destoff); - } else { - /* - * further increase destoff to indicate - * how much buffer space we need - */ - destoff += set_codepoint(sc[off].cp[i], - NULL, 0, 0); - } + herodotus_write_codepoint(w, sc[off].cp[i]); } } else { /* we have a simple mapping */ - if (likely(destoff < destlen)) { - destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map), - dest, destlen, destoff); - } else { - destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map), - NULL, 0, 0); - } + herodotus_write_codepoint(w, (uint_least32_t) + ((int_least32_t)cp + map)); } } - if (set_codepoint == set_codepoint_utf8 && destlen > 0) { - /* - * NUL-terminate destination to always ensure NUL-termination, - * unless in check mode. - * Just like with snprintf() a return value >= destlen indicates - * truncation. - */ - ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0'; + herodotus_writer_nul_terminate(w); + + return herodotus_writer_number_written(w); +} + +static size_t +herodotus_next_word_break(const HERODOTUS_READER *r) +{ + if (r->src == NULL || r->off > r->srclen) { + return 0; } - return destoff; + if (r->type == HERODOTUS_TYPE_CODEPOINT) { + return grapheme_next_word_break( + ((const uint_least32_t *)(r->src)) + r->off, + r->srclen - r->off); + } else { /* r->type == HERODOTUS_TYPE_UTF8 */ + return grapheme_next_word_break_utf8( + ((const char *)(r->src)) + r->off, + r->srclen - r->off); + } } static inline size_t -to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen, - size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *), - size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t)) +to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) { enum case_property prop; - size_t next_wb, srcoff, destoff, res; + enum herodotus_status s; uint_least32_t cp; - for (srcoff = destoff = 0; ; ) { - if (get_codepoint == get_codepoint_utf8) { - if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff, - srclen - srcoff)) == 0) { - /* we consumed all of the string */ - break; - } - } else { - if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff, - srclen - srcoff)) == 0) { - /* we consumed all of the string */ - break; - } - } - - for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) { + for (;;) { + herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r)); + for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) { /* check if we have a cased character */ - res = get_codepoint(src, srclen, srcoff, &cp); prop = get_case_property(cp); if (prop == CASE_PROP_CASED || prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { break; } else { /* write the data to the output verbatim, it if permits */ - destoff += set_codepoint_utf8(cp, dest, destlen, destoff); - } - } + herodotus_write_codepoint(w, cp); - if (next_wb > 0) { - /* get character length */ - res = get_codepoint(src, srclen, srcoff, &cp); - - /* we have a cased character at srcoff, map it to titlecase */ - if (get_codepoint == get_codepoint_utf8) { - destoff += to_case((const char *)src + srcoff, - srclen - srcoff, - (char *)dest + destoff, - (destoff < destlen) ? (destlen - destoff) : 0, - res, 0, - get_codepoint_utf8, - set_codepoint_utf8, title_major, - title_minor, title_special); - } else { - destoff += to_case((const uint_least32_t *)src + srcoff, - srclen - srcoff, - (uint_least32_t *)dest + destoff, - (destoff < destlen) ? (destlen - destoff) : 0, - res, 0, - get_codepoint, - set_codepoint, title_major, - title_minor, title_special); + /* increment reader */ + herodotus_read_codepoint(r, true, &cp); } - - /* we consumed a character */ - srcoff += res; - next_wb -= res; } - /* cast the rest of the codepoints in the word to lowercase */ - if (get_codepoint == get_codepoint_utf8) { - destoff += to_case((const char *)src + srcoff, - srclen - srcoff, - (char *)dest + destoff, - (destoff < destlen) ? (destlen - destoff) : 0, - next_wb, 1, - get_codepoint_utf8, - set_codepoint_utf8, lower_major, - lower_minor, lower_special); + if (s == HERODOTUS_STATUS_END_OF_BUFFER) { + /* we are done */ + break; + } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { + /* + * we did not encounter any cased character + * up to the word break + */ + continue; } else { - destoff += to_case((const uint_least32_t *)src + srcoff, - srclen - srcoff, - (uint_least32_t *)dest + destoff, - (destoff < destlen) ? (destlen - destoff) : 0, - next_wb, 1, - get_codepoint, - set_codepoint, lower_major, - lower_minor, lower_special); + /* + * we encountered a cased character before the word + * break, convert it to titlecase + */ + herodotus_reader_push_advance_limit(r, + herodotus_reader_next_codepoint_break(r)); + to_case(r, w, 0, title_major, title_minor, title_special); + herodotus_reader_pop_limit(r); } - srcoff += next_wb; - } - if (set_codepoint == set_codepoint_utf8) { - /* - * NUL-terminate destination to always ensure NUL-termination. - * Just like with snprintf() a return value >= destlen indicates - * truncation. - */ - ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0'; + /* cast the rest of the codepoints in the word to lowercase */ + to_case(r, w, 1, lower_major, lower_minor, lower_special); + + herodotus_reader_pop_limit(r); } - return destoff; + herodotus_writer_nul_terminate(w); + + return herodotus_writer_number_written(w); } size_t grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen) { - return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint, - upper_major, upper_minor, upper_special); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); + + return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); } size_t grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen) { - return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint, - lower_major, lower_minor, lower_special); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); + + return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); } size_t grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen) { - return to_titlecase(src, srclen, dest, destlen, get_codepoint, - set_codepoint); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); + + return to_titlecase(&r, &w); } size_t grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen) { - return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8, - upper_major, upper_minor, upper_special); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); + + return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); } size_t grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen) { - return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8, - lower_major, lower_minor, lower_special); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); + return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); } size_t grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen) { - return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8, - set_codepoint_utf8); + HERODOTUS_READER r; + HERODOTUS_WRITER w; + + herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); + herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); + + return to_titlecase(&r, &w); } static inline bool