utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 8239639e3fe1192c8b0c3f45ed7eb5be02853476
parent df2997a300792b8efd6a1ea9281c14dfe986d6f9
Author: Steven G. Johnson <stevenj@alum.mit.edu>
Date:   Tue, 15 Dec 2020 15:26:56 -0500

fix NULL args in grapheme_break_stateful

Diffstat:
MNEWS.md | 7+++++++
Mtest/graphemetest.c | 3+++
Mutf8proc.c | 18+++++++++++-------
3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md @@ -1,5 +1,12 @@ # utf8proc release history # +## Version 2.6.1 ## + +2020-12-15 + + - Bugfix in `utf8proc_grapheme_break_stateful` for `NULL` state argument, which + also broke `utf8proc_grapheme_break`. + ## Version 2.6 ## 2020-11-23 diff --git a/test/graphemetest.c b/test/graphemetest.c @@ -118,6 +118,9 @@ int main(int argc, char **argv) checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */ checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */ + check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test"); + check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test"); + printf("Passed regression tests!\n"); return 0; diff --git a/utf8proc.c b/utf8proc.c @@ -290,13 +290,14 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) { - int lbc_override; - if (*state == UTF8PROC_BOUNDCLASS_START) - *state = lbc_override = lbc; - else - lbc_override = *state; - utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc); if (state) { + int lbc_override; + if (*state == UTF8PROC_BOUNDCLASS_START) + *state = lbc_override = lbc; + else + lbc_override = *state; + utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc); + // Special support for GB 12/13 made possible by GB999. After two RI // class codepoints we want to force a break. Do this by resetting the // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break @@ -315,8 +316,11 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t } else *state = tbc; + + return break_permitted; } - return break_permitted; + else + return grapheme_break_simple(lbc, tbc); } UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(