libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit dfda0db8503b0051addc96368840b06c22fa8eeb
parent 29c6958306523d0370488403717047ceb960bc69
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 18 Dec 2021 12:02:20 +0100

Rename functions/defines/files from "grapheme" to "character"

It was always confusing to have "grapheme" used in different contexts.
One is the library name, which is also the prefix for all constants,
the other is to indicate we have functions for analyzing grapheme
clusters.

We rename all functions related to graphemes to operate on "characters"
instead, where these are user-perceived characters. This naming choice
is not out of the ordinary and also what libunistring for instance uses.

Additionally, rename gen/grapheme.c to gen/character-prop.c to indicate
we extract properties, improving readability. This also removes a bit
of ambiguity regarding internal constants prefixed with GRAPHEME_, which
might suggest that these were "officially" from grapheme.h, even though
they serve only an internal use for characters specifically.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 32++++++++++++++++----------------
Agen/character-prop.c | 93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Agen/character-test.c | 19+++++++++++++++++++
Dgen/grapheme-test.c | 19-------------------
Dgen/grapheme.c | 93-------------------------------------------------------------------------------
Mgrapheme.h | 4++--
Asrc/character.c | 228+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/grapheme.c | 228-------------------------------------------------------------------------------
Atest/character-performance.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/character.c | 45+++++++++++++++++++++++++++++++++++++++++++++
Dtest/grapheme-performance.c | 63---------------------------------------------------------------
Dtest/grapheme.c | 45---------------------------------------------
12 files changed, 466 insertions(+), 466 deletions(-)

diff --git a/Makefile b/Makefile @@ -10,17 +10,17 @@ DATA =\ data/GraphemeBreakTest.txt\ GEN =\ - gen/grapheme\ - gen/grapheme-test\ + gen/character-prop\ + gen/character-test\ SRC =\ - src/grapheme\ + src/character\ src/utf8\ src/util\ TEST =\ - test/grapheme\ - test/grapheme-performance\ + test/character\ + test/character-performance\ test/utf8-decode\ test/utf8-encode\ @@ -34,27 +34,27 @@ MAN7 = man/libgrapheme.7 all: libgrapheme.a libgrapheme.so -gen/grapheme.o: gen/grapheme.c config.mk gen/util.h -gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h +gen/character-prop.o: gen/character-prop.c config.mk gen/util.h +gen/character-test.o: gen/character-test.c config.mk gen/util.h gen/util.o: gen/util.c config.mk gen/util.h -src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h +src/character.o: src/character.c config.mk gen/character-prop.h grapheme.h src/util.h src/utf8.o: src/utf8.c config.mk grapheme.h src/util.o: src/util.c config.mk grapheme.h src/util.h -test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h test/util.h -test/grapheme-performance.o: test/grapheme-performance.c config.mk gen/grapheme-test.h grapheme.h test/util.h +test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h +test/character-performance.o: test/character-performance.c config.mk gen/character-test.h grapheme.h test/util.h test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h test/util.o: test/util.c config.mk test/util.h -gen/grapheme: gen/grapheme.o gen/util.o -gen/grapheme-test: gen/grapheme-test.o gen/util.o -test/grapheme: test/grapheme.o test/util.o libgrapheme.a -test/grapheme-performance: test/grapheme-performance.o test/util.o libgrapheme.a +gen/character-prop: gen/character-prop.o gen/util.o +gen/character-test: gen/character-test.o gen/util.o +test/character: test/character.o test/util.o libgrapheme.a +test/character-performance: test/character-performance.o test/util.o libgrapheme.a test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a -gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme -gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test +gen/character-prop.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character-prop +gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test data/emoji-data.txt: wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt diff --git a/gen/character-prop.c b/gen/character-prop.c @@ -0,0 +1,93 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +#include "util.h" + +#define FILE_EMOJI "data/emoji-data.txt" +#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" + +static struct property segment_property[] = { + { + .enumname = "CHARACTER_PROP_CONTROL", + .identifier = "Control", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_CR", + .identifier = "CR", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_EXTEND", + .identifier = "Extend", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_EXTENDED_PICTOGRAPHIC", + .identifier = "Extended_Pictographic", + .fname = FILE_EMOJI, + }, + { + .enumname = "CHARACTER_PROP_HANGUL_L", + .identifier = "L", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_HANGUL_V", + .identifier = "V", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_HANGUL_T", + .identifier = "T", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_HANGUL_LV", + .identifier = "LV", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_HANGUL_LVT", + .identifier = "LVT", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_LF", + .identifier = "LF", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_PREPEND", + .identifier = "Prepend", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_REGIONAL_INDICATOR", + .identifier = "Regional_Indicator", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_SPACINGMARK", + .identifier = "SpacingMark", + .fname = FILE_GRAPHEME, + }, + { + .enumname = "CHARACTER_PROP_ZWJ", + .identifier = "ZWJ", + .fname = FILE_GRAPHEME, + }, +}; + +int +main(int argc, char *argv[]) +{ + (void)argc; + + property_list_parse(segment_property, LEN(segment_property)); + property_list_print(segment_property, LEN(segment_property), + "character_prop", argv[0]); + property_list_free(segment_property, LEN(segment_property)); + + return 0; +} diff --git a/gen/character-test.c b/gen/character-test.c @@ -0,0 +1,19 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +#include "util.h" + +int +main(int argc, char *argv[]) +{ + struct segment_test *st = NULL; + size_t numsegtests = 0; + + (void)argc; + + segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests); + segment_test_list_print(st, numsegtests, "character_test", argv[0]); + segment_test_list_free(st, numsegtests); + + return 0; +} diff --git a/gen/grapheme-test.c b/gen/grapheme-test.c @@ -1,19 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> - -#include "util.h" - -int -main(int argc, char *argv[]) -{ - struct segment_test *st = NULL; - size_t numsegtests = 0; - - (void)argc; - - segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests); - segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]); - segment_test_list_free(st, numsegtests); - - return 0; -} diff --git a/gen/grapheme.c b/gen/grapheme.c @@ -1,93 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> - -#include "util.h" - -#define FILE_EMOJI "data/emoji-data.txt" -#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" - -static struct property segment_property[] = { - { - .enumname = "GRAPHEME_PROP_CONTROL", - .identifier = "Control", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_CR", - .identifier = "CR", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_EXTEND", - .identifier = "Extend", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC", - .identifier = "Extended_Pictographic", - .fname = FILE_EMOJI, - }, - { - .enumname = "GRAPHEME_PROP_HANGUL_L", - .identifier = "L", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_HANGUL_V", - .identifier = "V", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_HANGUL_T", - .identifier = "T", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_HANGUL_LV", - .identifier = "LV", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_HANGUL_LVT", - .identifier = "LVT", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_LF", - .identifier = "LF", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_PREPEND", - .identifier = "Prepend", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_REGIONAL_INDICATOR", - .identifier = "Regional_Indicator", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_SPACINGMARK", - .identifier = "SpacingMark", - .fname = FILE_GRAPHEME, - }, - { - .enumname = "GRAPHEME_PROP_ZWJ", - .identifier = "ZWJ", - .fname = FILE_GRAPHEME, - }, -}; - -int -main(int argc, char *argv[]) -{ - (void)argc; - - property_list_parse(segment_property, LEN(segment_property)); - property_list_print(segment_property, LEN(segment_property), - "grapheme_prop", argv[0]); - property_list_free(segment_property, LEN(segment_property)); - - return 0; -} diff --git a/grapheme.h b/grapheme.h @@ -19,9 +19,9 @@ typedef struct lg_internal_segmentation_state { #define LG_INVALID_CODE_POINT UINT32_C(0xFFFD) -size_t lg_grapheme_nextbreak(const char *); +size_t lg_character_nextbreak(const char *); -bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *); +bool lg_character_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *); size_t lg_utf8_decode(const char *, size_t, uint_least32_t *); size_t lg_utf8_encode(uint_least32_t, char *, size_t); diff --git a/src/character.c b/src/character.c @@ -0,0 +1,228 @@ +/* See LICENSE file for copyright and license details. */ +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +#include "../gen/character-prop.h" +#include "../grapheme.h" +#include "util.h" + +enum { + CHARACTER_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ + CHARACTER_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ +}; + +bool +lg_character_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state) +{ + struct lg_internal_heisenstate *p[2] = { 0 }; + uint_least16_t flags = 0; + bool isbreak = true; + + /* set state depending on state pointer */ + if (state != NULL) { + p[0] = &(state->a); + p[1] = &(state->b); + flags = state->flags; + } + + /* skip printable ASCII */ + if ((a >= 0x20 && a <= 0x7E) && + (b >= 0x20 && b <= 0x7E)) { + goto hasbreak; + } + + /* + * Apply grapheme cluster breaking algorithm (UAX #29), see + * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + */ + + /* + * update flags, if state-pointer given + */ + if (has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { + if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) { + /* one more RI is on the left side of the seam, flip state */ + flags ^= CHARACTER_FLAG_RI_ODD; + } else { + /* an RI appeared on the right side but the left + side is not an RI, reset state (number 0 is even) */ + flags &= ~CHARACTER_FLAG_RI_ODD; + } + } + if (!(flags & CHARACTER_FLAG_EMOJI) && + ((has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { + flags |= CHARACTER_FLAG_EMOJI; + } else if ((flags & CHARACTER_FLAG_EMOJI) && + ((has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) && + has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) || + (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) && + has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)) || + (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) && + has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) || + (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) && + has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) { + /* CHARACTER_FLAG_EMOJI remains */ + } else { + flags &= ~CHARACTER_FLAG_EMOJI; + } + + /* write updated flags to state, if given */ + if (state != NULL) { + state->flags = flags; + } + + /* + * apply rules + */ + + /* skip GB1 and GB2, as they are never satisfied here */ + + /* GB3 */ + if (has_property(a, p[0], character_prop, CHARACTER_PROP_CR) && + has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) { + goto nobreak; + } + + /* GB4 */ + if (has_property(a, p[0], character_prop, CHARACTER_PROP_CONTROL) || + has_property(a, p[0], character_prop, CHARACTER_PROP_CR) || + has_property(a, p[0], character_prop, CHARACTER_PROP_LF)) { + goto hasbreak; + } + + /* GB5 */ + if (has_property(b, p[1], character_prop, CHARACTER_PROP_CONTROL) || + has_property(b, p[1], character_prop, CHARACTER_PROP_CR) || + has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) { + goto hasbreak; + } + + /* GB6 */ + if (has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_L) && + (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_L) || + has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || + has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LV) || + + has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LVT))) { + goto nobreak; + } + + /* GB7 */ + if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LV) || + has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_V)) && + (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) || + has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T))) { + goto nobreak; + } + + /* GB8 */ + if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LVT) || + has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_T)) && + has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T)) { + goto nobreak; + } + + /* GB9 */ + if (has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND) || + has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) { + goto nobreak; + } + + /* GB9a */ + if (has_property(b, p[1], character_prop, CHARACTER_PROP_SPACINGMARK)) { + goto nobreak; + } + + /* GB9b */ + if (has_property(a, p[0], character_prop, CHARACTER_PROP_PREPEND)) { + goto nobreak; + } + + /* GB11 */ + if ((flags & CHARACTER_FLAG_EMOJI) && + has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) && + has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) { + goto nobreak; + } + + /* GB12/GB13 */ + if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && + has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) && + (flags & CHARACTER_FLAG_RI_ODD)) { + goto nobreak; + } + + /* GB999 */ + goto hasbreak; +nobreak: + isbreak = false; +hasbreak: + if (state != NULL) { + /* move b-state to a-state, discard b-state */ + memcpy(&(state->a), &(state->b), sizeof(state->a)); + memset(&(state->b), 0, sizeof(state->b)); + + /* reset flags */ + if (isbreak) { + state->flags = 0; + } + } + + return isbreak; +} + +size_t +lg_character_nextbreak(const char *str) +{ + uint_least32_t cp0, cp1; + size_t ret, len = 0; + LG_SEGMENTATION_STATE state = { 0 }; + + if (str == NULL) { + return 0; + } + + /* + * lg_utf8_decode, when it encounters an unexpected byte, + * does not count it to the error and instead assumes that the + * unexpected byte is the beginning of a new sequence. + * This way, when the string ends with a null byte, we never + * miss it, even if the previous UTF-8 sequence terminates + * unexpectedly, as it would either act as an unexpected byte, + * saved for later, or as a null byte itself, that we can catch. + * We pass (size_t)-1 to the length, as we will never read beyond + * the null byte for the reasons given above. + */ + + /* get first code point */ + len += lg_utf8_decode(str, (size_t)-1, &cp0); + if (cp0 == LG_INVALID_CODE_POINT) { + return len; + } + + while (cp0 != 0) { + /* get next code point */ + ret = lg_utf8_decode(str + len, (size_t)-1, &cp1); + + if (cp1 == LG_INVALID_CODE_POINT || + lg_character_isbreak(cp0, cp1, &state)) { + /* we read an invalid cp or have a breakpoint */ + break; + } else { + /* we don't have a breakpoint, continue */ + len += ret; + } + + /* prepare next round */ + cp0 = cp1; + } + + return len; +} diff --git a/src/grapheme.c b/src/grapheme.c @@ -1,228 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stdbool.h> -#include <stddef.h> -#include <stdlib.h> -#include <string.h> - -#include "../gen/grapheme.h" -#include "../grapheme.h" -#include "util.h" - -enum { - GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ - GRAPHEME_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ -}; - -bool -lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state) -{ - struct lg_internal_heisenstate *p[2] = { 0 }; - uint_least16_t flags = 0; - bool isbreak = true; - - /* set state depending on state pointer */ - if (state != NULL) { - p[0] = &(state->a); - p[1] = &(state->b); - flags = state->flags; - } - - /* skip printable ASCII */ - if ((a >= 0x20 && a <= 0x7E) && - (b >= 0x20 && b <= 0x7E)) { - goto hasbreak; - } - - /* - * Apply grapheme cluster breaking algorithm (UAX #29), see - * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - */ - - /* - * update flags, if state-pointer given - */ - if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) { - /* one more RI is on the left side of the seam, flip state */ - flags ^= GRAPHEME_FLAG_RI_ODD; - } else { - /* an RI appeared on the right side but the left - side is not an RI, reset state (number 0 is even) */ - flags &= ~GRAPHEME_FLAG_RI_ODD; - } - } - if (!(flags & GRAPHEME_FLAG_EMOJI) && - ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { - flags |= GRAPHEME_FLAG_EMOJI; - } else if ((flags & GRAPHEME_FLAG_EMOJI) && - ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) || - (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) || - (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) || - (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) { - /* GRAPHEME_FLAG_EMOJI remains */ - } else { - flags &= ~GRAPHEME_FLAG_EMOJI; - } - - /* write updated flags to state, if given */ - if (state != NULL) { - state->flags = flags; - } - - /* - * apply rules - */ - - /* skip GB1 and GB2, as they are never satisfied here */ - - /* GB3 */ - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { - goto nobreak; - } - - /* GB4 */ - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) || - has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) || - has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) { - goto hasbreak; - } - - /* GB5 */ - if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) { - goto hasbreak; - } - - /* GB6 */ - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) && - (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || - - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) { - goto nobreak; - } - - /* GB7 */ - if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) || - has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) && - (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) { - goto nobreak; - } - - /* GB8 */ - if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) || - has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) { - goto nobreak; - } - - /* GB9 */ - if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) || - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) { - goto nobreak; - } - - /* GB9a */ - if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) { - goto nobreak; - } - - /* GB9b */ - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) { - goto nobreak; - } - - /* GB11 */ - if ((flags & GRAPHEME_FLAG_EMOJI) && - has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) { - goto nobreak; - } - - /* GB12/GB13 */ - if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && - has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) && - (flags & GRAPHEME_FLAG_RI_ODD)) { - goto nobreak; - } - - /* GB999 */ - goto hasbreak; -nobreak: - isbreak = false; -hasbreak: - if (state != NULL) { - /* move b-state to a-state, discard b-state */ - memcpy(&(state->a), &(state->b), sizeof(state->a)); - memset(&(state->b), 0, sizeof(state->b)); - - /* reset flags */ - if (isbreak) { - state->flags = 0; - } - } - - return isbreak; -} - -size_t -lg_grapheme_nextbreak(const char *str) -{ - uint_least32_t cp0, cp1; - size_t ret, len = 0; - LG_SEGMENTATION_STATE state = { 0 }; - - if (str == NULL) { - return 0; - } - - /* - * lg_utf8_decode, when it encounters an unexpected byte, - * does not count it to the error and instead assumes that the - * unexpected byte is the beginning of a new sequence. - * This way, when the string ends with a null byte, we never - * miss it, even if the previous UTF-8 sequence terminates - * unexpectedly, as it would either act as an unexpected byte, - * saved for later, or as a null byte itself, that we can catch. - * We pass (size_t)-1 to the length, as we will never read beyond - * the null byte for the reasons given above. - */ - - /* get first code point */ - len += lg_utf8_decode(str, (size_t)-1, &cp0); - if (cp0 == LG_INVALID_CODE_POINT) { - return len; - } - - while (cp0 != 0) { - /* get next code point */ - ret = lg_utf8_decode(str + len, (size_t)-1, &cp1); - - if (cp1 == LG_INVALID_CODE_POINT || - lg_grapheme_isbreak(cp0, cp1, &state)) { - /* we read an invalid cp or have a breakpoint */ - break; - } else { - /* we don't have a breakpoint, continue */ - len += ret; - } - - /* prepare next round */ - cp0 = cp1; - } - - return len; -} diff --git a/test/character-performance.c b/test/character-performance.c @@ -0,0 +1,63 @@ +/* See LICENSE file for copyright and license details. */ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include "../grapheme.h" +#include "../gen/character-test.h" +#include "util.h" + +#define NUM_ITERATIONS 1000 + +int +main(int argc, char *argv[]) +{ + struct timespec start, end; + size_t i, j, bufsiz, off; + uint32_t *buf; + LG_SEGMENTATION_STATE state; + double cp_per_sec; + + (void)argc; + + /* allocate and generate buffer */ + for (i = 0, bufsiz = 0; i < LEN(character_test); i++) { + bufsiz += character_test[i].cplen; + } + if (!(buf = calloc(bufsiz, sizeof(*buf)))) { + fprintf(stderr, "%s: calloc: Out of memory.\n", argv[0]); + return 1; + } + for (i = 0, off = 0; i < LEN(character_test); i++) { + for (j = 0; j < character_test[i].cplen; j++) { + buf[off + j] = character_test[i].cp[j]; + } + off += character_test[i].cplen; + } + + /* run test */ + printf("%s: Running benchmark ", argv[0]); + fflush(stdout); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (i = 0; i < NUM_ITERATIONS; i++) { + memset(&state, 0, sizeof(state)); + for (j = 0; j < bufsiz - 1; j++) { + (void)lg_character_isbreak(buf[j], buf[j+1], &state); + } + if (i % (NUM_ITERATIONS / 10) == 0) { + printf("."); + fflush(stdout); + } + } + clock_gettime(CLOCK_MONOTONIC, &end); + + cp_per_sec = (double)(NUM_ITERATIONS * bufsiz) / + time_diff(&start, &end); + + printf(" %.2e CP/s\n", cp_per_sec); + + return 0; +} diff --git a/test/character.c b/test/character.c @@ -0,0 +1,45 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" +#include "../gen/character-test.h" +#include "util.h" + +int +main(int argc, char *argv[]) +{ + LG_SEGMENTATION_STATE state; + size_t i, j, k, len, failed; + + (void)argc; + + /* character break test */ + for (i = 0, failed = 0; i < LEN(character_test); i++) { + memset(&state, 0, sizeof(state)); + for (j = 0, k = 0, len = 1; j < character_test[i].cplen; j++) { + if ((j + 1) == character_test[i].cplen || + lg_character_isbreak(character_test[i].cp[j], + character_test[i].cp[j + 1], + &state)) { + /* check if our resulting length matches */ + if (k == character_test[i].lenlen || + len != character_test[i].len[k++]) { + fprintf(stderr, "%s: Failed test \"%s\".\n", + argv[0], character_test[i].descr); + failed++; + break; + } + len = 1; + } else { + len++; + } + } + } + printf("%s: %zu/%zu tests passed.\n", argv[0], + LEN(character_test) - failed, LEN(character_test)); + + return (failed > 0) ? 1 : 0; +} diff --git a/test/grapheme-performance.c b/test/grapheme-performance.c @@ -1,63 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> - -#include "../grapheme.h" -#include "../gen/grapheme-test.h" -#include "util.h" - -#define NUM_ITERATIONS 1000 - -int -main(int argc, char *argv[]) -{ - struct timespec start, end; - size_t i, j, bufsiz, off; - uint32_t *buf; - LG_SEGMENTATION_STATE state; - double cp_per_sec; - - (void)argc; - - /* allocate and generate buffer */ - for (i = 0, bufsiz = 0; i < LEN(grapheme_test); i++) { - bufsiz += grapheme_test[i].cplen; - } - if (!(buf = calloc(bufsiz, sizeof(*buf)))) { - fprintf(stderr, "%s: calloc: Out of memory.\n", argv[0]); - return 1; - } - for (i = 0, off = 0; i < LEN(grapheme_test); i++) { - for (j = 0; j < grapheme_test[i].cplen; j++) { - buf[off + j] = grapheme_test[i].cp[j]; - } - off += grapheme_test[i].cplen; - } - - /* run test */ - printf("%s: Running benchmark ", argv[0]); - fflush(stdout); - - clock_gettime(CLOCK_MONOTONIC, &start); - for (i = 0; i < NUM_ITERATIONS; i++) { - memset(&state, 0, sizeof(state)); - for (j = 0; j < bufsiz - 1; j++) { - (void)lg_grapheme_isbreak(buf[j], buf[j+1], &state); - } - if (i % (NUM_ITERATIONS / 10) == 0) { - printf("."); - fflush(stdout); - } - } - clock_gettime(CLOCK_MONOTONIC, &end); - - cp_per_sec = (double)(NUM_ITERATIONS * bufsiz) / - time_diff(&start, &end); - - printf(" %.2e CP/s\n", cp_per_sec); - - return 0; -} diff --git a/test/grapheme.c b/test/grapheme.c @@ -1,45 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -#include "../grapheme.h" -#include "../gen/grapheme-test.h" -#include "util.h" - -int -main(int argc, char *argv[]) -{ - LG_SEGMENTATION_STATE state; - size_t i, j, k, len, failed; - - (void)argc; - - /* grapheme break test */ - for (i = 0, failed = 0; i < LEN(grapheme_test); i++) { - memset(&state, 0, sizeof(state)); - for (j = 0, k = 0, len = 1; j < grapheme_test[i].cplen; j++) { - if ((j + 1) == grapheme_test[i].cplen || - lg_grapheme_isbreak(grapheme_test[i].cp[j], - grapheme_test[i].cp[j + 1], - &state)) { - /* check if our resulting length matches */ - if (k == grapheme_test[i].lenlen || - len != grapheme_test[i].len[k++]) { - fprintf(stderr, "%s: Failed test \"%s\".\n", - argv[0], grapheme_test[i].descr); - failed++; - break; - } - len = 1; - } else { - len++; - } - } - } - printf("%s: %zu/%zu tests passed.\n", argv[0], - LEN(grapheme_test) - failed, LEN(grapheme_test)); - - return (failed > 0) ? 1 : 0; -}