libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit fc071310eecb27fe2a469a64a3154c8db514a779
parent 79ff57ed9cab260e7051d1a9a5e4135921776acd
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 17 Oct 2020 20:57:52 +0200

Refactor directory structure and Makefile

I didn't like it that the test was in the src/-directory and we
basically did what the C-preprocessor does with an include, which
is why now, instead of those *_body.c source files, we just include
the headers of the data we generated, which are now reasonably located
in data/.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 71+++++++++++++++++++++++++++++++++++------------------------------------
Asrc/boundary.c | 280+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/boundary_body.c | 277-------------------------------------------------------------------------------
Dsrc/test_body.c | 373-------------------------------------------------------------------------------
Atest/test.c | 374+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 689 insertions(+), 686 deletions(-)

diff --git a/Makefile b/Makefile @@ -4,67 +4,66 @@ include config.mk -BIN = src/test -REQ = src/boundary src/codepoint src/grapheme -GBP_URL = https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -EMO_URL = https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt -GBT_URL = https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt -GBP = data/gbp.txt -EMO = data/emo.txt -GBT = data/gbt.txt +LIB = src/boundary src/codepoint src/grapheme +TEST = test/test +DATA = data/gbp data/emo data/gbt + MAN3 = man/grapheme_bytelen.3 MAN7 = man/libgrapheme.7 -all: libgrapheme.a libgrapheme.so $(BIN) - -test: src/test - ./$< +all: libgrapheme.a libgrapheme.so $(TEST) -src/test: src/test.o $(REQ:=.o) - -src/boundary.o: src/boundary.c config.mk grapheme.h +src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h src/codepoint.o: src/codepoint.c config.mk grapheme.h src/grapheme.o: src/grapheme.c config.mk grapheme.h -src/test.o: src/test.c config.mk grapheme.h +test/test.o: test/test.c config.mk data/gbt.h grapheme.h + +test/test: test/test.o $(LIB:=.o) -.o: - $(CC) -o $@ $(LDFLAGS) $< $(REQ:=.o) +test: $(TEST) + for m in $(TEST); do ./$$m; done + +$(TEST): + $(CC) -o $@ $(LDFLAGS) $< $(LIB:=.o) .c.o: $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $< -libgrapheme.a: $(REQ:=.o) +libgrapheme.a: $(LIB:=.o) $(AR) rc $@ $? $(RANLIB) $@ -libgrapheme.so: $(REQ:=.o) +libgrapheme.so: $(LIB:=.o) $(CC) -o $@ -shared $? -src/boundary.c: data/gbp.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c - printf "/* Automatically generated by gbp.awk and emo.awk */\n" > $@ +data/gbp.h: data/gbp.awk data/gbp.txt + printf "/* Automatically generated by gbp.awk */\n" > $@ + printf "#include <stdint.h>\n\n" >> $@ + awk -f data/gbp.awk data/gbp.txt >> $@ + printf "\n" >> $@ + +data/emo.h: data/emo.awk data/emo.txt + printf "/* Automatically generated by emo.awk */\n" > $@ printf "#include <stdint.h>\n\n" >> $@ - awk -f data/gbp.awk $(GBP) >> $@ - awk -f data/emo.awk $(EMO) >> $@ + awk -f data/emo.awk data/emo.txt >> $@ printf "\n" >> $@ - cat src/boundary_body.c >> $@ -src/test.c: data/gbt.awk $(GBT) src/test_body.c +data/gbt.h: data/gbt.awk data/gbt.txt printf "/* Automatically generated by gbt.awk */\n" > $@ printf "#include <stddef.h>\n" >> $@ printf "#include <stdint.h>\n\n" >> $@ printf "#include \"../grapheme.h\"\n\n" >> $@ - awk -f data/gbt.awk $(GBT) >> $@ + awk -f data/gbt.awk data/gbt.txt >> $@ printf "\n" >> $@ - cat src/test_body.c >> $@ -$(GBP): - wget -O $@ $(GBP_URL) +data/gbp.txt: + wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -$(EMO): - wget -O $@ $(EMO_URL) +data/emo.txt: + wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt -$(GBT): - wget -O $@ $(GBT_URL) +data/gbt.txt: + wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt install: all mkdir -p "$(DESTDIR)$(LIBPREFIX)" @@ -85,7 +84,7 @@ uninstall: rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h" clean: - rm -f src/boundary.c src/test.c $(REQ:=.o) $(BIN:=.o) $(BIN) libgrapheme.a libgrapheme.so + rm -f $(DATA:=.h) $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so clean-data: - rm -f $(GBP) $(EMO) $(GBT) + rm -f $(DATA:=.txt) diff --git a/src/boundary.c b/src/boundary.c @@ -0,0 +1,280 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +#include "../data/emo.h" +#include "../data/gbp.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +enum { + GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ + GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ +}; + +enum cp_property { + PROP_CR, /* carriage return */ + PROP_LF, /* line feed */ + PROP_CONTROL, /* control character */ + PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ + PROP_ZWJ, /* zero width joiner */ + PROP_RI, /* regional indicator */ + PROP_PREPEND, /* prepend character */ + PROP_SPACINGMARK, /* spacing mark */ + PROP_L, /* hangul syllable type L */ + PROP_V, /* hangul syllable type V */ + PROP_T, /* hangul syllable type T */ + PROP_LV, /* hangul syllable type LV */ + PROP_LVT, /* hangul syllable type LVT */ + PROP_EXTPICT, /* extended pictographic */ +}; + +struct { + const uint32_t (*table)[2]; + size_t tablelen; +} cp_property_tables[] = { + [PROP_CR] = { + .table = cr_table, + .tablelen = LEN(cr_table), + }, + [PROP_LF] = { + .table = lf_table, + .tablelen = LEN(lf_table), + }, + [PROP_CONTROL] = { + .table = control_table, + .tablelen = LEN(control_table), + }, + [PROP_EXTEND] = { + .table = extend_table, + .tablelen = LEN(extend_table), + }, + [PROP_ZWJ] = { + .table = zwj_table, + .tablelen = LEN(zwj_table), + }, + [PROP_RI] = { + .table = ri_table, + .tablelen = LEN(ri_table), + }, + [PROP_PREPEND] = { + .table = prepend_table, + .tablelen = LEN(prepend_table), + }, + [PROP_SPACINGMARK] = { + .table = spacingmark_table, + .tablelen = LEN(spacingmark_table), + }, + [PROP_L] = { + .table = l_table, + .tablelen = LEN(l_table), + }, + [PROP_V] = { + .table = v_table, + .tablelen = LEN(v_table), + }, + [PROP_T] = { + .table = t_table, + .tablelen = LEN(t_table), + }, + [PROP_LV] = { + .table = lv_table, + .tablelen = LEN(lv_table), + }, + [PROP_LVT] = { + .table = lvt_table, + .tablelen = LEN(lvt_table), + }, + [PROP_EXTPICT] = { + .table = extpict_table, + .tablelen = LEN(extpict_table), + }, +}; + +struct cp_properties { + uint32_t cp; + int_least16_t determined; + int_least16_t state; +}; + +static int +cp_cmp(const void *a, const void *b) +{ + uint32_t cp = *(uint32_t *)a; + uint32_t *range = (uint32_t *)b; + + return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); +} + +static int +has_property(struct cp_properties *props, enum cp_property p) +{ + if (!(props->determined & (1 << p))) { + /* not determined yet, do a lookup and set the state */ + if (bsearch(&props->cp, cp_property_tables[p].table, + cp_property_tables[p].tablelen, + sizeof(*cp_property_tables[p].table), + cp_cmp)) { + props->state |= (1 << p); + } else { + props->state &= ~(1 << p); + } + + /* now it's determined */ + props->determined |= (1 << p); + } + + return (props->state & (1 << p)); +} + +int +grapheme_boundary(uint32_t a, uint32_t b, int *state) +{ + struct cp_properties props[] = { + { + .cp = a, + }, + { + .cp = b, + }, + }; + int s; + + /* skip printable ASCII */ + if ((a >= 0x20 && a <= 0x7E) && + (b >= 0x20 && b <= 0x7E)) { + return 1; + } + + /* set internal state based on given state-pointer */ + s = (state != NULL) ? *state : 0; + + /* + * Apply grapheme cluster breaking algorithm (UAX #29), see + * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + */ + + /* + * update state + */ + if (has_property(&props[1], PROP_RI)) { + if (has_property(&props[0], PROP_RI)) { + /* one more RI is on the left side of the seam */ + s ^= GRAPHEME_STATE_RI_ODD; + } else { + /* an RI appeared on the right side but the left + side is not an RI, reset state (0 is even) */ + s &= ~GRAPHEME_STATE_RI_ODD; + } + } + if (!(*state & GRAPHEME_STATE_EMOJI) && + ((has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_EXTEND)))) { + s |= GRAPHEME_STATE_EMOJI; + } else if ((*state & GRAPHEME_STATE_EMOJI) && + ((has_property(&props[0], PROP_ZWJ) && + has_property(&props[1], PROP_EXTPICT)) || + (has_property(&props[0], PROP_EXTEND) && + has_property(&props[1], PROP_EXTEND)) || + (has_property(&props[0], PROP_EXTEND) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_ZWJ)) || + (has_property(&props[0], PROP_EXTPICT) && + has_property(&props[1], PROP_EXTEND)))) { + /* GRAPHEME_STATE_EMOJI remains */ + } else { + s &= ~GRAPHEME_STATE_EMOJI; + } + + /* write updated state to state-pointer, if given */ + if (state != NULL) { + *state = s; + } + + /* + * apply rules + */ + + /* skip GB1 and GB2, as they are never satisfied here */ + + /* GB3 */ + if (has_property(&props[0], PROP_CR) && + has_property(&props[1], PROP_LF)) { + return 0; + } + + /* GB4 */ + if (has_property(&props[0], PROP_CONTROL) || + has_property(&props[0], PROP_CR) || + has_property(&props[0], PROP_LF)) { + return 1; + } + + /* GB5 */ + if (has_property(&props[1], PROP_CONTROL) || + has_property(&props[1], PROP_CR) || + has_property(&props[1], PROP_LF)) { + return 1; + } + + /* GB6 */ + if (has_property(&props[0], PROP_L) && + (has_property(&props[1], PROP_L) || + has_property(&props[1], PROP_V) || + has_property(&props[1], PROP_LV) || + has_property(&props[1], PROP_LVT))) { + return 0; + } + + /* GB7 */ + if ((has_property(&props[0], PROP_LV) || + has_property(&props[0], PROP_V)) && + (has_property(&props[1], PROP_V) || + has_property(&props[1], PROP_T))) { + return 0; + } + + /* GB8 */ + if ((has_property(&props[0], PROP_LVT) || + has_property(&props[0], PROP_T)) && + has_property(&props[1], PROP_T)) { + return 0; + } + + /* GB9 */ + if (has_property(&props[1], PROP_EXTEND) || + has_property(&props[1], PROP_ZWJ)) { + return 0; + } + + /* GB9a */ + if (has_property(&props[1], PROP_SPACINGMARK)) { + return 0; + } + + /* GB9b */ + if (has_property(&props[0], PROP_PREPEND)) { + return 0; + } + + /* GB11 */ + if ((s & GRAPHEME_STATE_EMOJI) && + has_property(&props[0], PROP_ZWJ) && + has_property(&props[1], PROP_EXTPICT)) { + return 0; + } + + /* GB12/GB13 */ + if (has_property(&props[0], PROP_RI) && + has_property(&props[1], PROP_RI) && + (s & GRAPHEME_STATE_RI_ODD)) { + return 0; + } + + /* GB999 */ + return 1; +} diff --git a/src/boundary_body.c b/src/boundary_body.c @@ -1,277 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdlib.h> - -#define LEN(x) (sizeof(x) / sizeof(*x)) - -enum { - GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */ - GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */ -}; - -enum cp_property { - PROP_CR, /* carriage return */ - PROP_LF, /* line feed */ - PROP_CONTROL, /* control character */ - PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */ - PROP_ZWJ, /* zero width joiner */ - PROP_RI, /* regional indicator */ - PROP_PREPEND, /* prepend character */ - PROP_SPACINGMARK, /* spacing mark */ - PROP_L, /* hangul syllable type L */ - PROP_V, /* hangul syllable type V */ - PROP_T, /* hangul syllable type T */ - PROP_LV, /* hangul syllable type LV */ - PROP_LVT, /* hangul syllable type LVT */ - PROP_EXTPICT, /* extended pictographic */ -}; - -struct { - const uint32_t (*table)[2]; - size_t tablelen; -} cp_property_tables[] = { - [PROP_CR] = { - .table = cr_table, - .tablelen = LEN(cr_table), - }, - [PROP_LF] = { - .table = lf_table, - .tablelen = LEN(lf_table), - }, - [PROP_CONTROL] = { - .table = control_table, - .tablelen = LEN(control_table), - }, - [PROP_EXTEND] = { - .table = extend_table, - .tablelen = LEN(extend_table), - }, - [PROP_ZWJ] = { - .table = zwj_table, - .tablelen = LEN(zwj_table), - }, - [PROP_RI] = { - .table = ri_table, - .tablelen = LEN(ri_table), - }, - [PROP_PREPEND] = { - .table = prepend_table, - .tablelen = LEN(prepend_table), - }, - [PROP_SPACINGMARK] = { - .table = spacingmark_table, - .tablelen = LEN(spacingmark_table), - }, - [PROP_L] = { - .table = l_table, - .tablelen = LEN(l_table), - }, - [PROP_V] = { - .table = v_table, - .tablelen = LEN(v_table), - }, - [PROP_T] = { - .table = t_table, - .tablelen = LEN(t_table), - }, - [PROP_LV] = { - .table = lv_table, - .tablelen = LEN(lv_table), - }, - [PROP_LVT] = { - .table = lvt_table, - .tablelen = LEN(lvt_table), - }, - [PROP_EXTPICT] = { - .table = extpict_table, - .tablelen = LEN(extpict_table), - }, -}; - -struct cp_properties { - uint32_t cp; - int_least16_t determined; - int_least16_t state; -}; - -static int -cp_cmp(const void *a, const void *b) -{ - uint32_t cp = *(uint32_t *)a; - uint32_t *range = (uint32_t *)b; - - return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]); -} - -static int -has_property(struct cp_properties *props, enum cp_property p) -{ - if (!(props->determined & (1 << p))) { - /* not determined yet, do a lookup and set the state */ - if (bsearch(&props->cp, cp_property_tables[p].table, - cp_property_tables[p].tablelen, - sizeof(*cp_property_tables[p].table), - cp_cmp)) { - props->state |= (1 << p); - } else { - props->state &= ~(1 << p); - } - - /* now it's determined */ - props->determined |= (1 << p); - } - - return (props->state & (1 << p)); -} - -int -grapheme_boundary(uint32_t a, uint32_t b, int *state) -{ - struct cp_properties props[] = { - { - .cp = a, - }, - { - .cp = b, - }, - }; - int s; - - /* skip printable ASCII */ - if ((a >= 0x20 && a <= 0x7E) && - (b >= 0x20 && b <= 0x7E)) { - return 1; - } - - /* set internal state based on given state-pointer */ - s = (state != NULL) ? *state : 0; - - /* - * Apply grapheme cluster breaking algorithm (UAX #29), see - * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - */ - - /* - * update state - */ - if (has_property(&props[1], PROP_RI)) { - if (has_property(&props[0], PROP_RI)) { - /* one more RI is on the left side of the seam */ - s ^= GRAPHEME_STATE_RI_ODD; - } else { - /* an RI appeared on the right side but the left - side is not an RI, reset state (0 is even) */ - s &= ~GRAPHEME_STATE_RI_ODD; - } - } - if (!(*state & GRAPHEME_STATE_EMOJI) && - ((has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_EXTEND)))) { - s |= GRAPHEME_STATE_EMOJI; - } else if ((*state & GRAPHEME_STATE_EMOJI) && - ((has_property(&props[0], PROP_ZWJ) && - has_property(&props[1], PROP_EXTPICT)) || - (has_property(&props[0], PROP_EXTEND) && - has_property(&props[1], PROP_EXTEND)) || - (has_property(&props[0], PROP_EXTEND) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_ZWJ)) || - (has_property(&props[0], PROP_EXTPICT) && - has_property(&props[1], PROP_EXTEND)))) { - /* GRAPHEME_STATE_EMOJI remains */ - } else { - s &= ~GRAPHEME_STATE_EMOJI; - } - - /* write updated state to state-pointer, if given */ - if (state != NULL) { - *state = s; - } - - /* - * apply rules - */ - - /* skip GB1 and GB2, as they are never satisfied here */ - - /* GB3 */ - if (has_property(&props[0], PROP_CR) && - has_property(&props[1], PROP_LF)) { - return 0; - } - - /* GB4 */ - if (has_property(&props[0], PROP_CONTROL) || - has_property(&props[0], PROP_CR) || - has_property(&props[0], PROP_LF)) { - return 1; - } - - /* GB5 */ - if (has_property(&props[1], PROP_CONTROL) || - has_property(&props[1], PROP_CR) || - has_property(&props[1], PROP_LF)) { - return 1; - } - - /* GB6 */ - if (has_property(&props[0], PROP_L) && - (has_property(&props[1], PROP_L) || - has_property(&props[1], PROP_V) || - has_property(&props[1], PROP_LV) || - has_property(&props[1], PROP_LVT))) { - return 0; - } - - /* GB7 */ - if ((has_property(&props[0], PROP_LV) || - has_property(&props[0], PROP_V)) && - (has_property(&props[1], PROP_V) || - has_property(&props[1], PROP_T))) { - return 0; - } - - /* GB8 */ - if ((has_property(&props[0], PROP_LVT) || - has_property(&props[0], PROP_T)) && - has_property(&props[1], PROP_T)) { - return 0; - } - - /* GB9 */ - if (has_property(&props[1], PROP_EXTEND) || - has_property(&props[1], PROP_ZWJ)) { - return 0; - } - - /* GB9a */ - if (has_property(&props[1], PROP_SPACINGMARK)) { - return 0; - } - - /* GB9b */ - if (has_property(&props[0], PROP_PREPEND)) { - return 0; - } - - /* GB11 */ - if ((s & GRAPHEME_STATE_EMOJI) && - has_property(&props[0], PROP_ZWJ) && - has_property(&props[1], PROP_EXTPICT)) { - return 0; - } - - /* GB12/GB13 */ - if (has_property(&props[0], PROP_RI) && - has_property(&props[1], PROP_RI) && - (s & GRAPHEME_STATE_RI_ODD)) { - return 0; - } - - /* GB999 */ - return 1; -} diff --git a/src/test_body.c b/src/test_body.c @@ -1,373 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -#include "../grapheme.h" - -#define LEN(x) (sizeof(x) / sizeof(*x)) - -static const struct { - uint32_t cp; /* input code point */ - uint8_t *exp_arr; /* expected UTF-8 byte sequence */ - size_t exp_len; /* expected length of UTF-8 sequence */ -} enc_test[] = { - { - /* invalid code point (UTF-16 surrogate half) */ - .cp = UINT32_C(0xD800), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, - .exp_len = 3, - }, - { - /* invalid code point (UTF-16-unrepresentable) */ - .cp = UINT32_C(0x110000), - .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, - .exp_len = 3, - }, - { - /* code point encoded to a 1-byte sequence */ - .cp = 0x01, - .exp_arr = (uint8_t[]){ 0x01 }, - .exp_len = 1, - }, - { - /* code point encoded to a 2-byte sequence */ - .cp = 0xFF, - .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, - .exp_len = 2, - }, - { - /* code point encoded to a 3-byte sequence */ - .cp = 0xFFF, - .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, - .exp_len = 3, - }, - { - /* code point encoded to a 4-byte sequence */ - .cp = UINT32_C(0xFFFFF), - .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, - .exp_len = 4, - }, -}; - -static const struct { - uint8_t *arr; /* UTF-8 byte sequence */ - size_t len; /* length of UTF-8 byte sequence */ - size_t exp_len; /* expected length returned */ - uint32_t exp_cp; /* expected code point returned */ -} dec_test[] = { - { - /* empty sequence - * [ ] -> - * INVALID - */ - .arr = NULL, - .len = 0, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid lead byte - * [ 11111101 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xFD }, - .len = 1, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 1-byte sequence - * [ 00000001 ] -> - * 0000001 - */ - .arr = (uint8_t[]){ 0x01 }, - .len = 1, - .exp_len = 1, - .exp_cp = 0x1, - }, - { - /* valid 2-byte sequence - * [ 11000011 10111111 ] -> - * 00011111111 - */ - .arr = (uint8_t[]){ 0xC3, 0xBF }, - .len = 2, - .exp_len = 2, - .exp_cp = 0xFF, - }, - { - /* invalid 2-byte sequence (second byte missing) - * [ 11000011 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC3 }, - .len = 1, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 2-byte sequence (second byte malformed) - * [ 11000011 11111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC3, 0xFF }, - .len = 2, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 2-byte sequence (overlong encoded) - * [ 11000001 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xC1, 0xBF }, - .len = 2, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 3-byte sequence - * [ 11100000 10111111 10111111 ] -> - * 0000111111111111 - */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, - .len = 3, - .exp_len = 3, - .exp_cp = 0xFFF, - }, - { - /* invalid 3-byte sequence (second byte missing) - * [ 11100000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0 }, - .len = 1, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (second byte malformed) - * [ 11100000 01111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, - .len = 3, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (third byte missing) - * [ 11100000 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0xBF }, - .len = 2, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (third byte malformed) - * [ 11100000 10111111 01111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, - .len = 3, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (overlong encoded) - * [ 11100000 10011111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, - .len = 3, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 3-byte sequence (UTF-16 surrogate half) - * [ 11101101 10100000 10000000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, - .len = 3, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* valid 4-byte sequence - * [ 11110011 10111111 10111111 10111111 ] -> - * 011111111111111111111 - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, - .len = 4, - .exp_len = 4, - .exp_cp = UINT32_C(0xFFFFF), - }, - { - /* invalid 4-byte sequence (second byte missing) - * [ 11110011 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3 }, - .len = 1, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (second byte malformed) - * [ 11110011 01111111 10111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, - .len = 4, - .exp_len = 1, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (third byte missing) - * [ 11110011 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF }, - .len = 2, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (third byte malformed) - * [ 11110011 10111111 01111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, - .len = 4, - .exp_len = 2, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (fourth byte missing) - * [ 11110011 10111111 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, - .len = 3, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (fourth byte malformed) - * [ 11110011 10111111 10111111 01111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, - .len = 4, - .exp_len = 3, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (overlong encoded) - * [ 11110000 10000000 10000001 10111111 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, - .len = 4, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, - { - /* invalid 4-byte sequence (UTF-16-unrepresentable) - * [ 11110100 10010000 10000000 10000000 ] -> - * INVALID - */ - .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, - .len = 4, - .exp_len = 4, - .exp_cp = GRAPHEME_CP_INVALID, - }, -}; - -int -main(void) -{ - int state; - size_t i, j, k, len, failed; - - /* UTF-8 encoder test */ - for (i = 0, failed = 0; i < LEN(enc_test); i++) { - uint8_t arr[4]; - size_t len; - - len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); - - if (len != enc_test[i].exp_len || - memcmp(arr, enc_test[i].exp_arr, len)) { - fprintf(stderr, "Failed UTF-8-encoder test %zu: " - "Expected (", i); - for (j = 0; j < enc_test[i].exp_len; j++) { - fprintf(stderr, "0x%x", - enc_test[i].exp_arr[j]); - if (j + 1 < enc_test[i].exp_len) { - fprintf(stderr, " "); - } - } - fprintf(stderr, "), but got ("); - for (j = 0; j < len; j++) { - fprintf(stderr, "0x%x", arr[j]); - if (j + 1 < len) { - fprintf(stderr, " "); - } - } - fprintf(stderr, ")\n"); - failed++; - } - } - printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", - LEN(enc_test) - failed, LEN(enc_test)); - - /* UTF-8 decoder test */ - for (i = 0, failed = 0; i < LEN(dec_test); i++) { - size_t len; - uint32_t cp; - - len = grapheme_cp_decode(&cp, dec_test[i].arr, - dec_test[i].len); - - if (len != dec_test[i].exp_len || - cp != dec_test[i].exp_cp) { - fprintf(stderr, "Failed UTF-8-decoder test %zu: " - "Expected (%zx,%u), but got (%zx,%u)\n", - i, dec_test[i].exp_len, - dec_test[i].exp_cp, len, cp); - failed++; - } - } - printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", - LEN(dec_test) - failed, LEN(dec_test)); - - /* grapheme break test */ - for (i = 0, failed = 0; i < LEN(t); i++) { - for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { - if ((j + 1) == t[i].cplen || - grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], - &state)) { - /* check if our resulting length matches */ - if (k == t[i].lenlen || len != t[i].len[k++]) { - fprintf(stderr, "Failed \"%s\"\n", - t[i].descr); - failed++; - break; - } - len = 1; - } else { - len++; - } - } - } - printf("Grapheme break test: Passed %zu out of %zu tests.\n", - LEN(t) - failed, LEN(t)); - - return (failed > 0) ? 1 : 0; -} diff --git a/test/test.c b/test/test.c @@ -0,0 +1,374 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "../grapheme.h" +#include "../data/gbt.h" + +#define LEN(x) (sizeof(x) / sizeof(*x)) + +static const struct { + uint32_t cp; /* input code point */ + uint8_t *exp_arr; /* expected UTF-8 byte sequence */ + size_t exp_len; /* expected length of UTF-8 sequence */ +} enc_test[] = { + { + /* invalid code point (UTF-16 surrogate half) */ + .cp = UINT32_C(0xD800), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* invalid code point (UTF-16-unrepresentable) */ + .cp = UINT32_C(0x110000), + .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD }, + .exp_len = 3, + }, + { + /* code point encoded to a 1-byte sequence */ + .cp = 0x01, + .exp_arr = (uint8_t[]){ 0x01 }, + .exp_len = 1, + }, + { + /* code point encoded to a 2-byte sequence */ + .cp = 0xFF, + .exp_arr = (uint8_t[]){ 0xC3, 0xBF }, + .exp_len = 2, + }, + { + /* code point encoded to a 3-byte sequence */ + .cp = 0xFFF, + .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .exp_len = 3, + }, + { + /* code point encoded to a 4-byte sequence */ + .cp = UINT32_C(0xFFFFF), + .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .exp_len = 4, + }, +}; + +static const struct { + uint8_t *arr; /* UTF-8 byte sequence */ + size_t len; /* length of UTF-8 byte sequence */ + size_t exp_len; /* expected length returned */ + uint32_t exp_cp; /* expected code point returned */ +} dec_test[] = { + { + /* empty sequence + * [ ] -> + * INVALID + */ + .arr = NULL, + .len = 0, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid lead byte + * [ 11111101 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xFD }, + .len = 1, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 1-byte sequence + * [ 00000001 ] -> + * 0000001 + */ + .arr = (uint8_t[]){ 0x01 }, + .len = 1, + .exp_len = 1, + .exp_cp = 0x1, + }, + { + /* valid 2-byte sequence + * [ 11000011 10111111 ] -> + * 00011111111 + */ + .arr = (uint8_t[]){ 0xC3, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = 0xFF, + }, + { + /* invalid 2-byte sequence (second byte missing) + * [ 11000011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3 }, + .len = 1, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 2-byte sequence (second byte malformed) + * [ 11000011 11111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC3, 0xFF }, + .len = 2, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 2-byte sequence (overlong encoded) + * [ 11000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xC1, 0xBF }, + .len = 2, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 3-byte sequence + * [ 11100000 10111111 10111111 ] -> + * 0000111111111111 + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = 0xFFF, + }, + { + /* invalid 3-byte sequence (second byte missing) + * [ 11100000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0 }, + .len = 1, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (second byte malformed) + * [ 11100000 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF }, + .len = 3, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte missing) + * [ 11100000 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF }, + .len = 2, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (third byte malformed) + * [ 11100000 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F }, + .len = 3, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (overlong encoded) + * [ 11100000 10011111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF }, + .len = 3, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 3-byte sequence (UTF-16 surrogate half) + * [ 11101101 10100000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 }, + .len = 3, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* valid 4-byte sequence + * [ 11110011 10111111 10111111 10111111 ] -> + * 011111111111111111111 + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = UINT32_C(0xFFFFF), + }, + { + /* invalid 4-byte sequence (second byte missing) + * [ 11110011 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3 }, + .len = 1, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (second byte malformed) + * [ 11110011 01111111 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF }, + .len = 4, + .exp_len = 1, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte missing) + * [ 11110011 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF }, + .len = 2, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (third byte malformed) + * [ 11110011 10111111 01111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF }, + .len = 4, + .exp_len = 2, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte missing) + * [ 11110011 10111111 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF }, + .len = 3, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (fourth byte malformed) + * [ 11110011 10111111 10111111 01111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F }, + .len = 4, + .exp_len = 3, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (overlong encoded) + * [ 11110000 10000000 10000001 10111111 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF }, + .len = 4, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, + { + /* invalid 4-byte sequence (UTF-16-unrepresentable) + * [ 11110100 10010000 10000000 10000000 ] -> + * INVALID + */ + .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 }, + .len = 4, + .exp_len = 4, + .exp_cp = GRAPHEME_CP_INVALID, + }, +}; + +int +main(void) +{ + int state; + size_t i, j, k, len, failed; + + /* UTF-8 encoder test */ + for (i = 0, failed = 0; i < LEN(enc_test); i++) { + uint8_t arr[4]; + size_t len; + + len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr)); + + if (len != enc_test[i].exp_len || + memcmp(arr, enc_test[i].exp_arr, len)) { + fprintf(stderr, "Failed UTF-8-encoder test %zu: " + "Expected (", i); + for (j = 0; j < enc_test[i].exp_len; j++) { + fprintf(stderr, "0x%x", + enc_test[i].exp_arr[j]); + if (j + 1 < enc_test[i].exp_len) { + fprintf(stderr, " "); + } + } + fprintf(stderr, "), but got ("); + for (j = 0; j < len; j++) { + fprintf(stderr, "0x%x", arr[j]); + if (j + 1 < len) { + fprintf(stderr, " "); + } + } + fprintf(stderr, ")\n"); + failed++; + } + } + printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n", + LEN(enc_test) - failed, LEN(enc_test)); + + /* UTF-8 decoder test */ + for (i = 0, failed = 0; i < LEN(dec_test); i++) { + size_t len; + uint32_t cp; + + len = grapheme_cp_decode(&cp, dec_test[i].arr, + dec_test[i].len); + + if (len != dec_test[i].exp_len || + cp != dec_test[i].exp_cp) { + fprintf(stderr, "Failed UTF-8-decoder test %zu: " + "Expected (%zx,%u), but got (%zx,%u)\n", + i, dec_test[i].exp_len, + dec_test[i].exp_cp, len, cp); + failed++; + } + } + printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n", + LEN(dec_test) - failed, LEN(dec_test)); + + /* grapheme break test */ + for (i = 0, failed = 0; i < LEN(t); i++) { + for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) { + if ((j + 1) == t[i].cplen || + grapheme_boundary(t[i].cp[j], t[i].cp[j + 1], + &state)) { + /* check if our resulting length matches */ + if (k == t[i].lenlen || len != t[i].len[k++]) { + fprintf(stderr, "Failed \"%s\"\n", + t[i].descr); + failed++; + break; + } + len = 1; + } else { + len++; + } + } + } + printf("Grapheme break test: Passed %zu out of %zu tests.\n", + LEN(t) - failed, LEN(t)); + + return (failed > 0) ? 1 : 0; +}