commit fc071310eecb27fe2a469a64a3154c8db514a779
parent 79ff57ed9cab260e7051d1a9a5e4135921776acd
Author: Laslo Hunhold <dev@frign.de>
Date: Sat, 17 Oct 2020 20:57:52 +0200
Refactor directory structure and Makefile
I didn't like it that the test was in the src/-directory and we
basically did what the C-preprocessor does with an include, which
is why now, instead of those *_body.c source files, we just include
the headers of the data we generated, which are now reasonably located
in data/.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | Makefile | | | 71 | +++++++++++++++++++++++++++++++++++------------------------------------ |
A | src/boundary.c | | | 280 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
D | src/boundary_body.c | | | 277 | ------------------------------------------------------------------------------- |
D | src/test_body.c | | | 373 | ------------------------------------------------------------------------------- |
A | test/test.c | | | 374 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 689 insertions(+), 686 deletions(-)
diff --git a/Makefile b/Makefile
@@ -4,67 +4,66 @@
include config.mk
-BIN = src/test
-REQ = src/boundary src/codepoint src/grapheme
-GBP_URL = https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-EMO_URL = https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
-GBT_URL = https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
-GBP = data/gbp.txt
-EMO = data/emo.txt
-GBT = data/gbt.txt
+LIB = src/boundary src/codepoint src/grapheme
+TEST = test/test
+DATA = data/gbp data/emo data/gbt
+
MAN3 = man/grapheme_bytelen.3
MAN7 = man/libgrapheme.7
-all: libgrapheme.a libgrapheme.so $(BIN)
-
-test: src/test
- ./$<
+all: libgrapheme.a libgrapheme.so $(TEST)
-src/test: src/test.o $(REQ:=.o)
-
-src/boundary.o: src/boundary.c config.mk grapheme.h
+src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h
src/codepoint.o: src/codepoint.c config.mk grapheme.h
src/grapheme.o: src/grapheme.c config.mk grapheme.h
-src/test.o: src/test.c config.mk grapheme.h
+test/test.o: test/test.c config.mk data/gbt.h grapheme.h
+
+test/test: test/test.o $(LIB:=.o)
-.o:
- $(CC) -o $@ $(LDFLAGS) $< $(REQ:=.o)
+test: $(TEST)
+ for m in $(TEST); do ./$$m; done
+
+$(TEST):
+ $(CC) -o $@ $(LDFLAGS) $< $(LIB:=.o)
.c.o:
$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
-libgrapheme.a: $(REQ:=.o)
+libgrapheme.a: $(LIB:=.o)
$(AR) rc $@ $?
$(RANLIB) $@
-libgrapheme.so: $(REQ:=.o)
+libgrapheme.so: $(LIB:=.o)
$(CC) -o $@ -shared $?
-src/boundary.c: data/gbp.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c
- printf "/* Automatically generated by gbp.awk and emo.awk */\n" > $@
+data/gbp.h: data/gbp.awk data/gbp.txt
+ printf "/* Automatically generated by gbp.awk */\n" > $@
+ printf "#include <stdint.h>\n\n" >> $@
+ awk -f data/gbp.awk data/gbp.txt >> $@
+ printf "\n" >> $@
+
+data/emo.h: data/emo.awk data/emo.txt
+ printf "/* Automatically generated by emo.awk */\n" > $@
printf "#include <stdint.h>\n\n" >> $@
- awk -f data/gbp.awk $(GBP) >> $@
- awk -f data/emo.awk $(EMO) >> $@
+ awk -f data/emo.awk data/emo.txt >> $@
printf "\n" >> $@
- cat src/boundary_body.c >> $@
-src/test.c: data/gbt.awk $(GBT) src/test_body.c
+data/gbt.h: data/gbt.awk data/gbt.txt
printf "/* Automatically generated by gbt.awk */\n" > $@
printf "#include <stddef.h>\n" >> $@
printf "#include <stdint.h>\n\n" >> $@
printf "#include \"../grapheme.h\"\n\n" >> $@
- awk -f data/gbt.awk $(GBT) >> $@
+ awk -f data/gbt.awk data/gbt.txt >> $@
printf "\n" >> $@
- cat src/test_body.c >> $@
-$(GBP):
- wget -O $@ $(GBP_URL)
+data/gbp.txt:
+ wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-$(EMO):
- wget -O $@ $(EMO_URL)
+data/emo.txt:
+ wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
-$(GBT):
- wget -O $@ $(GBT_URL)
+data/gbt.txt:
+ wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
install: all
mkdir -p "$(DESTDIR)$(LIBPREFIX)"
@@ -85,7 +84,7 @@ uninstall:
rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
clean:
- rm -f src/boundary.c src/test.c $(REQ:=.o) $(BIN:=.o) $(BIN) libgrapheme.a libgrapheme.so
+ rm -f $(DATA:=.h) $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so
clean-data:
- rm -f $(GBP) $(EMO) $(GBT)
+ rm -f $(DATA:=.txt)
diff --git a/src/boundary.c b/src/boundary.c
@@ -0,0 +1,280 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "../data/emo.h"
+#include "../data/gbp.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+enum {
+ GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+ GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
+};
+
+enum cp_property {
+ PROP_CR, /* carriage return */
+ PROP_LF, /* line feed */
+ PROP_CONTROL, /* control character */
+ PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */
+ PROP_ZWJ, /* zero width joiner */
+ PROP_RI, /* regional indicator */
+ PROP_PREPEND, /* prepend character */
+ PROP_SPACINGMARK, /* spacing mark */
+ PROP_L, /* hangul syllable type L */
+ PROP_V, /* hangul syllable type V */
+ PROP_T, /* hangul syllable type T */
+ PROP_LV, /* hangul syllable type LV */
+ PROP_LVT, /* hangul syllable type LVT */
+ PROP_EXTPICT, /* extended pictographic */
+};
+
+struct {
+ const uint32_t (*table)[2];
+ size_t tablelen;
+} cp_property_tables[] = {
+ [PROP_CR] = {
+ .table = cr_table,
+ .tablelen = LEN(cr_table),
+ },
+ [PROP_LF] = {
+ .table = lf_table,
+ .tablelen = LEN(lf_table),
+ },
+ [PROP_CONTROL] = {
+ .table = control_table,
+ .tablelen = LEN(control_table),
+ },
+ [PROP_EXTEND] = {
+ .table = extend_table,
+ .tablelen = LEN(extend_table),
+ },
+ [PROP_ZWJ] = {
+ .table = zwj_table,
+ .tablelen = LEN(zwj_table),
+ },
+ [PROP_RI] = {
+ .table = ri_table,
+ .tablelen = LEN(ri_table),
+ },
+ [PROP_PREPEND] = {
+ .table = prepend_table,
+ .tablelen = LEN(prepend_table),
+ },
+ [PROP_SPACINGMARK] = {
+ .table = spacingmark_table,
+ .tablelen = LEN(spacingmark_table),
+ },
+ [PROP_L] = {
+ .table = l_table,
+ .tablelen = LEN(l_table),
+ },
+ [PROP_V] = {
+ .table = v_table,
+ .tablelen = LEN(v_table),
+ },
+ [PROP_T] = {
+ .table = t_table,
+ .tablelen = LEN(t_table),
+ },
+ [PROP_LV] = {
+ .table = lv_table,
+ .tablelen = LEN(lv_table),
+ },
+ [PROP_LVT] = {
+ .table = lvt_table,
+ .tablelen = LEN(lvt_table),
+ },
+ [PROP_EXTPICT] = {
+ .table = extpict_table,
+ .tablelen = LEN(extpict_table),
+ },
+};
+
+struct cp_properties {
+ uint32_t cp;
+ int_least16_t determined;
+ int_least16_t state;
+};
+
+static int
+cp_cmp(const void *a, const void *b)
+{
+ uint32_t cp = *(uint32_t *)a;
+ uint32_t *range = (uint32_t *)b;
+
+ return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
+}
+
+static int
+has_property(struct cp_properties *props, enum cp_property p)
+{
+ if (!(props->determined & (1 << p))) {
+ /* not determined yet, do a lookup and set the state */
+ if (bsearch(&props->cp, cp_property_tables[p].table,
+ cp_property_tables[p].tablelen,
+ sizeof(*cp_property_tables[p].table),
+ cp_cmp)) {
+ props->state |= (1 << p);
+ } else {
+ props->state &= ~(1 << p);
+ }
+
+ /* now it's determined */
+ props->determined |= (1 << p);
+ }
+
+ return (props->state & (1 << p));
+}
+
+int
+grapheme_boundary(uint32_t a, uint32_t b, int *state)
+{
+ struct cp_properties props[] = {
+ {
+ .cp = a,
+ },
+ {
+ .cp = b,
+ },
+ };
+ int s;
+
+ /* skip printable ASCII */
+ if ((a >= 0x20 && a <= 0x7E) &&
+ (b >= 0x20 && b <= 0x7E)) {
+ return 1;
+ }
+
+ /* set internal state based on given state-pointer */
+ s = (state != NULL) ? *state : 0;
+
+ /*
+ * Apply grapheme cluster breaking algorithm (UAX #29), see
+ * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ */
+
+ /*
+ * update state
+ */
+ if (has_property(&props[1], PROP_RI)) {
+ if (has_property(&props[0], PROP_RI)) {
+ /* one more RI is on the left side of the seam */
+ s ^= GRAPHEME_STATE_RI_ODD;
+ } else {
+ /* an RI appeared on the right side but the left
+ side is not an RI, reset state (0 is even) */
+ s &= ~GRAPHEME_STATE_RI_ODD;
+ }
+ }
+ if (!(*state & GRAPHEME_STATE_EMOJI) &&
+ ((has_property(&props[0], PROP_EXTPICT) &&
+ has_property(&props[1], PROP_ZWJ)) ||
+ (has_property(&props[0], PROP_EXTPICT) &&
+ has_property(&props[1], PROP_EXTEND)))) {
+ s |= GRAPHEME_STATE_EMOJI;
+ } else if ((*state & GRAPHEME_STATE_EMOJI) &&
+ ((has_property(&props[0], PROP_ZWJ) &&
+ has_property(&props[1], PROP_EXTPICT)) ||
+ (has_property(&props[0], PROP_EXTEND) &&
+ has_property(&props[1], PROP_EXTEND)) ||
+ (has_property(&props[0], PROP_EXTEND) &&
+ has_property(&props[1], PROP_ZWJ)) ||
+ (has_property(&props[0], PROP_EXTPICT) &&
+ has_property(&props[1], PROP_ZWJ)) ||
+ (has_property(&props[0], PROP_EXTPICT) &&
+ has_property(&props[1], PROP_EXTEND)))) {
+ /* GRAPHEME_STATE_EMOJI remains */
+ } else {
+ s &= ~GRAPHEME_STATE_EMOJI;
+ }
+
+ /* write updated state to state-pointer, if given */
+ if (state != NULL) {
+ *state = s;
+ }
+
+ /*
+ * apply rules
+ */
+
+ /* skip GB1 and GB2, as they are never satisfied here */
+
+ /* GB3 */
+ if (has_property(&props[0], PROP_CR) &&
+ has_property(&props[1], PROP_LF)) {
+ return 0;
+ }
+
+ /* GB4 */
+ if (has_property(&props[0], PROP_CONTROL) ||
+ has_property(&props[0], PROP_CR) ||
+ has_property(&props[0], PROP_LF)) {
+ return 1;
+ }
+
+ /* GB5 */
+ if (has_property(&props[1], PROP_CONTROL) ||
+ has_property(&props[1], PROP_CR) ||
+ has_property(&props[1], PROP_LF)) {
+ return 1;
+ }
+
+ /* GB6 */
+ if (has_property(&props[0], PROP_L) &&
+ (has_property(&props[1], PROP_L) ||
+ has_property(&props[1], PROP_V) ||
+ has_property(&props[1], PROP_LV) ||
+ has_property(&props[1], PROP_LVT))) {
+ return 0;
+ }
+
+ /* GB7 */
+ if ((has_property(&props[0], PROP_LV) ||
+ has_property(&props[0], PROP_V)) &&
+ (has_property(&props[1], PROP_V) ||
+ has_property(&props[1], PROP_T))) {
+ return 0;
+ }
+
+ /* GB8 */
+ if ((has_property(&props[0], PROP_LVT) ||
+ has_property(&props[0], PROP_T)) &&
+ has_property(&props[1], PROP_T)) {
+ return 0;
+ }
+
+ /* GB9 */
+ if (has_property(&props[1], PROP_EXTEND) ||
+ has_property(&props[1], PROP_ZWJ)) {
+ return 0;
+ }
+
+ /* GB9a */
+ if (has_property(&props[1], PROP_SPACINGMARK)) {
+ return 0;
+ }
+
+ /* GB9b */
+ if (has_property(&props[0], PROP_PREPEND)) {
+ return 0;
+ }
+
+ /* GB11 */
+ if ((s & GRAPHEME_STATE_EMOJI) &&
+ has_property(&props[0], PROP_ZWJ) &&
+ has_property(&props[1], PROP_EXTPICT)) {
+ return 0;
+ }
+
+ /* GB12/GB13 */
+ if (has_property(&props[0], PROP_RI) &&
+ has_property(&props[1], PROP_RI) &&
+ (s & GRAPHEME_STATE_RI_ODD)) {
+ return 0;
+ }
+
+ /* GB999 */
+ return 1;
+}
diff --git a/src/boundary_body.c b/src/boundary_body.c
@@ -1,277 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
-enum {
- GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
- GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
-};
-
-enum cp_property {
- PROP_CR, /* carriage return */
- PROP_LF, /* line feed */
- PROP_CONTROL, /* control character */
- PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */
- PROP_ZWJ, /* zero width joiner */
- PROP_RI, /* regional indicator */
- PROP_PREPEND, /* prepend character */
- PROP_SPACINGMARK, /* spacing mark */
- PROP_L, /* hangul syllable type L */
- PROP_V, /* hangul syllable type V */
- PROP_T, /* hangul syllable type T */
- PROP_LV, /* hangul syllable type LV */
- PROP_LVT, /* hangul syllable type LVT */
- PROP_EXTPICT, /* extended pictographic */
-};
-
-struct {
- const uint32_t (*table)[2];
- size_t tablelen;
-} cp_property_tables[] = {
- [PROP_CR] = {
- .table = cr_table,
- .tablelen = LEN(cr_table),
- },
- [PROP_LF] = {
- .table = lf_table,
- .tablelen = LEN(lf_table),
- },
- [PROP_CONTROL] = {
- .table = control_table,
- .tablelen = LEN(control_table),
- },
- [PROP_EXTEND] = {
- .table = extend_table,
- .tablelen = LEN(extend_table),
- },
- [PROP_ZWJ] = {
- .table = zwj_table,
- .tablelen = LEN(zwj_table),
- },
- [PROP_RI] = {
- .table = ri_table,
- .tablelen = LEN(ri_table),
- },
- [PROP_PREPEND] = {
- .table = prepend_table,
- .tablelen = LEN(prepend_table),
- },
- [PROP_SPACINGMARK] = {
- .table = spacingmark_table,
- .tablelen = LEN(spacingmark_table),
- },
- [PROP_L] = {
- .table = l_table,
- .tablelen = LEN(l_table),
- },
- [PROP_V] = {
- .table = v_table,
- .tablelen = LEN(v_table),
- },
- [PROP_T] = {
- .table = t_table,
- .tablelen = LEN(t_table),
- },
- [PROP_LV] = {
- .table = lv_table,
- .tablelen = LEN(lv_table),
- },
- [PROP_LVT] = {
- .table = lvt_table,
- .tablelen = LEN(lvt_table),
- },
- [PROP_EXTPICT] = {
- .table = extpict_table,
- .tablelen = LEN(extpict_table),
- },
-};
-
-struct cp_properties {
- uint32_t cp;
- int_least16_t determined;
- int_least16_t state;
-};
-
-static int
-cp_cmp(const void *a, const void *b)
-{
- uint32_t cp = *(uint32_t *)a;
- uint32_t *range = (uint32_t *)b;
-
- return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
-}
-
-static int
-has_property(struct cp_properties *props, enum cp_property p)
-{
- if (!(props->determined & (1 << p))) {
- /* not determined yet, do a lookup and set the state */
- if (bsearch(&props->cp, cp_property_tables[p].table,
- cp_property_tables[p].tablelen,
- sizeof(*cp_property_tables[p].table),
- cp_cmp)) {
- props->state |= (1 << p);
- } else {
- props->state &= ~(1 << p);
- }
-
- /* now it's determined */
- props->determined |= (1 << p);
- }
-
- return (props->state & (1 << p));
-}
-
-int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
-{
- struct cp_properties props[] = {
- {
- .cp = a,
- },
- {
- .cp = b,
- },
- };
- int s;
-
- /* skip printable ASCII */
- if ((a >= 0x20 && a <= 0x7E) &&
- (b >= 0x20 && b <= 0x7E)) {
- return 1;
- }
-
- /* set internal state based on given state-pointer */
- s = (state != NULL) ? *state : 0;
-
- /*
- * Apply grapheme cluster breaking algorithm (UAX #29), see
- * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- */
-
- /*
- * update state
- */
- if (has_property(&props[1], PROP_RI)) {
- if (has_property(&props[0], PROP_RI)) {
- /* one more RI is on the left side of the seam */
- s ^= GRAPHEME_STATE_RI_ODD;
- } else {
- /* an RI appeared on the right side but the left
- side is not an RI, reset state (0 is even) */
- s &= ~GRAPHEME_STATE_RI_ODD;
- }
- }
- if (!(*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_EXTEND)))) {
- s |= GRAPHEME_STATE_EMOJI;
- } else if ((*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(&props[0], PROP_ZWJ) &&
- has_property(&props[1], PROP_EXTPICT)) ||
- (has_property(&props[0], PROP_EXTEND) &&
- has_property(&props[1], PROP_EXTEND)) ||
- (has_property(&props[0], PROP_EXTEND) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_EXTEND)))) {
- /* GRAPHEME_STATE_EMOJI remains */
- } else {
- s &= ~GRAPHEME_STATE_EMOJI;
- }
-
- /* write updated state to state-pointer, if given */
- if (state != NULL) {
- *state = s;
- }
-
- /*
- * apply rules
- */
-
- /* skip GB1 and GB2, as they are never satisfied here */
-
- /* GB3 */
- if (has_property(&props[0], PROP_CR) &&
- has_property(&props[1], PROP_LF)) {
- return 0;
- }
-
- /* GB4 */
- if (has_property(&props[0], PROP_CONTROL) ||
- has_property(&props[0], PROP_CR) ||
- has_property(&props[0], PROP_LF)) {
- return 1;
- }
-
- /* GB5 */
- if (has_property(&props[1], PROP_CONTROL) ||
- has_property(&props[1], PROP_CR) ||
- has_property(&props[1], PROP_LF)) {
- return 1;
- }
-
- /* GB6 */
- if (has_property(&props[0], PROP_L) &&
- (has_property(&props[1], PROP_L) ||
- has_property(&props[1], PROP_V) ||
- has_property(&props[1], PROP_LV) ||
- has_property(&props[1], PROP_LVT))) {
- return 0;
- }
-
- /* GB7 */
- if ((has_property(&props[0], PROP_LV) ||
- has_property(&props[0], PROP_V)) &&
- (has_property(&props[1], PROP_V) ||
- has_property(&props[1], PROP_T))) {
- return 0;
- }
-
- /* GB8 */
- if ((has_property(&props[0], PROP_LVT) ||
- has_property(&props[0], PROP_T)) &&
- has_property(&props[1], PROP_T)) {
- return 0;
- }
-
- /* GB9 */
- if (has_property(&props[1], PROP_EXTEND) ||
- has_property(&props[1], PROP_ZWJ)) {
- return 0;
- }
-
- /* GB9a */
- if (has_property(&props[1], PROP_SPACINGMARK)) {
- return 0;
- }
-
- /* GB9b */
- if (has_property(&props[0], PROP_PREPEND)) {
- return 0;
- }
-
- /* GB11 */
- if ((s & GRAPHEME_STATE_EMOJI) &&
- has_property(&props[0], PROP_ZWJ) &&
- has_property(&props[1], PROP_EXTPICT)) {
- return 0;
- }
-
- /* GB12/GB13 */
- if (has_property(&props[0], PROP_RI) &&
- has_property(&props[1], PROP_RI) &&
- (s & GRAPHEME_STATE_RI_ODD)) {
- return 0;
- }
-
- /* GB999 */
- return 1;
-}
diff --git a/src/test_body.c b/src/test_body.c
@@ -1,373 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "../grapheme.h"
-
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
-static const struct {
- uint32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
- size_t exp_len; /* expected length of UTF-8 sequence */
-} enc_test[] = {
- {
- /* invalid code point (UTF-16 surrogate half) */
- .cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
- .exp_len = 3,
- },
- {
- /* invalid code point (UTF-16-unrepresentable) */
- .cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
- .exp_len = 3,
- },
- {
- /* code point encoded to a 1-byte sequence */
- .cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
- .exp_len = 1,
- },
- {
- /* code point encoded to a 2-byte sequence */
- .cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
- .exp_len = 2,
- },
- {
- /* code point encoded to a 3-byte sequence */
- .cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
- .exp_len = 3,
- },
- {
- /* code point encoded to a 4-byte sequence */
- .cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
- .exp_len = 4,
- },
-};
-
-static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
- size_t len; /* length of UTF-8 byte sequence */
- size_t exp_len; /* expected length returned */
- uint32_t exp_cp; /* expected code point returned */
-} dec_test[] = {
- {
- /* empty sequence
- * [ ] ->
- * INVALID
- */
- .arr = NULL,
- .len = 0,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid lead byte
- * [ 11111101 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xFD },
- .len = 1,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 1-byte sequence
- * [ 00000001 ] ->
- * 0000001
- */
- .arr = (uint8_t[]){ 0x01 },
- .len = 1,
- .exp_len = 1,
- .exp_cp = 0x1,
- },
- {
- /* valid 2-byte sequence
- * [ 11000011 10111111 ] ->
- * 00011111111
- */
- .arr = (uint8_t[]){ 0xC3, 0xBF },
- .len = 2,
- .exp_len = 2,
- .exp_cp = 0xFF,
- },
- {
- /* invalid 2-byte sequence (second byte missing)
- * [ 11000011 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC3 },
- .len = 1,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 2-byte sequence (second byte malformed)
- * [ 11000011 11111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC3, 0xFF },
- .len = 2,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 2-byte sequence (overlong encoded)
- * [ 11000001 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xC1, 0xBF },
- .len = 2,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 3-byte sequence
- * [ 11100000 10111111 10111111 ] ->
- * 0000111111111111
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
- .len = 3,
- .exp_len = 3,
- .exp_cp = 0xFFF,
- },
- {
- /* invalid 3-byte sequence (second byte missing)
- * [ 11100000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0 },
- .len = 1,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (second byte malformed)
- * [ 11100000 01111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
- .len = 3,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (third byte missing)
- * [ 11100000 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF },
- .len = 2,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (third byte malformed)
- * [ 11100000 10111111 01111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
- .len = 3,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (overlong encoded)
- * [ 11100000 10011111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
- .len = 3,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 3-byte sequence (UTF-16 surrogate half)
- * [ 11101101 10100000 10000000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
- .len = 3,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* valid 4-byte sequence
- * [ 11110011 10111111 10111111 10111111 ] ->
- * 011111111111111111111
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
- .len = 4,
- .exp_len = 4,
- .exp_cp = UINT32_C(0xFFFFF),
- },
- {
- /* invalid 4-byte sequence (second byte missing)
- * [ 11110011 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3 },
- .len = 1,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (second byte malformed)
- * [ 11110011 01111111 10111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
- .len = 4,
- .exp_len = 1,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (third byte missing)
- * [ 11110011 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF },
- .len = 2,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (third byte malformed)
- * [ 11110011 10111111 01111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
- .len = 4,
- .exp_len = 2,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (fourth byte missing)
- * [ 11110011 10111111 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
- .len = 3,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (fourth byte malformed)
- * [ 11110011 10111111 10111111 01111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
- .len = 4,
- .exp_len = 3,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (overlong encoded)
- * [ 11110000 10000000 10000001 10111111 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
- .len = 4,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
- {
- /* invalid 4-byte sequence (UTF-16-unrepresentable)
- * [ 11110100 10010000 10000000 10000000 ] ->
- * INVALID
- */
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
- .len = 4,
- .exp_len = 4,
- .exp_cp = GRAPHEME_CP_INVALID,
- },
-};
-
-int
-main(void)
-{
- int state;
- size_t i, j, k, len, failed;
-
- /* UTF-8 encoder test */
- for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
- size_t len;
-
- len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
-
- if (len != enc_test[i].exp_len ||
- memcmp(arr, enc_test[i].exp_arr, len)) {
- fprintf(stderr, "Failed UTF-8-encoder test %zu: "
- "Expected (", i);
- for (j = 0; j < enc_test[i].exp_len; j++) {
- fprintf(stderr, "0x%x",
- enc_test[i].exp_arr[j]);
- if (j + 1 < enc_test[i].exp_len) {
- fprintf(stderr, " ");
- }
- }
- fprintf(stderr, "), but got (");
- for (j = 0; j < len; j++) {
- fprintf(stderr, "0x%x", arr[j]);
- if (j + 1 < len) {
- fprintf(stderr, " ");
- }
- }
- fprintf(stderr, ")\n");
- failed++;
- }
- }
- printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
- LEN(enc_test) - failed, LEN(enc_test));
-
- /* UTF-8 decoder test */
- for (i = 0, failed = 0; i < LEN(dec_test); i++) {
- size_t len;
- uint32_t cp;
-
- len = grapheme_cp_decode(&cp, dec_test[i].arr,
- dec_test[i].len);
-
- if (len != dec_test[i].exp_len ||
- cp != dec_test[i].exp_cp) {
- fprintf(stderr, "Failed UTF-8-decoder test %zu: "
- "Expected (%zx,%u), but got (%zx,%u)\n",
- i, dec_test[i].exp_len,
- dec_test[i].exp_cp, len, cp);
- failed++;
- }
- }
- printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
- LEN(dec_test) - failed, LEN(dec_test));
-
- /* grapheme break test */
- for (i = 0, failed = 0; i < LEN(t); i++) {
- for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
- if ((j + 1) == t[i].cplen ||
- grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
- &state)) {
- /* check if our resulting length matches */
- if (k == t[i].lenlen || len != t[i].len[k++]) {
- fprintf(stderr, "Failed \"%s\"\n",
- t[i].descr);
- failed++;
- break;
- }
- len = 1;
- } else {
- len++;
- }
- }
- }
- printf("Grapheme break test: Passed %zu out of %zu tests.\n",
- LEN(t) - failed, LEN(t));
-
- return (failed > 0) ? 1 : 0;
-}
diff --git a/test/test.c b/test/test.c
@@ -0,0 +1,374 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../data/gbt.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*x))
+
+static const struct {
+ uint32_t cp; /* input code point */
+ uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+ size_t exp_len; /* expected length of UTF-8 sequence */
+} enc_test[] = {
+ {
+ /* invalid code point (UTF-16 surrogate half) */
+ .cp = UINT32_C(0xD800),
+ .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_len = 3,
+ },
+ {
+ /* invalid code point (UTF-16-unrepresentable) */
+ .cp = UINT32_C(0x110000),
+ .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_len = 3,
+ },
+ {
+ /* code point encoded to a 1-byte sequence */
+ .cp = 0x01,
+ .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_len = 1,
+ },
+ {
+ /* code point encoded to a 2-byte sequence */
+ .cp = 0xFF,
+ .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_len = 2,
+ },
+ {
+ /* code point encoded to a 3-byte sequence */
+ .cp = 0xFFF,
+ .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_len = 3,
+ },
+ {
+ /* code point encoded to a 4-byte sequence */
+ .cp = UINT32_C(0xFFFFF),
+ .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_len = 4,
+ },
+};
+
+static const struct {
+ uint8_t *arr; /* UTF-8 byte sequence */
+ size_t len; /* length of UTF-8 byte sequence */
+ size_t exp_len; /* expected length returned */
+ uint32_t exp_cp; /* expected code point returned */
+} dec_test[] = {
+ {
+ /* empty sequence
+ * [ ] ->
+ * INVALID
+ */
+ .arr = NULL,
+ .len = 0,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid lead byte
+ * [ 11111101 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xFD },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 1-byte sequence
+ * [ 00000001 ] ->
+ * 0000001
+ */
+ .arr = (uint8_t[]){ 0x01 },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = 0x1,
+ },
+ {
+ /* valid 2-byte sequence
+ * [ 11000011 10111111 ] ->
+ * 00011111111
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = 0xFF,
+ },
+ {
+ /* invalid 2-byte sequence (second byte missing)
+ * [ 11000011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3 },
+ .len = 1,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (second byte malformed)
+ * [ 11000011 11111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .len = 2,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (overlong encoded)
+ * [ 11000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 3-byte sequence
+ * [ 11100000 10111111 10111111 ] ->
+ * 0000111111111111
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = 0xFFF,
+ },
+ {
+ /* invalid 3-byte sequence (second byte missing)
+ * [ 11100000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0 },
+ .len = 1,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (second byte malformed)
+ * [ 11100000 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .len = 3,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte missing)
+ * [ 11100000 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .len = 2,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte malformed)
+ * [ 11100000 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .len = 3,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (overlong encoded)
+ * [ 11100000 10011111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (UTF-16 surrogate half)
+ * [ 11101101 10100000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* valid 4-byte sequence
+ * [ 11110011 10111111 10111111 10111111 ] ->
+ * 011111111111111111111
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = UINT32_C(0xFFFFF),
+ },
+ {
+ /* invalid 4-byte sequence (second byte missing)
+ * [ 11110011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3 },
+ .len = 1,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (second byte malformed)
+ * [ 11110011 01111111 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 1,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte missing)
+ * [ 11110011 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .len = 2,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte malformed)
+ * [ 11110011 10111111 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .len = 4,
+ .exp_len = 2,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte missing)
+ * [ 11110011 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte malformed)
+ * [ 11110011 10111111 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .len = 4,
+ .exp_len = 3,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (overlong encoded)
+ * [ 11110000 10000000 10000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (UTF-16-unrepresentable)
+ * [ 11110100 10010000 10000000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = GRAPHEME_CP_INVALID,
+ },
+};
+
+int
+main(void)
+{
+ int state;
+ size_t i, j, k, len, failed;
+
+ /* UTF-8 encoder test */
+ for (i = 0, failed = 0; i < LEN(enc_test); i++) {
+ uint8_t arr[4];
+ size_t len;
+
+ len = grapheme_cp_encode(enc_test[i].cp, arr, LEN(arr));
+
+ if (len != enc_test[i].exp_len ||
+ memcmp(arr, enc_test[i].exp_arr, len)) {
+ fprintf(stderr, "Failed UTF-8-encoder test %zu: "
+ "Expected (", i);
+ for (j = 0; j < enc_test[i].exp_len; j++) {
+ fprintf(stderr, "0x%x",
+ enc_test[i].exp_arr[j]);
+ if (j + 1 < enc_test[i].exp_len) {
+ fprintf(stderr, " ");
+ }
+ }
+ fprintf(stderr, "), but got (");
+ for (j = 0; j < len; j++) {
+ fprintf(stderr, "0x%x", arr[j]);
+ if (j + 1 < len) {
+ fprintf(stderr, " ");
+ }
+ }
+ fprintf(stderr, ")\n");
+ failed++;
+ }
+ }
+ printf("UTF-8 encoder test: Passed %zu out of %zu tests.\n",
+ LEN(enc_test) - failed, LEN(enc_test));
+
+ /* UTF-8 decoder test */
+ for (i = 0, failed = 0; i < LEN(dec_test); i++) {
+ size_t len;
+ uint32_t cp;
+
+ len = grapheme_cp_decode(&cp, dec_test[i].arr,
+ dec_test[i].len);
+
+ if (len != dec_test[i].exp_len ||
+ cp != dec_test[i].exp_cp) {
+ fprintf(stderr, "Failed UTF-8-decoder test %zu: "
+ "Expected (%zx,%u), but got (%zx,%u)\n",
+ i, dec_test[i].exp_len,
+ dec_test[i].exp_cp, len, cp);
+ failed++;
+ }
+ }
+ printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
+ LEN(dec_test) - failed, LEN(dec_test));
+
+ /* grapheme break test */
+ for (i = 0, failed = 0; i < LEN(t); i++) {
+ for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
+ if ((j + 1) == t[i].cplen ||
+ grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
+ &state)) {
+ /* check if our resulting length matches */
+ if (k == t[i].lenlen || len != t[i].len[k++]) {
+ fprintf(stderr, "Failed \"%s\"\n",
+ t[i].descr);
+ failed++;
+ break;
+ }
+ len = 1;
+ } else {
+ len++;
+ }
+ }
+ }
+ printf("Grapheme break test: Passed %zu out of %zu tests.\n",
+ LEN(t) - failed, LEN(t));
+
+ return (failed > 0) ? 1 : 0;
+}