commit a88c9a176dfc9459c3a7b09f6c9aedda6d06732f
parent f8bb18d674283ffd63b87fd903fdc75bee4fc2fd
Author: Laslo Hunhold <dev@frign.de>
Date: Mon, 1 Jun 2020 12:00:50 +0200
Expose grapheme_cp_{en,de}code() and grapheme_boundary()
After the preparation, we can now expose these three functions in
grapheme.h, as suggested by Mattias.
In this context, we get rid of the Codepoint-typedef, as there is no
need to opaquely define uint32_t. A codepoint is just a number, and thus
let's stop with the "Rune", "Codepoint", etc. naming-nonsense!
Moving everything into grapheme.h, there is also no need for boundary.h
and codepoint.h, which we reflect in the Makefile.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
11 files changed, 36 insertions(+), 55 deletions(-)
diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@
include config.mk
BIN = src/test
-REQ = src/codepoint src/boundary src/grapheme
+REQ = src/boundary src/codepoint src/grapheme
GBP_URL = https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
EMO_URL = https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
GBT_URL = https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
@@ -19,10 +19,10 @@ all: libgrapheme.a libgrapheme.so $(BIN)
src/test: src/test.o $(REQ:=.o)
-src/boundary.o: src/boundary.c config.mk src/codepoint.h src/boundary.h
-src/codepoint.o: src/codepoint.c config.mk src/codepoint.h
-src/grapheme.o: src/grapheme.c config.mk src/codepoint.h src/boundary.h
-src/test.o: src/test.c config.mk src/codepoint.h src/boundary.h
+src/boundary.o: src/boundary.c config.mk grapheme.h
+src/codepoint.o: src/codepoint.c config.mk grapheme.h
+src/grapheme.o: src/grapheme.c config.mk grapheme.h
+src/test.o: src/test.c config.mk grapheme.h
.o:
$(CC) -o $@ $(LDFLAGS) $< $(REQ:=.o)
@@ -42,7 +42,7 @@ test:
src/boundary.c: data/gbt.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c
printf "/* Automatically generated by gbp.awk and emo.awk */\n" > $@
- printf "#include \"codepoint.h\"\n" >> $@
+ printf "#include <stdint.h>\n\n" >> $@
awk -f data/gbp.awk $(GBP) >> $@
awk -f data/emo.awk $(EMO) >> $@
printf "\n" >> $@
@@ -50,8 +50,9 @@ src/boundary.c: data/gbt.awk $(GBP) data/emo.awk $(EMO) src/boundary_body.c
src/test.c: data/gbt.awk $(GBT) src/test_body.c
printf "/* Automatically generated by gbt.awk */\n" > $@
- printf "#include <stddef.h>\n\n" >> $@
- printf "#include \"codepoint.h\"\n\n" >> $@
+ printf "#include <stddef.h>\n" >> $@
+ printf "#include <stdint.h>\n\n" >> $@
+ printf "#include \"../grapheme.h\"\n\n" >> $@
awk -f data/gbt.awk $(GBT) >> $@
printf "\n" >> $@
cat src/test_body.c >> $@
diff --git a/data/emo.awk b/data/emo.awk
@@ -34,7 +34,7 @@ function hextonum(str) {
}
function mktable(name, array, arrlen) {
- printf("\nstatic const Codepoint "name"_table[][2] = {\n");
+ printf("\nstatic const uint32_t "name"_table[][2] = {\n");
for (j = 0; j < arrlen; j++) {
if (ind = index(array[j], "..")) {
diff --git a/data/gbp.awk b/data/gbp.awk
@@ -58,7 +58,7 @@ function hextonum(str) {
}
function mktable(name, array, arrlen) {
- printf("\nstatic const Codepoint "name"_table[][2] = {\n");
+ printf("static const uint32_t "name"_table[][2] = {\n");
for (j = 0; j < arrlen; j++) {
if (ind = index(array[j], "..")) {
diff --git a/data/gbt.awk b/data/gbt.awk
@@ -4,7 +4,7 @@
BEGIN {
FS = " "
- printf("struct test {\n\tCodepoint *cp;\n\tsize_t cplen;\n");
+ printf("struct test {\n\tuint32_t *cp;\n\tsize_t cplen;\n");
printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n");
printf("static const struct test t[] = {\n");
}
@@ -38,7 +38,7 @@ $0 ~ /^#/ || $0 ~ /^\s*$/ { next }
len[nlens++] = curlen;
# print code points
- printf("\t{\n\t\t.cp = (Codepoint[]){ ");
+ printf("\t{\n\t\t.cp = (uint32_t[]){ ");
for (i = 0; i < ncps; i++) {
printf("0x%s", cp[i]);
if (i + 1 < ncps) {
diff --git a/grapheme.h b/grapheme.h
@@ -3,6 +3,14 @@
#define GRAPHEME_H
#include <stddef.h>
+#include <stdint.h>
+
+#define CP_INVALID UINT32_C(0xFFFD)
+
+int grapheme_boundary(uint32_t, uint32_t, int *);
+
+size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
+size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
size_t grapheme_len(const char *);
diff --git a/src/boundary.h b/src/boundary.h
@@ -1,11 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#ifndef BOUNDARY_H
-#define BOUNDARY_H
-
-#include <stddef.h>
-
-#include "codepoint.h"
-
-int boundary(Codepoint, Codepoint, int *);
-
-#endif /* BOUNDARY_H */
diff --git a/src/boundary_body.c b/src/boundary_body.c
@@ -1,10 +1,8 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
+#include <stdint.h>
#include <stdlib.h>
-#include "codepoint.h"
-#include "boundary.h"
-
#define LEN(x) (sizeof(x) / sizeof(*x))
enum {
@@ -15,8 +13,8 @@ enum {
static int
cp_cmp(const void *a, const void *b)
{
- Codepoint cp = *(Codepoint *)a;
- Codepoint *range = (Codepoint *)b;
+ uint32_t cp = *(uint32_t *)a;
+ uint32_t *range = (uint32_t *)b;
return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
}
@@ -40,7 +38,7 @@ enum property {
};
struct {
- const Codepoint (*table)[2];
+ const uint32_t (*table)[2];
size_t tablelen;
} tables[] = {
[PROP_CR] = {
@@ -102,7 +100,7 @@ struct {
};
static int
-is(Codepoint cp[2], char (*props)[2], int index, enum property p)
+is(uint32_t cp[2], char (*props)[2], int index, enum property p)
{
if (props[p][index] == 2) {
/* need to determine property */
@@ -119,9 +117,9 @@ is(Codepoint cp[2], char (*props)[2], int index, enum property p)
#define IS(I, PROP) (is(cp, props, I, PROP))
int
-boundary(Codepoint cp0, Codepoint cp1, int *state)
+grapheme_boundary(uint32_t cp0, uint32_t cp1, int *state)
{
- Codepoint cp[2] = { cp0, cp1 };
+ uint32_t cp[2] = { cp0, cp1 };
char props[NUM_PROPS][2];
size_t i;
diff --git a/src/codepoint.c b/src/codepoint.c
@@ -1,5 +1,5 @@
/* See LICENSE file for copyright and license details. */
-#include "codepoint.h"
+#include "../grapheme.h"
#include <stdio.h>
#define BETWEEN(c, l, u) (c >= l && c <= u)
diff --git a/src/codepoint.h b/src/codepoint.h
@@ -1,15 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#ifndef CODEPOINT_H
-#define CODEPOINT_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-typedef uint32_t Codepoint;
-
-#define CP_INVALID 0xFFFD
-
-size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
-size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
-
-#endif /* CODEPOINT_H */
diff --git a/src/grapheme.c b/src/grapheme.c
@@ -2,13 +2,12 @@
#include <stddef.h>
#include <stdlib.h>
-#include "codepoint.h"
-#include "boundary.h"
+#include "../grapheme.h"
size_t
grapheme_len(const char *str)
{
- Codepoint cp0, cp1;
+ uint32_t cp0, cp1;
size_t ret, len = 0;
int state = 0;
@@ -38,7 +37,7 @@ grapheme_len(const char *str)
/* get next code point */
ret = grapheme_cp_decode(&cp1, (uint8_t *)(str + len), 5);
- if (cp1 == CP_INVALID || boundary(cp0, cp1, &state)) {
+ if (cp1 == CP_INVALID || grapheme_boundary(cp0, cp1, &state)) {
/* we read an invalid cp or have a breakpoint */
break;
} else {
diff --git a/src/test_body.c b/src/test_body.c
@@ -1,10 +1,10 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
+#include <stdint.h>
#include <stdio.h>
#include <string.h>
-#include "boundary.h"
-#include "codepoint.h"
+#include "../grapheme.h"
#define LEN(x) (sizeof(x) / sizeof(*x))
@@ -350,7 +350,8 @@ int main(void)
for (i = 0, failed = 0; i < LEN(t); i++) {
for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
if ((j + 1) == t[i].cplen ||
- boundary(t[i].cp[j], t[i].cp[j + 1], &state)) {
+ grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
+ &state)) {
/* check if our resulting length matches */
if (k == t[i].lenlen || len != t[i].len[k++]) {
fprintf(stderr, "Failed \"%s\"\n",