commit 539d2cc2024f494b1e3292d4730bdc96390e1361
parent 1b3992ebe5c587446aaa962a314ef9244d86fb0d
Author: Steven G. Johnson <stevenj@alum.mit.edu>
Date: Sun, 7 Dec 2014 22:25:31 -0500
grapheme test for UAX#29
Diffstat:
M | .gitignore | | | 1 | + |
M | Makefile | | | 14 | +++++++++++--- |
A | graphemetest.c | | | 62 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | normtest.c | | | 45 | +-------------------------------------------- |
A | tests.h | | | 53 | +++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 128 insertions(+), 47 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ bench/bench
bench/icu
bench/unistring
normtest
+graphemetest
diff --git a/Makefile b/Makefile
@@ -2,6 +2,7 @@
CURL=curl
RUBY=ruby
+PERL=perl
MAKE=make
# settings
@@ -24,7 +25,7 @@ all: c-library
c-library: libmojibake.a libmojibake.$(SHLIB_EXT)
clean:
- rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt
+ rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
$(MAKE) -C bench clean
update: utf8proc_data.c.new
@@ -67,8 +68,15 @@ libmojibake.dylib: utf8proc.o
NormalizationTest.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
-normtest: normtest.c utf8proc.o mojibake.h
+GraphemeBreakTest.txt:
+ $(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
+
+normtest: normtest.c utf8proc.o mojibake.h tests.h
$(cc) normtest.c utf8proc.o -o normtest
-check: normtest NormalizationTest.txt
+graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
+ $(cc) graphemetest.c utf8proc.o -o graphemetest
+
+check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
./normtest
+ ./graphemetest
diff --git a/graphemetest.c b/graphemetest.c
@@ -0,0 +1,62 @@
+#include "tests.h"
+
+int main(void)
+{
+ char *buf = NULL;
+ size_t bufsize = 0;
+ FILE *f = fopen("GraphemeBreakTest.txt", "r");
+ uint8_t src[1024];
+
+ check(f != NULL, "error opening NormalizationTest.txt");
+ while (getline(&buf, &bufsize, f) > 0) {
+ size_t bi = 0, si = 0;
+ lineno += 1;
+
+ if (lineno % 100 == 0)
+ printf("checking line %zd...\n", lineno);
+
+ if (buf[0] == '#') continue;
+
+ while (buf[bi]) {
+ bi = skipspaces(buf, bi);
+ if (buf[bi] == '/') { /* grapheme break */
+ src[si++] = 0xff;
+ bi++;
+ }
+ else if (buf[bi] == '+') { /* no break */
+ bi++;
+ }
+ else if (buf[bi] == '#') { /* start of comments */
+ break;
+ }
+ else { /* hex-encoded codepoint */
+ bi += encode((char*) (src + si), buf + bi) - 1;
+ while (src[si]) ++si; /* advance to NUL termination */
+ }
+ }
+ if (si && src[si-1] == 0xff)
+ --si; /* no 0xff after final grapheme */
+ src[si] = 0; /* NUL-terminate */
+
+ if (si) {
+ uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
+ size_t i = 0, j = 0;
+ ssize_t glen;
+ uint8_t *g; /* utf8proc_map grapheme results */
+ while (i < si) {
+ if (src[i] != 0xff)
+ utf8[j++] = src[i++];
+ else
+ i++;
+ }
+ glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+ check(glen >= 0, "utf8proc_map error = %s",
+ utf8proc_errmsg(glen));
+ check(!strcmp((char*)g, (char*)src),
+ "grapheme mismatch: %s vs. %s", (char*)g, (char*)src);
+ }
+ }
+ fclose(f);
+ printf("Passed tests after %zd lines!\n", lineno);
+ return 0;
+}
diff --git a/normtest.c b/normtest.c
@@ -1,47 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "mojibake.h"
-
-size_t lineno = 0;
-
-void check(int cond, const char *format, ...)
-{
- if (!cond) {
- va_list args;
- fprintf(stderr, "line %zd: ", lineno);
- va_start(args, format);
- vfprintf(stderr, format, args);
- va_end(args);
- fprintf(stderr, "\n");
- exit(1);
- }
-}
-
-/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
- separated by whitespace, and terminated by any character not in
- [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
- in dest, returning the number of bytes read from buf */
-size_t encode(char *dest, const char *buf)
-{
- size_t i = 0, j, d = 0;
- do {
- int c;
- while (isspace(buf[i])) ++i; /* skip whitespace */
- for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
- ; /* find end of hex input */
- if (j == i) { /* no codepoint found */
- dest[d] = 0; /* NUL-terminate destination string */
- return i + 1;
- }
- check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
- i = j; /* skip to char after hex input */
- d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
- } while (1);
-}
+#include "tests.h"
#define CHECK_NORM(NRM, norm, src) { \
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
diff --git a/tests.h b/tests.h
@@ -0,0 +1,53 @@
+/* Common functions and includes for our test programs. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "mojibake.h"
+
+size_t lineno = 0;
+
+void check(int cond, const char *format, ...)
+{
+ if (!cond) {
+ va_list args;
+ fprintf(stderr, "line %zd: ", lineno);
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+}
+
+size_t skipspaces(const char *buf, size_t i)
+{
+ while (isspace(buf[i])) ++i;
+ return i;
+}
+
+/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
+ separated by whitespace, and terminated by any character not in
+ [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
+ in dest, returning the number of bytes read from buf */
+size_t encode(char *dest, const char *buf)
+{
+ size_t i = 0, j, d = 0;
+ do {
+ int c;
+ i = skipspaces(buf, i);
+ for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
+ ; /* find end of hex input */
+ if (j == i) { /* no codepoint found */
+ dest[d] = 0; /* NUL-terminate destination string */
+ return i + 1;
+ }
+ check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
+ i = j; /* skip to char after hex input */
+ d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
+ } while (1);
+}
+