utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 90721f2d39b0cdd5d22409f1bf4f6ce4b7382944
parent 10f7e2ed5a7f3d05cbbc45f457be12456e6969d3
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Fri,  6 Mar 2015 17:36:08 -0500

directory cleanup: move tests and data into subdirectories

Diffstat:
MMakefile | 50+++++++++++++++++++++++++-------------------------
Rdata_generator.rb -> data/data_generator.rb | 0
Dgraphemetest.c | 72------------------------------------------------------------------------
Dnormtest.c | 64----------------------------------------------------------------
Atest/graphemetest.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/normtest.c | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rprintproperty.c -> test/printproperty.c | 0
Atest/tests.h | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Dtests.h | 53-----------------------------------------------------
9 files changed, 214 insertions(+), 214 deletions(-)

diff --git a/Makefile b/Makefile @@ -25,7 +25,7 @@ all: c-library c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT) clean: - rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt + rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt $(MAKE) -C bench clean update: utf8proc_data.c.new @@ -33,23 +33,23 @@ update: utf8proc_data.c.new # real targets -utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt - $(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new +utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt + (cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new -UnicodeData.txt: - $(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt +data/UnicodeData.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt -GraphemeBreakProperty.txt: - $(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt +data/GraphemeBreakProperty.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt -DerivedCoreProperties.txt: - $(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt +data/DerivedCoreProperties.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt -CompositionExclusions.txt: - $(CURL) -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt +data/CompositionExclusions.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt -CaseFolding.txt: - $(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +data/CaseFolding.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c $(cc) -c -o utf8proc.o utf8proc.c @@ -68,21 +68,21 @@ libutf8proc.dylib: utf8proc.o # Test programs -NormalizationTest.txt: - $(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt +data/NormalizationTest.txt: + $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt -GraphemeBreakTest.txt: +data/GraphemeBreakTest.txt: $(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ -normtest: normtest.c utf8proc.o utf8proc.h tests.h - $(cc) normtest.c utf8proc.o -o $@ +test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h + $(cc) test/normtest.c utf8proc.o -o $@ -graphemetest: graphemetest.c utf8proc.o utf8proc.h tests.h - $(cc) graphemetest.c utf8proc.o -o $@ +test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h + $(cc) test/graphemetest.c utf8proc.o -o $@ -printproperty: printproperty.c utf8proc.o utf8proc.h tests.h - $(cc) printproperty.c utf8proc.o -o $@ +test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h + $(cc) test/printproperty.c utf8proc.o -o $@ -check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt - ./normtest - ./graphemetest +check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt + test/normtest data/NormalizationTest.txt + test/graphemetest data/GraphemeBreakTest.txt diff --git a/data_generator.rb b/data/data_generator.rb diff --git a/graphemetest.c b/graphemetest.c @@ -1,72 +0,0 @@ -#include "tests.h" - -int main(void) -{ - char *buf = NULL; - size_t bufsize = 0; - FILE *f = fopen("GraphemeBreakTest.txt", "r"); - uint8_t src[1024]; - - check(f != NULL, "error opening GraphemeBreakTest.txt"); - while (getline(&buf, &bufsize, f) > 0) { - size_t bi = 0, si = 0; - lineno += 1; - - if (lineno % 100 == 0) - printf("checking line %zd...\n", lineno); - - if (buf[0] == '#') continue; - - while (buf[bi]) { - bi = skipspaces(buf, bi); - if (buf[bi] == '/') { /* grapheme break */ - src[si++] = '/'; - bi++; - } - else if (buf[bi] == '+') { /* no break */ - bi++; - } - else if (buf[bi] == '#') { /* start of comments */ - break; - } - else { /* hex-encoded codepoint */ - bi += encode((char*) (src + si), buf + bi) - 1; - while (src[si]) ++si; /* advance to NUL termination */ - } - } - if (si && src[si-1] == '/') - --si; /* no break after final grapheme */ - src[si] = 0; /* NUL-terminate */ - - if (si) { - uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ - size_t i = 0, j = 0; - ssize_t glen; - uint8_t *g; /* utf8proc_map grapheme results */ - while (i < si) { - if (src[i] != '/') - utf8[j++] = src[i++]; - else - i++; - } - glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); - if (glen == UTF8PROC_ERROR_INVALIDUTF8) { - /* the test file contains surrogate codepoints, which are only for UTF-16 */ - printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); - } - else { - check(glen >= 0, "utf8proc_map error = %s", - utf8proc_errmsg(glen)); - for (i = 0; i <= glen; ++i) - if (g[i] == 0xff) - g[i] = '/'; /* easier-to-read output (/ is not in test strings) */ - check(!strcmp((char*)g, (char*)src), - "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); - } - free(g); - } - } - fclose(f); - printf("Passed tests after %zd lines!\n", lineno); - return 0; -} diff --git a/normtest.c b/normtest.c @@ -1,64 +0,0 @@ -#include "tests.h" - -#define CHECK_NORM(NRM, norm, src) { \ - char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ - check(!strcmp(norm, src_norm), \ - "normalization failed for %s -> %s", src, norm); \ - free(src_norm); \ -} - -int main(void) -{ - char *buf = NULL; - size_t bufsize = 0; - FILE *f = fopen("NormalizationTest.txt", "r"); - char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; - - check(f != NULL, "error opening NormalizationTest.txt"); - while (getline(&buf, &bufsize, f) > 0) { - size_t offset; - lineno += 1; - - if (buf[0] == '@') { - printf("line %zd: %s", lineno, buf + 1); - continue; - } - else if (lineno % 1000 == 0) - printf("checking line %zd...\n", lineno); - - if (buf[0] == '#') continue; - - offset = encode(source, buf); - offset += encode(NFC, buf + offset); - offset += encode(NFD, buf + offset); - offset += encode(NFKC, buf + offset); - offset += encode(NFKD, buf + offset); - - CHECK_NORM(NFC, NFC, source); - CHECK_NORM(NFC, NFC, NFC); - CHECK_NORM(NFC, NFC, NFD); - CHECK_NORM(NFC, NFKC, NFKC); - CHECK_NORM(NFC, NFKC, NFKD); - - CHECK_NORM(NFD, NFD, source); - CHECK_NORM(NFD, NFD, NFC); - CHECK_NORM(NFD, NFD, NFD); - CHECK_NORM(NFD, NFKD, NFKC); - CHECK_NORM(NFD, NFKD, NFKD); - - CHECK_NORM(NFKC, NFKC, source); - CHECK_NORM(NFKC, NFKC, NFC); - CHECK_NORM(NFKC, NFKC, NFD); - CHECK_NORM(NFKC, NFKC, NFKC); - CHECK_NORM(NFKC, NFKC, NFKD); - - CHECK_NORM(NFKD, NFKD, source); - CHECK_NORM(NFKD, NFKD, NFC); - CHECK_NORM(NFKD, NFKD, NFD); - CHECK_NORM(NFKD, NFKD, NFKC); - CHECK_NORM(NFKD, NFKD, NFKD); - } - fclose(f); - printf("Passed tests after %zd lines!\n", lineno); - return 0; -} diff --git a/test/graphemetest.c b/test/graphemetest.c @@ -0,0 +1,72 @@ +#include "tests.h" + +int main(int argc, char **argv) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; + uint8_t src[1024]; + + check(f != NULL, "error opening GraphemeBreakTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t bi = 0, si = 0; + lineno += 1; + + if (lineno % 100 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + while (buf[bi]) { + bi = skipspaces(buf, bi); + if (buf[bi] == '/') { /* grapheme break */ + src[si++] = '/'; + bi++; + } + else if (buf[bi] == '+') { /* no break */ + bi++; + } + else if (buf[bi] == '#') { /* start of comments */ + break; + } + else { /* hex-encoded codepoint */ + bi += encode((char*) (src + si), buf + bi) - 1; + while (src[si]) ++si; /* advance to NUL termination */ + } + } + if (si && src[si-1] == '/') + --si; /* no break after final grapheme */ + src[si] = 0; /* NUL-terminate */ + + if (si) { + uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ + size_t i = 0, j = 0; + ssize_t glen; + uint8_t *g; /* utf8proc_map grapheme results */ + while (i < si) { + if (src[i] != '/') + utf8[j++] = src[i++]; + else + i++; + } + glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); + if (glen == UTF8PROC_ERROR_INVALIDUTF8) { + /* the test file contains surrogate codepoints, which are only for UTF-16 */ + printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); + } + else { + check(glen >= 0, "utf8proc_map error = %s", + utf8proc_errmsg(glen)); + for (i = 0; i <= glen; ++i) + if (g[i] == 0xff) + g[i] = '/'; /* easier-to-read output (/ is not in test strings) */ + check(!strcmp((char*)g, (char*)src), + "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); + } + free(g); + } + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +} diff --git a/test/normtest.c b/test/normtest.c @@ -0,0 +1,64 @@ +#include "tests.h" + +#define CHECK_NORM(NRM, norm, src) { \ + char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ + check(!strcmp(norm, src_norm), \ + "normalization failed for %s -> %s", src, norm); \ + free(src_norm); \ +} + +int main(int argc, char **argv) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; + char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; + + check(f != NULL, "error opening NormalizationTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t offset; + lineno += 1; + + if (buf[0] == '@') { + printf("line %zd: %s", lineno, buf + 1); + continue; + } + else if (lineno % 1000 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + offset = encode(source, buf); + offset += encode(NFC, buf + offset); + offset += encode(NFD, buf + offset); + offset += encode(NFKC, buf + offset); + offset += encode(NFKD, buf + offset); + + CHECK_NORM(NFC, NFC, source); + CHECK_NORM(NFC, NFC, NFC); + CHECK_NORM(NFC, NFC, NFD); + CHECK_NORM(NFC, NFKC, NFKC); + CHECK_NORM(NFC, NFKC, NFKD); + + CHECK_NORM(NFD, NFD, source); + CHECK_NORM(NFD, NFD, NFC); + CHECK_NORM(NFD, NFD, NFD); + CHECK_NORM(NFD, NFKD, NFKC); + CHECK_NORM(NFD, NFKD, NFKD); + + CHECK_NORM(NFKC, NFKC, source); + CHECK_NORM(NFKC, NFKC, NFC); + CHECK_NORM(NFKC, NFKC, NFD); + CHECK_NORM(NFKC, NFKC, NFKC); + CHECK_NORM(NFKC, NFKC, NFKD); + + CHECK_NORM(NFKD, NFKD, source); + CHECK_NORM(NFKD, NFKD, NFC); + CHECK_NORM(NFKD, NFKD, NFD); + CHECK_NORM(NFKD, NFKD, NFKC); + CHECK_NORM(NFKD, NFKD, NFKD); + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +} diff --git a/printproperty.c b/test/printproperty.c diff --git a/test/tests.h b/test/tests.h @@ -0,0 +1,53 @@ +/* Common functions and includes for our test programs. */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <stdarg.h> + +#include "../utf8proc.h" + +size_t lineno = 0; + +void check(int cond, const char *format, ...) +{ + if (!cond) { + va_list args; + fprintf(stderr, "line %zd: ", lineno); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\n"); + exit(1); + } +} + +size_t skipspaces(const char *buf, size_t i) +{ + while (isspace(buf[i])) ++i; + return i; +} + +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, + separated by whitespace, and terminated by any character not in + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string + in dest, returning the number of bytes read from buf */ +size_t encode(char *dest, const char *buf) +{ + size_t i = 0, j, d = 0; + do { + int c; + i = skipspaces(buf, i); + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) + ; /* find end of hex input */ + if (j == i) { /* no codepoint found */ + dest[d] = 0; /* NUL-terminate destination string */ + return i + 1; + } + check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); + i = j; /* skip to char after hex input */ + d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); + } while (1); +} + diff --git a/tests.h b/tests.h @@ -1,53 +0,0 @@ -/* Common functions and includes for our test programs. */ - -#include <stdio.h> -#include <stdlib.h> -#include <ctype.h> -#include <string.h> -#include <stdarg.h> - -#include "utf8proc.h" - -size_t lineno = 0; - -void check(int cond, const char *format, ...) -{ - if (!cond) { - va_list args; - fprintf(stderr, "line %zd: ", lineno); - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - fprintf(stderr, "\n"); - exit(1); - } -} - -size_t skipspaces(const char *buf, size_t i) -{ - while (isspace(buf[i])) ++i; - return i; -} - -/* if buf points to a sequence of codepoints encoded as hexadecimal strings, - separated by whitespace, and terminated by any character not in - [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string - in dest, returning the number of bytes read from buf */ -size_t encode(char *dest, const char *buf) -{ - size_t i = 0, j, d = 0; - do { - int c; - i = skipspaces(buf, i); - for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) - ; /* find end of hex input */ - if (j == i) { /* no codepoint found */ - dest[d] = 0; /* NUL-terminate destination string */ - return i + 1; - } - check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); - i = j; /* skip to char after hex input */ - d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); - } while (1); -} -