utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 91a77d75885527263bb2cd79173bbf8494994c09
parent af06f858e1f5c117d1f19a5cd1bde9cf2fbbac55
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Thu,  7 Aug 2014 16:52:16 -0400

added normalization and encoding test for #13

Diffstat:
M.gitignore | 3+--
MMakefile | 13++++++++++++-
Anormtest.c | 107+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -13,5 +13,4 @@ bench/bench bench/icu bench/unistring - - +normtest diff --git a/Makefile b/Makefile @@ -45,7 +45,6 @@ CompositionExclusions.txt: CaseFolding.txt: $(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt - utf8proc.o: mojibake.h utf8proc.c utf8proc_data.c $(cc) -c -o utf8proc.o utf8proc.c @@ -59,3 +58,15 @@ libmojibake.so: utf8proc.o libmojibake.dylib: utf8proc.o $(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ + + +# Test programs + +NormalizationTest.txt: + $(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt + +normtest: normtest.c utf8proc.o mojibake.h + $(cc) normtest.c utf8proc.o -o normtest + +check: normtest NormalizationTest.txt + ./normtest diff --git a/normtest.c b/normtest.c @@ -0,0 +1,107 @@ +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <stdarg.h> + +#include "mojibake.h" + +size_t lineno = 0; + +void check(int cond, const char *format, ...) +{ + if (!cond) { + va_list args; + fprintf(stderr, "line %zd: ", lineno); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\n"); + exit(1); + } +} + +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, + separated by whitespace, and terminated by any character not in + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string + in dest, returning the number of bytes read from buf */ +size_t encode(char *dest, const char *buf) +{ + size_t i = 0, j, d = 0; + do { + int c; + while (isspace(buf[i])) ++i; /* skip whitespace */ + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) + ; /* find end of hex input */ + if (j == i) { /* no codepoint found */ + dest[d] = 0; /* NUL-terminate destination string */ + return i + 1; + } + check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); + i = j; /* skip to char after hex input */ + d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); + } while (1); +} + +#define CHECK_NORM(NRM, norm, src) { \ + char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ + check(!strcmp(norm, src_norm), \ + "normalization failed for %s -> %s", src, norm); \ + free(src_norm); \ +} + +int main(void) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = fopen("NormalizationTest.txt", "r"); + char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; + + check(f != NULL, "error opening NormalizationTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t offset; + lineno += 1; + + if (buf[0] == '@') { + printf("line %zd: %s", lineno, buf + 1); + continue; + } + else if (lineno % 1000 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + offset = encode(source, buf); + offset += encode(NFC, buf + offset); + offset += encode(NFD, buf + offset); + offset += encode(NFKC, buf + offset); + offset += encode(NFKD, buf + offset); + + CHECK_NORM(NFC, NFC, source); + CHECK_NORM(NFC, NFC, NFC); + CHECK_NORM(NFC, NFC, NFD); + CHECK_NORM(NFC, NFKC, NFKC); + CHECK_NORM(NFC, NFKC, NFKD); + + CHECK_NORM(NFD, NFD, source); + CHECK_NORM(NFD, NFD, NFC); + CHECK_NORM(NFD, NFD, NFD); + CHECK_NORM(NFD, NFKD, NFKC); + CHECK_NORM(NFD, NFKD, NFKD); + + CHECK_NORM(NFKC, NFKC, source); + CHECK_NORM(NFKC, NFKC, NFC); + CHECK_NORM(NFKC, NFKC, NFD); + CHECK_NORM(NFKC, NFKC, NFKC); + CHECK_NORM(NFKC, NFKC, NFKD); + + CHECK_NORM(NFKD, NFKD, source); + CHECK_NORM(NFKD, NFKD, NFC); + CHECK_NORM(NFKD, NFKD, NFD); + CHECK_NORM(NFKD, NFKD, NFKC); + CHECK_NORM(NFKD, NFKD, NFKD); + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +}