utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit a5c9de204717d619ddb2fa4eb994542bb3cfc157
parent f67f1a72f7bbaac8dbf08a183193ac82641cd34b
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Sat, 31 Oct 2015 18:13:25 -0400

Merge pull request #58 from petercolberg/master

Fix build warnings
Diffstat:
M.gitignore | 3++-
M.travis.yml | 3++-
AMANIFEST | 7+++++++
MMakefile | 56++++++++++++++++++++++++++++++++++++--------------------
Mdata/Makefile | 3++-
Mdata/charwidths.jl | 18++++++++++++------
Mtest/charwidth.c | 2+-
Mtest/iterate.c | 2+-
Mtest/printproperty.c | 2+-
Atest/tests.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Mtest/tests.h | 54++++++++++++------------------------------------------
11 files changed, 122 insertions(+), 74 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -9,6 +9,7 @@ *.dylib *.dSYM *.out +*.new data/*.txt data/*.ttf data/*.sfd @@ -18,9 +19,9 @@ bench/icu bench/unistring normtest graphemetest -utf8proc_data.c.new printproperty charwidth valid iterate case +/tmp/ diff --git a/.travis.yml b/.travis.yml @@ -10,9 +10,10 @@ before_install: - sudo apt-get update -qq -y - sudo apt-get install libpcre3-dev julia fontforge -y script: - - make prefix=`pwd`/local install + - make manifest && diff MANIFEST.new MANIFEST - make check - make data && diff data/utf8proc_data.c.new utf8proc_data.c + - make clean && git status --ignored --porcelain && test -z "$(git status --ignored --porcelain)" - (mkdir build_static && cd build_static && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON && make) - (mkdir build_shared && cd build_shared && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON && make) env: diff --git a/MANIFEST b/MANIFEST @@ -0,0 +1,7 @@ +include/ +include/utf8proc.h +lib/ +lib/libutf8proc.a +lib/libutf8proc.so -> libutf8proc.so.1.3.0 +lib/libutf8proc.so.1 -> libutf8proc.so.1.3.0 +lib/libutf8proc.so.1.3.0 diff --git a/Makefile b/Makefile @@ -5,6 +5,7 @@ MAKE=make AR?=ar CC?=gcc INSTALL=install +FIND=find # compiler settings CFLAGS ?= -O2 @@ -17,7 +18,7 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS # from the utf8proc version number because it indicates ABI compatibility, # not API compatibility: MAJOR should be incremented whenever *binary* # compatibility is broken, even if the API is backward-compatible -# Be sure to also update these in CMakeLists.txt! +# Be sure to also update these in MANIFEST and CMakeLists.txt! MAJOR=1 MINOR=3 PATCH=0 @@ -38,12 +39,17 @@ includedir=$(prefix)/include # meta targets -.PHONY: all, clean, update, data +.PHONY: all clean data update manifest install all: libutf8proc.a libutf8proc.$(SHLIB_EXT) clean: - rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate + rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) +ifneq ($(OS),Darwin) + rm -f libutf8proc.so.$(MAJOR) +endif + rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case + rm -rf MANIFEST.new tmp $(MAKE) -C bench clean $(MAKE) -C data clean @@ -52,6 +58,8 @@ data: data/utf8proc_data.c.new update: data/utf8proc_data.c.new cp -f data/utf8proc_data.c.new utf8proc_data.c +manifest: MANIFEST.new + # real targets data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl @@ -84,12 +92,17 @@ install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT) mkdir -m 755 -p $(DESTDIR)$(libdir) $(INSTALL) -m 644 libutf8proc.a $(DESTDIR)$(libdir) $(INSTALL) -m 755 libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir) - ln -f -s $(libdir)/libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.$(SHLIB_EXT) + ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.$(SHLIB_EXT) ifneq ($(OS),Darwin) - ln -f -s $(libdir)/libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.so.$(MAJOR) - ln -f -s $(libdir)/libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.so.$(MAJOR).$(MINOR) + ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.so.$(MAJOR) endif +MANIFEST.new: + rm -rf tmp + $(MAKE) install prefix=/usr DESTDIR=$(PWD)/tmp + $(FIND) tmp/usr -mindepth 1 -type l -printf "%P -> %l\n" -or -type f -printf "%P\n" -or -type d -printf "%P/\n" | LC_ALL=C sort > $@ + rm -rf tmp + # Test programs data/NormalizationTest.txt: @@ -98,26 +111,29 @@ data/NormalizationTest.txt: data/GraphemeBreakTest.txt: $(MAKE) -C data GraphemeBreakTest.txt -test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/normtest.c utf8proc.o -o $@ +test/tests.o: test/tests.c test/tests.h utf8proc.h + $(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c + +test/normtest: test/normtest.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/normtest.c test/tests.o utf8proc.o -o $@ -test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/graphemetest.c utf8proc.o -o $@ +test/graphemetest: test/graphemetest.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/graphemetest.c test/tests.o utf8proc.o -o $@ -test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/printproperty.c utf8proc.o -o $@ +test/printproperty: test/printproperty.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/printproperty.c test/tests.o utf8proc.o -o $@ -test/charwidth: test/charwidth.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/charwidth.c utf8proc.o -o $@ +test/charwidth: test/charwidth.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/charwidth.c test/tests.o utf8proc.o -o $@ -test/valid: test/valid.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/valid.c utf8proc.o -o $@ +test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/valid.c test/tests.o utf8proc.o -o $@ -test/iterate: test/iterate.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/iterate.c utf8proc.o -o $@ +test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@ -test/case: test/case.c utf8proc.o utf8proc.h test/tests.h - $(CC) $(UCFLAGS) test/case.c utf8proc.o -o $@ +test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench diff --git a/data/Makefile b/data/Makefile @@ -59,4 +59,5 @@ GraphemeBreakTest.txt: $(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ clean: - rm -f UnicodeData.txt EastAsianWidth.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd + rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd + rm -f utf8proc_data.c.new diff --git a/data/charwidths.jl b/data/charwidths.jl @@ -8,8 +8,14 @@ ############################################################################# # Julia 0.3/0.4 compatibility (taken from Compat package) +if VERSION < v"0.4.0-dev+1387" + typealias AbstractString String +end if VERSION < v"0.4.0-dev+1419" - const UInt16 = Uint16 + const UInt32 = Uint32 +end +if VERSION < v"0.4.0-dev+3874" + Base.parse{T<:Integer}(::Type{T}, s::AbstractString) = parseint(T, s) end CharWidths = Dict{Int,Int}() @@ -52,7 +58,7 @@ end # Widths from GNU Unifont #Read sfdfile for character widths -function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) +function parsesfd(filename::AbstractString, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) state=:seekchar lineno = 0 codepoint = width = nothing @@ -65,8 +71,8 @@ function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) state = :readdata end elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024 - contains(line, "Encoding:") && (codepoint = int(split(line)[3])) - contains(line, "Width:") && (width = int(split(line)[2])) + contains(line, "Encoding:") && (codepoint = parse(Int, split(line)[3])) + contains(line, "Width:") && (width = parse(Int, split(line)[2])) if codepoint!=nothing && width!=nothing && codepoint >= 0 w=div(width, 512) # 512 units to the en if w > 0 @@ -100,8 +106,8 @@ for line in readlines(open("EastAsianWidth.txt")) width = strip(tokens[2]) #Parse code point range into Julia UnitRange rangetokens = split(charrange, "..") - charstart = uint32("0x"*rangetokens[1]) - charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) + charstart = parse(UInt32, "0x"*rangetokens[1]) + charend = parse(UInt32, "0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) #Assign widths for c in charstart:charend diff --git a/test/charwidth.c b/test/charwidth.c @@ -2,7 +2,7 @@ #include <ctype.h> #include <wchar.h> -int my_isprint(int c) { +static int my_isprint(int c) { int cat = utf8proc_get_property(c)->category; return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) || (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd); diff --git a/test/iterate.c b/test/iterate.c @@ -8,7 +8,7 @@ static int error; #define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__) #define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__) -void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line) +static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line) { utf8proc_int32_t out[16]; utf8proc_ssize_t ret; diff --git a/test/printproperty.c b/test/printproperty.c @@ -7,7 +7,7 @@ int main(int argc, char **argv) int i; for (i = 1; i < argc; ++i) { - int c; + unsigned int c; if (!strcmp(argv[i], "-V")) { printf("utf8proc version %s\n", utf8proc_version()); continue; diff --git a/test/tests.c b/test/tests.c @@ -0,0 +1,46 @@ +/* Common functions for our test programs. */ + +#include "tests.h" + +size_t lineno = 0; + +void check(int cond, const char *format, ...) +{ + if (!cond) { + va_list args; + fprintf(stderr, "line %zd: ", lineno); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\n"); + exit(1); + } +} + +size_t skipspaces(const char *buf, size_t i) +{ + while (isspace(buf[i])) ++i; + return i; +} + +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, + separated by whitespace, and terminated by any character not in + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string + in dest, returning the number of bytes read from buf */ +size_t encode(char *dest, const char *buf) +{ + size_t i = 0, j, d = 0; + for (;;) { + int c; + i = skipspaces(buf, i); + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) + ; /* find end of hex input */ + if (j == i) { /* no codepoint found */ + dest[d] = 0; /* NUL-terminate destination string */ + return i + 1; + } + check(sscanf(buf + i, "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i); + i = j; /* skip to char after hex input */ + d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d)); + } +} diff --git a/test/tests.h b/test/tests.h @@ -1,5 +1,13 @@ /* Common functions and includes for our test programs. */ +/* + * Set feature macro to enable getline() and wcwidth(). + * + * Please refer to section 2.2.1 of POSIX.1-2008: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_02_01_02 + */ +#define _XOPEN_SOURCE 700 + #include <stdio.h> #include <stdlib.h> #include <ctype.h> @@ -8,46 +16,8 @@ #include "../utf8proc.h" -size_t lineno = 0; - -void check(int cond, const char *format, ...) -{ - if (!cond) { - va_list args; - fprintf(stderr, "line %zd: ", lineno); - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - fprintf(stderr, "\n"); - exit(1); - } -} - -size_t skipspaces(const char *buf, size_t i) -{ - while (isspace(buf[i])) ++i; - return i; -} - -/* if buf points to a sequence of codepoints encoded as hexadecimal strings, - separated by whitespace, and terminated by any character not in - [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string - in dest, returning the number of bytes read from buf */ -size_t encode(char *dest, const char *buf) -{ - size_t i = 0, j, d = 0; - for (;;) { - int c; - i = skipspaces(buf, i); - for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) - ; /* find end of hex input */ - if (j == i) { /* no codepoint found */ - dest[d] = 0; /* NUL-terminate destination string */ - return i + 1; - } - check(sscanf(buf + i, "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i); - i = j; /* skip to char after hex input */ - d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d)); - } -} +extern size_t lineno; +void check(int cond, const char *format, ...); +size_t skipspaces(const char *buf, size_t i); +size_t encode(char *dest, const char *buf);