utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 610730f2314f4cdb52c64e2ef78a9d5d69402b66
parent 0520d6f7243c7a2507fe11e9e67fce2697ab2f95
Author: Mike Glorioso <mike.glorioso@gmail.com>
Date:   Thu, 14 Jan 2021 12:59:49 -0500

Fix Sign-Conversion warnings in library and test code (#214)

* JuliaStrings#169 turn on sign-conversion warnings

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>

* JuliaStrings#169 fix sign-conversion warnings for utf8proc.c

fix sign-converstion warnings for utf8proc_iterate
uc requires at most 21 bits to identify a unicode codepoint, so there is no need for it to be unsigned
multiple locations use, modify, or store uc with a signed value
the only exception is line 137 where uc is compared with an unsigned value

fix sign-converstion warnings for utf8proc_tolower, utf8proc_toupper, utf8proc_totitle
all three methods have sign conversion warnings when calling seqindex_decode_index
seqindex_decode_index uses the passed value as an index to an array utf8proc_sequences
as utf8proc_sequences is hard-coded and smaller than 2^31 - 1 we can safely cast to unsigned

fix sign-converstion warnings for utf8proc_decompose_char
lines with this warning use the defined function utf8proc_decompose_lump
in the function, a hardcoded unsigned value (1<<12) is complemented then cast as a signed value
as the intent is to remove the 12th bit flag from options, a signed value, and explicit cast is safe

fix sign-conversion warnings for utf8proc_map_custom
result is declared as signed, but is only expected to contain values between 0 and 4
sizeof returns an unsigned value. result must be cast to unsigned

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>

* JuliaStrings#169 fix sign-conversion warnings for test/*

fix sign-conversion warnings for test/tests.c encode
change type for d to match return value of utf8proc_encode_char

fix sign-conversion warnings for test/graphemetest.c checkline
si, i, and j are unsigned size types, utf8proc_map and utf8proc_iterate accept and return signed size types
utf8proc_map treats negative strlen values as 0. the strlen used by the test must be similarly limited
utf8proc_iterate treats negative strlen values as 4 which will be less than the unsigned size
fix unused-but-set-variable warning by checking the glen value

fix sign-conversion warnings for test/case.c main
the if block ensures that tested codepoint fits in wint_t, but needs to include u and l as well
c, u, and l can be safely cast to wint_t

fix sign-conversion warnings for test/iterate.c
all values used for len are below 8, so an explicit cast is safe
updated types for more portable test code

fix sign-conversion warnings for test/printproperty.c main
change type of c to signed to resolve all sign-converstion warnings.
replace sscanf(... &c) wiht sscanf(... &x) followed by explicit sign converstion

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>
Diffstat:
MMakefile | 2+-
Mtest/case.c | 12++++++------
Mtest/graphemetest.c | 5+++--
Mtest/iterate.c | 12++++++------
Mtest/printproperty.c | 6++++--
Mtest/tests.c | 3++-
Mutf8proc.c | 14+++++++-------
7 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile @@ -11,7 +11,7 @@ PERL=perl CFLAGS ?= -O2 PICFLAG = -fPIC C99FLAG = -std=c99 -WCFLAGS = -Wall -Wextra -pedantic +WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES) LDFLAG_SHARED = -shared SOFLAG = -Wl,-soname diff --git a/test/case.c b/test/case.c @@ -26,27 +26,27 @@ int main(int argc, char **argv) ++error; } - if (sizeof(wint_t) > 2 || c < (1<<16)) { - wint_t l0 = towlower(c), u0 = towupper(c); + if (sizeof(wint_t) > 2 || (c < (1<<16) && u < (1<<16) && l < (1<<16))) { + wint_t l0 = towlower((wint_t)c), u0 = towupper((wint_t)c); /* OS unicode tables may be out of date. But if they do have a lower/uppercase mapping, hopefully it is correct? */ - if (l0 != c && l0 != l) { + if (l0 != (wint_t)c && l0 != (wint_t)l) { fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n", l, c, l0); ++error; } - else if (l0 != l) { /* often true for out-of-date OS unicode */ + else if (l0 != (wint_t)l) { /* often true for out-of-date OS unicode */ ++better; /* printf("%x != towlower(%x) == %x\n", l, c, l0); */ } - if (u0 != c && u0 != u) { + if (u0 != (wint_t)c && u0 != (wint_t)u) { fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n", u, c, u0); ++error; } - else if (u0 != u) { /* often true for out-of-date OS unicode */ + else if (u0 != (wint_t)u) { /* often true for out-of-date OS unicode */ ++better; /* printf("%x != towupper(%x) == %x\n", u, c, u0); */ } diff --git a/test/graphemetest.c b/test/graphemetest.c @@ -43,7 +43,7 @@ void checkline(const char *_buf, bool verbose) { else i++; } - glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); + glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND); if (glen == UTF8PROC_ERROR_INVALIDUTF8) { /* the test file contains surrogate codepoints, which are only for UTF-16 */ printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); @@ -66,7 +66,7 @@ void checkline(const char *_buf, bool verbose) { utf8proc_bool expectbreak = false; do { utf8proc_int32_t codepoint; - i += utf8proc_iterate(src + i, si - i, &codepoint); + i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint); check(codepoint >= 0, "invalid UTF-8 data"); if (codepoint == 0x002F) expectbreak = true; @@ -110,6 +110,7 @@ int main(int argc, char **argv) utf8proc_uint8_t *g; glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); + check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks"); free(g); }; diff --git a/test/iterate.c b/test/iterate.c @@ -8,7 +8,7 @@ static int error; #define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__) #define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__) -static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int line) +static void testbytes(utf8proc_uint8_t *buf, utf8proc_ssize_t len, utf8proc_ssize_t retval, int line) { utf8proc_int32_t out[16]; utf8proc_ssize_t ret; @@ -16,13 +16,13 @@ static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int /* Make a copy to ensure that memory is left uninitialized after "len" * bytes. This way, Valgrind can detect overreads. */ - unsigned char tmp[16]; - memcpy(tmp, buf, len); + utf8proc_uint8_t tmp[16]; + memcpy(tmp, buf, (unsigned long int)len); tests++; if ((ret = utf8proc_iterate(tmp, len, out)) != retval) { fprintf(stderr, "Failed (%d):", line); - for (int i = 0; i < len ; i++) { + for (utf8proc_ssize_t i = 0; i < len ; i++) { fprintf(stderr, " 0x%02x", tmp[i]); } fprintf(stderr, " -> %zd\n", ret); @@ -32,8 +32,8 @@ static void testbytes(unsigned char *buf, int len, utf8proc_ssize_t retval, int int main(int argc, char **argv) { - uint32_t byt; - unsigned char buf[16]; + utf8proc_int32_t byt; + utf8proc_uint8_t buf[16]; (void) argc; (void) argv; /* unused */ diff --git a/test/printproperty.c b/test/printproperty.c @@ -8,12 +8,14 @@ int main(int argc, char **argv) for (i = 1; i < argc; ++i) { utf8proc_uint8_t cstr[16], *map; - unsigned int c; + utf8proc_uint32_t x; + utf8proc_int32_t c; if (!strcmp(argv[i], "-V")) { printf("utf8proc version %s\n", utf8proc_version()); continue; } - check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]); + check(sscanf(argv[i],"%x", &x) == 1, "invalid hex input %s", argv[i]); + c = (utf8proc_int32_t)x; const utf8proc_property_t *p = utf8proc_get_property(c); if (utf8proc_codepoint_valid(c)) diff --git a/test/tests.c b/test/tests.c @@ -29,7 +29,8 @@ size_t skipspaces(const unsigned char *buf, size_t i) in dest, returning the number of bytes read from buf */ size_t encode(unsigned char *dest, const unsigned char *buf) { - size_t i = 0, j, d = 0; + size_t i = 0, j; + utf8proc_ssize_t d = 0; for (;;) { int c; i = skipspaces(buf, i); diff --git a/utf8proc.c b/utf8proc.c @@ -125,7 +125,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { - utf8proc_uint32_t uc; + utf8proc_int32_t uc; const utf8proc_uint8_t *end; *dst = -1; @@ -137,7 +137,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( return 1; } // Must be between 0xc2 and 0xf4 inclusive to be valid - if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; + if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if (uc < 0xe0) { // 2-byte sequence // Must have valid continuation character if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; @@ -376,19 +376,19 @@ static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) { utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; - return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; + return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) { utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; - return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; + return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; } UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) @@ -420,7 +420,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ - options & ~UTF8PROC_LUMP, last_boundclass) + options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; @@ -735,7 +735,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( *dstptr = NULL; result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; - buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); + buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) {