utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 7c14ef5f8371e463a01e0f1de971caa600384390
parent 498ecbddd87f2555a730e90810db7744cf416b82
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Thu,  9 Apr 2015 11:36:40 -0400

Merge pull request #32 from JuliaLang/tk/ssize_t_typedef

Use a typedef instead of a #define for ssize_t with MSVC
Diffstat:
Mdata/data_generator.rb | 8++++----
Mtest/graphemetest.c | 8++++----
Mtest/normtest.c | 2+-
Mtest/tests.h | 2+-
Mutf8proc.c | 130++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mutf8proc.h | 85++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mutf8proc_data.c | 8++++----
7 files changed, 126 insertions(+), 117 deletions(-)

diff --git a/data/data_generator.rb b/data/data_generator.rb @@ -268,7 +268,7 @@ for code in 0...0x110000 end end -$stdout << "const int32_t utf8proc_sequences[] = {\n " +$stdout << "const utf8proc_int32_t utf8proc_sequences[] = {\n " i = 0 $int_array.each do |entry| i += 1 @@ -280,7 +280,7 @@ $int_array.each do |entry| end $stdout << "};\n\n" -$stdout << "const uint16_t utf8proc_stage1table[] = {\n " +$stdout << "const utf8proc_uint16_t utf8proc_stage1table[] = {\n " i = 0 stage1.each do |entry| i += 1 @@ -292,7 +292,7 @@ stage1.each do |entry| end $stdout << "};\n\n" -$stdout << "const uint16_t utf8proc_stage2table[] = {\n " +$stdout << "const utf8proc_uint16_t utf8proc_stage2table[] = {\n " i = 0 stage2.flatten.each do |entry| i += 1 @@ -311,7 +311,7 @@ properties.each { |line| } $stdout << "};\n\n" -$stdout << "const int32_t utf8proc_combinations[] = {\n " +$stdout << "const utf8proc_int32_t utf8proc_combinations[] = {\n " i = 0 comb1st_indicies.keys.each_index do |a| comb2nd_indicies.keys.each_index do |b| diff --git a/test/graphemetest.c b/test/graphemetest.c @@ -5,7 +5,7 @@ int main(int argc, char **argv) char *buf = NULL; size_t bufsize = 0; FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; - uint8_t src[1024]; + utf8proc_uint8_t src[1024]; check(f != NULL, "error opening GraphemeBreakTest.txt"); while (getline(&buf, &bufsize, f) > 0) { @@ -39,10 +39,10 @@ int main(int argc, char **argv) src[si] = 0; /* NUL-terminate */ if (si) { - uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ + utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ size_t i = 0, j = 0; - ssize_t glen; - uint8_t *g; /* utf8proc_map grapheme results */ + utf8proc_ssize_t glen; + utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ while (i < si) { if (src[i] != '/') utf8[j++] = src[i++]; diff --git a/test/normtest.c b/test/normtest.c @@ -1,7 +1,7 @@ #include "tests.h" #define CHECK_NORM(NRM, norm, src) { \ - char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ + char *src_norm = (char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \ check(!strcmp(norm, src_norm), \ "normalization failed for %s -> %s", src, norm); \ free(src_norm); \ diff --git a/test/tests.h b/test/tests.h @@ -47,7 +47,7 @@ size_t encode(char *dest, const char *buf) } check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); i = j; /* skip to char after hex input */ - d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); + d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d)); } while (1); } diff --git a/utf8proc.c b/utf8proc.c @@ -44,7 +44,7 @@ #include "utf8proc_data.c" -UTF8PROC_DLLEXPORT const int8_t utf8proc_utf8class[256] = { +UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -91,7 +91,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; } -UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) { +UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { switch (errcode) { case UTF8PROC_ERROR_NOMEM: return "Memory for processing UTF-8 data could not be allocated."; @@ -108,12 +108,12 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) { } } -UTF8PROC_DLLEXPORT ssize_t utf8proc_iterate( - const uint8_t *str, ssize_t strlen, int32_t *dst +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { int length; int i; - int32_t uc = -1; + utf8proc_int32_t uc = -1; *dst = -1; if (!strlen) return 0; length = utf8proc_utf8class[str[0]]; @@ -148,14 +148,14 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_iterate( return length; } -UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) { +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { if (uc < 0 || uc >= 0x110000 || ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || (uc >= 0xFDD0 && uc < 0xFDF0)) return false; else return true; } -UTF8PROC_DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { return 0; } else if (uc < 0x80) { @@ -186,7 +186,7 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { } /* internal "unsafe" version that does not check whether uc is in range */ -static const utf8proc_property_t *get_property(int32_t uc) { +static const utf8proc_property_t *get_property(utf8proc_int32_t uc) { /* ASSERT: uc >= 0 && uc < 0x110000 */ return utf8proc_properties + ( utf8proc_stage2table[ @@ -195,12 +195,12 @@ static const utf8proc_property_t *get_property(int32_t uc) { ); } -UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { +UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc); } /* return whether there is a grapheme break between boundclasses lbc and tbc */ -static bool grapheme_break(int lbc, int tbc) { +static utf8proc_bool grapheme_break(int lbc, int tbc) { return (lbc == UTF8PROC_BOUNDCLASS_START) ? true : (lbc == UTF8PROC_BOUNDCLASS_CR && @@ -226,22 +226,22 @@ static bool grapheme_break(int lbc, int tbc) { } /* return whether there is a grapheme break between codepoints c1 and c2 */ -UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) { +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { return grapheme_break(utf8proc_get_property(c1)->boundclass, utf8proc_get_property(c2)->boundclass); } /* return a character width analogous to wcwidth (except portable and hopefully less buggy than most system wcwidth functions). */ -UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t c) { +UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { return utf8proc_get_property(c)->charwidth; } -UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t c) { +UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { return utf8proc_get_property(c)->category; } -UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) { +UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; return s[utf8proc_category(c)]; } @@ -250,17 +250,17 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) { return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) -UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; utf8proc_propval_t category; - int32_t hangul_sindex; + utf8proc_int32_t hangul_sindex; if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; property = get_property(uc); category = property->category; hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { - int32_t hangul_tindex; + utf8proc_int32_t hangul_tindex; if (bufsize >= 1) { dst[0] = UTF8PROC_HANGUL_LBASE + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; @@ -312,8 +312,8 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssi } if (options & UTF8PROC_CASEFOLD) { if (property->casefold_mapping) { - const int32_t *casefold_entry; - ssize_t written = 0; + const utf8proc_int32_t *casefold_entry; + utf8proc_ssize_t written = 0; for (casefold_entry = property->casefold_mapping; *casefold_entry >= 0; casefold_entry++) { written += utf8proc_decompose_char(*casefold_entry, dst+written, @@ -327,8 +327,8 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssi if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (property->decomp_mapping && (!property->decomp_type || (options & UTF8PROC_COMPAT))) { - const int32_t *decomp_entry; - ssize_t written = 0; + const utf8proc_int32_t *decomp_entry; + utf8proc_ssize_t written = 0; for (decomp_entry = property->decomp_mapping; *decomp_entry >= 0; decomp_entry++) { written += utf8proc_decompose_char(*decomp_entry, dst+written, @@ -340,7 +340,7 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssi } } if (options & UTF8PROC_CHARBOUND) { - bool boundary; + utf8proc_bool boundary; int tbc = property->boundclass; boundary = grapheme_break(*last_boundclass, tbc); *last_boundclass = tbc; @@ -354,21 +354,21 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssi return 1; } -UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose( - const uint8_t *str, ssize_t strlen, - int32_t *buffer, ssize_t bufsize, utf8proc_option_t options +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ) { /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ - ssize_t wpos = 0; + utf8proc_ssize_t wpos = 0; if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) return UTF8PROC_ERROR_INVALIDOPTS; if ((options & UTF8PROC_STRIPMARK) && !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) return UTF8PROC_ERROR_INVALIDOPTS; { - int32_t uc; - ssize_t rpos = 0; - ssize_t decomp_result; + utf8proc_int32_t uc; + utf8proc_ssize_t rpos = 0; + utf8proc_ssize_t decomp_result; int boundclass = UTF8PROC_BOUNDCLASS_START; while (1) { if (options & UTF8PROC_NULLTERM) { @@ -390,14 +390,14 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose( if (decomp_result < 0) return decomp_result; wpos += decomp_result; /* prohibiting integer overflows due to too long strings: */ - if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) + if (wpos < 0 || wpos > SSIZE_MAX/sizeof(utf8proc_int32_t)/2) return UTF8PROC_ERROR_OVERFLOW; } } if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { - ssize_t pos = 0; + utf8proc_ssize_t pos = 0; while (pos < wpos-1) { - int32_t uc1, uc2; + utf8proc_int32_t uc1, uc2; const utf8proc_property_t *property1, *property2; uc1 = buffer[pos]; uc2 = buffer[pos+1]; @@ -416,13 +416,13 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose( return wpos; } -UTF8PROC_DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, utf8proc_option_t options) { +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored ASSERT: 'buffer' has one spare byte of free space at the end! */ if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { - ssize_t rpos; - ssize_t wpos = 0; - int32_t uc; + utf8proc_ssize_t rpos; + utf8proc_ssize_t wpos = 0; + utf8proc_int32_t uc; for (rpos = 0; rpos < length; rpos++) { uc = buffer[rpos]; if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; @@ -451,23 +451,23 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, ut length = wpos; } if (options & UTF8PROC_COMPOSE) { - int32_t *starter = NULL; - int32_t current_char; + utf8proc_int32_t *starter = NULL; + utf8proc_int32_t current_char; const utf8proc_property_t *starter_property = NULL, *current_property; utf8proc_propval_t max_combining_class = -1; - ssize_t rpos; - ssize_t wpos = 0; - int32_t composition; + utf8proc_ssize_t rpos; + utf8proc_ssize_t wpos = 0; + utf8proc_int32_t composition; for (rpos = 0; rpos < length; rpos++) { current_char = buffer[rpos]; current_property = get_property(current_char); if (starter && current_property->combining_class > max_combining_class) { /* combination perhaps possible */ - int32_t hangul_lindex; - int32_t hangul_sindex; + utf8proc_int32_t hangul_lindex; + utf8proc_int32_t hangul_sindex; hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { - int32_t hangul_vindex; + utf8proc_int32_t hangul_vindex; hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { *starter = UTF8PROC_HANGUL_SBASE + @@ -480,7 +480,7 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, ut hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { - int32_t hangul_tindex; + utf8proc_int32_t hangul_tindex; hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { *starter += hangul_tindex; @@ -520,26 +520,26 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, ut length = wpos; } { - ssize_t rpos, wpos = 0; - int32_t uc; + utf8proc_ssize_t rpos, wpos = 0; + utf8proc_int32_t uc; for (rpos = 0; rpos < length; rpos++) { uc = buffer[rpos]; - wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); + wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); } - ((uint8_t *)buffer)[wpos] = 0; + ((utf8proc_uint8_t *)buffer)[wpos] = 0; return wpos; } } -UTF8PROC_DLLEXPORT ssize_t utf8proc_map( - const uint8_t *str, ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ) { - int32_t *buffer; - ssize_t result; + utf8proc_int32_t *buffer; + utf8proc_ssize_t result; *dstptr = NULL; result = utf8proc_decompose(str, strlen, NULL, 0, options); if (result < 0) return result; - buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1); + buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; result = utf8proc_decompose(str, strlen, buffer, result, options); if (result < 0) { @@ -552,37 +552,37 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_map( return result; } { - int32_t *newptr; - newptr = (int32_t *) realloc(buffer, (size_t)result+1); + utf8proc_int32_t *newptr; + newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); if (newptr) buffer = newptr; } - *dstptr = (uint8_t *)buffer; + *dstptr = (utf8proc_uint8_t *)buffer; return result; } -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) { - uint8_t *retval; +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { + utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE); return retval; } -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) { - uint8_t *retval; +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { + utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE); return retval; } -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) { - uint8_t *retval; +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { + utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); return retval; } -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) { - uint8_t *retval; +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { + utf8proc_uint8_t *retval; utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT); return retval; diff --git a/utf8proc.h b/utf8proc.h @@ -77,23 +77,32 @@ #include <stdlib.h> #include <sys/types.h> #ifdef _MSC_VER -typedef signed char int8_t; -typedef unsigned char uint8_t; -typedef short int16_t; -typedef unsigned short uint16_t; -typedef int int32_t; +typedef signed char utf8proc_int8_t; +typedef unsigned char utf8proc_uint8_t; +typedef short utf8proc_int16_t; +typedef unsigned short utf8proc_uint16_t; +typedef int utf8proc_int32_t; # ifdef _WIN64 -# define ssize_t __int64 +typedef __int64 utf8proc_ssize_t; # else -# define ssize_t int +typedef int utf8proc_ssize_t; # endif # ifndef __cplusplus -typedef unsigned char bool; +typedef unsigned char utf8proc_bool; enum {false, true}; +# else +typedef bool utf8proc_bool; # endif #else # include <stdbool.h> # include <inttypes.h> +typedef int8_t utf8proc_int8_t; +typedef uint8_t utf8proc_uint8_t; +typedef int16_t utf8proc_int16_t; +typedef uint16_t utf8proc_uint16_t; +typedef int32_t utf8proc_int32_t; +typedef ssize_t utf8proc_ssize_t; +typedef bool utf8proc_bool; #endif #include <limits.h> @@ -203,7 +212,7 @@ typedef enum { /* @name Types */ /** Holds the value of a property. */ -typedef int16_t utf8proc_propval_t; +typedef utf8proc_int16_t utf8proc_propval_t; /** Struct containing information about a codepoint. */ typedef struct utf8proc_property_struct { @@ -223,13 +232,13 @@ typedef struct utf8proc_property_struct { * @see utf8proc_decomp_type_t. */ utf8proc_propval_t decomp_type; - const int32_t *decomp_mapping; - const int32_t *casefold_mapping; - int32_t uppercase_mapping; - int32_t lowercase_mapping; - int32_t titlecase_mapping; - int32_t comb1st_index; - int32_t comb2nd_index; + const utf8proc_int32_t *decomp_mapping; + const utf8proc_int32_t *casefold_mapping; + utf8proc_int32_t uppercase_mapping; + utf8proc_int32_t lowercase_mapping; + utf8proc_int32_t titlecase_mapping; + utf8proc_int32_t comb1st_index; + utf8proc_int32_t comb2nd_index; unsigned bidi_mirrored:1; unsigned comp_exclusion:1; /** @@ -351,7 +360,7 @@ typedef enum { * Array containing the byte lengths of a UTF-8 encoded codepoint based * on the first byte. */ -UTF8PROC_DLLEXPORT extern const int8_t utf8proc_utf8class[256]; +UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256]; /** * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH @@ -364,7 +373,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void); * Returns an informative error string for the given utf8proc error code * (e.g. the error codes returned by @ref utf8proc_map). */ -UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode); +UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode); /** * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. @@ -376,7 +385,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode); * In case of success, the number of bytes read is returned; otherwise, a * negative error code is returned. */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *codepoint_ref); +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref); /** * Check if a codepoint is valid (regardless of whether it has been @@ -384,7 +393,7 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, * * @return 1 if the given `codepoint` is valid and otherwise return 0. */ -UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint); +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); /** * Encodes the codepoint as an UTF-8 string in the byte array pointed @@ -395,7 +404,7 @@ UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint); * * This function does not check whether `codepoint` is valid Unicode. */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_encode_char(int32_t codepoint, uint8_t *dst); +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst); /** * Look up the properties for a given codepoint. @@ -409,7 +418,7 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_encode_char(int32_t codepoint, uint8_t *dst) * If the codepoint is unassigned or invalid, a pointer to a special struct is * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN). */ -UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t codepoint); +UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint); /** Decompose a codepoint into an array of codepoints. * @@ -438,8 +447,8 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t code * required buffer size is returned, while the buffer will be overwritten with * undefined data. */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char( - int32_t codepoint, int32_t *dst, ssize_t bufsize, +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( + utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass ); @@ -459,9 +468,9 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose_char( * required buffer size is returned, while the buffer will be overwritten with * undefined data. */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose( - const uint8_t *str, ssize_t strlen, - int32_t *buffer, ssize_t bufsize, utf8proc_option_t options +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ); /** @@ -489,13 +498,13 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_decompose( * entries of the array pointed to by `str` have to be in the * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, utf8proc_option_t options); +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); /** * Given a pair of consecutive codepoints, return whether a grapheme break is * permitted between them (as defined by the extended grapheme clusters in UAX#29). */ -UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t codepoint2); +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); /** * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, @@ -505,19 +514,19 @@ UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t code * @note * If you want to check for particular types of non-printable characters, * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ -UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t codepoint); +UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint); /** * Return the Unicode category for the codepoint (one of the * @ref utf8proc_category_t constants.) */ -UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t codepoint); +UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint); /** * Return the two-letter (nul-terminated) Unicode category string for * the codepoint (e.g. `"Lu"` or `"Co"`). */ -UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint); +UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint); /** * Maps the given UTF-8 string pointed to by `str` to a new UTF-8 @@ -537,8 +546,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint); * @note The memory of the new UTF-8 string will have been allocated * with `malloc`, and should therefore be deallocated with `free`. */ -UTF8PROC_DLLEXPORT ssize_t utf8proc_map( - const uint8_t *str, ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ); /** @name Unicode normalization @@ -550,13 +559,13 @@ UTF8PROC_DLLEXPORT ssize_t utf8proc_map( */ /** @{ */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str); +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); /** NFC normalization (@ref UTF8PROC_COMPOSE). */ -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str); +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); /** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str); +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); /** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ -UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str); +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); /** @} */ #ifdef __cplusplus diff --git a/utf8proc_data.c b/utf8proc_data.c @@ -1,4 +1,4 @@ -const int32_t utf8proc_sequences[] = { +const utf8proc_int32_t utf8proc_sequences[] = { 97, -1, 98, -1, 99, -1, 100, -1, 101, -1, 102, -1, 103, -1, 104, -1, 105, -1, 106, -1, 107, -1, 108, @@ -1523,7 +1523,7 @@ const int32_t utf8proc_sequences[] = { 172689, -1, 19798, -1, 40702, -1, 40709, -1, 40719, -1, 40726, -1, 173568, -1, }; -const uint16_t utf8proc_stage1table[] = { +const utf8proc_uint16_t utf8proc_stage1table[] = { 0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632, @@ -2070,7 +2070,7 @@ const uint16_t utf8proc_stage1table[] = { 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 35584, }; -const uint16_t utf8proc_stage2table[] = { +const utf8proc_uint16_t utf8proc_stage2table[] = { 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 3, 5, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -13003,7 +13003,7 @@ const utf8proc_property_t utf8proc_properties[] = { {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, 0, utf8proc_sequences + 12179, NULL, -1, -1, -1, -1, -1, false, false, false, false, UTF8PROC_BOUNDCLASS_OTHER, 2}, }; -const int32_t utf8proc_combinations[] = { +const utf8proc_int32_t utf8proc_combinations[] = { 192, 193, 194, 195, 196, 197, -1, 256, 258, 260, 550, 461, -1, -1, 512, 514, -1, -1, -1, -1, -1, -1, -1,