utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit b4621f43c3b8aaa5636cb129cd0f2e0f8cc81889
parent 8da37e28920ba72b81c1d2cd4995647aadcd6db5
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Wed, 30 Nov 2016 10:40:26 -0500

new utf8proc_map_custom for hooking in user-defined custom mappings (#89)

* new utf8proc_map_custom for hooking in user-defined custom mappings

* whoops, add test program

* NEWS, version bump for 2.1

* change test functions to static so that gcc doesn't complain about missing prototypes

Diffstat:
MCMakeLists.txt | 4++--
MMANIFEST | 6+++---
MMakefile | 12++++++++----
MNEWS.md | 15+++++++++++++++
Atest/custom.c | 27+++++++++++++++++++++++++++
Mutf8proc.c | 24++++++++++++++++++++----
Mutf8proc.h | 38+++++++++++++++++++++++++++++++++++---
7 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -10,8 +10,8 @@ project (utf8proc C) # API version number (defined in utf8proc.h). # Be sure to also update these in Makefile and MANIFEST! set(SO_MAJOR 2) -set(SO_MINOR 0) -set(SO_PATCH 2) +set(SO_MINOR 1) +set(SO_PATCH 0) add_definitions ( -DUTF8PROC_EXPORTS diff --git a/MANIFEST b/MANIFEST @@ -2,6 +2,6 @@ include/ include/utf8proc.h lib/ lib/libutf8proc.a -lib/libutf8proc.so -> libutf8proc.so.2.0.2 -lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2 -lib/libutf8proc.so.2.0.2 +lib/libutf8proc.so -> libutf8proc.so.2.1.0 +lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0 +lib/libutf8proc.so.2.1.0 diff --git a/Makefile b/Makefile @@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS # The API version number is defined in utf8proc.h. # Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt! MAJOR=2 -MINOR=0 -PATCH=2 +MINOR=1 +PATCH=0 OS := $(shell uname) ifeq ($(OS),Darwin) # MacOS X @@ -49,7 +49,7 @@ clean: ifneq ($(OS),Darwin) rm -f libutf8proc.so.$(MAJOR) endif - rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case + rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom rm -rf MANIFEST.new tmp $(MAKE) -C bench clean $(MAKE) -C data clean @@ -136,7 +136,10 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@ -check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o +test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ + +check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt @@ -144,3 +147,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB test/valid test/iterate test/case + test/custom diff --git a/NEWS.md b/NEWS.md @@ -1,5 +1,17 @@ # utf8proc release history # +## Version 2.1 (not yet released) ## + +- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom` + to allow user-supplied transformations of codepoints, in conjunction + with other transformations ([#89]). + +- New function `utf8proc_normalize_utf32` to apply normalizations + directly to UTF-32 data (not just UTF-8) ([#88]). + +- Fixed stack overflow that could occur due to incorrect definition + of `UINT16_MAX` with some compilers ([#84]). + ## Version 2.0.2 ## 2016-07-27: @@ -279,3 +291,6 @@ Release of version 1.0.1 [#78]: https://github.com/JuliaLang/utf8proc/issues/78 [#79]: https://github.com/JuliaLang/utf8proc/issues/79 [#80]: https://github.com/JuliaLang/utf8proc/issues/80 +[#84]: https://github.com/JuliaLang/utf8proc/pull/84 +[#88]: https://github.com/JuliaLang/utf8proc/pull/88 +[#89]: https://github.com/JuliaLang/utf8proc/pull/89 diff --git a/test/custom.c b/test/custom.c @@ -0,0 +1,27 @@ +#include "tests.h" + +static int thunk_test = 1; + +static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk) +{ + check(((int *) thunk) == &thunk_test, "unexpected thunk passed"); + if (codepoint == 'a') + return 'b'; + if (codepoint == 'S') + return 0x00df; /* ß */ + return codepoint; +} + +int main(void) +{ + utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */ + utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */ + utf8proc_uint8_t *output; + utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM, + custom, &thunk_test); + printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output); + check(strlen((char*) output) == 6, "incorrect output length"); + check(!memcmp(correct, output, 7), "incorrect output data"); + free(output); + return 0; +} diff --git a/utf8proc.c b/utf8proc.c @@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { return s[utf8proc_category(c)]; } - - #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) @@ -486,6 +484,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ) { + return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +) { /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ utf8proc_ssize_t wpos = 0; if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) @@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; } + if (custom_func != NULL) { + uc = custom_func(uc, custom_data); /* user-specified custom mapping */ + } decomp_result = utf8proc_decompose_char( uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, &boundclass @@ -684,14 +693,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ) { + return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +) { utf8proc_int32_t *buffer; utf8proc_ssize_t result; *dstptr = NULL; - result = utf8proc_decompose(str, strlen, NULL, 0, options); + result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; - result = utf8proc_decompose(str, strlen, buffer, result, options); + result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) { free(buffer); return result; diff --git a/utf8proc.h b/utf8proc.h @@ -71,9 +71,9 @@ /** The MAJOR version number (increased when backwards API compatibility is broken). */ #define UTF8PROC_VERSION_MAJOR 2 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ -#define UTF8PROC_VERSION_MINOR 0 +#define UTF8PROC_VERSION_MINOR 1 /** The PATCH version (increased for fixes that do not change the API). */ -#define UTF8PROC_VERSION_PATCH 2 +#define UTF8PROC_VERSION_PATCH 0 /** @} */ #include <stdlib.h> @@ -374,6 +374,13 @@ typedef enum { } utf8proc_boundclass_t; /** + * Function pointer type passed to @ref utf8proc_map_custom and + * @ref utf8proc_decompose_custom, which is used to specify a user-defined + * mapping of codepoints to be applied in conjunction with other mappings. + */ +typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data); + +/** * Array containing the byte lengths of a UTF-8 encoded codepoint based * on the first byte. */ @@ -480,6 +487,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( * `buffer` (which must contain at least `bufsize` entries). In case of * success, the number of codepoints written is returned; in case of an * error, a negative error code is returned (@ref utf8proc_errmsg). + * See @ref utf8proc_decompose_custom to supply additional transformations. * * If the number of written codepoints would be bigger than `bufsize`, the * required buffer size is returned, while the buffer will be overwritten with @@ -491,6 +499,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( ); /** + * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + +/** * Normalizes the sequence of `length` codepoints pointed to by `buffer` * in-place (i.e., the result is also stored in `buffer`). * @@ -623,7 +643,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi * in any case the result will be NULL terminated (though it might * contain NULL characters with the string if `str` contained NULL * characters). Other flags in the `options` field are passed to the - * functions defined above, and regarded as described. + * functions defined above, and regarded as described. See also + * @ref utfproc_map_custom to supply a custom codepoint transformation. * * In case of success the length of the new string is returned, * otherwise a negative error code is returned. @@ -635,6 +656,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ); +/** + * Like @ref utf8proc_map, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + /** @name Unicode normalization * * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC