utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 70bbed8626e902d8c1e2b8277b0c61efb8a460bb
parent caef918abd0a9425b3942df3859c7bea7b8986e0
Author: Michael Drake <mike@smoothartist.com>
Date:   Mon, 21 Nov 2016 14:22:39 +0000

Tlsa/ucs4 normalize (#88)

* Split codepoint sequence normalisation out into separate function.

This creates utf8proc_normalize_utf32() which takes and returns
a UTF-32 string, applying the following options:

- UTF8PROC_NLF2LS
- UTF8PROC_NLF2PS
- UTF8PROC_NLF2LF
- UTF8PROC_STRIPCC
- UTF8PROC_COMPOSE
- UTF8PROC_STABLE

The utf8proc_reencode() function has been updated to call the
new utf8proc_normalize_utf32().

* Update code documentation: utf8proc_reencode handles UTF8PROC_CHARBOUND.

Diffstat:
Mutf8proc.c | 13++++++++++---
Mutf8proc.h | 32++++++++++++++++++++++++++++++--
2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/utf8proc.c b/utf8proc.c @@ -545,9 +545,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( return wpos; } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { - /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored - ASSERT: 'buffer' has one spare byte of free space at the end! */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { utf8proc_ssize_t rpos; utf8proc_ssize_t wpos = 0; @@ -655,6 +654,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, } length = wpos; } + return length; +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored + ASSERT: 'buffer' has one spare byte of free space at the end! */ + length = utf8proc_normalize_utf32(buffer, length, options); + if (length < 0) return length; { utf8proc_ssize_t rpos, wpos = 0; utf8proc_int32_t uc; diff --git a/utf8proc.h b/utf8proc.h @@ -491,8 +491,34 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( ); /** + * Normalizes the sequence of `length` codepoints pointed to by `buffer` + * in-place (i.e., the result is also stored in `buffer`). + * + * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. + * @param length the length (in codepoints) of the buffer. + * @param options a bitwise or (`|`) of one or more of the following flags: + * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS + * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS + * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF + * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters + * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite + * codepoints + * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate + * the unicode versioning stability + * + * @return + * In case of success, the length (in codepoints) of the normalized UTF-32 string is + * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * + * @warning The entries of the array pointed to by `str` have to be in the + * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); + +/** * Reencodes the sequence of `length` codepoints pointed to by `buffer` * UTF-8 data in-place (i.e., the result is also stored in `buffer`). + * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion. * * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. * @param length the length (in codepoints) of the buffer. @@ -505,10 +531,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( * codepoints * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate * the unicode versioning stability + * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster * * @return - * In case of success, the length (in bytes) of the resulting UTF-8 string is - * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * In case of success, the length (in bytes) of the resulting nul-terminated + * UTF-8 string is returned; otherwise, a negative error code is returned + * (@ref utf8proc_errmsg). * * @warning The amount of free space pointed to by `buffer` must * exceed the amount of the input data by one byte, and the