utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 3822984606dd72ab129c0c9b26f496e75e7868e9
parent 128c04e3d02f8088eadfe924b647a2503cb0e945
Author: Steven G. Johnson <stevenj@mit.edu>
Date:   Thu, 12 Mar 2015 14:17:27 -0400

remove requirement that get_property and decompose_char argument be in range 0x0 to 0x10ffff

Diffstat:
Mutf8proc.c | 24++++++++++++++----------
Mutf8proc.h | 4----
2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/utf8proc.c b/utf8proc.c @@ -182,7 +182,8 @@ DLLEXPORT ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { } else return 0; } -DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { +/* internal "unsafe" version that does not check whether uc is in range */ +static const utf8proc_property_t *get_property(int32_t uc) { /* ASSERT: uc >= 0 && uc < 0x110000 */ return utf8proc_properties + ( utf8proc_stage2table[ @@ -191,6 +192,10 @@ DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { ); } +DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { + return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc); +} + /* return whether there is a grapheme break between boundclasses lbc and tbc */ static bool grapheme_break(int lbc, int tbc) { return @@ -242,13 +247,12 @@ DLLEXPORT const char *utf8proc_category_string(int32_t c) { return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) -DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, - int options, int *last_boundclass) { - /* ASSERT: uc >= 0 && uc < 0x110000 */ +DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, int options, int *last_boundclass) { const utf8proc_property_t *property; utf8proc_propval_t category; int32_t hangul_sindex; - property = utf8proc_get_property(uc); + if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; + property = get_property(uc); category = property->category; hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { @@ -394,8 +398,8 @@ DLLEXPORT ssize_t utf8proc_decompose( const utf8proc_property_t *property1, *property2; uc1 = buffer[pos]; uc2 = buffer[pos+1]; - property1 = utf8proc_get_property(uc1); - property2 = utf8proc_get_property(uc2); + property1 = get_property(uc1); + property2 = get_property(uc2); if (property1->combining_class > property2->combining_class && property2->combining_class > 0) { buffer[pos] = uc2; @@ -453,7 +457,7 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options int32_t composition; for (rpos = 0; rpos < length; rpos++) { current_char = buffer[rpos]; - current_property = utf8proc_get_property(current_char); + current_property = get_property(current_char); if (starter && current_property->combining_class > max_combining_class) { /* combination perhaps possible */ int32_t hangul_lindex; @@ -482,7 +486,7 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options } } if (!starter_property) { - starter_property = utf8proc_get_property(*starter); + starter_property = get_property(*starter); } if (starter_property->comb1st_index >= 0 && current_property->comb2nd_index >= 0) { @@ -491,7 +495,7 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options current_property->comb2nd_index ]; if (composition >= 0 && (!(options & UTF8PROC_STABLE) || - !(utf8proc_get_property(composition)->comp_exclusion))) { + !(get_property(composition)->comp_exclusion))) { *starter = composition; starter_property = NULL; continue; diff --git a/utf8proc.h b/utf8proc.h @@ -310,8 +310,6 @@ DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc); * the unicode char with the given code point 'uc'. * If the character is not existent a pointer to a special struct is * returned, where 'category' is 0 (UTF8PROC_CATEGORY_CN). - * WARNING: The parameter 'uc' has to be in the range of 0x0000 to - * 0x10FFFF, otherwise the program might crash! */ DLLEXPORT ssize_t utf8proc_decompose_char( @@ -338,8 +336,6 @@ DLLEXPORT ssize_t utf8proc_decompose_char( * If the number of written chars would be bigger than 'bufsize', * the buffer (up to 'bufsize') has inpredictable data, and the needed * buffer size is returned. - * WARNING: The parameter 'uc' has to be in the range of 0x0000 to - * 0x10FFFF, otherwise the program might crash! */ DLLEXPORT ssize_t utf8proc_decompose(