utf8proc.c (32466B)
1/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ 2/* 3 * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. 4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 * DEALINGS IN THE SOFTWARE. 23 */ 24 25/* 26 * This library contains derived data from a modified version of the 27 * Unicode data files. 28 * 29 * The original data files are available at 30 * https://www.unicode.org/Public/UNIDATA/ 31 * 32 * Please notice the copyright statement in the file "utf8proc_data.c". 33 */ 34 35 36/* 37 * File name: utf8proc.c 38 * 39 * Description: 40 * Implementation of libutf8proc. 41 */ 42 43 44#include "utf8proc.h" 45 46#ifndef SSIZE_MAX 47#define SSIZE_MAX ((size_t)SIZE_MAX/2) 48#endif 49#ifndef UINT16_MAX 50# define UINT16_MAX 65535U 51#endif 52 53#include "utf8proc_data.c" 54 55 56UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { 57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 72 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 73 74#define UTF8PROC_HANGUL_SBASE 0xAC00 75#define UTF8PROC_HANGUL_LBASE 0x1100 76#define UTF8PROC_HANGUL_VBASE 0x1161 77#define UTF8PROC_HANGUL_TBASE 0x11A7 78#define UTF8PROC_HANGUL_LCOUNT 19 79#define UTF8PROC_HANGUL_VCOUNT 21 80#define UTF8PROC_HANGUL_TCOUNT 28 81#define UTF8PROC_HANGUL_NCOUNT 588 82#define UTF8PROC_HANGUL_SCOUNT 11172 83/* END is exclusive */ 84#define UTF8PROC_HANGUL_L_START 0x1100 85#define UTF8PROC_HANGUL_L_END 0x115A 86#define UTF8PROC_HANGUL_L_FILLER 0x115F 87#define UTF8PROC_HANGUL_V_START 0x1160 88#define UTF8PROC_HANGUL_V_END 0x11A3 89#define UTF8PROC_HANGUL_T_START 0x11A8 90#define UTF8PROC_HANGUL_T_END 0x11FA 91#define UTF8PROC_HANGUL_S_START 0xAC00 92#define UTF8PROC_HANGUL_S_END 0xD7A4 93 94/* Should follow semantic-versioning rules (semver.org) based on API 95 compatibility. (Note that the shared-library version number will 96 be different, being based on ABI compatibility.): */ 97#define STRINGIZEx(x) #x 98#define STRINGIZE(x) STRINGIZEx(x) 99UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { 100 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; 101} 102 103UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { 104 return "15.1.0"; 105} 106 107UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { 108 switch (errcode) { 109 case UTF8PROC_ERROR_NOMEM: 110 return "Memory for processing UTF-8 data could not be allocated."; 111 case UTF8PROC_ERROR_OVERFLOW: 112 return "UTF-8 string is too long to be processed."; 113 case UTF8PROC_ERROR_INVALIDUTF8: 114 return "Invalid UTF-8 string"; 115 case UTF8PROC_ERROR_NOTASSIGNED: 116 return "Unassigned Unicode code point found in UTF-8 string."; 117 case UTF8PROC_ERROR_INVALIDOPTS: 118 return "Invalid options for UTF-8 processing chosen."; 119 default: 120 return "An unknown error occurred while processing UTF-8 data."; 121 } 122} 123 124#define utf_cont(ch) (((ch) & 0xc0) == 0x80) 125UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( 126 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst 127) { 128 utf8proc_int32_t uc; 129 const utf8proc_uint8_t *end; 130 131 *dst = -1; 132 if (!strlen) return 0; 133 end = str + ((strlen < 0) ? 4 : strlen); 134 uc = *str++; 135 if (uc < 0x80) { 136 *dst = uc; 137 return 1; 138 } 139 // Must be between 0xc2 and 0xf4 inclusive to be valid 140 if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; 141 if (uc < 0xe0) { // 2-byte sequence 142 // Must have valid continuation character 143 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; 144 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); 145 return 2; 146 } 147 if (uc < 0xf0) { // 3-byte sequence 148 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) 149 return UTF8PROC_ERROR_INVALIDUTF8; 150 // Check for surrogate chars 151 if (uc == 0xed && *str > 0x9f) 152 return UTF8PROC_ERROR_INVALIDUTF8; 153 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); 154 if (uc < 0x800) 155 return UTF8PROC_ERROR_INVALIDUTF8; 156 *dst = uc; 157 return 3; 158 } 159 // 4-byte sequence 160 // Must have 3 valid continuation characters 161 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) 162 return UTF8PROC_ERROR_INVALIDUTF8; 163 // Make sure in correct range (0x10000 - 0x10ffff) 164 if (uc == 0xf0) { 165 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; 166 } else if (uc == 0xf4) { 167 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; 168 } 169 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); 170 return 4; 171} 172 173UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { 174 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); 175} 176 177UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 178 if (uc < 0x00) { 179 return 0; 180 } else if (uc < 0x80) { 181 dst[0] = (utf8proc_uint8_t) uc; 182 return 1; 183 } else if (uc < 0x800) { 184 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 185 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 186 return 2; 187 // Note: we allow encoding 0xd800-0xdfff here, so as not to change 188 // the API, however, these are actually invalid in UTF-8 189 } else if (uc < 0x10000) { 190 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 191 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 192 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 193 return 3; 194 } else if (uc < 0x110000) { 195 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 196 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 197 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 198 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 199 return 4; 200 } else return 0; 201} 202 203/* internal version used for inserting 0xff bytes between graphemes */ 204static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 205 if (uc < 0x00) { 206 if (uc == -1) { /* internal value used for grapheme breaks */ 207 dst[0] = (utf8proc_uint8_t)0xFF; 208 return 1; 209 } 210 return 0; 211 } else if (uc < 0x80) { 212 dst[0] = (utf8proc_uint8_t)uc; 213 return 1; 214 } else if (uc < 0x800) { 215 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 216 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 217 return 2; 218 } else if (uc < 0x10000) { 219 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 220 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 221 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 222 return 3; 223 } else if (uc < 0x110000) { 224 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 225 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 226 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 227 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 228 return 4; 229 } else return 0; 230} 231 232/* internal "unsafe" version that does not check whether uc is in range */ 233static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { 234 /* ASSERT: uc >= 0 && uc < 0x110000 */ 235 return utf8proc_properties + ( 236 utf8proc_stage2table[ 237 utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 238 ] 239 ); 240} 241 242UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { 243 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); 244} 245 246/* return whether there is a grapheme break between boundclasses lbc and tbc 247 (according to the definition of extended grapheme clusters) 248 249 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): 250 http://www.unicode.org/reports/tr29/tr29-29.html 251 252 CAVEATS: 253 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) 254 and GB 12/13 (regional indicator code points) require knowledge of previous characters 255 and are thus not handled by this function. This may result in an incorrect break before 256 an E_Modifier class codepoint and an incorrectly missing break between two 257 REGIONAL_INDICATOR class code points if such support does not exist in the caller. 258 259 See the special support in grapheme_break_extended, for required bookkeeping by the caller. 260*/ 261static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { 262 return 263 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1 264 (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3 265 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // --- 266 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4 267 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5 268 (lbc == UTF8PROC_BOUNDCLASS_L && // GB6 269 (tbc == UTF8PROC_BOUNDCLASS_L || // --- 270 tbc == UTF8PROC_BOUNDCLASS_V || // --- 271 tbc == UTF8PROC_BOUNDCLASS_LV || // --- 272 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // --- 273 ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7 274 lbc == UTF8PROC_BOUNDCLASS_V) && // --- 275 (tbc == UTF8PROC_BOUNDCLASS_V || // --- 276 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // --- 277 ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8 278 lbc == UTF8PROC_BOUNDCLASS_T) && // --- 279 tbc == UTF8PROC_BOUNDCLASS_T) ? false : // --- 280 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9 281 tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- 282 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a 283 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b 284 (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below) 285 tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- 286 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) 287 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- 288 true; // GB999 289} 290 291static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state) 292{ 293 if (state) { 294 int state_bc, state_icb; /* boundclass and indic_conjunct_break state */ 295 if (*state == 0) { /* state initialization */ 296 state_bc = lbc; 297 state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE; 298 } 299 else { /* lbc and licb are already encoded in *state */ 300 state_bc = *state & 0xff; // 1st byte of state is bound class 301 state_icb = *state >> 8; // 2nd byte of state is indic conjunct break 302 } 303 304 utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) && 305 !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER 306 && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c 307 308 // Special support for GB9c. Don't break between two consonants 309 // separated 1+ linker characters and 0+ extend characters in any order. 310 // After a consonant, we enter LINKER state after at least one linker. 311 if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT 312 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT 313 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) 314 state_icb = ticb; 315 else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER) 316 state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ? 317 UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb; 318 319 // Special support for GB 12/13 made possible by GB999. After two RI 320 // class codepoints we want to force a break. Do this by resetting the 321 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break 322 // after that character according to GB999 (unless of course such a break is 323 // forbidden by a different rule such as GB9). 324 if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) 325 state_bc = UTF8PROC_BOUNDCLASS_OTHER; 326 // Special support for GB11 (emoji extend* zwj / emoji) 327 else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { 328 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji 329 state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; 330 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) 331 state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo 332 else 333 state_bc = tbc; 334 } 335 else 336 state_bc = tbc; 337 338 *state = state_bc + (state_icb << 8); 339 return break_permitted; 340 } 341 else 342 return grapheme_break_simple(lbc, tbc); 343} 344 345UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( 346 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { 347 348 const utf8proc_property_t *p1 = utf8proc_get_property(c1); 349 const utf8proc_property_t *p2 = utf8proc_get_property(c2); 350 return grapheme_break_extended(p1->boundclass, 351 p2->boundclass, 352 p1->indic_conjunct_break, 353 p2->indic_conjunct_break, 354 state); 355} 356 357 358UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( 359 utf8proc_int32_t c1, utf8proc_int32_t c2) { 360 return utf8proc_grapheme_break_stateful(c1, c2, NULL); 361} 362 363static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) 364{ 365 utf8proc_int32_t entry_cp = **entry; 366 if ((entry_cp & 0xF800) == 0xD800) { 367 *entry = *entry + 1; 368 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); 369 entry_cp += 0x10000; 370 } 371 return entry_cp; 372} 373 374static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) 375{ 376 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; 377 return seqindex_decode_entry(&entry); 378} 379 380static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 381 utf8proc_ssize_t written = 0; 382 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF]; 383 int len = seqindex >> 14; 384 if (len >= 3) { 385 len = *entry; 386 entry++; 387 } 388 for (; len >= 0; entry++, len--) { 389 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); 390 391 written += utf8proc_decompose_char(entry_cp, dst+written, 392 (bufsize > written) ? (bufsize - written) : 0, options, 393 last_boundclass); 394 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 395 } 396 return written; 397} 398 399UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) 400{ 401 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; 402 return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; 403} 404 405UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) 406{ 407 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; 408 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; 409} 410 411UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) 412{ 413 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; 414 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; 415} 416 417UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) 418{ 419 const utf8proc_property_t *p = utf8proc_get_property(c); 420 return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX; 421} 422 423UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c) 424{ 425 const utf8proc_property_t *p = utf8proc_get_property(c); 426 return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT; 427} 428 429/* return a character width analogous to wcwidth (except portable and 430 hopefully less buggy than most system wcwidth functions). */ 431UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { 432 return utf8proc_get_property(c)->charwidth; 433} 434 435UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { 436 return (utf8proc_category_t) utf8proc_get_property(c)->category; 437} 438 439UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { 440 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; 441 return s[utf8proc_category(c)]; 442} 443 444#define utf8proc_decompose_lump(replacement_uc) \ 445 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 446 options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) 447 448UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 449 const utf8proc_property_t *property; 450 utf8proc_propval_t category; 451 utf8proc_int32_t hangul_sindex; 452 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; 453 property = unsafe_get_property(uc); 454 category = property->category; 455 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 456 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 457 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 458 utf8proc_int32_t hangul_tindex; 459 if (bufsize >= 1) { 460 dst[0] = UTF8PROC_HANGUL_LBASE + 461 hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 462 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 463 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 464 } 465 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 466 if (!hangul_tindex) return 2; 467 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 468 return 3; 469 } 470 } 471 if (options & UTF8PROC_REJECTNA) { 472 if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 473 } 474 if (options & UTF8PROC_IGNORE) { 475 if (property->ignorable) return 0; 476 } 477 if (options & UTF8PROC_STRIPNA) { 478 if (!category) return 0; 479 } 480 if (options & UTF8PROC_LUMP) { 481 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 482 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 483 utf8proc_decompose_lump(0x0027); 484 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 485 utf8proc_decompose_lump(0x002D); 486 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 487 if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 488 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 489 utf8proc_decompose_lump(0x003C); 490 if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 491 utf8proc_decompose_lump(0x003E); 492 if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 493 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 494 utf8proc_decompose_lump(0x005E); 495 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 496 utf8proc_decompose_lump(0x005F); 497 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 498 if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 499 if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 500 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 501 if (category == UTF8PROC_CATEGORY_ZL || 502 category == UTF8PROC_CATEGORY_ZP) 503 utf8proc_decompose_lump(0x000A); 504 } 505 } 506 if (options & UTF8PROC_STRIPMARK) { 507 if (category == UTF8PROC_CATEGORY_MN || 508 category == UTF8PROC_CATEGORY_MC || 509 category == UTF8PROC_CATEGORY_ME) return 0; 510 } 511 if (options & UTF8PROC_CASEFOLD) { 512 if (property->casefold_seqindex != UINT16_MAX) { 513 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); 514 } 515 } 516 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 517 if (property->decomp_seqindex != UINT16_MAX && 518 (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 519 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); 520 } 521 } 522 if (options & UTF8PROC_CHARBOUND) { 523 utf8proc_bool boundary; 524 boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break, 525 last_boundclass); 526 if (boundary) { 527 if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ 528 if (bufsize >= 2) dst[1] = uc; 529 return 2; 530 } 531 } 532 if (bufsize >= 1) *dst = uc; 533 return 1; 534} 535 536UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( 537 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 538 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options 539) { 540 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); 541} 542 543UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( 544 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 545 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, 546 utf8proc_custom_func custom_func, void *custom_data 547) { 548 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ 549 utf8proc_ssize_t wpos = 0; 550 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 551 return UTF8PROC_ERROR_INVALIDOPTS; 552 if ((options & UTF8PROC_STRIPMARK) && 553 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 554 return UTF8PROC_ERROR_INVALIDOPTS; 555 { 556 utf8proc_int32_t uc; 557 utf8proc_ssize_t rpos = 0; 558 utf8proc_ssize_t decomp_result; 559 int boundclass = UTF8PROC_BOUNDCLASS_START; 560 while (1) { 561 if (options & UTF8PROC_NULLTERM) { 562 rpos += utf8proc_iterate(str + rpos, -1, &uc); 563 /* checking of return value is not necessary, 564 as 'uc' is < 0 in case of error */ 565 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 566 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 567 if (uc == 0) break; 568 } else { 569 if (rpos >= strlen) break; 570 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 571 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 572 } 573 if (custom_func != NULL) { 574 uc = custom_func(uc, custom_data); /* user-specified custom mapping */ 575 } 576 decomp_result = utf8proc_decompose_char( 577 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 578 &boundclass 579 ); 580 if (decomp_result < 0) return decomp_result; 581 wpos += decomp_result; 582 /* prohibiting integer overflows due to too long strings: */ 583 if (wpos < 0 || 584 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2)) 585 return UTF8PROC_ERROR_OVERFLOW; 586 } 587 } 588 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { 589 utf8proc_ssize_t pos = 0; 590 while (pos < wpos-1) { 591 utf8proc_int32_t uc1, uc2; 592 const utf8proc_property_t *property1, *property2; 593 uc1 = buffer[pos]; 594 uc2 = buffer[pos+1]; 595 property1 = unsafe_get_property(uc1); 596 property2 = unsafe_get_property(uc2); 597 if (property1->combining_class > property2->combining_class && 598 property2->combining_class > 0) { 599 buffer[pos] = uc2; 600 buffer[pos+1] = uc1; 601 if (pos > 0) pos--; else pos++; 602 } else { 603 pos++; 604 } 605 } 606 } 607 return wpos; 608} 609 610UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 611 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ 612 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { 613 utf8proc_ssize_t rpos; 614 utf8proc_ssize_t wpos = 0; 615 utf8proc_int32_t uc; 616 for (rpos = 0; rpos < length; rpos++) { 617 uc = buffer[rpos]; 618 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 619 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || 620 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { 621 if (options & UTF8PROC_NLF2LS) { 622 if (options & UTF8PROC_NLF2PS) { 623 buffer[wpos++] = 0x000A; 624 } else { 625 buffer[wpos++] = 0x2028; 626 } 627 } else { 628 if (options & UTF8PROC_NLF2PS) { 629 buffer[wpos++] = 0x2029; 630 } else { 631 buffer[wpos++] = 0x0020; 632 } 633 } 634 } else if ((options & UTF8PROC_STRIPCC) && 635 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { 636 if (uc == 0x0009) buffer[wpos++] = 0x0020; 637 } else { 638 buffer[wpos++] = uc; 639 } 640 } 641 length = wpos; 642 } 643 if (options & UTF8PROC_COMPOSE) { 644 utf8proc_int32_t *starter = NULL; 645 utf8proc_int32_t current_char; 646 const utf8proc_property_t *starter_property = NULL, *current_property; 647 utf8proc_propval_t max_combining_class = -1; 648 utf8proc_ssize_t rpos; 649 utf8proc_ssize_t wpos = 0; 650 utf8proc_int32_t composition; 651 for (rpos = 0; rpos < length; rpos++) { 652 current_char = buffer[rpos]; 653 current_property = unsafe_get_property(current_char); 654 if (starter && current_property->combining_class > max_combining_class) { 655 /* combination perhaps possible */ 656 utf8proc_int32_t hangul_lindex; 657 utf8proc_int32_t hangul_sindex; 658 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; 659 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { 660 utf8proc_int32_t hangul_vindex; 661 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 662 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 663 *starter = UTF8PROC_HANGUL_SBASE + 664 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 665 UTF8PROC_HANGUL_TCOUNT; 666 starter_property = NULL; 667 continue; 668 } 669 } 670 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; 671 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 672 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { 673 utf8proc_int32_t hangul_tindex; 674 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 675 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 676 *starter += hangul_tindex; 677 starter_property = NULL; 678 continue; 679 } 680 } 681 if (!starter_property) { 682 starter_property = unsafe_get_property(*starter); 683 } 684 if (starter_property->comb_index < 0x8000 && 685 current_property->comb_index != UINT16_MAX && 686 current_property->comb_index >= 0x8000) { 687 int sidx = starter_property->comb_index; 688 int idx = current_property->comb_index & 0x3FFF; 689 if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) { 690 idx += sidx + 2 - utf8proc_combinations[sidx]; 691 if (current_property->comb_index & 0x4000) { 692 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1]; 693 } else 694 composition = utf8proc_combinations[idx]; 695 696 if (composition > 0 && (!(options & UTF8PROC_STABLE) || 697 !(unsafe_get_property(composition)->comp_exclusion))) { 698 *starter = composition; 699 starter_property = NULL; 700 continue; 701 } 702 } 703 } 704 } 705 buffer[wpos] = current_char; 706 if (current_property->combining_class) { 707 if (current_property->combining_class > max_combining_class) { 708 max_combining_class = current_property->combining_class; 709 } 710 } else { 711 starter = buffer + wpos; 712 starter_property = NULL; 713 max_combining_class = -1; 714 } 715 wpos++; 716 } 717 length = wpos; 718 } 719 return length; 720} 721 722UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 723 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 724 ASSERT: 'buffer' has one spare byte of free space at the end! */ 725 length = utf8proc_normalize_utf32(buffer, length, options); 726 if (length < 0) return length; 727 { 728 utf8proc_ssize_t rpos, wpos = 0; 729 utf8proc_int32_t uc; 730 if (options & UTF8PROC_CHARBOUND) { 731 for (rpos = 0; rpos < length; rpos++) { 732 uc = buffer[rpos]; 733 wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 734 } 735 } else { 736 for (rpos = 0; rpos < length; rpos++) { 737 uc = buffer[rpos]; 738 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 739 } 740 } 741 ((utf8proc_uint8_t *)buffer)[wpos] = 0; 742 return wpos; 743 } 744} 745 746UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( 747 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options 748) { 749 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); 750} 751 752UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( 753 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, 754 utf8proc_custom_func custom_func, void *custom_data 755) { 756 utf8proc_int32_t *buffer; 757 utf8proc_ssize_t result; 758 *dstptr = NULL; 759 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); 760 if (result < 0) return result; 761 buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); 762 if (!buffer) return UTF8PROC_ERROR_NOMEM; 763 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); 764 if (result < 0) { 765 free(buffer); 766 return result; 767 } 768 result = utf8proc_reencode(buffer, result, options); 769 if (result < 0) { 770 free(buffer); 771 return result; 772 } 773 { 774 utf8proc_int32_t *newptr; 775 newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); 776 if (newptr) buffer = newptr; 777 } 778 *dstptr = (utf8proc_uint8_t *)buffer; 779 return result; 780} 781 782UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { 783 utf8proc_uint8_t *retval; 784 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 785 UTF8PROC_DECOMPOSE); 786 return retval; 787} 788 789UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { 790 utf8proc_uint8_t *retval; 791 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 792 UTF8PROC_COMPOSE); 793 return retval; 794} 795 796UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { 797 utf8proc_uint8_t *retval; 798 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 799 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 800 return retval; 801} 802 803UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { 804 utf8proc_uint8_t *retval; 805 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 806 UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 807 return retval; 808} 809 810UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { 811 utf8proc_uint8_t *retval; 812 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 813 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); 814 return retval; 815}