utf8-norm.c (16607B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright (c) 2014 SGI. 4 * All rights reserved. 5 */ 6 7#include "utf8n.h" 8 9int utf8version_is_supported(const struct unicode_map *um, unsigned int version) 10{ 11 int i = um->tables->utf8agetab_size - 1; 12 13 while (i >= 0 && um->tables->utf8agetab[i] != 0) { 14 if (version == um->tables->utf8agetab[i]) 15 return 1; 16 i--; 17 } 18 return 0; 19} 20 21/* 22 * UTF-8 valid ranges. 23 * 24 * The UTF-8 encoding spreads the bits of a 32bit word over several 25 * bytes. This table gives the ranges that can be held and how they'd 26 * be represented. 27 * 28 * 0x00000000 0x0000007F: 0xxxxxxx 29 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 30 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 31 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 32 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 33 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 34 * 35 * There is an additional requirement on UTF-8, in that only the 36 * shortest representation of a 32bit value is to be used. A decoder 37 * must not decode sequences that do not satisfy this requirement. 38 * Thus the allowed ranges have a lower bound. 39 * 40 * 0x00000000 0x0000007F: 0xxxxxxx 41 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 42 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 43 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 44 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 45 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 46 * 47 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 48 * 17 planes of 65536 values. This limits the sequences actually seen 49 * even more, to just the following. 50 * 51 * 0 - 0x7F: 0 - 0x7F 52 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 53 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 54 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 55 * 56 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 57 * 58 * Note that the longest sequence seen with valid usage is 4 bytes, 59 * the same a single UTF-32 character. This makes the UTF-8 60 * representation of Unicode strictly smaller than UTF-32. 61 * 62 * The shortest sequence requirement was introduced by: 63 * Corrigendum #1: UTF-8 Shortest Form 64 * It can be found here: 65 * http://www.unicode.org/versions/corrigendum1.html 66 * 67 */ 68 69/* 70 * Return the number of bytes used by the current UTF-8 sequence. 71 * Assumes the input points to the first byte of a valid UTF-8 72 * sequence. 73 */ 74static inline int utf8clen(const char *s) 75{ 76 unsigned char c = *s; 77 78 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 79} 80 81/* 82 * Decode a 3-byte UTF-8 sequence. 83 */ 84static unsigned int 85utf8decode3(const char *str) 86{ 87 unsigned int uc; 88 89 uc = *str++ & 0x0F; 90 uc <<= 6; 91 uc |= *str++ & 0x3F; 92 uc <<= 6; 93 uc |= *str++ & 0x3F; 94 95 return uc; 96} 97 98/* 99 * Encode a 3-byte UTF-8 sequence. 100 */ 101static int 102utf8encode3(char *str, unsigned int val) 103{ 104 str[2] = (val & 0x3F) | 0x80; 105 val >>= 6; 106 str[1] = (val & 0x3F) | 0x80; 107 val >>= 6; 108 str[0] = val | 0xE0; 109 110 return 3; 111} 112 113/* 114 * utf8trie_t 115 * 116 * A compact binary tree, used to decode UTF-8 characters. 117 * 118 * Internal nodes are one byte for the node itself, and up to three 119 * bytes for an offset into the tree. The first byte contains the 120 * following information: 121 * NEXTBYTE - flag - advance to next byte if set 122 * BITNUM - 3 bit field - the bit number to tested 123 * OFFLEN - 2 bit field - number of bytes in the offset 124 * if offlen == 0 (non-branching node) 125 * RIGHTPATH - 1 bit field - set if the following node is for the 126 * right-hand path (tested bit is set) 127 * TRIENODE - 1 bit field - set if the following node is an internal 128 * node, otherwise it is a leaf node 129 * if offlen != 0 (branching node) 130 * LEFTNODE - 1 bit field - set if the left-hand node is internal 131 * RIGHTNODE - 1 bit field - set if the right-hand node is internal 132 * 133 * Due to the way utf8 works, there cannot be branching nodes with 134 * NEXTBYTE set, and moreover those nodes always have a righthand 135 * descendant. 136 */ 137typedef const unsigned char utf8trie_t; 138#define BITNUM 0x07 139#define NEXTBYTE 0x08 140#define OFFLEN 0x30 141#define OFFLEN_SHIFT 4 142#define RIGHTPATH 0x40 143#define TRIENODE 0x80 144#define RIGHTNODE 0x40 145#define LEFTNODE 0x80 146 147/* 148 * utf8leaf_t 149 * 150 * The leaves of the trie are embedded in the trie, and so the same 151 * underlying datatype: unsigned char. 152 * 153 * leaf[0]: The unicode version, stored as a generation number that is 154 * an index into ->utf8agetab[]. With this we can filter code 155 * points based on the unicode version in which they were 156 * defined. The CCC of a non-defined code point is 0. 157 * leaf[1]: Canonical Combining Class. During normalization, we need 158 * to do a stable sort into ascending order of all characters 159 * with a non-zero CCC that occur between two characters with 160 * a CCC of 0, or at the begin or end of a string. 161 * The unicode standard guarantees that all CCC values are 162 * between 0 and 254 inclusive, which leaves 255 available as 163 * a special value. 164 * Code points with CCC 0 are known as stoppers. 165 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 166 * start of a NUL-terminated string that is the decomposition 167 * of the character. 168 * The CCC of a decomposable character is the same as the CCC 169 * of the first character of its decomposition. 170 * Some characters decompose as the empty string: these are 171 * characters with the Default_Ignorable_Code_Point property. 172 * These do affect normalization, as they all have CCC 0. 173 * 174 * The decompositions in the trie have been fully expanded, with the 175 * exception of Hangul syllables, which are decomposed algorithmically. 176 * 177 * Casefolding, if applicable, is also done using decompositions. 178 * 179 * The trie is constructed in such a way that leaves exist for all 180 * UTF-8 sequences that match the criteria from the "UTF-8 valid 181 * ranges" comment above, and only for those sequences. Therefore a 182 * lookup in the trie can be used to validate the UTF-8 input. 183 */ 184typedef const unsigned char utf8leaf_t; 185 186#define LEAF_GEN(LEAF) ((LEAF)[0]) 187#define LEAF_CCC(LEAF) ((LEAF)[1]) 188#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 189 190#define MINCCC (0) 191#define MAXCCC (254) 192#define STOPPER (0) 193#define DECOMPOSE (255) 194 195/* Marker for hangul syllable decomposition. */ 196#define HANGUL ((char)(255)) 197/* Size of the synthesized leaf used for Hangul syllable decomposition. */ 198#define UTF8HANGULLEAF (12) 199 200/* 201 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 202 * 203 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 204 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 205 * 206 * SBase = 0xAC00 207 * LBase = 0x1100 208 * VBase = 0x1161 209 * TBase = 0x11A7 210 * LCount = 19 211 * VCount = 21 212 * TCount = 28 213 * NCount = 588 (VCount * TCount) 214 * SCount = 11172 (LCount * NCount) 215 * 216 * Decomposition: 217 * SIndex = s - SBase 218 * 219 * LV (Canonical/Full) 220 * LIndex = SIndex / NCount 221 * VIndex = (Sindex % NCount) / TCount 222 * LPart = LBase + LIndex 223 * VPart = VBase + VIndex 224 * 225 * LVT (Canonical) 226 * LVIndex = (SIndex / TCount) * TCount 227 * TIndex = (Sindex % TCount) 228 * LVPart = SBase + LVIndex 229 * TPart = TBase + TIndex 230 * 231 * LVT (Full) 232 * LIndex = SIndex / NCount 233 * VIndex = (Sindex % NCount) / TCount 234 * TIndex = (Sindex % TCount) 235 * LPart = LBase + LIndex 236 * VPart = VBase + VIndex 237 * if (TIndex == 0) { 238 * d = <LPart, VPart> 239 * } else { 240 * TPart = TBase + TIndex 241 * d = <LPart, TPart, VPart> 242 * } 243 */ 244 245/* Constants */ 246#define SB (0xAC00) 247#define LB (0x1100) 248#define VB (0x1161) 249#define TB (0x11A7) 250#define LC (19) 251#define VC (21) 252#define TC (28) 253#define NC (VC * TC) 254#define SC (LC * NC) 255 256/* Algorithmic decomposition of hangul syllable. */ 257static utf8leaf_t * 258utf8hangul(const char *str, unsigned char *hangul) 259{ 260 unsigned int si; 261 unsigned int li; 262 unsigned int vi; 263 unsigned int ti; 264 unsigned char *h; 265 266 /* Calculate the SI, LI, VI, and TI values. */ 267 si = utf8decode3(str) - SB; 268 li = si / NC; 269 vi = (si % NC) / TC; 270 ti = si % TC; 271 272 /* Fill in base of leaf. */ 273 h = hangul; 274 LEAF_GEN(h) = 2; 275 LEAF_CCC(h) = DECOMPOSE; 276 h += 2; 277 278 /* Add LPart, a 3-byte UTF-8 sequence. */ 279 h += utf8encode3((char *)h, li + LB); 280 281 /* Add VPart, a 3-byte UTF-8 sequence. */ 282 h += utf8encode3((char *)h, vi + VB); 283 284 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 285 if (ti) 286 h += utf8encode3((char *)h, ti + TB); 287 288 /* Terminate string. */ 289 h[0] = '\0'; 290 291 return hangul; 292} 293 294/* 295 * Use trie to scan s, touching at most len bytes. 296 * Returns the leaf if one exists, NULL otherwise. 297 * 298 * A non-NULL return guarantees that the UTF-8 sequence starting at s 299 * is well-formed and corresponds to a known unicode code point. The 300 * shorthand for this will be "is valid UTF-8 unicode". 301 */ 302static utf8leaf_t *utf8nlookup(const struct unicode_map *um, 303 enum utf8_normalization n, unsigned char *hangul, const char *s, 304 size_t len) 305{ 306 utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; 307 int offlen; 308 int offset; 309 int mask; 310 int node; 311 312 if (len == 0) 313 return NULL; 314 315 node = 1; 316 while (node) { 317 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 318 if (*trie & NEXTBYTE) { 319 if (--len == 0) 320 return NULL; 321 s++; 322 } 323 mask = 1 << (*trie & BITNUM); 324 if (*s & mask) { 325 /* Right leg */ 326 if (offlen) { 327 /* Right node at offset of trie */ 328 node = (*trie & RIGHTNODE); 329 offset = trie[offlen]; 330 while (--offlen) { 331 offset <<= 8; 332 offset |= trie[offlen]; 333 } 334 trie += offset; 335 } else if (*trie & RIGHTPATH) { 336 /* Right node after this node */ 337 node = (*trie & TRIENODE); 338 trie++; 339 } else { 340 /* No right node. */ 341 return NULL; 342 } 343 } else { 344 /* Left leg */ 345 if (offlen) { 346 /* Left node after this node. */ 347 node = (*trie & LEFTNODE); 348 trie += offlen + 1; 349 } else if (*trie & RIGHTPATH) { 350 /* No left node. */ 351 return NULL; 352 } else { 353 /* Left node after this node */ 354 node = (*trie & TRIENODE); 355 trie++; 356 } 357 } 358 } 359 /* 360 * Hangul decomposition is done algorithmically. These are the 361 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 362 * always 3 bytes long, so s has been advanced twice, and the 363 * start of the sequence is at s-2. 364 */ 365 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 366 trie = utf8hangul(s - 2, hangul); 367 return trie; 368} 369 370/* 371 * Use trie to scan s. 372 * Returns the leaf if one exists, NULL otherwise. 373 * 374 * Forwards to utf8nlookup(). 375 */ 376static utf8leaf_t *utf8lookup(const struct unicode_map *um, 377 enum utf8_normalization n, unsigned char *hangul, const char *s) 378{ 379 return utf8nlookup(um, n, hangul, s, (size_t)-1); 380} 381 382/* 383 * Length of the normalization of s, touch at most len bytes. 384 * Return -1 if s is not valid UTF-8 unicode. 385 */ 386ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, 387 const char *s, size_t len) 388{ 389 utf8leaf_t *leaf; 390 size_t ret = 0; 391 unsigned char hangul[UTF8HANGULLEAF]; 392 393 while (len && *s) { 394 leaf = utf8nlookup(um, n, hangul, s, len); 395 if (!leaf) 396 return -1; 397 if (um->tables->utf8agetab[LEAF_GEN(leaf)] > 398 um->ntab[n]->maxage) 399 ret += utf8clen(s); 400 else if (LEAF_CCC(leaf) == DECOMPOSE) 401 ret += strlen(LEAF_STR(leaf)); 402 else 403 ret += utf8clen(s); 404 len -= utf8clen(s); 405 s += utf8clen(s); 406 } 407 return ret; 408} 409 410/* 411 * Set up an utf8cursor for use by utf8byte(). 412 * 413 * u8c : pointer to cursor. 414 * data : const struct utf8data to use for normalization. 415 * s : string. 416 * len : length of s. 417 * 418 * Returns -1 on error, 0 on success. 419 */ 420int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, 421 enum utf8_normalization n, const char *s, size_t len) 422{ 423 if (!s) 424 return -1; 425 u8c->um = um; 426 u8c->n = n; 427 u8c->s = s; 428 u8c->p = NULL; 429 u8c->ss = NULL; 430 u8c->sp = NULL; 431 u8c->len = len; 432 u8c->slen = 0; 433 u8c->ccc = STOPPER; 434 u8c->nccc = STOPPER; 435 /* Check we didn't clobber the maximum length. */ 436 if (u8c->len != len) 437 return -1; 438 /* The first byte of s may not be an utf8 continuation. */ 439 if (len > 0 && (*s & 0xC0) == 0x80) 440 return -1; 441 return 0; 442} 443 444/* 445 * Get one byte from the normalized form of the string described by u8c. 446 * 447 * Returns the byte cast to an unsigned char on succes, and -1 on failure. 448 * 449 * The cursor keeps track of the location in the string in u8c->s. 450 * When a character is decomposed, the current location is stored in 451 * u8c->p, and u8c->s is set to the start of the decomposition. Note 452 * that bytes from a decomposition do not count against u8c->len. 453 * 454 * Characters are emitted if they match the current CCC in u8c->ccc. 455 * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 456 * and the function returns 0 in that case. 457 * 458 * Sorting by CCC is done by repeatedly scanning the string. The 459 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 460 * the start of the scan. The first pass finds the lowest CCC to be 461 * emitted and stores it in u8c->nccc, the second pass emits the 462 * characters with this CCC and finds the next lowest CCC. This limits 463 * the number of passes to 1 + the number of different CCCs in the 464 * sequence being scanned. 465 * 466 * Therefore: 467 * u8c->p != NULL -> a decomposition is being scanned. 468 * u8c->ss != NULL -> this is a repeating scan. 469 * u8c->ccc == -1 -> this is the first scan of a repeating scan. 470 */ 471int utf8byte(struct utf8cursor *u8c) 472{ 473 utf8leaf_t *leaf; 474 int ccc; 475 476 for (;;) { 477 /* Check for the end of a decomposed character. */ 478 if (u8c->p && *u8c->s == '\0') { 479 u8c->s = u8c->p; 480 u8c->p = NULL; 481 } 482 483 /* Check for end-of-string. */ 484 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 485 /* There is no next byte. */ 486 if (u8c->ccc == STOPPER) 487 return 0; 488 /* End-of-string during a scan counts as a stopper. */ 489 ccc = STOPPER; 490 goto ccc_mismatch; 491 } else if ((*u8c->s & 0xC0) == 0x80) { 492 /* This is a continuation of the current character. */ 493 if (!u8c->p) 494 u8c->len--; 495 return (unsigned char)*u8c->s++; 496 } 497 498 /* Look up the data for the current character. */ 499 if (u8c->p) { 500 leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); 501 } else { 502 leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul, 503 u8c->s, u8c->len); 504 } 505 506 /* No leaf found implies that the input is a binary blob. */ 507 if (!leaf) 508 return -1; 509 510 ccc = LEAF_CCC(leaf); 511 /* Characters that are too new have CCC 0. */ 512 if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > 513 u8c->um->ntab[u8c->n]->maxage) { 514 ccc = STOPPER; 515 } else if (ccc == DECOMPOSE) { 516 u8c->len -= utf8clen(u8c->s); 517 u8c->p = u8c->s + utf8clen(u8c->s); 518 u8c->s = LEAF_STR(leaf); 519 /* Empty decomposition implies CCC 0. */ 520 if (*u8c->s == '\0') { 521 if (u8c->ccc == STOPPER) 522 continue; 523 ccc = STOPPER; 524 goto ccc_mismatch; 525 } 526 527 leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); 528 if (!leaf) 529 return -1; 530 ccc = LEAF_CCC(leaf); 531 } 532 533 /* 534 * If this is not a stopper, then see if it updates 535 * the next canonical class to be emitted. 536 */ 537 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 538 u8c->nccc = ccc; 539 540 /* 541 * Return the current byte if this is the current 542 * combining class. 543 */ 544 if (ccc == u8c->ccc) { 545 if (!u8c->p) 546 u8c->len--; 547 return (unsigned char)*u8c->s++; 548 } 549 550 /* Current combining class mismatch. */ 551ccc_mismatch: 552 if (u8c->nccc == STOPPER) { 553 /* 554 * Scan forward for the first canonical class 555 * to be emitted. Save the position from 556 * which to restart. 557 */ 558 u8c->ccc = MINCCC - 1; 559 u8c->nccc = ccc; 560 u8c->sp = u8c->p; 561 u8c->ss = u8c->s; 562 u8c->slen = u8c->len; 563 if (!u8c->p) 564 u8c->len -= utf8clen(u8c->s); 565 u8c->s += utf8clen(u8c->s); 566 } else if (ccc != STOPPER) { 567 /* Not a stopper, and not the ccc we're emitting. */ 568 if (!u8c->p) 569 u8c->len -= utf8clen(u8c->s); 570 u8c->s += utf8clen(u8c->s); 571 } else if (u8c->nccc != MAXCCC + 1) { 572 /* At a stopper, restart for next ccc. */ 573 u8c->ccc = u8c->nccc; 574 u8c->nccc = MAXCCC + 1; 575 u8c->s = u8c->ss; 576 u8c->p = u8c->sp; 577 u8c->len = u8c->slen; 578 } else { 579 /* All done, proceed from here. */ 580 u8c->ccc = STOPPER; 581 u8c->nccc = STOPPER; 582 u8c->sp = NULL; 583 u8c->ss = NULL; 584 u8c->slen = 0; 585 } 586 } 587} 588 589#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE 590EXPORT_SYMBOL_GPL(utf8version_is_supported); 591EXPORT_SYMBOL_GPL(utf8nlen); 592EXPORT_SYMBOL_GPL(utf8ncursor); 593EXPORT_SYMBOL_GPL(utf8byte); 594#endif