cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mkutf8data.c (81702B)


      1/*
      2 * Copyright (c) 2014 SGI.
      3 * All rights reserved.
      4 *
      5 * This program is free software; you can redistribute it and/or
      6 * modify it under the terms of the GNU General Public License as
      7 * published by the Free Software Foundation.
      8 *
      9 * This program is distributed in the hope that it would be useful,
     10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12 * GNU General Public License for more details.
     13 *
     14 * You should have received a copy of the GNU General Public License
     15 * along with this program; if not, write the Free Software Foundation,
     16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
     17 */
     18
     19/* Generator for a compact trie for unicode normalization */
     20
     21#include <sys/types.h>
     22#include <stddef.h>
     23#include <stdlib.h>
     24#include <stdio.h>
     25#include <assert.h>
     26#include <string.h>
     27#include <unistd.h>
     28#include <errno.h>
     29
     30/* Default names of the in- and output files. */
     31
     32#define AGE_NAME	"DerivedAge.txt"
     33#define CCC_NAME	"DerivedCombiningClass.txt"
     34#define PROP_NAME	"DerivedCoreProperties.txt"
     35#define DATA_NAME	"UnicodeData.txt"
     36#define FOLD_NAME	"CaseFolding.txt"
     37#define NORM_NAME	"NormalizationCorrections.txt"
     38#define TEST_NAME	"NormalizationTest.txt"
     39#define UTF8_NAME	"utf8data.h"
     40
     41const char	*age_name  = AGE_NAME;
     42const char	*ccc_name  = CCC_NAME;
     43const char	*prop_name = PROP_NAME;
     44const char	*data_name = DATA_NAME;
     45const char	*fold_name = FOLD_NAME;
     46const char	*norm_name = NORM_NAME;
     47const char	*test_name = TEST_NAME;
     48const char	*utf8_name = UTF8_NAME;
     49
     50int verbose = 0;
     51
     52/* An arbitrary line size limit on input lines. */
     53
     54#define LINESIZE	1024
     55char line[LINESIZE];
     56char buf0[LINESIZE];
     57char buf1[LINESIZE];
     58char buf2[LINESIZE];
     59char buf3[LINESIZE];
     60
     61const char *argv0;
     62
     63#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
     64
     65/* ------------------------------------------------------------------ */
     66
     67/*
     68 * Unicode version numbers consist of three parts: major, minor, and a
     69 * revision.  These numbers are packed into an unsigned int to obtain
     70 * a single version number.
     71 *
     72 * To save space in the generated trie, the unicode version is not
     73 * stored directly, instead we calculate a generation number from the
     74 * unicode versions seen in the DerivedAge file, and use that as an
     75 * index into a table of unicode versions.
     76 */
     77#define UNICODE_MAJ_SHIFT		(16)
     78#define UNICODE_MIN_SHIFT		(8)
     79
     80#define UNICODE_MAJ_MAX			((unsigned short)-1)
     81#define UNICODE_MIN_MAX			((unsigned char)-1)
     82#define UNICODE_REV_MAX			((unsigned char)-1)
     83
     84#define UNICODE_AGE(MAJ,MIN,REV)			\
     85	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
     86	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
     87	 ((unsigned int)(REV)))
     88
     89unsigned int *ages;
     90int ages_count;
     91
     92unsigned int unicode_maxage;
     93
     94static int age_valid(unsigned int major, unsigned int minor,
     95		     unsigned int revision)
     96{
     97	if (major > UNICODE_MAJ_MAX)
     98		return 0;
     99	if (minor > UNICODE_MIN_MAX)
    100		return 0;
    101	if (revision > UNICODE_REV_MAX)
    102		return 0;
    103	return 1;
    104}
    105
    106/* ------------------------------------------------------------------ */
    107
    108/*
    109 * utf8trie_t
    110 *
    111 * A compact binary tree, used to decode UTF-8 characters.
    112 *
    113 * Internal nodes are one byte for the node itself, and up to three
    114 * bytes for an offset into the tree.  The first byte contains the
    115 * following information:
    116 *  NEXTBYTE  - flag        - advance to next byte if set
    117 *  BITNUM    - 3 bit field - the bit number to tested
    118 *  OFFLEN    - 2 bit field - number of bytes in the offset
    119 * if offlen == 0 (non-branching node)
    120 *  RIGHTPATH - 1 bit field - set if the following node is for the
    121 *                            right-hand path (tested bit is set)
    122 *  TRIENODE  - 1 bit field - set if the following node is an internal
    123 *                            node, otherwise it is a leaf node
    124 * if offlen != 0 (branching node)
    125 *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
    126 *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
    127 *
    128 * Due to the way utf8 works, there cannot be branching nodes with
    129 * NEXTBYTE set, and moreover those nodes always have a righthand
    130 * descendant.
    131 */
    132typedef unsigned char utf8trie_t;
    133#define BITNUM		0x07
    134#define NEXTBYTE	0x08
    135#define OFFLEN		0x30
    136#define OFFLEN_SHIFT	4
    137#define RIGHTPATH	0x40
    138#define TRIENODE	0x80
    139#define RIGHTNODE	0x40
    140#define LEFTNODE	0x80
    141
    142/*
    143 * utf8leaf_t
    144 *
    145 * The leaves of the trie are embedded in the trie, and so the same
    146 * underlying datatype, unsigned char.
    147 *
    148 * leaf[0]: The unicode version, stored as a generation number that is
    149 *          an index into utf8agetab[].  With this we can filter code
    150 *          points based on the unicode version in which they were
    151 *          defined.  The CCC of a non-defined code point is 0.
    152 * leaf[1]: Canonical Combining Class. During normalization, we need
    153 *          to do a stable sort into ascending order of all characters
    154 *          with a non-zero CCC that occur between two characters with
    155 *          a CCC of 0, or at the begin or end of a string.
    156 *          The unicode standard guarantees that all CCC values are
    157 *          between 0 and 254 inclusive, which leaves 255 available as
    158 *          a special value.
    159 *          Code points with CCC 0 are known as stoppers.
    160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
    161 *          start of a NUL-terminated string that is the decomposition
    162 *          of the character.
    163 *          The CCC of a decomposable character is the same as the CCC
    164 *          of the first character of its decomposition.
    165 *          Some characters decompose as the empty string: these are
    166 *          characters with the Default_Ignorable_Code_Point property.
    167 *          These do affect normalization, as they all have CCC 0.
    168 *
    169 * The decompositions in the trie have been fully expanded.
    170 *
    171 * Casefolding, if applicable, is also done using decompositions.
    172 */
    173typedef unsigned char utf8leaf_t;
    174
    175#define LEAF_GEN(LEAF)	((LEAF)[0])
    176#define LEAF_CCC(LEAF)	((LEAF)[1])
    177#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
    178
    179#define MAXGEN		(255)
    180
    181#define MINCCC		(0)
    182#define MAXCCC		(254)
    183#define STOPPER		(0)
    184#define DECOMPOSE	(255)
    185#define HANGUL		((char)(255))
    186
    187#define UTF8HANGULLEAF	(12)
    188
    189struct tree;
    190static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
    191			       const char *, size_t);
    192static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
    193
    194unsigned char *utf8data;
    195size_t utf8data_size;
    196
    197utf8trie_t *nfdi;
    198utf8trie_t *nfdicf;
    199
    200/* ------------------------------------------------------------------ */
    201
    202/*
    203 * UTF8 valid ranges.
    204 *
    205 * The UTF-8 encoding spreads the bits of a 32bit word over several
    206 * bytes. This table gives the ranges that can be held and how they'd
    207 * be represented.
    208 *
    209 * 0x00000000 0x0000007F: 0xxxxxxx
    210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
    211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
    212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    215 *
    216 * There is an additional requirement on UTF-8, in that only the
    217 * shortest representation of a 32bit value is to be used.  A decoder
    218 * must not decode sequences that do not satisfy this requirement.
    219 * Thus the allowed ranges have a lower bound.
    220 *
    221 * 0x00000000 0x0000007F: 0xxxxxxx
    222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
    223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
    224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    227 *
    228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
    229 * 17 planes of 65536 values.  This limits the sequences actually seen
    230 * even more, to just the following.
    231 *
    232 *          0 -     0x7f: 0                     0x7f
    233 *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
    234 *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
    235 *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
    236 *
    237 * Even within those ranges not all values are allowed: the surrogates
    238 * 0xd800 - 0xdfff should never be seen.
    239 *
    240 * Note that the longest sequence seen with valid usage is 4 bytes,
    241 * the same a single UTF-32 character.  This makes the UTF-8
    242 * representation of Unicode strictly smaller than UTF-32.
    243 *
    244 * The shortest sequence requirement was introduced by:
    245 *    Corrigendum #1: UTF-8 Shortest Form
    246 * It can be found here:
    247 *    http://www.unicode.org/versions/corrigendum1.html
    248 *
    249 */
    250
    251#define UTF8_2_BITS     0xC0
    252#define UTF8_3_BITS     0xE0
    253#define UTF8_4_BITS     0xF0
    254#define UTF8_N_BITS     0x80
    255#define UTF8_2_MASK     0xE0
    256#define UTF8_3_MASK     0xF0
    257#define UTF8_4_MASK     0xF8
    258#define UTF8_N_MASK     0xC0
    259#define UTF8_V_MASK     0x3F
    260#define UTF8_V_SHIFT    6
    261
    262static int utf8encode(char *str, unsigned int val)
    263{
    264	int len;
    265
    266	if (val < 0x80) {
    267		str[0] = val;
    268		len = 1;
    269	} else if (val < 0x800) {
    270		str[1] = val & UTF8_V_MASK;
    271		str[1] |= UTF8_N_BITS;
    272		val >>= UTF8_V_SHIFT;
    273		str[0] = val;
    274		str[0] |= UTF8_2_BITS;
    275		len = 2;
    276	} else if (val < 0x10000) {
    277		str[2] = val & UTF8_V_MASK;
    278		str[2] |= UTF8_N_BITS;
    279		val >>= UTF8_V_SHIFT;
    280		str[1] = val & UTF8_V_MASK;
    281		str[1] |= UTF8_N_BITS;
    282		val >>= UTF8_V_SHIFT;
    283		str[0] = val;
    284		str[0] |= UTF8_3_BITS;
    285		len = 3;
    286	} else if (val < 0x110000) {
    287		str[3] = val & UTF8_V_MASK;
    288		str[3] |= UTF8_N_BITS;
    289		val >>= UTF8_V_SHIFT;
    290		str[2] = val & UTF8_V_MASK;
    291		str[2] |= UTF8_N_BITS;
    292		val >>= UTF8_V_SHIFT;
    293		str[1] = val & UTF8_V_MASK;
    294		str[1] |= UTF8_N_BITS;
    295		val >>= UTF8_V_SHIFT;
    296		str[0] = val;
    297		str[0] |= UTF8_4_BITS;
    298		len = 4;
    299	} else {
    300		printf("%#x: illegal val\n", val);
    301		len = 0;
    302	}
    303	return len;
    304}
    305
    306static unsigned int utf8decode(const char *str)
    307{
    308	const unsigned char *s = (const unsigned char*)str;
    309	unsigned int unichar = 0;
    310
    311	if (*s < 0x80) {
    312		unichar = *s;
    313	} else if (*s < UTF8_3_BITS) {
    314		unichar = *s++ & 0x1F;
    315		unichar <<= UTF8_V_SHIFT;
    316		unichar |= *s & 0x3F;
    317	} else if (*s < UTF8_4_BITS) {
    318		unichar = *s++ & 0x0F;
    319		unichar <<= UTF8_V_SHIFT;
    320		unichar |= *s++ & 0x3F;
    321		unichar <<= UTF8_V_SHIFT;
    322		unichar |= *s & 0x3F;
    323	} else {
    324		unichar = *s++ & 0x0F;
    325		unichar <<= UTF8_V_SHIFT;
    326		unichar |= *s++ & 0x3F;
    327		unichar <<= UTF8_V_SHIFT;
    328		unichar |= *s++ & 0x3F;
    329		unichar <<= UTF8_V_SHIFT;
    330		unichar |= *s & 0x3F;
    331	}
    332	return unichar;
    333}
    334
    335static int utf32valid(unsigned int unichar)
    336{
    337	return unichar < 0x110000;
    338}
    339
    340#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
    341
    342#define NODE 1
    343#define LEAF 0
    344
    345struct tree {
    346	void *root;
    347	int childnode;
    348	const char *type;
    349	unsigned int maxage;
    350	struct tree *next;
    351	int (*leaf_equal)(void *, void *);
    352	void (*leaf_print)(void *, int);
    353	int (*leaf_mark)(void *);
    354	int (*leaf_size)(void *);
    355	int *(*leaf_index)(struct tree *, void *);
    356	unsigned char *(*leaf_emit)(void *, unsigned char *);
    357	int leafindex[0x110000];
    358	int index;
    359};
    360
    361struct node {
    362	int index;
    363	int offset;
    364	int mark;
    365	int size;
    366	struct node *parent;
    367	void *left;
    368	void *right;
    369	unsigned char bitnum;
    370	unsigned char nextbyte;
    371	unsigned char leftnode;
    372	unsigned char rightnode;
    373	unsigned int keybits;
    374	unsigned int keymask;
    375};
    376
    377/*
    378 * Example lookup function for a tree.
    379 */
    380static void *lookup(struct tree *tree, const char *key)
    381{
    382	struct node *node;
    383	void *leaf = NULL;
    384
    385	node = tree->root;
    386	while (!leaf && node) {
    387		if (node->nextbyte)
    388			key++;
    389		if (*key & (1 << (node->bitnum & 7))) {
    390			/* Right leg */
    391			if (node->rightnode == NODE) {
    392				node = node->right;
    393			} else if (node->rightnode == LEAF) {
    394				leaf = node->right;
    395			} else {
    396				node = NULL;
    397			}
    398		} else {
    399			/* Left leg */
    400			if (node->leftnode == NODE) {
    401				node = node->left;
    402			} else if (node->leftnode == LEAF) {
    403				leaf = node->left;
    404			} else {
    405				node = NULL;
    406			}
    407		}
    408	}
    409
    410	return leaf;
    411}
    412
    413/*
    414 * A simple non-recursive tree walker: keep track of visits to the
    415 * left and right branches in the leftmask and rightmask.
    416 */
    417static void tree_walk(struct tree *tree)
    418{
    419	struct node *node;
    420	unsigned int leftmask;
    421	unsigned int rightmask;
    422	unsigned int bitmask;
    423	int indent = 1;
    424	int nodes, singletons, leaves;
    425
    426	nodes = singletons = leaves = 0;
    427
    428	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
    429	if (tree->childnode == LEAF) {
    430		assert(tree->root);
    431		tree->leaf_print(tree->root, indent);
    432		leaves = 1;
    433	} else {
    434		assert(tree->childnode == NODE);
    435		node = tree->root;
    436		leftmask = rightmask = 0;
    437		while (node) {
    438			printf("%*snode @ %p bitnum %d nextbyte %d"
    439			       " left %p right %p mask %x bits %x\n",
    440				indent, "", node,
    441				node->bitnum, node->nextbyte,
    442				node->left, node->right,
    443				node->keymask, node->keybits);
    444			nodes += 1;
    445			if (!(node->left && node->right))
    446				singletons += 1;
    447
    448			while (node) {
    449				bitmask = 1 << node->bitnum;
    450				if ((leftmask & bitmask) == 0) {
    451					leftmask |= bitmask;
    452					if (node->leftnode == LEAF) {
    453						assert(node->left);
    454						tree->leaf_print(node->left,
    455								 indent+1);
    456						leaves += 1;
    457					} else if (node->left) {
    458						assert(node->leftnode == NODE);
    459						indent += 1;
    460						node = node->left;
    461						break;
    462					}
    463				}
    464				if ((rightmask & bitmask) == 0) {
    465					rightmask |= bitmask;
    466					if (node->rightnode == LEAF) {
    467						assert(node->right);
    468						tree->leaf_print(node->right,
    469								 indent+1);
    470						leaves += 1;
    471					} else if (node->right) {
    472						assert(node->rightnode == NODE);
    473						indent += 1;
    474						node = node->right;
    475						break;
    476					}
    477				}
    478				leftmask &= ~bitmask;
    479				rightmask &= ~bitmask;
    480				node = node->parent;
    481				indent -= 1;
    482			}
    483		}
    484	}
    485	printf("nodes %d leaves %d singletons %d\n",
    486	       nodes, leaves, singletons);
    487}
    488
    489/*
    490 * Allocate an initialize a new internal node.
    491 */
    492static struct node *alloc_node(struct node *parent)
    493{
    494	struct node *node;
    495	int bitnum;
    496
    497	node = malloc(sizeof(*node));
    498	node->left = node->right = NULL;
    499	node->parent = parent;
    500	node->leftnode = NODE;
    501	node->rightnode = NODE;
    502	node->keybits = 0;
    503	node->keymask = 0;
    504	node->mark = 0;
    505	node->index = 0;
    506	node->offset = -1;
    507	node->size = 4;
    508
    509	if (node->parent) {
    510		bitnum = parent->bitnum;
    511		if ((bitnum & 7) == 0) {
    512			node->bitnum = bitnum + 7 + 8;
    513			node->nextbyte = 1;
    514		} else {
    515			node->bitnum = bitnum - 1;
    516			node->nextbyte = 0;
    517		}
    518	} else {
    519		node->bitnum = 7;
    520		node->nextbyte = 0;
    521	}
    522
    523	return node;
    524}
    525
    526/*
    527 * Insert a new leaf into the tree, and collapse any subtrees that are
    528 * fully populated and end in identical leaves. A nextbyte tagged
    529 * internal node will not be removed to preserve the tree's integrity.
    530 * Note that due to the structure of utf8, no nextbyte tagged node
    531 * will be a candidate for removal.
    532 */
    533static int insert(struct tree *tree, char *key, int keylen, void *leaf)
    534{
    535	struct node *node;
    536	struct node *parent;
    537	void **cursor;
    538	int keybits;
    539
    540	assert(keylen >= 1 && keylen <= 4);
    541
    542	node = NULL;
    543	cursor = &tree->root;
    544	keybits = 8 * keylen;
    545
    546	/* Insert, creating path along the way. */
    547	while (keybits) {
    548		if (!*cursor)
    549			*cursor = alloc_node(node);
    550		node = *cursor;
    551		if (node->nextbyte)
    552			key++;
    553		if (*key & (1 << (node->bitnum & 7)))
    554			cursor = &node->right;
    555		else
    556			cursor = &node->left;
    557		keybits--;
    558	}
    559	*cursor = leaf;
    560
    561	/* Merge subtrees if possible. */
    562	while (node) {
    563		if (*key & (1 << (node->bitnum & 7)))
    564			node->rightnode = LEAF;
    565		else
    566			node->leftnode = LEAF;
    567		if (node->nextbyte)
    568			break;
    569		if (node->leftnode == NODE || node->rightnode == NODE)
    570			break;
    571		assert(node->left);
    572		assert(node->right);
    573		/* Compare */
    574		if (! tree->leaf_equal(node->left, node->right))
    575			break;
    576		/* Keep left, drop right leaf. */
    577		leaf = node->left;
    578		/* Check in parent */
    579		parent = node->parent;
    580		if (!parent) {
    581			/* root of tree! */
    582			tree->root = leaf;
    583			tree->childnode = LEAF;
    584		} else if (parent->left == node) {
    585			parent->left = leaf;
    586			parent->leftnode = LEAF;
    587			if (parent->right) {
    588				parent->keymask = 0;
    589				parent->keybits = 0;
    590			} else {
    591				parent->keymask |= (1 << node->bitnum);
    592			}
    593		} else if (parent->right == node) {
    594			parent->right = leaf;
    595			parent->rightnode = LEAF;
    596			if (parent->left) {
    597				parent->keymask = 0;
    598				parent->keybits = 0;
    599			} else {
    600				parent->keymask |= (1 << node->bitnum);
    601				parent->keybits |= (1 << node->bitnum);
    602			}
    603		} else {
    604			/* internal tree error */
    605			assert(0);
    606		}
    607		free(node);
    608		node = parent;
    609	}
    610
    611	/* Propagate keymasks up along singleton chains. */
    612	while (node) {
    613		parent = node->parent;
    614		if (!parent)
    615			break;
    616		/* Nix the mask for parents with two children. */
    617		if (node->keymask == 0) {
    618			parent->keymask = 0;
    619			parent->keybits = 0;
    620		} else if (parent->left && parent->right) {
    621			parent->keymask = 0;
    622			parent->keybits = 0;
    623		} else {
    624			assert((parent->keymask & node->keymask) == 0);
    625			parent->keymask |= node->keymask;
    626			parent->keymask |= (1 << parent->bitnum);
    627			parent->keybits |= node->keybits;
    628			if (parent->right)
    629				parent->keybits |= (1 << parent->bitnum);
    630		}
    631		node = parent;
    632	}
    633
    634	return 0;
    635}
    636
    637/*
    638 * Prune internal nodes.
    639 *
    640 * Fully populated subtrees that end at the same leaf have already
    641 * been collapsed.  There are still internal nodes that have for both
    642 * their left and right branches a sequence of singletons that make
    643 * identical choices and end in identical leaves.  The keymask and
    644 * keybits collected in the nodes describe the choices made in these
    645 * singleton chains.  When they are identical for the left and right
    646 * branch of a node, and the two leaves comare identical, the node in
    647 * question can be removed.
    648 *
    649 * Note that nodes with the nextbyte tag set will not be removed by
    650 * this to ensure tree integrity.  Note as well that the structure of
    651 * utf8 ensures that these nodes would not have been candidates for
    652 * removal in any case.
    653 */
    654static void prune(struct tree *tree)
    655{
    656	struct node *node;
    657	struct node *left;
    658	struct node *right;
    659	struct node *parent;
    660	void *leftleaf;
    661	void *rightleaf;
    662	unsigned int leftmask;
    663	unsigned int rightmask;
    664	unsigned int bitmask;
    665	int count;
    666
    667	if (verbose > 0)
    668		printf("Pruning %s_%x\n", tree->type, tree->maxage);
    669
    670	count = 0;
    671	if (tree->childnode == LEAF)
    672		return;
    673	if (!tree->root)
    674		return;
    675
    676	leftmask = rightmask = 0;
    677	node = tree->root;
    678	while (node) {
    679		if (node->nextbyte)
    680			goto advance;
    681		if (node->leftnode == LEAF)
    682			goto advance;
    683		if (node->rightnode == LEAF)
    684			goto advance;
    685		if (!node->left)
    686			goto advance;
    687		if (!node->right)
    688			goto advance;
    689		left = node->left;
    690		right = node->right;
    691		if (left->keymask == 0)
    692			goto advance;
    693		if (right->keymask == 0)
    694			goto advance;
    695		if (left->keymask != right->keymask)
    696			goto advance;
    697		if (left->keybits != right->keybits)
    698			goto advance;
    699		leftleaf = NULL;
    700		while (!leftleaf) {
    701			assert(left->left || left->right);
    702			if (left->leftnode == LEAF)
    703				leftleaf = left->left;
    704			else if (left->rightnode == LEAF)
    705				leftleaf = left->right;
    706			else if (left->left)
    707				left = left->left;
    708			else if (left->right)
    709				left = left->right;
    710			else
    711				assert(0);
    712		}
    713		rightleaf = NULL;
    714		while (!rightleaf) {
    715			assert(right->left || right->right);
    716			if (right->leftnode == LEAF)
    717				rightleaf = right->left;
    718			else if (right->rightnode == LEAF)
    719				rightleaf = right->right;
    720			else if (right->left)
    721				right = right->left;
    722			else if (right->right)
    723				right = right->right;
    724			else
    725				assert(0);
    726		}
    727		if (! tree->leaf_equal(leftleaf, rightleaf))
    728			goto advance;
    729		/*
    730		 * This node has identical singleton-only subtrees.
    731		 * Remove it.
    732		 */
    733		parent = node->parent;
    734		left = node->left;
    735		right = node->right;
    736		if (parent->left == node)
    737			parent->left = left;
    738		else if (parent->right == node)
    739			parent->right = left;
    740		else
    741			assert(0);
    742		left->parent = parent;
    743		left->keymask |= (1 << node->bitnum);
    744		node->left = NULL;
    745		while (node) {
    746			bitmask = 1 << node->bitnum;
    747			leftmask &= ~bitmask;
    748			rightmask &= ~bitmask;
    749			if (node->leftnode == NODE && node->left) {
    750				left = node->left;
    751				free(node);
    752				count++;
    753				node = left;
    754			} else if (node->rightnode == NODE && node->right) {
    755				right = node->right;
    756				free(node);
    757				count++;
    758				node = right;
    759			} else {
    760				node = NULL;
    761			}
    762		}
    763		/* Propagate keymasks up along singleton chains. */
    764		node = parent;
    765		/* Force re-check */
    766		bitmask = 1 << node->bitnum;
    767		leftmask &= ~bitmask;
    768		rightmask &= ~bitmask;
    769		for (;;) {
    770			if (node->left && node->right)
    771				break;
    772			if (node->left) {
    773				left = node->left;
    774				node->keymask |= left->keymask;
    775				node->keybits |= left->keybits;
    776			}
    777			if (node->right) {
    778				right = node->right;
    779				node->keymask |= right->keymask;
    780				node->keybits |= right->keybits;
    781			}
    782			node->keymask |= (1 << node->bitnum);
    783			node = node->parent;
    784			/* Force re-check */
    785			bitmask = 1 << node->bitnum;
    786			leftmask &= ~bitmask;
    787			rightmask &= ~bitmask;
    788		}
    789	advance:
    790		bitmask = 1 << node->bitnum;
    791		if ((leftmask & bitmask) == 0 &&
    792		    node->leftnode == NODE &&
    793		    node->left) {
    794			leftmask |= bitmask;
    795			node = node->left;
    796		} else if ((rightmask & bitmask) == 0 &&
    797			   node->rightnode == NODE &&
    798			   node->right) {
    799			rightmask |= bitmask;
    800			node = node->right;
    801		} else {
    802			leftmask &= ~bitmask;
    803			rightmask &= ~bitmask;
    804			node = node->parent;
    805		}
    806	}
    807	if (verbose > 0)
    808		printf("Pruned %d nodes\n", count);
    809}
    810
    811/*
    812 * Mark the nodes in the tree that lead to leaves that must be
    813 * emitted.
    814 */
    815static void mark_nodes(struct tree *tree)
    816{
    817	struct node *node;
    818	struct node *n;
    819	unsigned int leftmask;
    820	unsigned int rightmask;
    821	unsigned int bitmask;
    822	int marked;
    823
    824	marked = 0;
    825	if (verbose > 0)
    826		printf("Marking %s_%x\n", tree->type, tree->maxage);
    827	if (tree->childnode == LEAF)
    828		goto done;
    829
    830	assert(tree->childnode == NODE);
    831	node = tree->root;
    832	leftmask = rightmask = 0;
    833	while (node) {
    834		bitmask = 1 << node->bitnum;
    835		if ((leftmask & bitmask) == 0) {
    836			leftmask |= bitmask;
    837			if (node->leftnode == LEAF) {
    838				assert(node->left);
    839				if (tree->leaf_mark(node->left)) {
    840					n = node;
    841					while (n && !n->mark) {
    842						marked++;
    843						n->mark = 1;
    844						n = n->parent;
    845					}
    846				}
    847			} else if (node->left) {
    848				assert(node->leftnode == NODE);
    849				node = node->left;
    850				continue;
    851			}
    852		}
    853		if ((rightmask & bitmask) == 0) {
    854			rightmask |= bitmask;
    855			if (node->rightnode == LEAF) {
    856				assert(node->right);
    857				if (tree->leaf_mark(node->right)) {
    858					n = node;
    859					while (n && !n->mark) {
    860						marked++;
    861						n->mark = 1;
    862						n = n->parent;
    863					}
    864				}
    865			} else if (node->right) {
    866				assert(node->rightnode == NODE);
    867				node = node->right;
    868				continue;
    869			}
    870		}
    871		leftmask &= ~bitmask;
    872		rightmask &= ~bitmask;
    873		node = node->parent;
    874	}
    875
    876	/* second pass: left siblings and singletons */
    877
    878	assert(tree->childnode == NODE);
    879	node = tree->root;
    880	leftmask = rightmask = 0;
    881	while (node) {
    882		bitmask = 1 << node->bitnum;
    883		if ((leftmask & bitmask) == 0) {
    884			leftmask |= bitmask;
    885			if (node->leftnode == LEAF) {
    886				assert(node->left);
    887				if (tree->leaf_mark(node->left)) {
    888					n = node;
    889					while (n && !n->mark) {
    890						marked++;
    891						n->mark = 1;
    892						n = n->parent;
    893					}
    894				}
    895			} else if (node->left) {
    896				assert(node->leftnode == NODE);
    897				node = node->left;
    898				if (!node->mark && node->parent->mark) {
    899					marked++;
    900					node->mark = 1;
    901				}
    902				continue;
    903			}
    904		}
    905		if ((rightmask & bitmask) == 0) {
    906			rightmask |= bitmask;
    907			if (node->rightnode == LEAF) {
    908				assert(node->right);
    909				if (tree->leaf_mark(node->right)) {
    910					n = node;
    911					while (n && !n->mark) {
    912						marked++;
    913						n->mark = 1;
    914						n = n->parent;
    915					}
    916				}
    917			} else if (node->right) {
    918				assert(node->rightnode == NODE);
    919				node = node->right;
    920				if (!node->mark && node->parent->mark &&
    921				    !node->parent->left) {
    922					marked++;
    923					node->mark = 1;
    924				}
    925				continue;
    926			}
    927		}
    928		leftmask &= ~bitmask;
    929		rightmask &= ~bitmask;
    930		node = node->parent;
    931	}
    932done:
    933	if (verbose > 0)
    934		printf("Marked %d nodes\n", marked);
    935}
    936
    937/*
    938 * Compute the index of each node and leaf, which is the offset in the
    939 * emitted trie.  These values must be pre-computed because relative
    940 * offsets between nodes are used to navigate the tree.
    941 */
    942static int index_nodes(struct tree *tree, int index)
    943{
    944	struct node *node;
    945	unsigned int leftmask;
    946	unsigned int rightmask;
    947	unsigned int bitmask;
    948	int count;
    949	int indent;
    950
    951	/* Align to a cache line (or half a cache line?). */
    952	while (index % 64)
    953		index++;
    954	tree->index = index;
    955	indent = 1;
    956	count = 0;
    957
    958	if (verbose > 0)
    959		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
    960	if (tree->childnode == LEAF) {
    961		index += tree->leaf_size(tree->root);
    962		goto done;
    963	}
    964
    965	assert(tree->childnode == NODE);
    966	node = tree->root;
    967	leftmask = rightmask = 0;
    968	while (node) {
    969		if (!node->mark)
    970			goto skip;
    971		count++;
    972		if (node->index != index)
    973			node->index = index;
    974		index += node->size;
    975skip:
    976		while (node) {
    977			bitmask = 1 << node->bitnum;
    978			if (node->mark && (leftmask & bitmask) == 0) {
    979				leftmask |= bitmask;
    980				if (node->leftnode == LEAF) {
    981					assert(node->left);
    982					*tree->leaf_index(tree, node->left) =
    983									index;
    984					index += tree->leaf_size(node->left);
    985					count++;
    986				} else if (node->left) {
    987					assert(node->leftnode == NODE);
    988					indent += 1;
    989					node = node->left;
    990					break;
    991				}
    992			}
    993			if (node->mark && (rightmask & bitmask) == 0) {
    994				rightmask |= bitmask;
    995				if (node->rightnode == LEAF) {
    996					assert(node->right);
    997					*tree->leaf_index(tree, node->right) = index;
    998					index += tree->leaf_size(node->right);
    999					count++;
   1000				} else if (node->right) {
   1001					assert(node->rightnode == NODE);
   1002					indent += 1;
   1003					node = node->right;
   1004					break;
   1005				}
   1006			}
   1007			leftmask &= ~bitmask;
   1008			rightmask &= ~bitmask;
   1009			node = node->parent;
   1010			indent -= 1;
   1011		}
   1012	}
   1013done:
   1014	/* Round up to a multiple of 16 */
   1015	while (index % 16)
   1016		index++;
   1017	if (verbose > 0)
   1018		printf("Final index %d\n", index);
   1019	return index;
   1020}
   1021
   1022/*
   1023 * Mark the nodes in a subtree, helper for size_nodes().
   1024 */
   1025static int mark_subtree(struct node *node)
   1026{
   1027	int changed;
   1028
   1029	if (!node || node->mark)
   1030		return 0;
   1031	node->mark = 1;
   1032	node->index = node->parent->index;
   1033	changed = 1;
   1034	if (node->leftnode == NODE)
   1035		changed += mark_subtree(node->left);
   1036	if (node->rightnode == NODE)
   1037		changed += mark_subtree(node->right);
   1038	return changed;
   1039}
   1040
   1041/*
   1042 * Compute the size of nodes and leaves. We start by assuming that
   1043 * each node needs to store a three-byte offset. The indexes of the
   1044 * nodes are calculated based on that, and then this function is
   1045 * called to see if the sizes of some nodes can be reduced.  This is
   1046 * repeated until no more changes are seen.
   1047 */
   1048static int size_nodes(struct tree *tree)
   1049{
   1050	struct tree *next;
   1051	struct node *node;
   1052	struct node *right;
   1053	struct node *n;
   1054	unsigned int leftmask;
   1055	unsigned int rightmask;
   1056	unsigned int bitmask;
   1057	unsigned int pathbits;
   1058	unsigned int pathmask;
   1059	unsigned int nbit;
   1060	int changed;
   1061	int offset;
   1062	int size;
   1063	int indent;
   1064
   1065	indent = 1;
   1066	changed = 0;
   1067	size = 0;
   1068
   1069	if (verbose > 0)
   1070		printf("Sizing %s_%x\n", tree->type, tree->maxage);
   1071	if (tree->childnode == LEAF)
   1072		goto done;
   1073
   1074	assert(tree->childnode == NODE);
   1075	pathbits = 0;
   1076	pathmask = 0;
   1077	node = tree->root;
   1078	leftmask = rightmask = 0;
   1079	while (node) {
   1080		if (!node->mark)
   1081			goto skip;
   1082		offset = 0;
   1083		if (!node->left || !node->right) {
   1084			size = 1;
   1085		} else {
   1086			if (node->rightnode == NODE) {
   1087				/*
   1088				 * If the right node is not marked,
   1089				 * look for a corresponding node in
   1090				 * the next tree.  Such a node need
   1091				 * not exist.
   1092				 */
   1093				right = node->right;
   1094				next = tree->next;
   1095				while (!right->mark) {
   1096					assert(next);
   1097					n = next->root;
   1098					while (n->bitnum != node->bitnum) {
   1099						nbit = 1 << n->bitnum;
   1100						if (!(pathmask & nbit))
   1101							break;
   1102						if (pathbits & nbit) {
   1103							if (n->rightnode == LEAF)
   1104								break;
   1105							n = n->right;
   1106						} else {
   1107							if (n->leftnode == LEAF)
   1108								break;
   1109							n = n->left;
   1110						}
   1111					}
   1112					if (n->bitnum != node->bitnum)
   1113						break;
   1114					n = n->right;
   1115					right = n;
   1116					next = next->next;
   1117				}
   1118				/* Make sure the right node is marked. */
   1119				if (!right->mark)
   1120					changed += mark_subtree(right);
   1121				offset = right->index - node->index;
   1122			} else {
   1123				offset = *tree->leaf_index(tree, node->right);
   1124				offset -= node->index;
   1125			}
   1126			assert(offset >= 0);
   1127			assert(offset <= 0xffffff);
   1128			if (offset <= 0xff) {
   1129				size = 2;
   1130			} else if (offset <= 0xffff) {
   1131				size = 3;
   1132			} else { /* offset <= 0xffffff */
   1133				size = 4;
   1134			}
   1135		}
   1136		if (node->size != size || node->offset != offset) {
   1137			node->size = size;
   1138			node->offset = offset;
   1139			changed++;
   1140		}
   1141skip:
   1142		while (node) {
   1143			bitmask = 1 << node->bitnum;
   1144			pathmask |= bitmask;
   1145			if (node->mark && (leftmask & bitmask) == 0) {
   1146				leftmask |= bitmask;
   1147				if (node->leftnode == LEAF) {
   1148					assert(node->left);
   1149				} else if (node->left) {
   1150					assert(node->leftnode == NODE);
   1151					indent += 1;
   1152					node = node->left;
   1153					break;
   1154				}
   1155			}
   1156			if (node->mark && (rightmask & bitmask) == 0) {
   1157				rightmask |= bitmask;
   1158				pathbits |= bitmask;
   1159				if (node->rightnode == LEAF) {
   1160					assert(node->right);
   1161				} else if (node->right) {
   1162					assert(node->rightnode == NODE);
   1163					indent += 1;
   1164					node = node->right;
   1165					break;
   1166				}
   1167			}
   1168			leftmask &= ~bitmask;
   1169			rightmask &= ~bitmask;
   1170			pathmask &= ~bitmask;
   1171			pathbits &= ~bitmask;
   1172			node = node->parent;
   1173			indent -= 1;
   1174		}
   1175	}
   1176done:
   1177	if (verbose > 0)
   1178		printf("Found %d changes\n", changed);
   1179	return changed;
   1180}
   1181
   1182/*
   1183 * Emit a trie for the given tree into the data array.
   1184 */
   1185static void emit(struct tree *tree, unsigned char *data)
   1186{
   1187	struct node *node;
   1188	unsigned int leftmask;
   1189	unsigned int rightmask;
   1190	unsigned int bitmask;
   1191	int offlen;
   1192	int offset;
   1193	int index;
   1194	int indent;
   1195	int size;
   1196	int bytes;
   1197	int leaves;
   1198	int nodes[4];
   1199	unsigned char byte;
   1200
   1201	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
   1202	leaves = 0;
   1203	bytes = 0;
   1204	index = tree->index;
   1205	data += index;
   1206	indent = 1;
   1207	if (verbose > 0)
   1208		printf("Emitting %s_%x\n", tree->type, tree->maxage);
   1209	if (tree->childnode == LEAF) {
   1210		assert(tree->root);
   1211		tree->leaf_emit(tree->root, data);
   1212		size = tree->leaf_size(tree->root);
   1213		index += size;
   1214		leaves++;
   1215		goto done;
   1216	}
   1217
   1218	assert(tree->childnode == NODE);
   1219	node = tree->root;
   1220	leftmask = rightmask = 0;
   1221	while (node) {
   1222		if (!node->mark)
   1223			goto skip;
   1224		assert(node->offset != -1);
   1225		assert(node->index == index);
   1226
   1227		byte = 0;
   1228		if (node->nextbyte)
   1229			byte |= NEXTBYTE;
   1230		byte |= (node->bitnum & BITNUM);
   1231		if (node->left && node->right) {
   1232			if (node->leftnode == NODE)
   1233				byte |= LEFTNODE;
   1234			if (node->rightnode == NODE)
   1235				byte |= RIGHTNODE;
   1236			if (node->offset <= 0xff)
   1237				offlen = 1;
   1238			else if (node->offset <= 0xffff)
   1239				offlen = 2;
   1240			else
   1241				offlen = 3;
   1242			nodes[offlen]++;
   1243			offset = node->offset;
   1244			byte |= offlen << OFFLEN_SHIFT;
   1245			*data++ = byte;
   1246			index++;
   1247			while (offlen--) {
   1248				*data++ = offset & 0xff;
   1249				index++;
   1250				offset >>= 8;
   1251			}
   1252		} else if (node->left) {
   1253			if (node->leftnode == NODE)
   1254				byte |= TRIENODE;
   1255			nodes[0]++;
   1256			*data++ = byte;
   1257			index++;
   1258		} else if (node->right) {
   1259			byte |= RIGHTNODE;
   1260			if (node->rightnode == NODE)
   1261				byte |= TRIENODE;
   1262			nodes[0]++;
   1263			*data++ = byte;
   1264			index++;
   1265		} else {
   1266			assert(0);
   1267		}
   1268skip:
   1269		while (node) {
   1270			bitmask = 1 << node->bitnum;
   1271			if (node->mark && (leftmask & bitmask) == 0) {
   1272				leftmask |= bitmask;
   1273				if (node->leftnode == LEAF) {
   1274					assert(node->left);
   1275					data = tree->leaf_emit(node->left,
   1276							       data);
   1277					size = tree->leaf_size(node->left);
   1278					index += size;
   1279					bytes += size;
   1280					leaves++;
   1281				} else if (node->left) {
   1282					assert(node->leftnode == NODE);
   1283					indent += 1;
   1284					node = node->left;
   1285					break;
   1286				}
   1287			}
   1288			if (node->mark && (rightmask & bitmask) == 0) {
   1289				rightmask |= bitmask;
   1290				if (node->rightnode == LEAF) {
   1291					assert(node->right);
   1292					data = tree->leaf_emit(node->right,
   1293							       data);
   1294					size = tree->leaf_size(node->right);
   1295					index += size;
   1296					bytes += size;
   1297					leaves++;
   1298				} else if (node->right) {
   1299					assert(node->rightnode == NODE);
   1300					indent += 1;
   1301					node = node->right;
   1302					break;
   1303				}
   1304			}
   1305			leftmask &= ~bitmask;
   1306			rightmask &= ~bitmask;
   1307			node = node->parent;
   1308			indent -= 1;
   1309		}
   1310	}
   1311done:
   1312	if (verbose > 0) {
   1313		printf("Emitted %d (%d) leaves",
   1314			leaves, bytes);
   1315		printf(" %d (%d+%d+%d+%d) nodes",
   1316			nodes[0] + nodes[1] + nodes[2] + nodes[3],
   1317			nodes[0], nodes[1], nodes[2], nodes[3]);
   1318		printf(" %d total\n", index - tree->index);
   1319	}
   1320}
   1321
   1322/* ------------------------------------------------------------------ */
   1323
   1324/*
   1325 * Unicode data.
   1326 *
   1327 * We need to keep track of the Canonical Combining Class, the Age,
   1328 * and decompositions for a code point.
   1329 *
   1330 * For the Age, we store the index into the ages table.  Effectively
   1331 * this is a generation number that the table maps to a unicode
   1332 * version.
   1333 *
   1334 * The correction field is used to indicate that this entry is in the
   1335 * corrections array, which contains decompositions that were
   1336 * corrected in later revisions.  The value of the correction field is
   1337 * the Unicode version in which the mapping was corrected.
   1338 */
   1339struct unicode_data {
   1340	unsigned int code;
   1341	int ccc;
   1342	int gen;
   1343	int correction;
   1344	unsigned int *utf32nfdi;
   1345	unsigned int *utf32nfdicf;
   1346	char *utf8nfdi;
   1347	char *utf8nfdicf;
   1348};
   1349
   1350struct unicode_data unicode_data[0x110000];
   1351struct unicode_data *corrections;
   1352int    corrections_count;
   1353
   1354struct tree *nfdi_tree;
   1355struct tree *nfdicf_tree;
   1356
   1357struct tree *trees;
   1358int          trees_count;
   1359
   1360/*
   1361 * Check the corrections array to see if this entry was corrected at
   1362 * some point.
   1363 */
   1364static struct unicode_data *corrections_lookup(struct unicode_data *u)
   1365{
   1366	int i;
   1367
   1368	for (i = 0; i != corrections_count; i++)
   1369		if (u->code == corrections[i].code)
   1370			return &corrections[i];
   1371	return u;
   1372}
   1373
   1374static int nfdi_equal(void *l, void *r)
   1375{
   1376	struct unicode_data *left = l;
   1377	struct unicode_data *right = r;
   1378
   1379	if (left->gen != right->gen)
   1380		return 0;
   1381	if (left->ccc != right->ccc)
   1382		return 0;
   1383	if (left->utf8nfdi && right->utf8nfdi &&
   1384	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
   1385		return 1;
   1386	if (left->utf8nfdi || right->utf8nfdi)
   1387		return 0;
   1388	return 1;
   1389}
   1390
   1391static int nfdicf_equal(void *l, void *r)
   1392{
   1393	struct unicode_data *left = l;
   1394	struct unicode_data *right = r;
   1395
   1396	if (left->gen != right->gen)
   1397		return 0;
   1398	if (left->ccc != right->ccc)
   1399		return 0;
   1400	if (left->utf8nfdicf && right->utf8nfdicf &&
   1401	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
   1402		return 1;
   1403	if (left->utf8nfdicf && right->utf8nfdicf)
   1404		return 0;
   1405	if (left->utf8nfdicf || right->utf8nfdicf)
   1406		return 0;
   1407	if (left->utf8nfdi && right->utf8nfdi &&
   1408	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
   1409		return 1;
   1410	if (left->utf8nfdi || right->utf8nfdi)
   1411		return 0;
   1412	return 1;
   1413}
   1414
   1415static void nfdi_print(void *l, int indent)
   1416{
   1417	struct unicode_data *leaf = l;
   1418
   1419	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
   1420		leaf->code, leaf->ccc, leaf->gen);
   1421
   1422	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
   1423		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
   1424	else if (leaf->utf8nfdi)
   1425		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
   1426
   1427	printf("\n");
   1428}
   1429
   1430static void nfdicf_print(void *l, int indent)
   1431{
   1432	struct unicode_data *leaf = l;
   1433
   1434	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
   1435		leaf->code, leaf->ccc, leaf->gen);
   1436
   1437	if (leaf->utf8nfdicf)
   1438		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
   1439	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
   1440		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
   1441	else if (leaf->utf8nfdi)
   1442		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
   1443	printf("\n");
   1444}
   1445
   1446static int nfdi_mark(void *l)
   1447{
   1448	return 1;
   1449}
   1450
   1451static int nfdicf_mark(void *l)
   1452{
   1453	struct unicode_data *leaf = l;
   1454
   1455	if (leaf->utf8nfdicf)
   1456		return 1;
   1457	return 0;
   1458}
   1459
   1460static int correction_mark(void *l)
   1461{
   1462	struct unicode_data *leaf = l;
   1463
   1464	return leaf->correction;
   1465}
   1466
   1467static int nfdi_size(void *l)
   1468{
   1469	struct unicode_data *leaf = l;
   1470	int size = 2;
   1471
   1472	if (HANGUL_SYLLABLE(leaf->code))
   1473		size += 1;
   1474	else if (leaf->utf8nfdi)
   1475		size += strlen(leaf->utf8nfdi) + 1;
   1476	return size;
   1477}
   1478
   1479static int nfdicf_size(void *l)
   1480{
   1481	struct unicode_data *leaf = l;
   1482	int size = 2;
   1483
   1484	if (HANGUL_SYLLABLE(leaf->code))
   1485		size += 1;
   1486	else if (leaf->utf8nfdicf)
   1487		size += strlen(leaf->utf8nfdicf) + 1;
   1488	else if (leaf->utf8nfdi)
   1489		size += strlen(leaf->utf8nfdi) + 1;
   1490	return size;
   1491}
   1492
   1493static int *nfdi_index(struct tree *tree, void *l)
   1494{
   1495	struct unicode_data *leaf = l;
   1496
   1497	return &tree->leafindex[leaf->code];
   1498}
   1499
   1500static int *nfdicf_index(struct tree *tree, void *l)
   1501{
   1502	struct unicode_data *leaf = l;
   1503
   1504	return &tree->leafindex[leaf->code];
   1505}
   1506
   1507static unsigned char *nfdi_emit(void *l, unsigned char *data)
   1508{
   1509	struct unicode_data *leaf = l;
   1510	unsigned char *s;
   1511
   1512	*data++ = leaf->gen;
   1513
   1514	if (HANGUL_SYLLABLE(leaf->code)) {
   1515		*data++ = DECOMPOSE;
   1516		*data++ = HANGUL;
   1517	} else if (leaf->utf8nfdi) {
   1518		*data++ = DECOMPOSE;
   1519		s = (unsigned char*)leaf->utf8nfdi;
   1520		while ((*data++ = *s++) != 0)
   1521			;
   1522	} else {
   1523		*data++ = leaf->ccc;
   1524	}
   1525	return data;
   1526}
   1527
   1528static unsigned char *nfdicf_emit(void *l, unsigned char *data)
   1529{
   1530	struct unicode_data *leaf = l;
   1531	unsigned char *s;
   1532
   1533	*data++ = leaf->gen;
   1534
   1535	if (HANGUL_SYLLABLE(leaf->code)) {
   1536		*data++ = DECOMPOSE;
   1537		*data++ = HANGUL;
   1538	} else if (leaf->utf8nfdicf) {
   1539		*data++ = DECOMPOSE;
   1540		s = (unsigned char*)leaf->utf8nfdicf;
   1541		while ((*data++ = *s++) != 0)
   1542			;
   1543	} else if (leaf->utf8nfdi) {
   1544		*data++ = DECOMPOSE;
   1545		s = (unsigned char*)leaf->utf8nfdi;
   1546		while ((*data++ = *s++) != 0)
   1547			;
   1548	} else {
   1549		*data++ = leaf->ccc;
   1550	}
   1551	return data;
   1552}
   1553
   1554static void utf8_create(struct unicode_data *data)
   1555{
   1556	char utf[18*4+1];
   1557	char *u;
   1558	unsigned int *um;
   1559	int i;
   1560
   1561	if (data->utf8nfdi) {
   1562		assert(data->utf8nfdi[0] == HANGUL);
   1563		return;
   1564	}
   1565
   1566	u = utf;
   1567	um = data->utf32nfdi;
   1568	if (um) {
   1569		for (i = 0; um[i]; i++)
   1570			u += utf8encode(u, um[i]);
   1571		*u = '\0';
   1572		data->utf8nfdi = strdup(utf);
   1573	}
   1574	u = utf;
   1575	um = data->utf32nfdicf;
   1576	if (um) {
   1577		for (i = 0; um[i]; i++)
   1578			u += utf8encode(u, um[i]);
   1579		*u = '\0';
   1580		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
   1581			data->utf8nfdicf = strdup(utf);
   1582	}
   1583}
   1584
   1585static void utf8_init(void)
   1586{
   1587	unsigned int unichar;
   1588	int i;
   1589
   1590	for (unichar = 0; unichar != 0x110000; unichar++)
   1591		utf8_create(&unicode_data[unichar]);
   1592
   1593	for (i = 0; i != corrections_count; i++)
   1594		utf8_create(&corrections[i]);
   1595}
   1596
   1597static void trees_init(void)
   1598{
   1599	struct unicode_data *data;
   1600	unsigned int maxage;
   1601	unsigned int nextage;
   1602	int count;
   1603	int i;
   1604	int j;
   1605
   1606	/* Count the number of different ages. */
   1607	count = 0;
   1608	nextage = (unsigned int)-1;
   1609	do {
   1610		maxage = nextage;
   1611		nextage = 0;
   1612		for (i = 0; i <= corrections_count; i++) {
   1613			data = &corrections[i];
   1614			if (nextage < data->correction &&
   1615			    data->correction < maxage)
   1616				nextage = data->correction;
   1617		}
   1618		count++;
   1619	} while (nextage);
   1620
   1621	/* Two trees per age: nfdi and nfdicf */
   1622	trees_count = count * 2;
   1623	trees = calloc(trees_count, sizeof(struct tree));
   1624
   1625	/* Assign ages to the trees. */
   1626	count = trees_count;
   1627	nextage = (unsigned int)-1;
   1628	do {
   1629		maxage = nextage;
   1630		trees[--count].maxage = maxage;
   1631		trees[--count].maxage = maxage;
   1632		nextage = 0;
   1633		for (i = 0; i <= corrections_count; i++) {
   1634			data = &corrections[i];
   1635			if (nextage < data->correction &&
   1636			    data->correction < maxage)
   1637				nextage = data->correction;
   1638		}
   1639	} while (nextage);
   1640
   1641	/* The ages assigned above are off by one. */
   1642	for (i = 0; i != trees_count; i++) {
   1643		j = 0;
   1644		while (ages[j] < trees[i].maxage)
   1645			j++;
   1646		trees[i].maxage = ages[j-1];
   1647	}
   1648
   1649	/* Set up the forwarding between trees. */
   1650	trees[trees_count-2].next = &trees[trees_count-1];
   1651	trees[trees_count-1].leaf_mark = nfdi_mark;
   1652	trees[trees_count-2].leaf_mark = nfdicf_mark;
   1653	for (i = 0; i != trees_count-2; i += 2) {
   1654		trees[i].next = &trees[trees_count-2];
   1655		trees[i].leaf_mark = correction_mark;
   1656		trees[i+1].next = &trees[trees_count-1];
   1657		trees[i+1].leaf_mark = correction_mark;
   1658	}
   1659
   1660	/* Assign the callouts. */
   1661	for (i = 0; i != trees_count; i += 2) {
   1662		trees[i].type = "nfdicf";
   1663		trees[i].leaf_equal = nfdicf_equal;
   1664		trees[i].leaf_print = nfdicf_print;
   1665		trees[i].leaf_size = nfdicf_size;
   1666		trees[i].leaf_index = nfdicf_index;
   1667		trees[i].leaf_emit = nfdicf_emit;
   1668
   1669		trees[i+1].type = "nfdi";
   1670		trees[i+1].leaf_equal = nfdi_equal;
   1671		trees[i+1].leaf_print = nfdi_print;
   1672		trees[i+1].leaf_size = nfdi_size;
   1673		trees[i+1].leaf_index = nfdi_index;
   1674		trees[i+1].leaf_emit = nfdi_emit;
   1675	}
   1676
   1677	/* Finish init. */
   1678	for (i = 0; i != trees_count; i++)
   1679		trees[i].childnode = NODE;
   1680}
   1681
   1682static void trees_populate(void)
   1683{
   1684	struct unicode_data *data;
   1685	unsigned int unichar;
   1686	char keyval[4];
   1687	int keylen;
   1688	int i;
   1689
   1690	for (i = 0; i != trees_count; i++) {
   1691		if (verbose > 0) {
   1692			printf("Populating %s_%x\n",
   1693				trees[i].type, trees[i].maxage);
   1694		}
   1695		for (unichar = 0; unichar != 0x110000; unichar++) {
   1696			if (unicode_data[unichar].gen < 0)
   1697				continue;
   1698			keylen = utf8encode(keyval, unichar);
   1699			data = corrections_lookup(&unicode_data[unichar]);
   1700			if (data->correction <= trees[i].maxage)
   1701				data = &unicode_data[unichar];
   1702			insert(&trees[i], keyval, keylen, data);
   1703		}
   1704	}
   1705}
   1706
   1707static void trees_reduce(void)
   1708{
   1709	int i;
   1710	int size;
   1711	int changed;
   1712
   1713	for (i = 0; i != trees_count; i++)
   1714		prune(&trees[i]);
   1715	for (i = 0; i != trees_count; i++)
   1716		mark_nodes(&trees[i]);
   1717	do {
   1718		size = 0;
   1719		for (i = 0; i != trees_count; i++)
   1720			size = index_nodes(&trees[i], size);
   1721		changed = 0;
   1722		for (i = 0; i != trees_count; i++)
   1723			changed += size_nodes(&trees[i]);
   1724	} while (changed);
   1725
   1726	utf8data = calloc(size, 1);
   1727	utf8data_size = size;
   1728	for (i = 0; i != trees_count; i++)
   1729		emit(&trees[i], utf8data);
   1730
   1731	if (verbose > 0) {
   1732		for (i = 0; i != trees_count; i++) {
   1733			printf("%s_%x idx %d\n",
   1734				trees[i].type, trees[i].maxage, trees[i].index);
   1735		}
   1736	}
   1737
   1738	nfdi = utf8data + trees[trees_count-1].index;
   1739	nfdicf = utf8data + trees[trees_count-2].index;
   1740
   1741	nfdi_tree = &trees[trees_count-1];
   1742	nfdicf_tree = &trees[trees_count-2];
   1743}
   1744
   1745static void verify(struct tree *tree)
   1746{
   1747	struct unicode_data *data;
   1748	utf8leaf_t	*leaf;
   1749	unsigned int	unichar;
   1750	char		key[4];
   1751	unsigned char	hangul[UTF8HANGULLEAF];
   1752	int		report;
   1753	int		nocf;
   1754
   1755	if (verbose > 0)
   1756		printf("Verifying %s_%x\n", tree->type, tree->maxage);
   1757	nocf = strcmp(tree->type, "nfdicf");
   1758
   1759	for (unichar = 0; unichar != 0x110000; unichar++) {
   1760		report = 0;
   1761		data = corrections_lookup(&unicode_data[unichar]);
   1762		if (data->correction <= tree->maxage)
   1763			data = &unicode_data[unichar];
   1764		utf8encode(key,unichar);
   1765		leaf = utf8lookup(tree, hangul, key);
   1766
   1767		if (!leaf) {
   1768			if (data->gen != -1)
   1769				report++;
   1770			if (unichar < 0xd800 || unichar > 0xdfff)
   1771				report++;
   1772		} else {
   1773			if (unichar >= 0xd800 && unichar <= 0xdfff)
   1774				report++;
   1775			if (data->gen == -1)
   1776				report++;
   1777			if (data->gen != LEAF_GEN(leaf))
   1778				report++;
   1779			if (LEAF_CCC(leaf) == DECOMPOSE) {
   1780				if (HANGUL_SYLLABLE(data->code)) {
   1781					if (data->utf8nfdi[0] != HANGUL)
   1782						report++;
   1783				} else if (nocf) {
   1784					if (!data->utf8nfdi) {
   1785						report++;
   1786					} else if (strcmp(data->utf8nfdi,
   1787							  LEAF_STR(leaf))) {
   1788						report++;
   1789					}
   1790				} else {
   1791					if (!data->utf8nfdicf &&
   1792					    !data->utf8nfdi) {
   1793						report++;
   1794					} else if (data->utf8nfdicf) {
   1795						if (strcmp(data->utf8nfdicf,
   1796							   LEAF_STR(leaf)))
   1797							report++;
   1798					} else if (strcmp(data->utf8nfdi,
   1799							  LEAF_STR(leaf))) {
   1800						report++;
   1801					}
   1802				}
   1803			} else if (data->ccc != LEAF_CCC(leaf)) {
   1804				report++;
   1805			}
   1806		}
   1807		if (report) {
   1808			printf("%X code %X gen %d ccc %d"
   1809				" nfdi -> \"%s\"",
   1810				unichar, data->code, data->gen,
   1811				data->ccc,
   1812				data->utf8nfdi);
   1813			if (leaf) {
   1814				printf(" gen %d ccc %d"
   1815					" nfdi -> \"%s\"",
   1816					LEAF_GEN(leaf),
   1817					LEAF_CCC(leaf),
   1818					LEAF_CCC(leaf) == DECOMPOSE ?
   1819						LEAF_STR(leaf) : "");
   1820			}
   1821			printf("\n");
   1822		}
   1823	}
   1824}
   1825
   1826static void trees_verify(void)
   1827{
   1828	int i;
   1829
   1830	for (i = 0; i != trees_count; i++)
   1831		verify(&trees[i]);
   1832}
   1833
   1834/* ------------------------------------------------------------------ */
   1835
   1836static void help(void)
   1837{
   1838	printf("Usage: %s [options]\n", argv0);
   1839	printf("\n");
   1840	printf("This program creates an a data trie used for parsing and\n");
   1841	printf("normalization of UTF-8 strings. The trie is derived from\n");
   1842	printf("a set of input files from the Unicode character database\n");
   1843	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
   1844	printf("\n");
   1845	printf("The generated tree supports two normalization forms:\n");
   1846	printf("\n");
   1847	printf("\tnfdi:\n");
   1848	printf("\t- Apply unicode normalization form NFD.\n");
   1849	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
   1850	printf("\n");
   1851	printf("\tnfdicf:\n");
   1852	printf("\t- Apply unicode normalization form NFD.\n");
   1853	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
   1854	printf("\t- Apply a full casefold (C + F).\n");
   1855	printf("\n");
   1856	printf("These forms were chosen as being most useful when dealing\n");
   1857	printf("with file names: NFD catches most cases where characters\n");
   1858	printf("should be considered equivalent. The ignorables are mostly\n");
   1859	printf("invisible, making names hard to type.\n");
   1860	printf("\n");
   1861	printf("The options to specify the files to be used are listed\n");
   1862	printf("below with their default values, which are the names used\n");
   1863	printf("by version 11.0.0 of the Unicode Character Database.\n");
   1864	printf("\n");
   1865	printf("The input files:\n");
   1866	printf("\t-a %s\n", AGE_NAME);
   1867	printf("\t-c %s\n", CCC_NAME);
   1868	printf("\t-p %s\n", PROP_NAME);
   1869	printf("\t-d %s\n", DATA_NAME);
   1870	printf("\t-f %s\n", FOLD_NAME);
   1871	printf("\t-n %s\n", NORM_NAME);
   1872	printf("\n");
   1873	printf("Additionally, the generated tables are tested using:\n");
   1874	printf("\t-t %s\n", TEST_NAME);
   1875	printf("\n");
   1876	printf("Finally, the output file:\n");
   1877	printf("\t-o %s\n", UTF8_NAME);
   1878	printf("\n");
   1879}
   1880
   1881static void usage(void)
   1882{
   1883	help();
   1884	exit(1);
   1885}
   1886
   1887static void open_fail(const char *name, int error)
   1888{
   1889	printf("Error %d opening %s: %s\n", error, name, strerror(error));
   1890	exit(1);
   1891}
   1892
   1893static void file_fail(const char *filename)
   1894{
   1895	printf("Error parsing %s\n", filename);
   1896	exit(1);
   1897}
   1898
   1899static void line_fail(const char *filename, const char *line)
   1900{
   1901	printf("Error parsing %s:%s\n", filename, line);
   1902	exit(1);
   1903}
   1904
   1905/* ------------------------------------------------------------------ */
   1906
   1907static void print_utf32(unsigned int *utf32str)
   1908{
   1909	int	i;
   1910
   1911	for (i = 0; utf32str[i]; i++)
   1912		printf(" %X", utf32str[i]);
   1913}
   1914
   1915static void print_utf32nfdi(unsigned int unichar)
   1916{
   1917	printf(" %X ->", unichar);
   1918	print_utf32(unicode_data[unichar].utf32nfdi);
   1919	printf("\n");
   1920}
   1921
   1922static void print_utf32nfdicf(unsigned int unichar)
   1923{
   1924	printf(" %X ->", unichar);
   1925	print_utf32(unicode_data[unichar].utf32nfdicf);
   1926	printf("\n");
   1927}
   1928
   1929/* ------------------------------------------------------------------ */
   1930
   1931static void age_init(void)
   1932{
   1933	FILE *file;
   1934	unsigned int first;
   1935	unsigned int last;
   1936	unsigned int unichar;
   1937	unsigned int major;
   1938	unsigned int minor;
   1939	unsigned int revision;
   1940	int gen;
   1941	int count;
   1942	int ret;
   1943
   1944	if (verbose > 0)
   1945		printf("Parsing %s\n", age_name);
   1946
   1947	file = fopen(age_name, "r");
   1948	if (!file)
   1949		open_fail(age_name, errno);
   1950	count = 0;
   1951
   1952	gen = 0;
   1953	while (fgets(line, LINESIZE, file)) {
   1954		ret = sscanf(line, "# Age=V%d_%d_%d",
   1955				&major, &minor, &revision);
   1956		if (ret == 3) {
   1957			ages_count++;
   1958			if (verbose > 1)
   1959				printf(" Age V%d_%d_%d\n",
   1960					major, minor, revision);
   1961			if (!age_valid(major, minor, revision))
   1962				line_fail(age_name, line);
   1963			continue;
   1964		}
   1965		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
   1966		if (ret == 2) {
   1967			ages_count++;
   1968			if (verbose > 1)
   1969				printf(" Age V%d_%d\n", major, minor);
   1970			if (!age_valid(major, minor, 0))
   1971				line_fail(age_name, line);
   1972			continue;
   1973		}
   1974	}
   1975
   1976	/* We must have found something above. */
   1977	if (verbose > 1)
   1978		printf("%d age entries\n", ages_count);
   1979	if (ages_count == 0 || ages_count > MAXGEN)
   1980		file_fail(age_name);
   1981
   1982	/* There is a 0 entry. */
   1983	ages_count++;
   1984	ages = calloc(ages_count + 1, sizeof(*ages));
   1985	/* And a guard entry. */
   1986	ages[ages_count] = (unsigned int)-1;
   1987
   1988	rewind(file);
   1989	count = 0;
   1990	gen = 0;
   1991	while (fgets(line, LINESIZE, file)) {
   1992		ret = sscanf(line, "# Age=V%d_%d_%d",
   1993				&major, &minor, &revision);
   1994		if (ret == 3) {
   1995			ages[++gen] =
   1996				UNICODE_AGE(major, minor, revision);
   1997			if (verbose > 1)
   1998				printf(" Age V%d_%d_%d = gen %d\n",
   1999					major, minor, revision, gen);
   2000			if (!age_valid(major, minor, revision))
   2001				line_fail(age_name, line);
   2002			continue;
   2003		}
   2004		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
   2005		if (ret == 2) {
   2006			ages[++gen] = UNICODE_AGE(major, minor, 0);
   2007			if (verbose > 1)
   2008				printf(" Age V%d_%d = %d\n",
   2009					major, minor, gen);
   2010			if (!age_valid(major, minor, 0))
   2011				line_fail(age_name, line);
   2012			continue;
   2013		}
   2014		ret = sscanf(line, "%X..%X ; %d.%d #",
   2015			     &first, &last, &major, &minor);
   2016		if (ret == 4) {
   2017			for (unichar = first; unichar <= last; unichar++)
   2018				unicode_data[unichar].gen = gen;
   2019			count += 1 + last - first;
   2020			if (verbose > 1)
   2021				printf("  %X..%X gen %d\n", first, last, gen);
   2022			if (!utf32valid(first) || !utf32valid(last))
   2023				line_fail(age_name, line);
   2024			continue;
   2025		}
   2026		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
   2027		if (ret == 3) {
   2028			unicode_data[unichar].gen = gen;
   2029			count++;
   2030			if (verbose > 1)
   2031				printf("  %X gen %d\n", unichar, gen);
   2032			if (!utf32valid(unichar))
   2033				line_fail(age_name, line);
   2034			continue;
   2035		}
   2036	}
   2037	unicode_maxage = ages[gen];
   2038	fclose(file);
   2039
   2040	/* Nix surrogate block */
   2041	if (verbose > 1)
   2042		printf(" Removing surrogate block D800..DFFF\n");
   2043	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
   2044		unicode_data[unichar].gen = -1;
   2045
   2046	if (verbose > 0)
   2047	        printf("Found %d entries\n", count);
   2048	if (count == 0)
   2049		file_fail(age_name);
   2050}
   2051
   2052static void ccc_init(void)
   2053{
   2054	FILE *file;
   2055	unsigned int first;
   2056	unsigned int last;
   2057	unsigned int unichar;
   2058	unsigned int value;
   2059	int count;
   2060	int ret;
   2061
   2062	if (verbose > 0)
   2063		printf("Parsing %s\n", ccc_name);
   2064
   2065	file = fopen(ccc_name, "r");
   2066	if (!file)
   2067		open_fail(ccc_name, errno);
   2068
   2069	count = 0;
   2070	while (fgets(line, LINESIZE, file)) {
   2071		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
   2072		if (ret == 3) {
   2073			for (unichar = first; unichar <= last; unichar++) {
   2074				unicode_data[unichar].ccc = value;
   2075                                count++;
   2076			}
   2077			if (verbose > 1)
   2078				printf(" %X..%X ccc %d\n", first, last, value);
   2079			if (!utf32valid(first) || !utf32valid(last))
   2080				line_fail(ccc_name, line);
   2081			continue;
   2082		}
   2083		ret = sscanf(line, "%X ; %d #", &unichar, &value);
   2084		if (ret == 2) {
   2085			unicode_data[unichar].ccc = value;
   2086                        count++;
   2087			if (verbose > 1)
   2088				printf(" %X ccc %d\n", unichar, value);
   2089			if (!utf32valid(unichar))
   2090				line_fail(ccc_name, line);
   2091			continue;
   2092		}
   2093	}
   2094	fclose(file);
   2095
   2096	if (verbose > 0)
   2097		printf("Found %d entries\n", count);
   2098	if (count == 0)
   2099		file_fail(ccc_name);
   2100}
   2101
   2102static int ignore_compatibility_form(char *type)
   2103{
   2104	int i;
   2105	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
   2106				 "final", "isolated", "circle", "super",
   2107				 "sub", "vertical", "wide", "narrow",
   2108				 "small", "square", "fraction", "compat"};
   2109
   2110	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
   2111		if (strcmp(type, ignored_types[i]) == 0)
   2112			return 1;
   2113	return 0;
   2114}
   2115
   2116static void nfdi_init(void)
   2117{
   2118	FILE *file;
   2119	unsigned int unichar;
   2120	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
   2121	char *s;
   2122	char *type;
   2123	unsigned int *um;
   2124	int count;
   2125	int i;
   2126	int ret;
   2127
   2128	if (verbose > 0)
   2129		printf("Parsing %s\n", data_name);
   2130	file = fopen(data_name, "r");
   2131	if (!file)
   2132		open_fail(data_name, errno);
   2133
   2134	count = 0;
   2135	while (fgets(line, LINESIZE, file)) {
   2136		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
   2137			     &unichar, buf0);
   2138		if (ret != 2)
   2139			continue;
   2140		if (!utf32valid(unichar))
   2141			line_fail(data_name, line);
   2142
   2143		s = buf0;
   2144		/* skip over <tag> */
   2145		if (*s == '<') {
   2146			type = ++s;
   2147			while (*++s != '>');
   2148			*s++ = '\0';
   2149			if(ignore_compatibility_form(type))
   2150				continue;
   2151		}
   2152		/* decode the decomposition into UTF-32 */
   2153		i = 0;
   2154		while (*s) {
   2155			mapping[i] = strtoul(s, &s, 16);
   2156			if (!utf32valid(mapping[i]))
   2157				line_fail(data_name, line);
   2158			i++;
   2159		}
   2160		mapping[i++] = 0;
   2161
   2162		um = malloc(i * sizeof(unsigned int));
   2163		memcpy(um, mapping, i * sizeof(unsigned int));
   2164		unicode_data[unichar].utf32nfdi = um;
   2165
   2166		if (verbose > 1)
   2167			print_utf32nfdi(unichar);
   2168		count++;
   2169	}
   2170	fclose(file);
   2171	if (verbose > 0)
   2172		printf("Found %d entries\n", count);
   2173	if (count == 0)
   2174		file_fail(data_name);
   2175}
   2176
   2177static void nfdicf_init(void)
   2178{
   2179	FILE *file;
   2180	unsigned int unichar;
   2181	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
   2182	char status;
   2183	char *s;
   2184	unsigned int *um;
   2185	int i;
   2186	int count;
   2187	int ret;
   2188
   2189	if (verbose > 0)
   2190		printf("Parsing %s\n", fold_name);
   2191	file = fopen(fold_name, "r");
   2192	if (!file)
   2193		open_fail(fold_name, errno);
   2194
   2195	count = 0;
   2196	while (fgets(line, LINESIZE, file)) {
   2197		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
   2198		if (ret != 3)
   2199			continue;
   2200		if (!utf32valid(unichar))
   2201			line_fail(fold_name, line);
   2202		/* Use the C+F casefold. */
   2203		if (status != 'C' && status != 'F')
   2204			continue;
   2205		s = buf0;
   2206		if (*s == '<')
   2207			while (*s++ != ' ')
   2208				;
   2209		i = 0;
   2210		while (*s) {
   2211			mapping[i] = strtoul(s, &s, 16);
   2212			if (!utf32valid(mapping[i]))
   2213				line_fail(fold_name, line);
   2214			i++;
   2215		}
   2216		mapping[i++] = 0;
   2217
   2218		um = malloc(i * sizeof(unsigned int));
   2219		memcpy(um, mapping, i * sizeof(unsigned int));
   2220		unicode_data[unichar].utf32nfdicf = um;
   2221
   2222		if (verbose > 1)
   2223			print_utf32nfdicf(unichar);
   2224		count++;
   2225	}
   2226	fclose(file);
   2227	if (verbose > 0)
   2228		printf("Found %d entries\n", count);
   2229	if (count == 0)
   2230		file_fail(fold_name);
   2231}
   2232
   2233static void ignore_init(void)
   2234{
   2235	FILE *file;
   2236	unsigned int unichar;
   2237	unsigned int first;
   2238	unsigned int last;
   2239	unsigned int *um;
   2240	int count;
   2241	int ret;
   2242
   2243	if (verbose > 0)
   2244		printf("Parsing %s\n", prop_name);
   2245	file = fopen(prop_name, "r");
   2246	if (!file)
   2247		open_fail(prop_name, errno);
   2248	assert(file);
   2249	count = 0;
   2250	while (fgets(line, LINESIZE, file)) {
   2251		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
   2252		if (ret == 3) {
   2253			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
   2254				continue;
   2255			if (!utf32valid(first) || !utf32valid(last))
   2256				line_fail(prop_name, line);
   2257			for (unichar = first; unichar <= last; unichar++) {
   2258				free(unicode_data[unichar].utf32nfdi);
   2259				um = malloc(sizeof(unsigned int));
   2260				*um = 0;
   2261				unicode_data[unichar].utf32nfdi = um;
   2262				free(unicode_data[unichar].utf32nfdicf);
   2263				um = malloc(sizeof(unsigned int));
   2264				*um = 0;
   2265				unicode_data[unichar].utf32nfdicf = um;
   2266				count++;
   2267			}
   2268			if (verbose > 1)
   2269				printf(" %X..%X Default_Ignorable_Code_Point\n",
   2270					first, last);
   2271			continue;
   2272		}
   2273		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
   2274		if (ret == 2) {
   2275			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
   2276				continue;
   2277			if (!utf32valid(unichar))
   2278				line_fail(prop_name, line);
   2279			free(unicode_data[unichar].utf32nfdi);
   2280			um = malloc(sizeof(unsigned int));
   2281			*um = 0;
   2282			unicode_data[unichar].utf32nfdi = um;
   2283			free(unicode_data[unichar].utf32nfdicf);
   2284			um = malloc(sizeof(unsigned int));
   2285			*um = 0;
   2286			unicode_data[unichar].utf32nfdicf = um;
   2287			if (verbose > 1)
   2288				printf(" %X Default_Ignorable_Code_Point\n",
   2289					unichar);
   2290			count++;
   2291			continue;
   2292		}
   2293	}
   2294	fclose(file);
   2295
   2296	if (verbose > 0)
   2297		printf("Found %d entries\n", count);
   2298	if (count == 0)
   2299		file_fail(prop_name);
   2300}
   2301
   2302static void corrections_init(void)
   2303{
   2304	FILE *file;
   2305	unsigned int unichar;
   2306	unsigned int major;
   2307	unsigned int minor;
   2308	unsigned int revision;
   2309	unsigned int age;
   2310	unsigned int *um;
   2311	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
   2312	char *s;
   2313	int i;
   2314	int count;
   2315	int ret;
   2316
   2317	if (verbose > 0)
   2318		printf("Parsing %s\n", norm_name);
   2319	file = fopen(norm_name, "r");
   2320	if (!file)
   2321		open_fail(norm_name, errno);
   2322
   2323	count = 0;
   2324	while (fgets(line, LINESIZE, file)) {
   2325		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
   2326				&unichar, buf0, buf1,
   2327				&major, &minor, &revision);
   2328		if (ret != 6)
   2329			continue;
   2330		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
   2331			line_fail(norm_name, line);
   2332		count++;
   2333	}
   2334	corrections = calloc(count, sizeof(struct unicode_data));
   2335	corrections_count = count;
   2336	rewind(file);
   2337
   2338	count = 0;
   2339	while (fgets(line, LINESIZE, file)) {
   2340		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
   2341				&unichar, buf0, buf1,
   2342				&major, &minor, &revision);
   2343		if (ret != 6)
   2344			continue;
   2345		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
   2346			line_fail(norm_name, line);
   2347		corrections[count] = unicode_data[unichar];
   2348		assert(corrections[count].code == unichar);
   2349		age = UNICODE_AGE(major, minor, revision);
   2350		corrections[count].correction = age;
   2351
   2352		i = 0;
   2353		s = buf0;
   2354		while (*s) {
   2355			mapping[i] = strtoul(s, &s, 16);
   2356			if (!utf32valid(mapping[i]))
   2357				line_fail(norm_name, line);
   2358			i++;
   2359		}
   2360		mapping[i++] = 0;
   2361
   2362		um = malloc(i * sizeof(unsigned int));
   2363		memcpy(um, mapping, i * sizeof(unsigned int));
   2364		corrections[count].utf32nfdi = um;
   2365
   2366		if (verbose > 1)
   2367			printf(" %X -> %s -> %s V%d_%d_%d\n",
   2368				unichar, buf0, buf1, major, minor, revision);
   2369		count++;
   2370	}
   2371	fclose(file);
   2372
   2373	if (verbose > 0)
   2374	        printf("Found %d entries\n", count);
   2375	if (count == 0)
   2376		file_fail(norm_name);
   2377}
   2378
   2379/* ------------------------------------------------------------------ */
   2380
   2381/*
   2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
   2383 *
   2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
   2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
   2386 *
   2387 * SBase = 0xAC00
   2388 * LBase = 0x1100
   2389 * VBase = 0x1161
   2390 * TBase = 0x11A7
   2391 * LCount = 19
   2392 * VCount = 21
   2393 * TCount = 28
   2394 * NCount = 588 (VCount * TCount)
   2395 * SCount = 11172 (LCount * NCount)
   2396 *
   2397 * Decomposition:
   2398 *   SIndex = s - SBase
   2399 *
   2400 * LV (Canonical/Full)
   2401 *   LIndex = SIndex / NCount
   2402 *   VIndex = (Sindex % NCount) / TCount
   2403 *   LPart = LBase + LIndex
   2404 *   VPart = VBase + VIndex
   2405 *
   2406 * LVT (Canonical)
   2407 *   LVIndex = (SIndex / TCount) * TCount
   2408 *   TIndex = (Sindex % TCount)
   2409 *   LVPart = SBase + LVIndex
   2410 *   TPart = TBase + TIndex
   2411 *
   2412 * LVT (Full)
   2413 *   LIndex = SIndex / NCount
   2414 *   VIndex = (Sindex % NCount) / TCount
   2415 *   TIndex = (Sindex % TCount)
   2416 *   LPart = LBase + LIndex
   2417 *   VPart = VBase + VIndex
   2418 *   if (TIndex == 0) {
   2419 *          d = <LPart, VPart>
   2420 *   } else {
   2421 *          TPart = TBase + TIndex
   2422 *          d = <LPart, VPart, TPart>
   2423 *   }
   2424 *
   2425 */
   2426
   2427static void hangul_decompose(void)
   2428{
   2429	unsigned int sb = 0xAC00;
   2430	unsigned int lb = 0x1100;
   2431	unsigned int vb = 0x1161;
   2432	unsigned int tb = 0x11a7;
   2433	/* unsigned int lc = 19; */
   2434	unsigned int vc = 21;
   2435	unsigned int tc = 28;
   2436	unsigned int nc = (vc * tc);
   2437	/* unsigned int sc = (lc * nc); */
   2438	unsigned int unichar;
   2439	unsigned int mapping[4];
   2440	unsigned int *um;
   2441        int count;
   2442	int i;
   2443
   2444	if (verbose > 0)
   2445		printf("Decomposing hangul\n");
   2446	/* Hangul */
   2447	count = 0;
   2448	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
   2449		unsigned int si = unichar - sb;
   2450		unsigned int li = si / nc;
   2451		unsigned int vi = (si % nc) / tc;
   2452		unsigned int ti = si % tc;
   2453
   2454		i = 0;
   2455		mapping[i++] = lb + li;
   2456		mapping[i++] = vb + vi;
   2457		if (ti)
   2458			mapping[i++] = tb + ti;
   2459		mapping[i++] = 0;
   2460
   2461		assert(!unicode_data[unichar].utf32nfdi);
   2462		um = malloc(i * sizeof(unsigned int));
   2463		memcpy(um, mapping, i * sizeof(unsigned int));
   2464		unicode_data[unichar].utf32nfdi = um;
   2465
   2466		assert(!unicode_data[unichar].utf32nfdicf);
   2467		um = malloc(i * sizeof(unsigned int));
   2468		memcpy(um, mapping, i * sizeof(unsigned int));
   2469		unicode_data[unichar].utf32nfdicf = um;
   2470
   2471		/*
   2472		 * Add a cookie as a reminder that the hangul syllable
   2473		 * decompositions must not be stored in the generated
   2474		 * trie.
   2475		 */
   2476		unicode_data[unichar].utf8nfdi = malloc(2);
   2477		unicode_data[unichar].utf8nfdi[0] = HANGUL;
   2478		unicode_data[unichar].utf8nfdi[1] = '\0';
   2479
   2480		if (verbose > 1)
   2481			print_utf32nfdi(unichar);
   2482
   2483		count++;
   2484	}
   2485	if (verbose > 0)
   2486		printf("Created %d entries\n", count);
   2487}
   2488
   2489static void nfdi_decompose(void)
   2490{
   2491	unsigned int unichar;
   2492	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
   2493	unsigned int *um;
   2494	unsigned int *dc;
   2495	int count;
   2496	int i;
   2497	int j;
   2498	int ret;
   2499
   2500	if (verbose > 0)
   2501		printf("Decomposing nfdi\n");
   2502
   2503	count = 0;
   2504	for (unichar = 0; unichar != 0x110000; unichar++) {
   2505		if (!unicode_data[unichar].utf32nfdi)
   2506			continue;
   2507		for (;;) {
   2508			ret = 1;
   2509			i = 0;
   2510			um = unicode_data[unichar].utf32nfdi;
   2511			while (*um) {
   2512				dc = unicode_data[*um].utf32nfdi;
   2513				if (dc) {
   2514					for (j = 0; dc[j]; j++)
   2515						mapping[i++] = dc[j];
   2516					ret = 0;
   2517				} else {
   2518					mapping[i++] = *um;
   2519				}
   2520				um++;
   2521			}
   2522			mapping[i++] = 0;
   2523			if (ret)
   2524				break;
   2525			free(unicode_data[unichar].utf32nfdi);
   2526			um = malloc(i * sizeof(unsigned int));
   2527			memcpy(um, mapping, i * sizeof(unsigned int));
   2528			unicode_data[unichar].utf32nfdi = um;
   2529		}
   2530		/* Add this decomposition to nfdicf if there is no entry. */
   2531		if (!unicode_data[unichar].utf32nfdicf) {
   2532			um = malloc(i * sizeof(unsigned int));
   2533			memcpy(um, mapping, i * sizeof(unsigned int));
   2534			unicode_data[unichar].utf32nfdicf = um;
   2535		}
   2536		if (verbose > 1)
   2537			print_utf32nfdi(unichar);
   2538		count++;
   2539	}
   2540	if (verbose > 0)
   2541		printf("Processed %d entries\n", count);
   2542}
   2543
   2544static void nfdicf_decompose(void)
   2545{
   2546	unsigned int unichar;
   2547	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
   2548	unsigned int *um;
   2549	unsigned int *dc;
   2550	int count;
   2551	int i;
   2552	int j;
   2553	int ret;
   2554
   2555	if (verbose > 0)
   2556		printf("Decomposing nfdicf\n");
   2557	count = 0;
   2558	for (unichar = 0; unichar != 0x110000; unichar++) {
   2559		if (!unicode_data[unichar].utf32nfdicf)
   2560			continue;
   2561		for (;;) {
   2562			ret = 1;
   2563			i = 0;
   2564			um = unicode_data[unichar].utf32nfdicf;
   2565			while (*um) {
   2566				dc = unicode_data[*um].utf32nfdicf;
   2567				if (dc) {
   2568					for (j = 0; dc[j]; j++)
   2569						mapping[i++] = dc[j];
   2570					ret = 0;
   2571				} else {
   2572					mapping[i++] = *um;
   2573				}
   2574				um++;
   2575			}
   2576			mapping[i++] = 0;
   2577			if (ret)
   2578				break;
   2579			free(unicode_data[unichar].utf32nfdicf);
   2580			um = malloc(i * sizeof(unsigned int));
   2581			memcpy(um, mapping, i * sizeof(unsigned int));
   2582			unicode_data[unichar].utf32nfdicf = um;
   2583		}
   2584		if (verbose > 1)
   2585			print_utf32nfdicf(unichar);
   2586		count++;
   2587	}
   2588	if (verbose > 0)
   2589		printf("Processed %d entries\n", count);
   2590}
   2591
   2592/* ------------------------------------------------------------------ */
   2593
   2594int utf8agemax(struct tree *, const char *);
   2595int utf8nagemax(struct tree *, const char *, size_t);
   2596int utf8agemin(struct tree *, const char *);
   2597int utf8nagemin(struct tree *, const char *, size_t);
   2598ssize_t utf8len(struct tree *, const char *);
   2599ssize_t utf8nlen(struct tree *, const char *, size_t);
   2600struct utf8cursor;
   2601int utf8cursor(struct utf8cursor *, struct tree *, const char *);
   2602int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
   2603int utf8byte(struct utf8cursor *);
   2604
   2605/*
   2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
   2607 *
   2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
   2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
   2610 *
   2611 * SBase = 0xAC00
   2612 * LBase = 0x1100
   2613 * VBase = 0x1161
   2614 * TBase = 0x11A7
   2615 * LCount = 19
   2616 * VCount = 21
   2617 * TCount = 28
   2618 * NCount = 588 (VCount * TCount)
   2619 * SCount = 11172 (LCount * NCount)
   2620 *
   2621 * Decomposition:
   2622 *   SIndex = s - SBase
   2623 *
   2624 * LV (Canonical/Full)
   2625 *   LIndex = SIndex / NCount
   2626 *   VIndex = (Sindex % NCount) / TCount
   2627 *   LPart = LBase + LIndex
   2628 *   VPart = VBase + VIndex
   2629 *
   2630 * LVT (Canonical)
   2631 *   LVIndex = (SIndex / TCount) * TCount
   2632 *   TIndex = (Sindex % TCount)
   2633 *   LVPart = SBase + LVIndex
   2634 *   TPart = TBase + TIndex
   2635 *
   2636 * LVT (Full)
   2637 *   LIndex = SIndex / NCount
   2638 *   VIndex = (Sindex % NCount) / TCount
   2639 *   TIndex = (Sindex % TCount)
   2640 *   LPart = LBase + LIndex
   2641 *   VPart = VBase + VIndex
   2642 *   if (TIndex == 0) {
   2643 *          d = <LPart, VPart>
   2644 *   } else {
   2645 *          TPart = TBase + TIndex
   2646 *          d = <LPart, VPart, TPart>
   2647 *   }
   2648 */
   2649
   2650/* Constants */
   2651#define SB	(0xAC00)
   2652#define LB	(0x1100)
   2653#define VB	(0x1161)
   2654#define TB	(0x11A7)
   2655#define LC	(19)
   2656#define VC	(21)
   2657#define TC	(28)
   2658#define NC	(VC * TC)
   2659#define SC	(LC * NC)
   2660
   2661/* Algorithmic decomposition of hangul syllable. */
   2662static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
   2663{
   2664	unsigned int	si;
   2665	unsigned int	li;
   2666	unsigned int	vi;
   2667	unsigned int	ti;
   2668	unsigned char	*h;
   2669
   2670	/* Calculate the SI, LI, VI, and TI values. */
   2671	si = utf8decode(str) - SB;
   2672	li = si / NC;
   2673	vi = (si % NC) / TC;
   2674	ti = si % TC;
   2675
   2676	/* Fill in base of leaf. */
   2677	h = hangul;
   2678	LEAF_GEN(h) = 2;
   2679	LEAF_CCC(h) = DECOMPOSE;
   2680	h += 2;
   2681
   2682	/* Add LPart, a 3-byte UTF-8 sequence. */
   2683	h += utf8encode((char *)h, li + LB);
   2684
   2685	/* Add VPart, a 3-byte UTF-8 sequence. */
   2686	h += utf8encode((char *)h, vi + VB);
   2687
   2688	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
   2689	if (ti)
   2690		h += utf8encode((char *)h, ti + TB);
   2691
   2692	/* Terminate string. */
   2693	h[0] = '\0';
   2694
   2695	return hangul;
   2696}
   2697
   2698/*
   2699 * Use trie to scan s, touching at most len bytes.
   2700 * Returns the leaf if one exists, NULL otherwise.
   2701 *
   2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
   2703 * is well-formed and corresponds to a known unicode code point.  The
   2704 * shorthand for this will be "is valid UTF-8 unicode".
   2705 */
   2706static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
   2707			       const char *s, size_t len)
   2708{
   2709	utf8trie_t	*trie;
   2710	int		offlen;
   2711	int		offset;
   2712	int		mask;
   2713	int		node;
   2714
   2715	if (!tree)
   2716		return NULL;
   2717	if (len == 0)
   2718		return NULL;
   2719	node = 1;
   2720	trie = utf8data + tree->index;
   2721	while (node) {
   2722		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
   2723		if (*trie & NEXTBYTE) {
   2724			if (--len == 0)
   2725				return NULL;
   2726			s++;
   2727		}
   2728		mask = 1 << (*trie & BITNUM);
   2729		if (*s & mask) {
   2730			/* Right leg */
   2731			if (offlen) {
   2732				/* Right node at offset of trie */
   2733				node = (*trie & RIGHTNODE);
   2734				offset = trie[offlen];
   2735				while (--offlen) {
   2736					offset <<= 8;
   2737					offset |= trie[offlen];
   2738				}
   2739				trie += offset;
   2740			} else if (*trie & RIGHTPATH) {
   2741				/* Right node after this node */
   2742				node = (*trie & TRIENODE);
   2743				trie++;
   2744			} else {
   2745				/* No right node. */
   2746				return NULL;
   2747			}
   2748		} else {
   2749			/* Left leg */
   2750			if (offlen) {
   2751				/* Left node after this node. */
   2752				node = (*trie & LEFTNODE);
   2753				trie += offlen + 1;
   2754			} else if (*trie & RIGHTPATH) {
   2755				/* No left node. */
   2756				return NULL;
   2757			} else {
   2758				/* Left node after this node */
   2759				node = (*trie & TRIENODE);
   2760				trie++;
   2761			}
   2762		}
   2763	}
   2764	/*
   2765	 * Hangul decomposition is done algorithmically. These are the
   2766	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
   2767	 * always 3 bytes long, so s has been advanced twice, and the
   2768	 * start of the sequence is at s-2.
   2769	 */
   2770	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
   2771		trie = utf8hangul(s - 2, hangul);
   2772	return trie;
   2773}
   2774
   2775/*
   2776 * Use trie to scan s.
   2777 * Returns the leaf if one exists, NULL otherwise.
   2778 *
   2779 * Forwards to trie_nlookup().
   2780 */
   2781static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
   2782			      const char *s)
   2783{
   2784	return utf8nlookup(tree, hangul, s, (size_t)-1);
   2785}
   2786
   2787/*
   2788 * Return the number of bytes used by the current UTF-8 sequence.
   2789 * Assumes the input points to the first byte of a valid UTF-8
   2790 * sequence.
   2791 */
   2792static inline int utf8clen(const char *s)
   2793{
   2794	unsigned char c = *s;
   2795	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
   2796}
   2797
   2798/*
   2799 * Maximum age of any character in s.
   2800 * Return -1 if s is not valid UTF-8 unicode.
   2801 * Return 0 if only non-assigned code points are used.
   2802 */
   2803int utf8agemax(struct tree *tree, const char *s)
   2804{
   2805	utf8leaf_t	*leaf;
   2806	int		age = 0;
   2807	int		leaf_age;
   2808	unsigned char	hangul[UTF8HANGULLEAF];
   2809
   2810	if (!tree)
   2811		return -1;
   2812
   2813	while (*s) {
   2814		leaf = utf8lookup(tree, hangul, s);
   2815		if (!leaf)
   2816			return -1;
   2817		leaf_age = ages[LEAF_GEN(leaf)];
   2818		if (leaf_age <= tree->maxage && leaf_age > age)
   2819			age = leaf_age;
   2820		s += utf8clen(s);
   2821	}
   2822	return age;
   2823}
   2824
   2825/*
   2826 * Minimum age of any character in s.
   2827 * Return -1 if s is not valid UTF-8 unicode.
   2828 * Return 0 if non-assigned code points are used.
   2829 */
   2830int utf8agemin(struct tree *tree, const char *s)
   2831{
   2832	utf8leaf_t	*leaf;
   2833	int		age;
   2834	int		leaf_age;
   2835	unsigned char	hangul[UTF8HANGULLEAF];
   2836
   2837	if (!tree)
   2838		return -1;
   2839	age = tree->maxage;
   2840	while (*s) {
   2841		leaf = utf8lookup(tree, hangul, s);
   2842		if (!leaf)
   2843			return -1;
   2844		leaf_age = ages[LEAF_GEN(leaf)];
   2845		if (leaf_age <= tree->maxage && leaf_age < age)
   2846			age = leaf_age;
   2847		s += utf8clen(s);
   2848	}
   2849	return age;
   2850}
   2851
   2852/*
   2853 * Maximum age of any character in s, touch at most len bytes.
   2854 * Return -1 if s is not valid UTF-8 unicode.
   2855 */
   2856int utf8nagemax(struct tree *tree, const char *s, size_t len)
   2857{
   2858	utf8leaf_t	*leaf;
   2859	int		age = 0;
   2860	int		leaf_age;
   2861	unsigned char	hangul[UTF8HANGULLEAF];
   2862
   2863	if (!tree)
   2864		return -1;
   2865
   2866        while (len && *s) {
   2867		leaf = utf8nlookup(tree, hangul, s, len);
   2868		if (!leaf)
   2869			return -1;
   2870		leaf_age = ages[LEAF_GEN(leaf)];
   2871		if (leaf_age <= tree->maxage && leaf_age > age)
   2872			age = leaf_age;
   2873		len -= utf8clen(s);
   2874		s += utf8clen(s);
   2875	}
   2876	return age;
   2877}
   2878
   2879/*
   2880 * Maximum age of any character in s, touch at most len bytes.
   2881 * Return -1 if s is not valid UTF-8 unicode.
   2882 */
   2883int utf8nagemin(struct tree *tree, const char *s, size_t len)
   2884{
   2885	utf8leaf_t	*leaf;
   2886	int		leaf_age;
   2887	int		age;
   2888	unsigned char	hangul[UTF8HANGULLEAF];
   2889
   2890	if (!tree)
   2891		return -1;
   2892	age = tree->maxage;
   2893        while (len && *s) {
   2894		leaf = utf8nlookup(tree, hangul, s, len);
   2895		if (!leaf)
   2896			return -1;
   2897		leaf_age = ages[LEAF_GEN(leaf)];
   2898		if (leaf_age <= tree->maxage && leaf_age < age)
   2899			age = leaf_age;
   2900		len -= utf8clen(s);
   2901		s += utf8clen(s);
   2902	}
   2903	return age;
   2904}
   2905
   2906/*
   2907 * Length of the normalization of s.
   2908 * Return -1 if s is not valid UTF-8 unicode.
   2909 *
   2910 * A string of Default_Ignorable_Code_Point has length 0.
   2911 */
   2912ssize_t utf8len(struct tree *tree, const char *s)
   2913{
   2914	utf8leaf_t	*leaf;
   2915	size_t		ret = 0;
   2916	unsigned char	hangul[UTF8HANGULLEAF];
   2917
   2918	if (!tree)
   2919		return -1;
   2920	while (*s) {
   2921		leaf = utf8lookup(tree, hangul, s);
   2922		if (!leaf)
   2923			return -1;
   2924		if (ages[LEAF_GEN(leaf)] > tree->maxage)
   2925			ret += utf8clen(s);
   2926		else if (LEAF_CCC(leaf) == DECOMPOSE)
   2927			ret += strlen(LEAF_STR(leaf));
   2928		else
   2929			ret += utf8clen(s);
   2930		s += utf8clen(s);
   2931	}
   2932	return ret;
   2933}
   2934
   2935/*
   2936 * Length of the normalization of s, touch at most len bytes.
   2937 * Return -1 if s is not valid UTF-8 unicode.
   2938 */
   2939ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
   2940{
   2941	utf8leaf_t	*leaf;
   2942	size_t		ret = 0;
   2943	unsigned char	hangul[UTF8HANGULLEAF];
   2944
   2945	if (!tree)
   2946		return -1;
   2947	while (len && *s) {
   2948		leaf = utf8nlookup(tree, hangul, s, len);
   2949		if (!leaf)
   2950			return -1;
   2951		if (ages[LEAF_GEN(leaf)] > tree->maxage)
   2952			ret += utf8clen(s);
   2953		else if (LEAF_CCC(leaf) == DECOMPOSE)
   2954			ret += strlen(LEAF_STR(leaf));
   2955		else
   2956			ret += utf8clen(s);
   2957		len -= utf8clen(s);
   2958		s += utf8clen(s);
   2959	}
   2960	return ret;
   2961}
   2962
   2963/*
   2964 * Cursor structure used by the normalizer.
   2965 */
   2966struct utf8cursor {
   2967	struct tree	*tree;
   2968	const char	*s;
   2969	const char	*p;
   2970	const char	*ss;
   2971	const char	*sp;
   2972	unsigned int	len;
   2973	unsigned int	slen;
   2974	short int	ccc;
   2975	short int	nccc;
   2976	unsigned int	unichar;
   2977	unsigned char	hangul[UTF8HANGULLEAF];
   2978};
   2979
   2980/*
   2981 * Set up an utf8cursor for use by utf8byte().
   2982 *
   2983 *   s      : string.
   2984 *   len    : length of s.
   2985 *   u8c    : pointer to cursor.
   2986 *   trie   : utf8trie_t to use for normalization.
   2987 *
   2988 * Returns -1 on error, 0 on success.
   2989 */
   2990int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
   2991		size_t len)
   2992{
   2993	if (!tree)
   2994		return -1;
   2995	if (!s)
   2996		return -1;
   2997	u8c->tree = tree;
   2998	u8c->s = s;
   2999	u8c->p = NULL;
   3000	u8c->ss = NULL;
   3001	u8c->sp = NULL;
   3002	u8c->len = len;
   3003	u8c->slen = 0;
   3004	u8c->ccc = STOPPER;
   3005	u8c->nccc = STOPPER;
   3006	u8c->unichar = 0;
   3007	/* Check we didn't clobber the maximum length. */
   3008	if (u8c->len != len)
   3009		return -1;
   3010	/* The first byte of s may not be an utf8 continuation. */
   3011	if (len > 0 && (*s & 0xC0) == 0x80)
   3012		return -1;
   3013	return 0;
   3014}
   3015
   3016/*
   3017 * Set up an utf8cursor for use by utf8byte().
   3018 *
   3019 *   s      : NUL-terminated string.
   3020 *   u8c    : pointer to cursor.
   3021 *   trie   : utf8trie_t to use for normalization.
   3022 *
   3023 * Returns -1 on error, 0 on success.
   3024 */
   3025int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
   3026{
   3027	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
   3028}
   3029
   3030/*
   3031 * Get one byte from the normalized form of the string described by u8c.
   3032 *
   3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
   3034 *
   3035 * The cursor keeps track of the location in the string in u8c->s.
   3036 * When a character is decomposed, the current location is stored in
   3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
   3038 * that bytes from a decomposition do not count against u8c->len.
   3039 *
   3040 * Characters are emitted if they match the current CCC in u8c->ccc.
   3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
   3042 * and the function returns 0 in that case.
   3043 *
   3044 * Sorting by CCC is done by repeatedly scanning the string.  The
   3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
   3046 * the start of the scan.  The first pass finds the lowest CCC to be
   3047 * emitted and stores it in u8c->nccc, the second pass emits the
   3048 * characters with this CCC and finds the next lowest CCC. This limits
   3049 * the number of passes to 1 + the number of different CCCs in the
   3050 * sequence being scanned.
   3051 *
   3052 * Therefore:
   3053 *  u8c->p  != NULL -> a decomposition is being scanned.
   3054 *  u8c->ss != NULL -> this is a repeating scan.
   3055 *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
   3056 */
   3057int utf8byte(struct utf8cursor *u8c)
   3058{
   3059	utf8leaf_t *leaf;
   3060	int ccc;
   3061
   3062	for (;;) {
   3063		/* Check for the end of a decomposed character. */
   3064		if (u8c->p && *u8c->s == '\0') {
   3065			u8c->s = u8c->p;
   3066			u8c->p = NULL;
   3067		}
   3068
   3069		/* Check for end-of-string. */
   3070		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
   3071			/* There is no next byte. */
   3072			if (u8c->ccc == STOPPER)
   3073				return 0;
   3074			/* End-of-string during a scan counts as a stopper. */
   3075			ccc = STOPPER;
   3076			goto ccc_mismatch;
   3077		} else if ((*u8c->s & 0xC0) == 0x80) {
   3078			/* This is a continuation of the current character. */
   3079			if (!u8c->p)
   3080				u8c->len--;
   3081			return (unsigned char)*u8c->s++;
   3082		}
   3083
   3084		/* Look up the data for the current character. */
   3085		if (u8c->p) {
   3086			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
   3087		} else {
   3088			leaf = utf8nlookup(u8c->tree, u8c->hangul,
   3089					   u8c->s, u8c->len);
   3090		}
   3091
   3092		/* No leaf found implies that the input is a binary blob. */
   3093		if (!leaf)
   3094			return -1;
   3095
   3096		/* Characters that are too new have CCC 0. */
   3097		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
   3098			ccc = STOPPER;
   3099		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
   3100			u8c->len -= utf8clen(u8c->s);
   3101			u8c->p = u8c->s + utf8clen(u8c->s);
   3102			u8c->s = LEAF_STR(leaf);
   3103			/* Empty decomposition implies CCC 0. */
   3104			if (*u8c->s == '\0') {
   3105				if (u8c->ccc == STOPPER)
   3106					continue;
   3107				ccc = STOPPER;
   3108				goto ccc_mismatch;
   3109			}
   3110			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
   3111			ccc = LEAF_CCC(leaf);
   3112		}
   3113		u8c->unichar = utf8decode(u8c->s);
   3114
   3115		/*
   3116		 * If this is not a stopper, then see if it updates
   3117		 * the next canonical class to be emitted.
   3118		 */
   3119		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
   3120			u8c->nccc = ccc;
   3121
   3122		/*
   3123		 * Return the current byte if this is the current
   3124		 * combining class.
   3125		 */
   3126		if (ccc == u8c->ccc) {
   3127			if (!u8c->p)
   3128				u8c->len--;
   3129			return (unsigned char)*u8c->s++;
   3130		}
   3131
   3132		/* Current combining class mismatch. */
   3133	ccc_mismatch:
   3134		if (u8c->nccc == STOPPER) {
   3135			/*
   3136			 * Scan forward for the first canonical class
   3137			 * to be emitted.  Save the position from
   3138			 * which to restart.
   3139			 */
   3140			assert(u8c->ccc == STOPPER);
   3141			u8c->ccc = MINCCC - 1;
   3142			u8c->nccc = ccc;
   3143			u8c->sp = u8c->p;
   3144			u8c->ss = u8c->s;
   3145			u8c->slen = u8c->len;
   3146			if (!u8c->p)
   3147				u8c->len -= utf8clen(u8c->s);
   3148			u8c->s += utf8clen(u8c->s);
   3149		} else if (ccc != STOPPER) {
   3150			/* Not a stopper, and not the ccc we're emitting. */
   3151			if (!u8c->p)
   3152				u8c->len -= utf8clen(u8c->s);
   3153			u8c->s += utf8clen(u8c->s);
   3154		} else if (u8c->nccc != MAXCCC + 1) {
   3155			/* At a stopper, restart for next ccc. */
   3156			u8c->ccc = u8c->nccc;
   3157			u8c->nccc = MAXCCC + 1;
   3158			u8c->s = u8c->ss;
   3159			u8c->p = u8c->sp;
   3160			u8c->len = u8c->slen;
   3161		} else {
   3162			/* All done, proceed from here. */
   3163			u8c->ccc = STOPPER;
   3164			u8c->nccc = STOPPER;
   3165			u8c->sp = NULL;
   3166			u8c->ss = NULL;
   3167			u8c->slen = 0;
   3168		}
   3169	}
   3170}
   3171
   3172/* ------------------------------------------------------------------ */
   3173
   3174static int normalize_line(struct tree *tree)
   3175{
   3176	char *s;
   3177	char *t;
   3178	int c;
   3179	struct utf8cursor u8c;
   3180
   3181	/* First test: null-terminated string. */
   3182	s = buf2;
   3183	t = buf3;
   3184	if (utf8cursor(&u8c, tree, s))
   3185		return -1;
   3186	while ((c = utf8byte(&u8c)) > 0)
   3187		if (c != (unsigned char)*t++)
   3188			return -1;
   3189	if (c < 0)
   3190		return -1;
   3191	if (*t != 0)
   3192		return -1;
   3193
   3194	/* Second test: length-limited string. */
   3195	s = buf2;
   3196	/* Replace NUL with a value that will cause an error if seen. */
   3197	s[strlen(s) + 1] = -1;
   3198	t = buf3;
   3199	if (utf8cursor(&u8c, tree, s))
   3200		return -1;
   3201	while ((c = utf8byte(&u8c)) > 0)
   3202		if (c != (unsigned char)*t++)
   3203			return -1;
   3204	if (c < 0)
   3205		return -1;
   3206	if (*t != 0)
   3207		return -1;
   3208
   3209	return 0;
   3210}
   3211
   3212static void normalization_test(void)
   3213{
   3214	FILE *file;
   3215	unsigned int unichar;
   3216	struct unicode_data *data;
   3217	char *s;
   3218	char *t;
   3219	int ret;
   3220	int ignorables;
   3221	int tests = 0;
   3222	int failures = 0;
   3223
   3224	if (verbose > 0)
   3225		printf("Parsing %s\n", test_name);
   3226	/* Step one, read data from file. */
   3227	file = fopen(test_name, "r");
   3228	if (!file)
   3229		open_fail(test_name, errno);
   3230
   3231	while (fgets(line, LINESIZE, file)) {
   3232		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
   3233			     buf0, buf1);
   3234		if (ret != 2 || *line == '#')
   3235			continue;
   3236		s = buf0;
   3237		t = buf2;
   3238		while (*s) {
   3239			unichar = strtoul(s, &s, 16);
   3240			t += utf8encode(t, unichar);
   3241		}
   3242		*t = '\0';
   3243
   3244		ignorables = 0;
   3245		s = buf1;
   3246		t = buf3;
   3247		while (*s) {
   3248			unichar = strtoul(s, &s, 16);
   3249			data = &unicode_data[unichar];
   3250			if (data->utf8nfdi && !*data->utf8nfdi)
   3251				ignorables = 1;
   3252			else
   3253				t += utf8encode(t, unichar);
   3254		}
   3255		*t = '\0';
   3256
   3257		tests++;
   3258		if (normalize_line(nfdi_tree) < 0) {
   3259			printf("Line %s -> %s", buf0, buf1);
   3260			if (ignorables)
   3261				printf(" (ignorables removed)");
   3262			printf(" failure\n");
   3263			failures++;
   3264		}
   3265	}
   3266	fclose(file);
   3267	if (verbose > 0)
   3268		printf("Ran %d tests with %d failures\n", tests, failures);
   3269	if (failures)
   3270		file_fail(test_name);
   3271}
   3272
   3273/* ------------------------------------------------------------------ */
   3274
   3275static void write_file(void)
   3276{
   3277	FILE *file;
   3278	int i;
   3279	int j;
   3280	int t;
   3281	int gen;
   3282
   3283	if (verbose > 0)
   3284		printf("Writing %s\n", utf8_name);
   3285	file = fopen(utf8_name, "w");
   3286	if (!file)
   3287		open_fail(utf8_name, errno);
   3288
   3289	fprintf(file, "/* This file is generated code, do not edit. */\n");
   3290	fprintf(file, "\n");
   3291	fprintf(file, "#include <linux/module.h>\n");
   3292	fprintf(file, "#include <linux/kernel.h>\n");
   3293	fprintf(file, "#include \"utf8n.h\"\n");
   3294	fprintf(file, "\n");
   3295	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
   3296	for (i = 0; i != ages_count; i++)
   3297		fprintf(file, "\t%#x%s\n", ages[i],
   3298			ages[i] == unicode_maxage ? "" : ",");
   3299	fprintf(file, "};\n");
   3300	fprintf(file, "\n");
   3301	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
   3302	t = 0;
   3303	for (gen = 0; gen < ages_count; gen++) {
   3304		fprintf(file, "\t{ %#x, %d }%s\n",
   3305			ages[gen], trees[t].index,
   3306			ages[gen] == unicode_maxage ? "" : ",");
   3307		if (trees[t].maxage == ages[gen])
   3308			t += 2;
   3309	}
   3310	fprintf(file, "};\n");
   3311	fprintf(file, "\n");
   3312	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
   3313	t = 1;
   3314	for (gen = 0; gen < ages_count; gen++) {
   3315		fprintf(file, "\t{ %#x, %d }%s\n",
   3316			ages[gen], trees[t].index,
   3317			ages[gen] == unicode_maxage ? "" : ",");
   3318		if (trees[t].maxage == ages[gen])
   3319			t += 2;
   3320	}
   3321	fprintf(file, "};\n");
   3322	fprintf(file, "\n");
   3323	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
   3324		utf8data_size);
   3325	t = 0;
   3326	for (i = 0; i != utf8data_size; i += 16) {
   3327		if (i == trees[t].index) {
   3328			fprintf(file, "\t/* %s_%x */\n",
   3329				trees[t].type, trees[t].maxage);
   3330			if (t < trees_count-1)
   3331				t++;
   3332		}
   3333		fprintf(file, "\t");
   3334		for (j = i; j != i + 16; j++)
   3335			fprintf(file, "0x%.2x%s", utf8data[j],
   3336				(j < utf8data_size -1 ? "," : ""));
   3337		fprintf(file, "\n");
   3338	}
   3339	fprintf(file, "};\n");
   3340	fprintf(file, "\n");
   3341	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
   3342	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
   3343	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
   3344	fprintf(file, "\n");
   3345	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
   3346	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
   3347	fprintf(file, "\n");
   3348	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
   3349	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
   3350	fprintf(file, "\n");
   3351	fprintf(file, "\t.utf8data = utf8data,\n");
   3352	fprintf(file, "};\n");
   3353	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
   3354	fprintf(file, "\n");
   3355	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
   3356	fclose(file);
   3357}
   3358
   3359/* ------------------------------------------------------------------ */
   3360
   3361int main(int argc, char *argv[])
   3362{
   3363	unsigned int unichar;
   3364	int opt;
   3365
   3366	argv0 = argv[0];
   3367
   3368	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
   3369		switch (opt) {
   3370		case 'a':
   3371			age_name = optarg;
   3372			break;
   3373		case 'c':
   3374			ccc_name = optarg;
   3375			break;
   3376		case 'd':
   3377			data_name = optarg;
   3378			break;
   3379		case 'f':
   3380			fold_name = optarg;
   3381			break;
   3382		case 'n':
   3383			norm_name = optarg;
   3384			break;
   3385		case 'o':
   3386			utf8_name = optarg;
   3387			break;
   3388		case 'p':
   3389			prop_name = optarg;
   3390			break;
   3391		case 't':
   3392			test_name = optarg;
   3393			break;
   3394		case 'v':
   3395			verbose++;
   3396			break;
   3397		case 'h':
   3398			help();
   3399			exit(0);
   3400		default:
   3401			usage();
   3402		}
   3403	}
   3404
   3405	if (verbose > 1)
   3406		help();
   3407	for (unichar = 0; unichar != 0x110000; unichar++)
   3408		unicode_data[unichar].code = unichar;
   3409	age_init();
   3410	ccc_init();
   3411	nfdi_init();
   3412	nfdicf_init();
   3413	ignore_init();
   3414	corrections_init();
   3415	hangul_decompose();
   3416	nfdi_decompose();
   3417	nfdicf_decompose();
   3418	utf8_init();
   3419	trees_init();
   3420	trees_populate();
   3421	trees_reduce();
   3422	trees_verify();
   3423	/* Prevent "unused function" warning. */
   3424	(void)lookup(nfdi_tree, " ");
   3425	if (verbose > 2)
   3426		tree_walk(nfdi_tree);
   3427	if (verbose > 2)
   3428		tree_walk(nfdicf_tree);
   3429	normalization_test();
   3430	write_file();
   3431
   3432	return 0;
   3433}