cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xz_dec_bcj.c (13905B)


      1/*
      2 * Branch/Call/Jump (BCJ) filter decoders
      3 *
      4 * Authors: Lasse Collin <lasse.collin@tukaani.org>
      5 *          Igor Pavlov <https://7-zip.org/>
      6 *
      7 * This file has been put into the public domain.
      8 * You can do whatever you want with this file.
      9 */
     10
     11#include "xz_private.h"
     12
     13/*
     14 * The rest of the file is inside this ifdef. It makes things a little more
     15 * convenient when building without support for any BCJ filters.
     16 */
     17#ifdef XZ_DEC_BCJ
     18
     19struct xz_dec_bcj {
     20	/* Type of the BCJ filter being used */
     21	enum {
     22		BCJ_X86 = 4,        /* x86 or x86-64 */
     23		BCJ_POWERPC = 5,    /* Big endian only */
     24		BCJ_IA64 = 6,       /* Big or little endian */
     25		BCJ_ARM = 7,        /* Little endian only */
     26		BCJ_ARMTHUMB = 8,   /* Little endian only */
     27		BCJ_SPARC = 9       /* Big or little endian */
     28	} type;
     29
     30	/*
     31	 * Return value of the next filter in the chain. We need to preserve
     32	 * this information across calls, because we must not call the next
     33	 * filter anymore once it has returned XZ_STREAM_END.
     34	 */
     35	enum xz_ret ret;
     36
     37	/* True if we are operating in single-call mode. */
     38	bool single_call;
     39
     40	/*
     41	 * Absolute position relative to the beginning of the uncompressed
     42	 * data (in a single .xz Block). We care only about the lowest 32
     43	 * bits so this doesn't need to be uint64_t even with big files.
     44	 */
     45	uint32_t pos;
     46
     47	/* x86 filter state */
     48	uint32_t x86_prev_mask;
     49
     50	/* Temporary space to hold the variables from struct xz_buf */
     51	uint8_t *out;
     52	size_t out_pos;
     53	size_t out_size;
     54
     55	struct {
     56		/* Amount of already filtered data in the beginning of buf */
     57		size_t filtered;
     58
     59		/* Total amount of data currently stored in buf  */
     60		size_t size;
     61
     62		/*
     63		 * Buffer to hold a mix of filtered and unfiltered data. This
     64		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
     65		 *
     66		 * Type         Alignment   Look-ahead
     67		 * x86              1           4
     68		 * PowerPC          4           0
     69		 * IA-64           16           0
     70		 * ARM              4           0
     71		 * ARM-Thumb        2           2
     72		 * SPARC            4           0
     73		 */
     74		uint8_t buf[16];
     75	} temp;
     76};
     77
     78#ifdef XZ_DEC_X86
     79/*
     80 * This is used to test the most significant byte of a memory address
     81 * in an x86 instruction.
     82 */
     83static inline int bcj_x86_test_msbyte(uint8_t b)
     84{
     85	return b == 0x00 || b == 0xFF;
     86}
     87
     88static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
     89{
     90	static const bool mask_to_allowed_status[8]
     91		= { true, true, true, false, true, false, false, false };
     92
     93	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
     94
     95	size_t i;
     96	size_t prev_pos = (size_t)-1;
     97	uint32_t prev_mask = s->x86_prev_mask;
     98	uint32_t src;
     99	uint32_t dest;
    100	uint32_t j;
    101	uint8_t b;
    102
    103	if (size <= 4)
    104		return 0;
    105
    106	size -= 4;
    107	for (i = 0; i < size; ++i) {
    108		if ((buf[i] & 0xFE) != 0xE8)
    109			continue;
    110
    111		prev_pos = i - prev_pos;
    112		if (prev_pos > 3) {
    113			prev_mask = 0;
    114		} else {
    115			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
    116			if (prev_mask != 0) {
    117				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
    118				if (!mask_to_allowed_status[prev_mask]
    119						|| bcj_x86_test_msbyte(b)) {
    120					prev_pos = i;
    121					prev_mask = (prev_mask << 1) | 1;
    122					continue;
    123				}
    124			}
    125		}
    126
    127		prev_pos = i;
    128
    129		if (bcj_x86_test_msbyte(buf[i + 4])) {
    130			src = get_unaligned_le32(buf + i + 1);
    131			while (true) {
    132				dest = src - (s->pos + (uint32_t)i + 5);
    133				if (prev_mask == 0)
    134					break;
    135
    136				j = mask_to_bit_num[prev_mask] * 8;
    137				b = (uint8_t)(dest >> (24 - j));
    138				if (!bcj_x86_test_msbyte(b))
    139					break;
    140
    141				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
    142			}
    143
    144			dest &= 0x01FFFFFF;
    145			dest |= (uint32_t)0 - (dest & 0x01000000);
    146			put_unaligned_le32(dest, buf + i + 1);
    147			i += 4;
    148		} else {
    149			prev_mask = (prev_mask << 1) | 1;
    150		}
    151	}
    152
    153	prev_pos = i - prev_pos;
    154	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
    155	return i;
    156}
    157#endif
    158
    159#ifdef XZ_DEC_POWERPC
    160static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
    161{
    162	size_t i;
    163	uint32_t instr;
    164
    165	for (i = 0; i + 4 <= size; i += 4) {
    166		instr = get_unaligned_be32(buf + i);
    167		if ((instr & 0xFC000003) == 0x48000001) {
    168			instr &= 0x03FFFFFC;
    169			instr -= s->pos + (uint32_t)i;
    170			instr &= 0x03FFFFFC;
    171			instr |= 0x48000001;
    172			put_unaligned_be32(instr, buf + i);
    173		}
    174	}
    175
    176	return i;
    177}
    178#endif
    179
    180#ifdef XZ_DEC_IA64
    181static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
    182{
    183	static const uint8_t branch_table[32] = {
    184		0, 0, 0, 0, 0, 0, 0, 0,
    185		0, 0, 0, 0, 0, 0, 0, 0,
    186		4, 4, 6, 6, 0, 0, 7, 7,
    187		4, 4, 0, 0, 4, 4, 0, 0
    188	};
    189
    190	/*
    191	 * The local variables take a little bit stack space, but it's less
    192	 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
    193	 * stack usage here without doing that for the LZMA2 decoder too.
    194	 */
    195
    196	/* Loop counters */
    197	size_t i;
    198	size_t j;
    199
    200	/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
    201	uint32_t slot;
    202
    203	/* Bitwise offset of the instruction indicated by slot */
    204	uint32_t bit_pos;
    205
    206	/* bit_pos split into byte and bit parts */
    207	uint32_t byte_pos;
    208	uint32_t bit_res;
    209
    210	/* Address part of an instruction */
    211	uint32_t addr;
    212
    213	/* Mask used to detect which instructions to convert */
    214	uint32_t mask;
    215
    216	/* 41-bit instruction stored somewhere in the lowest 48 bits */
    217	uint64_t instr;
    218
    219	/* Instruction normalized with bit_res for easier manipulation */
    220	uint64_t norm;
    221
    222	for (i = 0; i + 16 <= size; i += 16) {
    223		mask = branch_table[buf[i] & 0x1F];
    224		for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
    225			if (((mask >> slot) & 1) == 0)
    226				continue;
    227
    228			byte_pos = bit_pos >> 3;
    229			bit_res = bit_pos & 7;
    230			instr = 0;
    231			for (j = 0; j < 6; ++j)
    232				instr |= (uint64_t)(buf[i + j + byte_pos])
    233						<< (8 * j);
    234
    235			norm = instr >> bit_res;
    236
    237			if (((norm >> 37) & 0x0F) == 0x05
    238					&& ((norm >> 9) & 0x07) == 0) {
    239				addr = (norm >> 13) & 0x0FFFFF;
    240				addr |= ((uint32_t)(norm >> 36) & 1) << 20;
    241				addr <<= 4;
    242				addr -= s->pos + (uint32_t)i;
    243				addr >>= 4;
    244
    245				norm &= ~((uint64_t)0x8FFFFF << 13);
    246				norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
    247				norm |= (uint64_t)(addr & 0x100000)
    248						<< (36 - 20);
    249
    250				instr &= (1 << bit_res) - 1;
    251				instr |= norm << bit_res;
    252
    253				for (j = 0; j < 6; j++)
    254					buf[i + j + byte_pos]
    255						= (uint8_t)(instr >> (8 * j));
    256			}
    257		}
    258	}
    259
    260	return i;
    261}
    262#endif
    263
    264#ifdef XZ_DEC_ARM
    265static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
    266{
    267	size_t i;
    268	uint32_t addr;
    269
    270	for (i = 0; i + 4 <= size; i += 4) {
    271		if (buf[i + 3] == 0xEB) {
    272			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
    273					| ((uint32_t)buf[i + 2] << 16);
    274			addr <<= 2;
    275			addr -= s->pos + (uint32_t)i + 8;
    276			addr >>= 2;
    277			buf[i] = (uint8_t)addr;
    278			buf[i + 1] = (uint8_t)(addr >> 8);
    279			buf[i + 2] = (uint8_t)(addr >> 16);
    280		}
    281	}
    282
    283	return i;
    284}
    285#endif
    286
    287#ifdef XZ_DEC_ARMTHUMB
    288static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
    289{
    290	size_t i;
    291	uint32_t addr;
    292
    293	for (i = 0; i + 4 <= size; i += 2) {
    294		if ((buf[i + 1] & 0xF8) == 0xF0
    295				&& (buf[i + 3] & 0xF8) == 0xF8) {
    296			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
    297					| ((uint32_t)buf[i] << 11)
    298					| (((uint32_t)buf[i + 3] & 0x07) << 8)
    299					| (uint32_t)buf[i + 2];
    300			addr <<= 1;
    301			addr -= s->pos + (uint32_t)i + 4;
    302			addr >>= 1;
    303			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
    304			buf[i] = (uint8_t)(addr >> 11);
    305			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
    306			buf[i + 2] = (uint8_t)addr;
    307			i += 2;
    308		}
    309	}
    310
    311	return i;
    312}
    313#endif
    314
    315#ifdef XZ_DEC_SPARC
    316static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
    317{
    318	size_t i;
    319	uint32_t instr;
    320
    321	for (i = 0; i + 4 <= size; i += 4) {
    322		instr = get_unaligned_be32(buf + i);
    323		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
    324			instr <<= 2;
    325			instr -= s->pos + (uint32_t)i;
    326			instr >>= 2;
    327			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
    328					| 0x40000000 | (instr & 0x3FFFFF);
    329			put_unaligned_be32(instr, buf + i);
    330		}
    331	}
    332
    333	return i;
    334}
    335#endif
    336
    337/*
    338 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
    339 * of data that got filtered.
    340 *
    341 * NOTE: This is implemented as a switch statement to avoid using function
    342 * pointers, which could be problematic in the kernel boot code, which must
    343 * avoid pointers to static data (at least on x86).
    344 */
    345static void bcj_apply(struct xz_dec_bcj *s,
    346		      uint8_t *buf, size_t *pos, size_t size)
    347{
    348	size_t filtered;
    349
    350	buf += *pos;
    351	size -= *pos;
    352
    353	switch (s->type) {
    354#ifdef XZ_DEC_X86
    355	case BCJ_X86:
    356		filtered = bcj_x86(s, buf, size);
    357		break;
    358#endif
    359#ifdef XZ_DEC_POWERPC
    360	case BCJ_POWERPC:
    361		filtered = bcj_powerpc(s, buf, size);
    362		break;
    363#endif
    364#ifdef XZ_DEC_IA64
    365	case BCJ_IA64:
    366		filtered = bcj_ia64(s, buf, size);
    367		break;
    368#endif
    369#ifdef XZ_DEC_ARM
    370	case BCJ_ARM:
    371		filtered = bcj_arm(s, buf, size);
    372		break;
    373#endif
    374#ifdef XZ_DEC_ARMTHUMB
    375	case BCJ_ARMTHUMB:
    376		filtered = bcj_armthumb(s, buf, size);
    377		break;
    378#endif
    379#ifdef XZ_DEC_SPARC
    380	case BCJ_SPARC:
    381		filtered = bcj_sparc(s, buf, size);
    382		break;
    383#endif
    384	default:
    385		/* Never reached but silence compiler warnings. */
    386		filtered = 0;
    387		break;
    388	}
    389
    390	*pos += filtered;
    391	s->pos += filtered;
    392}
    393
    394/*
    395 * Flush pending filtered data from temp to the output buffer.
    396 * Move the remaining mixture of possibly filtered and unfiltered
    397 * data to the beginning of temp.
    398 */
    399static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
    400{
    401	size_t copy_size;
    402
    403	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
    404	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
    405	b->out_pos += copy_size;
    406
    407	s->temp.filtered -= copy_size;
    408	s->temp.size -= copy_size;
    409	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
    410}
    411
    412/*
    413 * The BCJ filter functions are primitive in sense that they process the
    414 * data in chunks of 1-16 bytes. To hide this issue, this function does
    415 * some buffering.
    416 */
    417XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
    418				     struct xz_dec_lzma2 *lzma2,
    419				     struct xz_buf *b)
    420{
    421	size_t out_start;
    422
    423	/*
    424	 * Flush pending already filtered data to the output buffer. Return
    425	 * immediately if we couldn't flush everything, or if the next
    426	 * filter in the chain had already returned XZ_STREAM_END.
    427	 */
    428	if (s->temp.filtered > 0) {
    429		bcj_flush(s, b);
    430		if (s->temp.filtered > 0)
    431			return XZ_OK;
    432
    433		if (s->ret == XZ_STREAM_END)
    434			return XZ_STREAM_END;
    435	}
    436
    437	/*
    438	 * If we have more output space than what is currently pending in
    439	 * temp, copy the unfiltered data from temp to the output buffer
    440	 * and try to fill the output buffer by decoding more data from the
    441	 * next filter in the chain. Apply the BCJ filter on the new data
    442	 * in the output buffer. If everything cannot be filtered, copy it
    443	 * to temp and rewind the output buffer position accordingly.
    444	 *
    445	 * This needs to be always run when temp.size == 0 to handle a special
    446	 * case where the output buffer is full and the next filter has no
    447	 * more output coming but hasn't returned XZ_STREAM_END yet.
    448	 */
    449	if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
    450		out_start = b->out_pos;
    451		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
    452		b->out_pos += s->temp.size;
    453
    454		s->ret = xz_dec_lzma2_run(lzma2, b);
    455		if (s->ret != XZ_STREAM_END
    456				&& (s->ret != XZ_OK || s->single_call))
    457			return s->ret;
    458
    459		bcj_apply(s, b->out, &out_start, b->out_pos);
    460
    461		/*
    462		 * As an exception, if the next filter returned XZ_STREAM_END,
    463		 * we can do that too, since the last few bytes that remain
    464		 * unfiltered are meant to remain unfiltered.
    465		 */
    466		if (s->ret == XZ_STREAM_END)
    467			return XZ_STREAM_END;
    468
    469		s->temp.size = b->out_pos - out_start;
    470		b->out_pos -= s->temp.size;
    471		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
    472
    473		/*
    474		 * If there wasn't enough input to the next filter to fill
    475		 * the output buffer with unfiltered data, there's no point
    476		 * to try decoding more data to temp.
    477		 */
    478		if (b->out_pos + s->temp.size < b->out_size)
    479			return XZ_OK;
    480	}
    481
    482	/*
    483	 * We have unfiltered data in temp. If the output buffer isn't full
    484	 * yet, try to fill the temp buffer by decoding more data from the
    485	 * next filter. Apply the BCJ filter on temp. Then we hopefully can
    486	 * fill the actual output buffer by copying filtered data from temp.
    487	 * A mix of filtered and unfiltered data may be left in temp; it will
    488	 * be taken care on the next call to this function.
    489	 */
    490	if (b->out_pos < b->out_size) {
    491		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
    492		s->out = b->out;
    493		s->out_pos = b->out_pos;
    494		s->out_size = b->out_size;
    495		b->out = s->temp.buf;
    496		b->out_pos = s->temp.size;
    497		b->out_size = sizeof(s->temp.buf);
    498
    499		s->ret = xz_dec_lzma2_run(lzma2, b);
    500
    501		s->temp.size = b->out_pos;
    502		b->out = s->out;
    503		b->out_pos = s->out_pos;
    504		b->out_size = s->out_size;
    505
    506		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
    507			return s->ret;
    508
    509		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
    510
    511		/*
    512		 * If the next filter returned XZ_STREAM_END, we mark that
    513		 * everything is filtered, since the last unfiltered bytes
    514		 * of the stream are meant to be left as is.
    515		 */
    516		if (s->ret == XZ_STREAM_END)
    517			s->temp.filtered = s->temp.size;
    518
    519		bcj_flush(s, b);
    520		if (s->temp.filtered > 0)
    521			return XZ_OK;
    522	}
    523
    524	return s->ret;
    525}
    526
    527XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
    528{
    529	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
    530	if (s != NULL)
    531		s->single_call = single_call;
    532
    533	return s;
    534}
    535
    536XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
    537{
    538	switch (id) {
    539#ifdef XZ_DEC_X86
    540	case BCJ_X86:
    541#endif
    542#ifdef XZ_DEC_POWERPC
    543	case BCJ_POWERPC:
    544#endif
    545#ifdef XZ_DEC_IA64
    546	case BCJ_IA64:
    547#endif
    548#ifdef XZ_DEC_ARM
    549	case BCJ_ARM:
    550#endif
    551#ifdef XZ_DEC_ARMTHUMB
    552	case BCJ_ARMTHUMB:
    553#endif
    554#ifdef XZ_DEC_SPARC
    555	case BCJ_SPARC:
    556#endif
    557		break;
    558
    559	default:
    560		/* Unsupported Filter ID */
    561		return XZ_OPTIONS_ERROR;
    562	}
    563
    564	s->type = id;
    565	s->ret = XZ_OK;
    566	s->pos = 0;
    567	s->x86_prev_mask = 0;
    568	s->temp.filtered = 0;
    569	s->temp.size = 0;
    570
    571	return XZ_OK;
    572}
    573
    574#endif