cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aegis128-neon-inner.c (8609B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
      4 */
      5
      6#ifdef CONFIG_ARM64
      7#include <asm/neon-intrinsics.h>
      8
      9#define AES_ROUND	"aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
     10#else
     11#include <arm_neon.h>
     12
     13#define AES_ROUND	"aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
     14#endif
     15
     16#define AEGIS_BLOCK_SIZE	16
     17
     18#include <stddef.h>
     19
     20extern int aegis128_have_aes_insn;
     21
     22void *memcpy(void *dest, const void *src, size_t n);
     23
     24struct aegis128_state {
     25	uint8x16_t v[5];
     26};
     27
     28extern const uint8_t crypto_aes_sbox[];
     29
     30static struct aegis128_state aegis128_load_state_neon(const void *state)
     31{
     32	return (struct aegis128_state){ {
     33		vld1q_u8(state),
     34		vld1q_u8(state + 16),
     35		vld1q_u8(state + 32),
     36		vld1q_u8(state + 48),
     37		vld1q_u8(state + 64)
     38	} };
     39}
     40
     41static void aegis128_save_state_neon(struct aegis128_state st, void *state)
     42{
     43	vst1q_u8(state, st.v[0]);
     44	vst1q_u8(state + 16, st.v[1]);
     45	vst1q_u8(state + 32, st.v[2]);
     46	vst1q_u8(state + 48, st.v[3]);
     47	vst1q_u8(state + 64, st.v[4]);
     48}
     49
     50static inline __attribute__((always_inline))
     51uint8x16_t aegis_aes_round(uint8x16_t w)
     52{
     53	uint8x16_t z = {};
     54
     55#ifdef CONFIG_ARM64
     56	if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
     57		static const uint8_t shift_rows[] = {
     58			0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
     59			0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
     60		};
     61		static const uint8_t ror32by8[] = {
     62			0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
     63			0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
     64		};
     65		uint8x16_t v;
     66
     67		// shift rows
     68		w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
     69
     70		// sub bytes
     71#ifndef CONFIG_CC_IS_GCC
     72		v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
     73		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
     74		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
     75		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
     76#else
     77		asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
     78		w -= 0x40;
     79		asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
     80		w -= 0x40;
     81		asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
     82		w -= 0x40;
     83		asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
     84#endif
     85
     86		// mix columns
     87		w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
     88		w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
     89		w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
     90
     91		return w;
     92	}
     93#endif
     94
     95	/*
     96	 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
     97	 * to force the compiler to issue the aese/aesmc instructions in pairs.
     98	 * This is much faster on many cores, where the instruction pair can
     99	 * execute in a single cycle.
    100	 */
    101	asm(AES_ROUND : "+w"(w) : "w"(z));
    102	return w;
    103}
    104
    105static inline __attribute__((always_inline))
    106struct aegis128_state aegis128_update_neon(struct aegis128_state st,
    107					   uint8x16_t m)
    108{
    109	m       ^= aegis_aes_round(st.v[4]);
    110	st.v[4] ^= aegis_aes_round(st.v[3]);
    111	st.v[3] ^= aegis_aes_round(st.v[2]);
    112	st.v[2] ^= aegis_aes_round(st.v[1]);
    113	st.v[1] ^= aegis_aes_round(st.v[0]);
    114	st.v[0] ^= m;
    115
    116	return st;
    117}
    118
    119static inline __attribute__((always_inline))
    120void preload_sbox(void)
    121{
    122	if (!IS_ENABLED(CONFIG_ARM64) ||
    123	    !IS_ENABLED(CONFIG_CC_IS_GCC) ||
    124	    __builtin_expect(aegis128_have_aes_insn, 1))
    125		return;
    126
    127	asm("ld1	{v16.16b-v19.16b}, [%0], #64	\n\t"
    128	    "ld1	{v20.16b-v23.16b}, [%0], #64	\n\t"
    129	    "ld1	{v24.16b-v27.16b}, [%0], #64	\n\t"
    130	    "ld1	{v28.16b-v31.16b}, [%0]		\n\t"
    131	    :: "r"(crypto_aes_sbox));
    132}
    133
    134void crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
    135{
    136	static const uint8_t const0[] = {
    137		0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
    138		0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
    139	};
    140	static const uint8_t const1[] = {
    141		0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
    142		0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
    143	};
    144	uint8x16_t k = vld1q_u8(key);
    145	uint8x16_t kiv = k ^ vld1q_u8(iv);
    146	struct aegis128_state st = {{
    147		kiv,
    148		vld1q_u8(const1),
    149		vld1q_u8(const0),
    150		k ^ vld1q_u8(const0),
    151		k ^ vld1q_u8(const1),
    152	}};
    153	int i;
    154
    155	preload_sbox();
    156
    157	for (i = 0; i < 5; i++) {
    158		st = aegis128_update_neon(st, k);
    159		st = aegis128_update_neon(st, kiv);
    160	}
    161	aegis128_save_state_neon(st, state);
    162}
    163
    164void crypto_aegis128_update_neon(void *state, const void *msg)
    165{
    166	struct aegis128_state st = aegis128_load_state_neon(state);
    167
    168	preload_sbox();
    169
    170	st = aegis128_update_neon(st, vld1q_u8(msg));
    171
    172	aegis128_save_state_neon(st, state);
    173}
    174
    175#ifdef CONFIG_ARM
    176/*
    177 * AArch32 does not provide these intrinsics natively because it does not
    178 * implement the underlying instructions. AArch32 only provides 64-bit
    179 * wide vtbl.8/vtbx.8 instruction, so use those instead.
    180 */
    181static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
    182{
    183	union {
    184		uint8x16_t	val;
    185		uint8x8x2_t	pair;
    186	} __a = { a };
    187
    188	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
    189			   vtbl2_u8(__a.pair, vget_high_u8(b)));
    190}
    191
    192static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
    193{
    194	union {
    195		uint8x16_t	val;
    196		uint8x8x2_t	pair;
    197	} __a = { a };
    198
    199	return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
    200			   vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
    201}
    202
    203static int8_t vminvq_s8(int8x16_t v)
    204{
    205	int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
    206
    207	s = vpmin_s8(s, s);
    208	s = vpmin_s8(s, s);
    209	s = vpmin_s8(s, s);
    210
    211	return vget_lane_s8(s, 0);
    212}
    213#endif
    214
    215static const uint8_t permute[] __aligned(64) = {
    216	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    217	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    218	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    219};
    220
    221void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
    222					unsigned int size)
    223{
    224	struct aegis128_state st = aegis128_load_state_neon(state);
    225	const int short_input = size < AEGIS_BLOCK_SIZE;
    226	uint8x16_t msg;
    227
    228	preload_sbox();
    229
    230	while (size >= AEGIS_BLOCK_SIZE) {
    231		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
    232
    233		msg = vld1q_u8(src);
    234		st = aegis128_update_neon(st, msg);
    235		msg ^= s;
    236		vst1q_u8(dst, msg);
    237
    238		size -= AEGIS_BLOCK_SIZE;
    239		src += AEGIS_BLOCK_SIZE;
    240		dst += AEGIS_BLOCK_SIZE;
    241	}
    242
    243	if (size > 0) {
    244		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
    245		uint8_t buf[AEGIS_BLOCK_SIZE];
    246		const void *in = src;
    247		void *out = dst;
    248		uint8x16_t m;
    249
    250		if (__builtin_expect(short_input, 0))
    251			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
    252
    253		m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
    254			       vld1q_u8(permute + 32 - size));
    255
    256		st = aegis128_update_neon(st, m);
    257
    258		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
    259			 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
    260
    261		if (__builtin_expect(short_input, 0))
    262			memcpy(dst, out, size);
    263		else
    264			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
    265	}
    266
    267	aegis128_save_state_neon(st, state);
    268}
    269
    270void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
    271					unsigned int size)
    272{
    273	struct aegis128_state st = aegis128_load_state_neon(state);
    274	const int short_input = size < AEGIS_BLOCK_SIZE;
    275	uint8x16_t msg;
    276
    277	preload_sbox();
    278
    279	while (size >= AEGIS_BLOCK_SIZE) {
    280		msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
    281		st = aegis128_update_neon(st, msg);
    282		vst1q_u8(dst, msg);
    283
    284		size -= AEGIS_BLOCK_SIZE;
    285		src += AEGIS_BLOCK_SIZE;
    286		dst += AEGIS_BLOCK_SIZE;
    287	}
    288
    289	if (size > 0) {
    290		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
    291		uint8_t buf[AEGIS_BLOCK_SIZE];
    292		const void *in = src;
    293		void *out = dst;
    294		uint8x16_t m;
    295
    296		if (__builtin_expect(short_input, 0))
    297			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
    298
    299		m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
    300				   vld1q_u8(permute + 32 - size));
    301
    302		st = aegis128_update_neon(st, m);
    303
    304		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
    305			 vqtbl1q_u8(m, vld1q_u8(permute + size)));
    306
    307		if (__builtin_expect(short_input, 0))
    308			memcpy(dst, out, size);
    309		else
    310			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
    311	}
    312
    313	aegis128_save_state_neon(st, state);
    314}
    315
    316int crypto_aegis128_final_neon(void *state, void *tag_xor,
    317			       unsigned int assoclen,
    318			       unsigned int cryptlen,
    319			       unsigned int authsize)
    320{
    321	struct aegis128_state st = aegis128_load_state_neon(state);
    322	uint8x16_t v;
    323	int i;
    324
    325	preload_sbox();
    326
    327	v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
    328					       vmov_n_u64(8ULL * cryptlen));
    329
    330	for (i = 0; i < 7; i++)
    331		st = aegis128_update_neon(st, v);
    332
    333	v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
    334
    335	if (authsize > 0) {
    336		v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
    337			       vld1q_u8(permute + authsize));
    338
    339		return vminvq_s8((int8x16_t)v);
    340	}
    341
    342	vst1q_u8(tag_xor, v);
    343	return 0;
    344}