twofish-avx-x86_64-asm_64.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
twofish-avx-x86_64-asm_64.S (9014B)
      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
      4 *
      5 * Copyright (C) 2012 Johannes Goetzfried
      6 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
      7 *
      8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
      9 */
     10
     11#include <linux/linkage.h>
     12#include <asm/frame.h>
     13#include "glue_helper-asm-avx.S"
     14
     15.file "twofish-avx-x86_64-asm_64.S"
     16
     17.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
     18.align 16
     19.Lbswap128_mask:
     20	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
     21
     22.text
     23
     24/* structure of crypto context */
     25#define s0	0
     26#define s1	1024
     27#define s2	2048
     28#define s3	3072
     29#define w	4096
     30#define k	4128
     31
     32/**********************************************************************
     33  8-way AVX twofish
     34 **********************************************************************/
     35#define CTX %rdi
     36
     37#define RA1 %xmm0
     38#define RB1 %xmm1
     39#define RC1 %xmm2
     40#define RD1 %xmm3
     41
     42#define RA2 %xmm4
     43#define RB2 %xmm5
     44#define RC2 %xmm6
     45#define RD2 %xmm7
     46
     47#define RX0 %xmm8
     48#define RY0 %xmm9
     49
     50#define RX1 %xmm10
     51#define RY1 %xmm11
     52
     53#define RK1 %xmm12
     54#define RK2 %xmm13
     55
     56#define RT %xmm14
     57#define RR %xmm15
     58
     59#define RID1  %r13
     60#define RID1d %r13d
     61#define RID2  %rsi
     62#define RID2d %esi
     63
     64#define RGI1   %rdx
     65#define RGI1bl %dl
     66#define RGI1bh %dh
     67#define RGI2   %rcx
     68#define RGI2bl %cl
     69#define RGI2bh %ch
     70
     71#define RGI3   %rax
     72#define RGI3bl %al
     73#define RGI3bh %ah
     74#define RGI4   %rbx
     75#define RGI4bl %bl
     76#define RGI4bh %bh
     77
     78#define RGS1  %r8
     79#define RGS1d %r8d
     80#define RGS2  %r9
     81#define RGS2d %r9d
     82#define RGS3  %r10
     83#define RGS3d %r10d
     84
     85
     86#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
     87	movzbl		src ## bl,        RID1d;     \
     88	movzbl		src ## bh,        RID2d;     \
     89	shrq $16,	src;                         \
     90	movl		t0(CTX, RID1, 4), dst ## d;  \
     91	movl		t1(CTX, RID2, 4), RID2d;     \
     92	movzbl		src ## bl,        RID1d;     \
     93	xorl		RID2d,            dst ## d;  \
     94	movzbl		src ## bh,        RID2d;     \
     95	interleave_op(il_reg);			     \
     96	xorl		t2(CTX, RID1, 4), dst ## d;  \
     97	xorl		t3(CTX, RID2, 4), dst ## d;
     98
     99#define dummy(d) /* do nothing */
    100
    101#define shr_next(reg) \
    102	shrq $16,	reg;
    103
    104#define G(gi1, gi2, x, t0, t1, t2, t3) \
    105	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
    106	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
    107	\
    108	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);      \
    109	shlq $32,	RGS2;                                        \
    110	orq		RGS1, RGS2;                                  \
    111	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);      \
    112	shlq $32,	RGS1;                                        \
    113	orq		RGS1, RGS3;
    114
    115#define round_head_2(a, b, x1, y1, x2, y2) \
    116	vmovq		b ## 1, RGI3;           \
    117	vpextrq $1,	b ## 1, RGI4;           \
    118	\
    119	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
    120	vmovq		a ## 2, RGI1;           \
    121	vpextrq $1,	a ## 2, RGI2;           \
    122	vmovq		RGS2, x1;               \
    123	vpinsrq $1,	RGS3, x1, x1;           \
    124	\
    125	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
    126	vmovq		b ## 2, RGI3;           \
    127	vpextrq $1,	b ## 2, RGI4;           \
    128	vmovq		RGS2, y1;               \
    129	vpinsrq $1,	RGS3, y1, y1;           \
    130	\
    131	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
    132	vmovq		RGS2, x2;               \
    133	vpinsrq $1,	RGS3, x2, x2;           \
    134	\
    135	G(RGI3, RGI4, y2, s1, s2, s3, s0);      \
    136	vmovq		RGS2, y2;               \
    137	vpinsrq $1,	RGS3, y2, y2;
    138
    139#define encround_tail(a, b, c, d, x, y, prerotate) \
    140	vpaddd			x, y,   x; \
    141	vpaddd			x, RK1, RT;\
    142	prerotate(b);			   \
    143	vpxor			RT, c,  c; \
    144	vpaddd			y, x,   y; \
    145	vpaddd			y, RK2, y; \
    146	vpsrld $1,		c, RT;     \
    147	vpslld $(32 - 1),	c, c;      \
    148	vpor			c, RT,  c; \
    149	vpxor			d, y,   d; \
    150
    151#define decround_tail(a, b, c, d, x, y, prerotate) \
    152	vpaddd			x, y,   x; \
    153	vpaddd			x, RK1, RT;\
    154	prerotate(a);			   \
    155	vpxor			RT, c,  c; \
    156	vpaddd			y, x,   y; \
    157	vpaddd			y, RK2, y; \
    158	vpxor			d, y,   d; \
    159	vpsrld $1,		d, y;      \
    160	vpslld $(32 - 1),	d, d;      \
    161	vpor			d, y,   d; \
    162
    163#define rotate_1l(x) \
    164	vpslld $1,		x, RR;     \
    165	vpsrld $(32 - 1),	x, x;      \
    166	vpor			x, RR,  x;
    167
    168#define preload_rgi(c) \
    169	vmovq			c, RGI1; \
    170	vpextrq $1,		c, RGI2;
    171
    172#define encrypt_round(n, a, b, c, d, preload, prerotate) \
    173	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
    174	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
    175	round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
    176	encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
    177	preload(c ## 1);                                         \
    178	encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
    179
    180#define decrypt_round(n, a, b, c, d, preload, prerotate) \
    181	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
    182	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
    183	round_head_2(a, b, RX0, RY0, RX1, RY1);                  \
    184	decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
    185	preload(c ## 1);                                         \
    186	decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
    187
    188#define encrypt_cycle(n) \
    189	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
    190	encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
    191
    192#define encrypt_cycle_last(n) \
    193	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
    194	encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
    195
    196#define decrypt_cycle(n) \
    197	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
    198	decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
    199
    200#define decrypt_cycle_last(n) \
    201	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
    202	decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
    203
    204#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    205	vpunpckldq		x1, x0, t0; \
    206	vpunpckhdq		x1, x0, t2; \
    207	vpunpckldq		x3, x2, t1; \
    208	vpunpckhdq		x3, x2, x3; \
    209	\
    210	vpunpcklqdq		t1, t0, x0; \
    211	vpunpckhqdq		t1, t0, x1; \
    212	vpunpcklqdq		x3, t2, x2; \
    213	vpunpckhqdq		x3, t2, x3;
    214
    215#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
    216	vpxor		x0, wkey, x0; \
    217	vpxor		x1, wkey, x1; \
    218	vpxor		x2, wkey, x2; \
    219	vpxor		x3, wkey, x3; \
    220	\
    221	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
    222
    223#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
    224	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
    225	\
    226	vpxor		x0, wkey, x0; \
    227	vpxor		x1, wkey, x1; \
    228	vpxor		x2, wkey, x2; \
    229	vpxor		x3, wkey, x3;
    230
    231.align 8
    232SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
    233	/* input:
    234	 *	%rdi: ctx, CTX
    235	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
    236	 * output:
    237	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
    238	 */
    239
    240	vmovdqu w(CTX), RK1;
    241
    242	pushq %r13;
    243	pushq %rbx;
    244	pushq %rcx;
    245
    246	inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
    247	preload_rgi(RA1);
    248	rotate_1l(RD1);
    249	inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
    250	rotate_1l(RD2);
    251
    252	encrypt_cycle(0);
    253	encrypt_cycle(1);
    254	encrypt_cycle(2);
    255	encrypt_cycle(3);
    256	encrypt_cycle(4);
    257	encrypt_cycle(5);
    258	encrypt_cycle(6);
    259	encrypt_cycle_last(7);
    260
    261	vmovdqu (w+4*4)(CTX), RK1;
    262
    263	popq %rcx;
    264	popq %rbx;
    265	popq %r13;
    266
    267	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
    268	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
    269
    270	RET;
    271SYM_FUNC_END(__twofish_enc_blk8)
    272
    273.align 8
    274SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
    275	/* input:
    276	 *	%rdi: ctx, CTX
    277	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
    278	 * output:
    279	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
    280	 */
    281
    282	vmovdqu (w+4*4)(CTX), RK1;
    283
    284	pushq %r13;
    285	pushq %rbx;
    286
    287	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
    288	preload_rgi(RC1);
    289	rotate_1l(RA1);
    290	inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
    291	rotate_1l(RA2);
    292
    293	decrypt_cycle(7);
    294	decrypt_cycle(6);
    295	decrypt_cycle(5);
    296	decrypt_cycle(4);
    297	decrypt_cycle(3);
    298	decrypt_cycle(2);
    299	decrypt_cycle(1);
    300	decrypt_cycle_last(0);
    301
    302	vmovdqu (w)(CTX), RK1;
    303
    304	popq %rbx;
    305	popq %r13;
    306
    307	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
    308	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
    309
    310	RET;
    311SYM_FUNC_END(__twofish_dec_blk8)
    312
    313SYM_FUNC_START(twofish_ecb_enc_8way)
    314	/* input:
    315	 *	%rdi: ctx, CTX
    316	 *	%rsi: dst
    317	 *	%rdx: src
    318	 */
    319	FRAME_BEGIN
    320
    321	movq %rsi, %r11;
    322
    323	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    324
    325	call __twofish_enc_blk8;
    326
    327	store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
    328
    329	FRAME_END
    330	RET;
    331SYM_FUNC_END(twofish_ecb_enc_8way)
    332
    333SYM_FUNC_START(twofish_ecb_dec_8way)
    334	/* input:
    335	 *	%rdi: ctx, CTX
    336	 *	%rsi: dst
    337	 *	%rdx: src
    338	 */
    339	FRAME_BEGIN
    340
    341	movq %rsi, %r11;
    342
    343	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
    344
    345	call __twofish_dec_blk8;
    346
    347	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    348
    349	FRAME_END
    350	RET;
    351SYM_FUNC_END(twofish_ecb_dec_8way)
    352
    353SYM_FUNC_START(twofish_cbc_dec_8way)
    354	/* input:
    355	 *	%rdi: ctx, CTX
    356	 *	%rsi: dst
    357	 *	%rdx: src
    358	 */
    359	FRAME_BEGIN
    360
    361	pushq %r12;
    362
    363	movq %rsi, %r11;
    364	movq %rdx, %r12;
    365
    366	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
    367
    368	call __twofish_dec_blk8;
    369
    370	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
    371
    372	popq %r12;
    373
    374	FRAME_END
    375	RET;
    376SYM_FUNC_END(twofish_cbc_dec_8way)