cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

twofish-x86_64-asm_64.S (7550B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/***************************************************************************
      3*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
      4*                                                                         *
      5***************************************************************************/
      6
      7.file "twofish-x86_64-asm.S"
      8.text
      9
     10#include <linux/linkage.h>
     11#include <asm/asm-offsets.h>
     12
     13#define a_offset	0
     14#define b_offset	4
     15#define c_offset	8
     16#define d_offset	12
     17
     18/* Structure of the crypto context struct*/
     19
     20#define s0	0	/* S0 Array 256 Words each */
     21#define s1	1024	/* S1 Array */
     22#define s2	2048	/* S2 Array */
     23#define s3	3072	/* S3 Array */
     24#define w	4096	/* 8 whitening keys (word) */
     25#define k	4128	/* key 1-32 ( word ) */
     26
     27/* define a few register aliases to allow macro substitution */
     28
     29#define R0     %rax
     30#define R0D    %eax
     31#define R0B    %al
     32#define R0H    %ah
     33
     34#define R1     %rbx
     35#define R1D    %ebx
     36#define R1B    %bl
     37#define R1H    %bh
     38
     39#define R2     %rcx
     40#define R2D    %ecx
     41#define R2B    %cl
     42#define R2H    %ch
     43
     44#define R3     %rdx
     45#define R3D    %edx
     46#define R3B    %dl
     47#define R3H    %dh
     48
     49
     50/* performs input whitening */
     51#define input_whitening(src,context,offset)\
     52	xor	w+offset(context),	src;
     53
     54/* performs input whitening */
     55#define output_whitening(src,context,offset)\
     56	xor	w+16+offset(context),	src;
     57
     58
     59/*
     60 * a input register containing a (rotated 16)
     61 * b input register containing b
     62 * c input register containing c
     63 * d input register containing d (already rol $1)
     64 * operations on a and b are interleaved to increase performance
     65 */
     66#define encrypt_round(a,b,c,d,round)\
     67	movzx	b ## B,		%edi;\
     68	mov	s1(%r11,%rdi,4),%r8d;\
     69	movzx	a ## B,		%edi;\
     70	mov	s2(%r11,%rdi,4),%r9d;\
     71	movzx	b ## H,		%edi;\
     72	ror	$16,		b ## D;\
     73	xor	s2(%r11,%rdi,4),%r8d;\
     74	movzx	a ## H,		%edi;\
     75	ror	$16,		a ## D;\
     76	xor	s3(%r11,%rdi,4),%r9d;\
     77	movzx	b ## B,		%edi;\
     78	xor	s3(%r11,%rdi,4),%r8d;\
     79	movzx	a ## B,		%edi;\
     80	xor	(%r11,%rdi,4),	%r9d;\
     81	movzx	b ## H,		%edi;\
     82	ror	$15,		b ## D;\
     83	xor	(%r11,%rdi,4),	%r8d;\
     84	movzx	a ## H,		%edi;\
     85	xor	s1(%r11,%rdi,4),%r9d;\
     86	add	%r8d,		%r9d;\
     87	add	%r9d,		%r8d;\
     88	add	k+round(%r11),	%r9d;\
     89	xor	%r9d,		c ## D;\
     90	rol	$15,		c ## D;\
     91	add	k+4+round(%r11),%r8d;\
     92	xor	%r8d,		d ## D;
     93
     94/*
     95 * a input register containing a(rotated 16)
     96 * b input register containing b
     97 * c input register containing c
     98 * d input register containing d (already rol $1)
     99 * operations on a and b are interleaved to increase performance
    100 * during the round a and b are prepared for the output whitening
    101 */
    102#define encrypt_last_round(a,b,c,d,round)\
    103	mov	b ## D,		%r10d;\
    104	shl	$32,		%r10;\
    105	movzx	b ## B,		%edi;\
    106	mov	s1(%r11,%rdi,4),%r8d;\
    107	movzx	a ## B,		%edi;\
    108	mov	s2(%r11,%rdi,4),%r9d;\
    109	movzx	b ## H,		%edi;\
    110	ror	$16,		b ## D;\
    111	xor	s2(%r11,%rdi,4),%r8d;\
    112	movzx	a ## H,		%edi;\
    113	ror	$16,		a ## D;\
    114	xor	s3(%r11,%rdi,4),%r9d;\
    115	movzx	b ## B,		%edi;\
    116	xor	s3(%r11,%rdi,4),%r8d;\
    117	movzx	a ## B,		%edi;\
    118	xor	(%r11,%rdi,4),	%r9d;\
    119	xor	a,		%r10;\
    120	movzx	b ## H,		%edi;\
    121	xor	(%r11,%rdi,4),	%r8d;\
    122	movzx	a ## H,		%edi;\
    123	xor	s1(%r11,%rdi,4),%r9d;\
    124	add	%r8d,		%r9d;\
    125	add	%r9d,		%r8d;\
    126	add	k+round(%r11),	%r9d;\
    127	xor	%r9d,		c ## D;\
    128	ror	$1,		c ## D;\
    129	add	k+4+round(%r11),%r8d;\
    130	xor	%r8d,		d ## D
    131
    132/*
    133 * a input register containing a
    134 * b input register containing b (rotated 16)
    135 * c input register containing c (already rol $1)
    136 * d input register containing d
    137 * operations on a and b are interleaved to increase performance
    138 */
    139#define decrypt_round(a,b,c,d,round)\
    140	movzx	a ## B,		%edi;\
    141	mov	(%r11,%rdi,4),	%r9d;\
    142	movzx	b ## B,		%edi;\
    143	mov	s3(%r11,%rdi,4),%r8d;\
    144	movzx	a ## H,		%edi;\
    145	ror	$16,		a ## D;\
    146	xor	s1(%r11,%rdi,4),%r9d;\
    147	movzx	b ## H,		%edi;\
    148	ror	$16,		b ## D;\
    149	xor	(%r11,%rdi,4),	%r8d;\
    150	movzx	a ## B,		%edi;\
    151	xor	s2(%r11,%rdi,4),%r9d;\
    152	movzx	b ## B,		%edi;\
    153	xor	s1(%r11,%rdi,4),%r8d;\
    154	movzx	a ## H,		%edi;\
    155	ror	$15,		a ## D;\
    156	xor	s3(%r11,%rdi,4),%r9d;\
    157	movzx	b ## H,		%edi;\
    158	xor	s2(%r11,%rdi,4),%r8d;\
    159	add	%r8d,		%r9d;\
    160	add	%r9d,		%r8d;\
    161	add	k+round(%r11),	%r9d;\
    162	xor	%r9d,		c ## D;\
    163	add	k+4+round(%r11),%r8d;\
    164	xor	%r8d,		d ## D;\
    165	rol	$15,		d ## D;
    166
    167/*
    168 * a input register containing a
    169 * b input register containing b
    170 * c input register containing c (already rol $1)
    171 * d input register containing d
    172 * operations on a and b are interleaved to increase performance
    173 * during the round a and b are prepared for the output whitening
    174 */
    175#define decrypt_last_round(a,b,c,d,round)\
    176	movzx	a ## B,		%edi;\
    177	mov	(%r11,%rdi,4),	%r9d;\
    178	movzx	b ## B,		%edi;\
    179	mov	s3(%r11,%rdi,4),%r8d;\
    180	movzx	b ## H,		%edi;\
    181	ror	$16,		b ## D;\
    182	xor	(%r11,%rdi,4),	%r8d;\
    183	movzx	a ## H,		%edi;\
    184	mov	b ## D,		%r10d;\
    185	shl	$32,		%r10;\
    186	xor	a,		%r10;\
    187	ror	$16,		a ## D;\
    188	xor	s1(%r11,%rdi,4),%r9d;\
    189	movzx	b ## B,		%edi;\
    190	xor	s1(%r11,%rdi,4),%r8d;\
    191	movzx	a ## B,		%edi;\
    192	xor	s2(%r11,%rdi,4),%r9d;\
    193	movzx	b ## H,		%edi;\
    194	xor	s2(%r11,%rdi,4),%r8d;\
    195	movzx	a ## H,		%edi;\
    196	xor	s3(%r11,%rdi,4),%r9d;\
    197	add	%r8d,		%r9d;\
    198	add	%r9d,		%r8d;\
    199	add	k+round(%r11),	%r9d;\
    200	xor	%r9d,		c ## D;\
    201	add	k+4+round(%r11),%r8d;\
    202	xor	%r8d,		d ## D;\
    203	ror	$1,		d ## D;
    204
    205SYM_FUNC_START(twofish_enc_blk)
    206	pushq    R1
    207
    208	/* %rdi contains the ctx address */
    209	/* %rsi contains the output address */
    210	/* %rdx contains the input address */
    211	/* ctx address is moved to free one non-rex register
    212	as target for the 8bit high operations */
    213	mov	%rdi,		%r11
    214
    215	movq	(R3),	R1
    216	movq	8(R3),	R3
    217	input_whitening(R1,%r11,a_offset)
    218	input_whitening(R3,%r11,c_offset)
    219	mov	R1D,	R0D
    220	rol	$16,	R0D
    221	shr	$32,	R1
    222	mov	R3D,	R2D
    223	shr	$32,	R3
    224	rol	$1,	R3D
    225
    226	encrypt_round(R0,R1,R2,R3,0);
    227	encrypt_round(R2,R3,R0,R1,8);
    228	encrypt_round(R0,R1,R2,R3,2*8);
    229	encrypt_round(R2,R3,R0,R1,3*8);
    230	encrypt_round(R0,R1,R2,R3,4*8);
    231	encrypt_round(R2,R3,R0,R1,5*8);
    232	encrypt_round(R0,R1,R2,R3,6*8);
    233	encrypt_round(R2,R3,R0,R1,7*8);
    234	encrypt_round(R0,R1,R2,R3,8*8);
    235	encrypt_round(R2,R3,R0,R1,9*8);
    236	encrypt_round(R0,R1,R2,R3,10*8);
    237	encrypt_round(R2,R3,R0,R1,11*8);
    238	encrypt_round(R0,R1,R2,R3,12*8);
    239	encrypt_round(R2,R3,R0,R1,13*8);
    240	encrypt_round(R0,R1,R2,R3,14*8);
    241	encrypt_last_round(R2,R3,R0,R1,15*8);
    242
    243
    244	output_whitening(%r10,%r11,a_offset)
    245	movq	%r10,	(%rsi)
    246
    247	shl	$32,	R1
    248	xor	R0,	R1
    249
    250	output_whitening(R1,%r11,c_offset)
    251	movq	R1,	8(%rsi)
    252
    253	popq	R1
    254	movl	$1,%eax
    255	RET
    256SYM_FUNC_END(twofish_enc_blk)
    257
    258SYM_FUNC_START(twofish_dec_blk)
    259	pushq    R1
    260
    261	/* %rdi contains the ctx address */
    262	/* %rsi contains the output address */
    263	/* %rdx contains the input address */
    264	/* ctx address is moved to free one non-rex register
    265	as target for the 8bit high operations */
    266	mov	%rdi,		%r11
    267
    268	movq	(R3),	R1
    269	movq	8(R3),	R3
    270	output_whitening(R1,%r11,a_offset)
    271	output_whitening(R3,%r11,c_offset)
    272	mov	R1D,	R0D
    273	shr	$32,	R1
    274	rol	$16,	R1D
    275	mov	R3D,	R2D
    276	shr	$32,	R3
    277	rol	$1,	R2D
    278
    279	decrypt_round(R0,R1,R2,R3,15*8);
    280	decrypt_round(R2,R3,R0,R1,14*8);
    281	decrypt_round(R0,R1,R2,R3,13*8);
    282	decrypt_round(R2,R3,R0,R1,12*8);
    283	decrypt_round(R0,R1,R2,R3,11*8);
    284	decrypt_round(R2,R3,R0,R1,10*8);
    285	decrypt_round(R0,R1,R2,R3,9*8);
    286	decrypt_round(R2,R3,R0,R1,8*8);
    287	decrypt_round(R0,R1,R2,R3,7*8);
    288	decrypt_round(R2,R3,R0,R1,6*8);
    289	decrypt_round(R0,R1,R2,R3,5*8);
    290	decrypt_round(R2,R3,R0,R1,4*8);
    291	decrypt_round(R0,R1,R2,R3,3*8);
    292	decrypt_round(R2,R3,R0,R1,2*8);
    293	decrypt_round(R0,R1,R2,R3,1*8);
    294	decrypt_last_round(R2,R3,R0,R1,0);
    295
    296	input_whitening(%r10,%r11,a_offset)
    297	movq	%r10,	(%rsi)
    298
    299	shl	$32,	R1
    300	xor	R0,	R1
    301
    302	input_whitening(R1,%r11,c_offset)
    303	movq	R1,	8(%rsi)
    304
    305	popq	R1
    306	movl	$1,%eax
    307	RET
    308SYM_FUNC_END(twofish_dec_blk)