cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

blowfish-x86_64-asm_64.S (5954B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * Blowfish Cipher Algorithm (x86_64)
      4 *
      5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
      6 */
      7
      8#include <linux/linkage.h>
      9
     10.file "blowfish-x86_64-asm.S"
     11.text
     12
     13/* structure of crypto context */
     14#define p	0
     15#define s0	((16 + 2) * 4)
     16#define s1	((16 + 2 + (1 * 256)) * 4)
     17#define s2	((16 + 2 + (2 * 256)) * 4)
     18#define s3	((16 + 2 + (3 * 256)) * 4)
     19
     20/* register macros */
     21#define CTX %r12
     22#define RIO %rsi
     23
     24#define RX0 %rax
     25#define RX1 %rbx
     26#define RX2 %rcx
     27#define RX3 %rdx
     28
     29#define RX0d %eax
     30#define RX1d %ebx
     31#define RX2d %ecx
     32#define RX3d %edx
     33
     34#define RX0bl %al
     35#define RX1bl %bl
     36#define RX2bl %cl
     37#define RX3bl %dl
     38
     39#define RX0bh %ah
     40#define RX1bh %bh
     41#define RX2bh %ch
     42#define RX3bh %dh
     43
     44#define RT0 %rdi
     45#define RT1 %rsi
     46#define RT2 %r8
     47#define RT3 %r9
     48
     49#define RT0d %edi
     50#define RT1d %esi
     51#define RT2d %r8d
     52#define RT3d %r9d
     53
     54#define RKEY %r10
     55
     56/***********************************************************************
     57 * 1-way blowfish
     58 ***********************************************************************/
     59#define F() \
     60	rorq $16,		RX0; \
     61	movzbl RX0bh,		RT0d; \
     62	movzbl RX0bl,		RT1d; \
     63	rolq $16,		RX0; \
     64	movl s0(CTX,RT0,4),	RT0d; \
     65	addl s1(CTX,RT1,4),	RT0d; \
     66	movzbl RX0bh,		RT1d; \
     67	movzbl RX0bl,		RT2d; \
     68	rolq $32,		RX0; \
     69	xorl s2(CTX,RT1,4),	RT0d; \
     70	addl s3(CTX,RT2,4),	RT0d; \
     71	xorq RT0,		RX0;
     72
     73#define add_roundkey_enc(n) \
     74	xorq p+4*(n)(CTX), 	RX0;
     75
     76#define round_enc(n) \
     77	add_roundkey_enc(n); \
     78	\
     79	F(); \
     80	F();
     81
     82#define add_roundkey_dec(n) \
     83	movq p+4*(n-1)(CTX),	RT0; \
     84	rorq $32,		RT0; \
     85	xorq RT0,		RX0;
     86
     87#define round_dec(n) \
     88	add_roundkey_dec(n); \
     89	\
     90	F(); \
     91	F(); \
     92
     93#define read_block() \
     94	movq (RIO), 		RX0; \
     95	rorq $32, 		RX0; \
     96	bswapq 			RX0;
     97
     98#define write_block() \
     99	bswapq 			RX0; \
    100	movq RX0, 		(RIO);
    101
    102#define xor_block() \
    103	bswapq 			RX0; \
    104	xorq RX0, 		(RIO);
    105
    106SYM_FUNC_START(__blowfish_enc_blk)
    107	/* input:
    108	 *	%rdi: ctx
    109	 *	%rsi: dst
    110	 *	%rdx: src
    111	 *	%rcx: bool, if true: xor output
    112	 */
    113	movq %r12, %r11;
    114
    115	movq %rdi, CTX;
    116	movq %rsi, %r10;
    117	movq %rdx, RIO;
    118
    119	read_block();
    120
    121	round_enc(0);
    122	round_enc(2);
    123	round_enc(4);
    124	round_enc(6);
    125	round_enc(8);
    126	round_enc(10);
    127	round_enc(12);
    128	round_enc(14);
    129	add_roundkey_enc(16);
    130
    131	movq %r11, %r12;
    132
    133	movq %r10, RIO;
    134	test %cl, %cl;
    135	jnz .L__enc_xor;
    136
    137	write_block();
    138	RET;
    139.L__enc_xor:
    140	xor_block();
    141	RET;
    142SYM_FUNC_END(__blowfish_enc_blk)
    143
    144SYM_FUNC_START(blowfish_dec_blk)
    145	/* input:
    146	 *	%rdi: ctx
    147	 *	%rsi: dst
    148	 *	%rdx: src
    149	 */
    150	movq %r12, %r11;
    151
    152	movq %rdi, CTX;
    153	movq %rsi, %r10;
    154	movq %rdx, RIO;
    155
    156	read_block();
    157
    158	round_dec(17);
    159	round_dec(15);
    160	round_dec(13);
    161	round_dec(11);
    162	round_dec(9);
    163	round_dec(7);
    164	round_dec(5);
    165	round_dec(3);
    166	add_roundkey_dec(1);
    167
    168	movq %r10, RIO;
    169	write_block();
    170
    171	movq %r11, %r12;
    172
    173	RET;
    174SYM_FUNC_END(blowfish_dec_blk)
    175
    176/**********************************************************************
    177  4-way blowfish, four blocks parallel
    178 **********************************************************************/
    179
    180/* F() for 4-way. Slower when used alone/1-way, but faster when used
    181 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
    182 */
    183#define F4(x) \
    184	movzbl x ## bh,		RT1d; \
    185	movzbl x ## bl,		RT3d; \
    186	rorq $16,		x; \
    187	movzbl x ## bh,		RT0d; \
    188	movzbl x ## bl,		RT2d; \
    189	rorq $16,		x; \
    190	movl s0(CTX,RT0,4),	RT0d; \
    191	addl s1(CTX,RT2,4),	RT0d; \
    192	xorl s2(CTX,RT1,4),	RT0d; \
    193	addl s3(CTX,RT3,4),	RT0d; \
    194	xorq RT0,		x;
    195
    196#define add_preloaded_roundkey4() \
    197	xorq RKEY,		RX0; \
    198	xorq RKEY,		RX1; \
    199	xorq RKEY,		RX2; \
    200	xorq RKEY,		RX3;
    201
    202#define preload_roundkey_enc(n) \
    203	movq p+4*(n)(CTX),	RKEY;
    204
    205#define add_roundkey_enc4(n) \
    206	add_preloaded_roundkey4(); \
    207	preload_roundkey_enc(n + 2);
    208
    209#define round_enc4(n) \
    210	add_roundkey_enc4(n); \
    211	\
    212	F4(RX0); \
    213	F4(RX1); \
    214	F4(RX2); \
    215	F4(RX3); \
    216	\
    217	F4(RX0); \
    218	F4(RX1); \
    219	F4(RX2); \
    220	F4(RX3);
    221
    222#define preload_roundkey_dec(n) \
    223	movq p+4*((n)-1)(CTX),	RKEY; \
    224	rorq $32,		RKEY;
    225
    226#define add_roundkey_dec4(n) \
    227	add_preloaded_roundkey4(); \
    228	preload_roundkey_dec(n - 2);
    229
    230#define round_dec4(n) \
    231	add_roundkey_dec4(n); \
    232	\
    233	F4(RX0); \
    234	F4(RX1); \
    235	F4(RX2); \
    236	F4(RX3); \
    237	\
    238	F4(RX0); \
    239	F4(RX1); \
    240	F4(RX2); \
    241	F4(RX3);
    242
    243#define read_block4() \
    244	movq (RIO),		RX0; \
    245	rorq $32,		RX0; \
    246	bswapq 			RX0; \
    247	\
    248	movq 8(RIO),		RX1; \
    249	rorq $32,		RX1; \
    250	bswapq 			RX1; \
    251	\
    252	movq 16(RIO),		RX2; \
    253	rorq $32,		RX2; \
    254	bswapq 			RX2; \
    255	\
    256	movq 24(RIO),		RX3; \
    257	rorq $32,		RX3; \
    258	bswapq 			RX3;
    259
    260#define write_block4() \
    261	bswapq 			RX0; \
    262	movq RX0,		(RIO); \
    263	\
    264	bswapq 			RX1; \
    265	movq RX1,		8(RIO); \
    266	\
    267	bswapq 			RX2; \
    268	movq RX2,		16(RIO); \
    269	\
    270	bswapq 			RX3; \
    271	movq RX3,		24(RIO);
    272
    273#define xor_block4() \
    274	bswapq 			RX0; \
    275	xorq RX0,		(RIO); \
    276	\
    277	bswapq 			RX1; \
    278	xorq RX1,		8(RIO); \
    279	\
    280	bswapq 			RX2; \
    281	xorq RX2,		16(RIO); \
    282	\
    283	bswapq 			RX3; \
    284	xorq RX3,		24(RIO);
    285
    286SYM_FUNC_START(__blowfish_enc_blk_4way)
    287	/* input:
    288	 *	%rdi: ctx
    289	 *	%rsi: dst
    290	 *	%rdx: src
    291	 *	%rcx: bool, if true: xor output
    292	 */
    293	pushq %r12;
    294	pushq %rbx;
    295	pushq %rcx;
    296
    297	movq %rdi, CTX
    298	movq %rsi, %r11;
    299	movq %rdx, RIO;
    300
    301	preload_roundkey_enc(0);
    302
    303	read_block4();
    304
    305	round_enc4(0);
    306	round_enc4(2);
    307	round_enc4(4);
    308	round_enc4(6);
    309	round_enc4(8);
    310	round_enc4(10);
    311	round_enc4(12);
    312	round_enc4(14);
    313	add_preloaded_roundkey4();
    314
    315	popq %r12;
    316	movq %r11, RIO;
    317
    318	test %r12b, %r12b;
    319	jnz .L__enc_xor4;
    320
    321	write_block4();
    322
    323	popq %rbx;
    324	popq %r12;
    325	RET;
    326
    327.L__enc_xor4:
    328	xor_block4();
    329
    330	popq %rbx;
    331	popq %r12;
    332	RET;
    333SYM_FUNC_END(__blowfish_enc_blk_4way)
    334
    335SYM_FUNC_START(blowfish_dec_blk_4way)
    336	/* input:
    337	 *	%rdi: ctx
    338	 *	%rsi: dst
    339	 *	%rdx: src
    340	 */
    341	pushq %r12;
    342	pushq %rbx;
    343
    344	movq %rdi, CTX;
    345	movq %rsi, %r11
    346	movq %rdx, RIO;
    347
    348	preload_roundkey_dec(17);
    349	read_block4();
    350
    351	round_dec4(17);
    352	round_dec4(15);
    353	round_dec4(13);
    354	round_dec4(11);
    355	round_dec4(9);
    356	round_dec4(7);
    357	round_dec4(5);
    358	round_dec4(3);
    359	add_preloaded_roundkey4();
    360
    361	movq %r11, RIO;
    362	write_block4();
    363
    364	popq %rbx;
    365	popq %r12;
    366
    367	RET;
    368SYM_FUNC_END(blowfish_dec_blk_4way)