cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sha1_ssse3_asm.S (11376B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
      4 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
      5 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
      6 * boost.
      7 *
      8 * This work was inspired by the vectorized implementation of Dean Gaudet.
      9 * Additional information on it can be found at:
     10 *    http://www.arctic.org/~dean/crypto/sha1.html
     11 *
     12 * It was improved upon with more efficient vectorization of the message
     13 * scheduling. This implementation has also been optimized for all current and
     14 * several future generations of Intel CPUs.
     15 *
     16 * See this article for more information about the implementation details:
     17 *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
     18 *
     19 * Copyright (C) 2010, Intel Corp.
     20 *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
     21 *            Ronen Zohar <ronen.zohar@intel.com>
     22 *
     23 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
     24 *   Author: Mathias Krause <minipli@googlemail.com>
     25 */
     26
     27#include <linux/linkage.h>
     28
     29#define CTX	%rdi	// arg1
     30#define BUF	%rsi	// arg2
     31#define CNT	%rdx	// arg3
     32
     33#define REG_A	%ecx
     34#define REG_B	%esi
     35#define REG_C	%edi
     36#define REG_D	%r12d
     37#define REG_E	%edx
     38
     39#define REG_T1	%eax
     40#define REG_T2	%ebx
     41
     42#define K_BASE		%r8
     43#define HASH_PTR	%r9
     44#define BUFFER_PTR	%r10
     45#define BUFFER_END	%r11
     46
     47#define W_TMP1	%xmm0
     48#define W_TMP2	%xmm9
     49
     50#define W0	%xmm1
     51#define W4	%xmm2
     52#define W8	%xmm3
     53#define W12	%xmm4
     54#define W16	%xmm5
     55#define W20	%xmm6
     56#define W24	%xmm7
     57#define W28	%xmm8
     58
     59#define XMM_SHUFB_BSWAP	%xmm10
     60
     61/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
     62#define WK(t)	(((t) & 15) * 4)(%rsp)
     63#define W_PRECALC_AHEAD	16
     64
     65/*
     66 * This macro implements the SHA-1 function's body for single 64-byte block
     67 * param: function's name
     68 */
     69.macro SHA1_VECTOR_ASM  name
     70	SYM_FUNC_START(\name)
     71
     72	push	%rbx
     73	push	%r12
     74	push	%rbp
     75	mov	%rsp, %rbp
     76
     77	sub	$64, %rsp		# allocate workspace
     78	and	$~15, %rsp		# align stack
     79
     80	mov	CTX, HASH_PTR
     81	mov	BUF, BUFFER_PTR
     82
     83	shl	$6, CNT			# multiply by 64
     84	add	BUF, CNT
     85	mov	CNT, BUFFER_END
     86
     87	lea	K_XMM_AR(%rip), K_BASE
     88	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
     89
     90	SHA1_PIPELINED_MAIN_BODY
     91
     92	# cleanup workspace
     93	mov	$8, %ecx
     94	mov	%rsp, %rdi
     95	xor	%eax, %eax
     96	rep stosq
     97
     98	mov	%rbp, %rsp		# deallocate workspace
     99	pop	%rbp
    100	pop	%r12
    101	pop	%rbx
    102	RET
    103
    104	SYM_FUNC_END(\name)
    105.endm
    106
    107/*
    108 * This macro implements 80 rounds of SHA-1 for one 64-byte block
    109 */
    110.macro SHA1_PIPELINED_MAIN_BODY
    111	INIT_REGALLOC
    112
    113	mov	  (HASH_PTR), A
    114	mov	 4(HASH_PTR), B
    115	mov	 8(HASH_PTR), C
    116	mov	12(HASH_PTR), D
    117	mov	16(HASH_PTR), E
    118
    119  .set i, 0
    120  .rept W_PRECALC_AHEAD
    121	W_PRECALC i
    122    .set i, (i+1)
    123  .endr
    124
    125.align 4
    1261:
    127	RR F1,A,B,C,D,E,0
    128	RR F1,D,E,A,B,C,2
    129	RR F1,B,C,D,E,A,4
    130	RR F1,E,A,B,C,D,6
    131	RR F1,C,D,E,A,B,8
    132
    133	RR F1,A,B,C,D,E,10
    134	RR F1,D,E,A,B,C,12
    135	RR F1,B,C,D,E,A,14
    136	RR F1,E,A,B,C,D,16
    137	RR F1,C,D,E,A,B,18
    138
    139	RR F2,A,B,C,D,E,20
    140	RR F2,D,E,A,B,C,22
    141	RR F2,B,C,D,E,A,24
    142	RR F2,E,A,B,C,D,26
    143	RR F2,C,D,E,A,B,28
    144
    145	RR F2,A,B,C,D,E,30
    146	RR F2,D,E,A,B,C,32
    147	RR F2,B,C,D,E,A,34
    148	RR F2,E,A,B,C,D,36
    149	RR F2,C,D,E,A,B,38
    150
    151	RR F3,A,B,C,D,E,40
    152	RR F3,D,E,A,B,C,42
    153	RR F3,B,C,D,E,A,44
    154	RR F3,E,A,B,C,D,46
    155	RR F3,C,D,E,A,B,48
    156
    157	RR F3,A,B,C,D,E,50
    158	RR F3,D,E,A,B,C,52
    159	RR F3,B,C,D,E,A,54
    160	RR F3,E,A,B,C,D,56
    161	RR F3,C,D,E,A,B,58
    162
    163	add	$64, BUFFER_PTR		# move to the next 64-byte block
    164	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
    165	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
    166
    167	RR F4,A,B,C,D,E,60
    168	RR F4,D,E,A,B,C,62
    169	RR F4,B,C,D,E,A,64
    170	RR F4,E,A,B,C,D,66
    171	RR F4,C,D,E,A,B,68
    172
    173	RR F4,A,B,C,D,E,70
    174	RR F4,D,E,A,B,C,72
    175	RR F4,B,C,D,E,A,74
    176	RR F4,E,A,B,C,D,76
    177	RR F4,C,D,E,A,B,78
    178
    179	UPDATE_HASH   (HASH_PTR), A
    180	UPDATE_HASH  4(HASH_PTR), B
    181	UPDATE_HASH  8(HASH_PTR), C
    182	UPDATE_HASH 12(HASH_PTR), D
    183	UPDATE_HASH 16(HASH_PTR), E
    184
    185	RESTORE_RENAMED_REGS
    186	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
    187	jne	1b
    188.endm
    189
    190.macro INIT_REGALLOC
    191  .set A, REG_A
    192  .set B, REG_B
    193  .set C, REG_C
    194  .set D, REG_D
    195  .set E, REG_E
    196  .set T1, REG_T1
    197  .set T2, REG_T2
    198.endm
    199
    200.macro RESTORE_RENAMED_REGS
    201	# order is important (REG_C is where it should be)
    202	mov	B, REG_B
    203	mov	D, REG_D
    204	mov	A, REG_A
    205	mov	E, REG_E
    206.endm
    207
    208.macro SWAP_REG_NAMES  a, b
    209  .set _T, \a
    210  .set \a, \b
    211  .set \b, _T
    212.endm
    213
    214.macro F1  b, c, d
    215	mov	\c, T1
    216	SWAP_REG_NAMES \c, T1
    217	xor	\d, T1
    218	and	\b, T1
    219	xor	\d, T1
    220.endm
    221
    222.macro F2  b, c, d
    223	mov	\d, T1
    224	SWAP_REG_NAMES \d, T1
    225	xor	\c, T1
    226	xor	\b, T1
    227.endm
    228
    229.macro F3  b, c ,d
    230	mov	\c, T1
    231	SWAP_REG_NAMES \c, T1
    232	mov	\b, T2
    233	or	\b, T1
    234	and	\c, T2
    235	and	\d, T1
    236	or	T2, T1
    237.endm
    238
    239.macro F4  b, c, d
    240	F2 \b, \c, \d
    241.endm
    242
    243.macro UPDATE_HASH  hash, val
    244	add	\hash, \val
    245	mov	\val, \hash
    246.endm
    247
    248/*
    249 * RR does two rounds of SHA-1 back to back with W[] pre-calc
    250 *   t1 = F(b, c, d);   e += w(i)
    251 *   e += t1;           b <<= 30;   d  += w(i+1);
    252 *   t1 = F(a, b, c);
    253 *   d += t1;           a <<= 5;
    254 *   e += a;
    255 *   t1 = e;            a >>= 7;
    256 *   t1 <<= 5;
    257 *   d += t1;
    258 */
    259.macro RR  F, a, b, c, d, e, round
    260	add	WK(\round), \e
    261	\F   \b, \c, \d		# t1 = F(b, c, d);
    262	W_PRECALC (\round + W_PRECALC_AHEAD)
    263	rol	$30, \b
    264	add	T1, \e
    265	add	WK(\round + 1), \d
    266
    267	\F   \a, \b, \c
    268	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
    269	rol	$5, \a
    270	add	\a, \e
    271	add	T1, \d
    272	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
    273
    274	mov	\e, T1
    275	SWAP_REG_NAMES \e, T1
    276
    277	rol	$5, T1
    278	add	T1, \d
    279
    280	# write:  \a, \b
    281	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
    282.endm
    283
    284.macro W_PRECALC  r
    285  .set i, \r
    286
    287  .if (i < 20)
    288    .set K_XMM, 0
    289  .elseif (i < 40)
    290    .set K_XMM, 16
    291  .elseif (i < 60)
    292    .set K_XMM, 32
    293  .elseif (i < 80)
    294    .set K_XMM, 48
    295  .endif
    296
    297  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
    298    .set i, ((\r) % 80)	    # pre-compute for the next iteration
    299    .if (i == 0)
    300	W_PRECALC_RESET
    301    .endif
    302	W_PRECALC_00_15
    303  .elseif (i<32)
    304	W_PRECALC_16_31
    305  .elseif (i < 80)   // rounds 32-79
    306	W_PRECALC_32_79
    307  .endif
    308.endm
    309
    310.macro W_PRECALC_RESET
    311  .set W,          W0
    312  .set W_minus_04, W4
    313  .set W_minus_08, W8
    314  .set W_minus_12, W12
    315  .set W_minus_16, W16
    316  .set W_minus_20, W20
    317  .set W_minus_24, W24
    318  .set W_minus_28, W28
    319  .set W_minus_32, W
    320.endm
    321
    322.macro W_PRECALC_ROTATE
    323  .set W_minus_32, W_minus_28
    324  .set W_minus_28, W_minus_24
    325  .set W_minus_24, W_minus_20
    326  .set W_minus_20, W_minus_16
    327  .set W_minus_16, W_minus_12
    328  .set W_minus_12, W_minus_08
    329  .set W_minus_08, W_minus_04
    330  .set W_minus_04, W
    331  .set W,          W_minus_32
    332.endm
    333
    334.macro W_PRECALC_SSSE3
    335
    336.macro W_PRECALC_00_15
    337	W_PRECALC_00_15_SSSE3
    338.endm
    339.macro W_PRECALC_16_31
    340	W_PRECALC_16_31_SSSE3
    341.endm
    342.macro W_PRECALC_32_79
    343	W_PRECALC_32_79_SSSE3
    344.endm
    345
    346/* message scheduling pre-compute for rounds 0-15 */
    347.macro W_PRECALC_00_15_SSSE3
    348  .if ((i & 3) == 0)
    349	movdqu	(i*4)(BUFFER_PTR), W_TMP1
    350  .elseif ((i & 3) == 1)
    351	pshufb	XMM_SHUFB_BSWAP, W_TMP1
    352	movdqa	W_TMP1, W
    353  .elseif ((i & 3) == 2)
    354	paddd	(K_BASE), W_TMP1
    355  .elseif ((i & 3) == 3)
    356	movdqa  W_TMP1, WK(i&~3)
    357	W_PRECALC_ROTATE
    358  .endif
    359.endm
    360
    361/* message scheduling pre-compute for rounds 16-31
    362 *
    363 * - calculating last 32 w[i] values in 8 XMM registers
    364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
    365 *   instruction
    366 *
    367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
    368 * dependency, but improves for 32-79
    369 */
    370.macro W_PRECALC_16_31_SSSE3
    371  # blended scheduling of vector and scalar instruction streams, one 4-wide
    372  # vector iteration / 4 scalar rounds
    373  .if ((i & 3) == 0)
    374	movdqa	W_minus_12, W
    375	palignr	$8, W_minus_16, W	# w[i-14]
    376	movdqa	W_minus_04, W_TMP1
    377	psrldq	$4, W_TMP1		# w[i-3]
    378	pxor	W_minus_08, W
    379  .elseif ((i & 3) == 1)
    380	pxor	W_minus_16, W_TMP1
    381	pxor	W_TMP1, W
    382	movdqa	W, W_TMP2
    383	movdqa	W, W_TMP1
    384	pslldq	$12, W_TMP2
    385  .elseif ((i & 3) == 2)
    386	psrld	$31, W
    387	pslld	$1, W_TMP1
    388	por	W, W_TMP1
    389	movdqa	W_TMP2, W
    390	psrld	$30, W_TMP2
    391	pslld	$2, W
    392  .elseif ((i & 3) == 3)
    393	pxor	W, W_TMP1
    394	pxor	W_TMP2, W_TMP1
    395	movdqa	W_TMP1, W
    396	paddd	K_XMM(K_BASE), W_TMP1
    397	movdqa	W_TMP1, WK(i&~3)
    398	W_PRECALC_ROTATE
    399  .endif
    400.endm
    401
    402/* message scheduling pre-compute for rounds 32-79
    403 *
    404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
    405 * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
    406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
    407 */
    408.macro W_PRECALC_32_79_SSSE3
    409  .if ((i & 3) == 0)
    410	movdqa	W_minus_04, W_TMP1
    411	pxor	W_minus_28, W		# W is W_minus_32 before xor
    412	palignr	$8, W_minus_08, W_TMP1
    413  .elseif ((i & 3) == 1)
    414	pxor	W_minus_16, W
    415	pxor	W_TMP1, W
    416	movdqa	W, W_TMP1
    417  .elseif ((i & 3) == 2)
    418	psrld	$30, W
    419	pslld	$2, W_TMP1
    420	por	W, W_TMP1
    421  .elseif ((i & 3) == 3)
    422	movdqa	W_TMP1, W
    423	paddd	K_XMM(K_BASE), W_TMP1
    424	movdqa	W_TMP1, WK(i&~3)
    425	W_PRECALC_ROTATE
    426  .endif
    427.endm
    428
    429.endm		// W_PRECALC_SSSE3
    430
    431
    432#define K1	0x5a827999
    433#define K2	0x6ed9eba1
    434#define K3	0x8f1bbcdc
    435#define K4	0xca62c1d6
    436
    437.section .rodata
    438.align 16
    439
    440K_XMM_AR:
    441	.long K1, K1, K1, K1
    442	.long K2, K2, K2, K2
    443	.long K3, K3, K3, K3
    444	.long K4, K4, K4, K4
    445
    446BSWAP_SHUFB_CTL:
    447	.long 0x00010203
    448	.long 0x04050607
    449	.long 0x08090a0b
    450	.long 0x0c0d0e0f
    451
    452
    453.section .text
    454
    455W_PRECALC_SSSE3
    456.macro xmm_mov a, b
    457	movdqu	\a,\b
    458.endm
    459
    460/*
    461 * SSSE3 optimized implementation:
    462 *
    463 * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
    464 *					const u8 *data, int blocks);
    465 *
    466 * Note that struct sha1_state is assumed to begin with u32 state[5].
    467 */
    468SHA1_VECTOR_ASM     sha1_transform_ssse3
    469
    470.macro W_PRECALC_AVX
    471
    472.purgem W_PRECALC_00_15
    473.macro  W_PRECALC_00_15
    474    W_PRECALC_00_15_AVX
    475.endm
    476.purgem W_PRECALC_16_31
    477.macro  W_PRECALC_16_31
    478    W_PRECALC_16_31_AVX
    479.endm
    480.purgem W_PRECALC_32_79
    481.macro  W_PRECALC_32_79
    482    W_PRECALC_32_79_AVX
    483.endm
    484
    485.macro W_PRECALC_00_15_AVX
    486  .if ((i & 3) == 0)
    487	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
    488  .elseif ((i & 3) == 1)
    489	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
    490  .elseif ((i & 3) == 2)
    491	vpaddd	(K_BASE), W, W_TMP1
    492  .elseif ((i & 3) == 3)
    493	vmovdqa	W_TMP1, WK(i&~3)
    494	W_PRECALC_ROTATE
    495  .endif
    496.endm
    497
    498.macro W_PRECALC_16_31_AVX
    499  .if ((i & 3) == 0)
    500	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
    501	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
    502	vpxor	W_minus_08, W, W
    503	vpxor	W_minus_16, W_TMP1, W_TMP1
    504  .elseif ((i & 3) == 1)
    505	vpxor	W_TMP1, W, W
    506	vpslldq	$12, W, W_TMP2
    507	vpslld	$1, W, W_TMP1
    508  .elseif ((i & 3) == 2)
    509	vpsrld	$31, W, W
    510	vpor	W, W_TMP1, W_TMP1
    511	vpslld	$2, W_TMP2, W
    512	vpsrld	$30, W_TMP2, W_TMP2
    513  .elseif ((i & 3) == 3)
    514	vpxor	W, W_TMP1, W_TMP1
    515	vpxor	W_TMP2, W_TMP1, W
    516	vpaddd	K_XMM(K_BASE), W, W_TMP1
    517	vmovdqu	W_TMP1, WK(i&~3)
    518	W_PRECALC_ROTATE
    519  .endif
    520.endm
    521
    522.macro W_PRECALC_32_79_AVX
    523  .if ((i & 3) == 0)
    524	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
    525	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
    526  .elseif ((i & 3) == 1)
    527	vpxor	W_minus_16, W_TMP1, W_TMP1
    528	vpxor	W_TMP1, W, W
    529  .elseif ((i & 3) == 2)
    530	vpslld	$2, W, W_TMP1
    531	vpsrld	$30, W, W
    532	vpor	W, W_TMP1, W
    533  .elseif ((i & 3) == 3)
    534	vpaddd	K_XMM(K_BASE), W, W_TMP1
    535	vmovdqu	W_TMP1, WK(i&~3)
    536	W_PRECALC_ROTATE
    537  .endif
    538.endm
    539
    540.endm    // W_PRECALC_AVX
    541
    542W_PRECALC_AVX
    543.purgem xmm_mov
    544.macro xmm_mov a, b
    545	vmovdqu	\a,\b
    546.endm
    547
    548
    549/* AVX optimized implementation:
    550 *  extern "C" void sha1_transform_avx(struct sha1_state *state,
    551 *				       const u8 *data, int blocks);
    552 */
    553SHA1_VECTOR_ASM     sha1_transform_avx