cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sha1_avx2_x86_64_asm.S (14288B)


      1/*
      2 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
      3 *
      4 * This file is provided under a dual BSD/GPLv2 license.  When using or
      5 * redistributing this file, you may do so under either license.
      6 *
      7 * GPL LICENSE SUMMARY
      8 *
      9 * Copyright(c) 2014 Intel Corporation.
     10 *
     11 * This program is free software; you can redistribute it and/or modify
     12 * it under the terms of version 2 of the GNU General Public License as
     13 * published by the Free Software Foundation.
     14 *
     15 * This program is distributed in the hope that it will be useful, but
     16 * WITHOUT ANY WARRANTY; without even the implied warranty of
     17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     18 * General Public License for more details.
     19 *
     20 * Contact Information:
     21 * Ilya Albrekht <ilya.albrekht@intel.com>
     22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
     23 * Ronen Zohar <ronen.zohar@intel.com>
     24 * Chandramouli Narayanan <mouli@linux.intel.com>
     25 *
     26 * BSD LICENSE
     27 *
     28 * Copyright(c) 2014 Intel Corporation.
     29 *
     30 * Redistribution and use in source and binary forms, with or without
     31 * modification, are permitted provided that the following conditions
     32 * are met:
     33 *
     34 * Redistributions of source code must retain the above copyright
     35 * notice, this list of conditions and the following disclaimer.
     36 * Redistributions in binary form must reproduce the above copyright
     37 * notice, this list of conditions and the following disclaimer in
     38 * the documentation and/or other materials provided with the
     39 * distribution.
     40 * Neither the name of Intel Corporation nor the names of its
     41 * contributors may be used to endorse or promote products derived
     42 * from this software without specific prior written permission.
     43 *
     44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     55 *
     56 */
     57
     58/*
     59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
     60 *
     61 *This implementation is based on the previous SSSE3 release:
     62 *Visit http://software.intel.com/en-us/articles/
     63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
     64 *
     65 *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
     66 *even number of 'blocks' consecutive 64-byte blocks.
     67 *
     68 *extern "C" void sha1_transform_avx2(
     69 *	struct sha1_state *state, const u8* input, int blocks );
     70 */
     71
     72#include <linux/linkage.h>
     73
     74#define	CTX	%rdi	/* arg1 */
     75#define BUF	%rsi	/* arg2 */
     76#define CNT	%rdx	/* arg3 */
     77
     78#define	REG_A	%ecx
     79#define	REG_B	%esi
     80#define	REG_C	%edi
     81#define	REG_D	%eax
     82#define	REG_E	%edx
     83#define	REG_TB	%ebx
     84#define	REG_TA	%r12d
     85#define	REG_RA	%rcx
     86#define	REG_RB	%rsi
     87#define	REG_RC	%rdi
     88#define	REG_RD	%rax
     89#define	REG_RE	%rdx
     90#define	REG_RTA	%r12
     91#define	REG_RTB	%rbx
     92#define	REG_T1	%r11d
     93#define	xmm_mov	vmovups
     94#define	avx2_zeroupper	vzeroupper
     95#define	RND_F1	1
     96#define	RND_F2	2
     97#define	RND_F3	3
     98
     99.macro REGALLOC
    100	.set A, REG_A
    101	.set B, REG_B
    102	.set C, REG_C
    103	.set D, REG_D
    104	.set E, REG_E
    105	.set TB, REG_TB
    106	.set TA, REG_TA
    107
    108	.set RA, REG_RA
    109	.set RB, REG_RB
    110	.set RC, REG_RC
    111	.set RD, REG_RD
    112	.set RE, REG_RE
    113
    114	.set RTA, REG_RTA
    115	.set RTB, REG_RTB
    116
    117	.set T1, REG_T1
    118.endm
    119
    120#define HASH_PTR	%r9
    121#define BLOCKS_CTR	%r8
    122#define BUFFER_PTR	%r10
    123#define BUFFER_PTR2	%r13
    124
    125#define PRECALC_BUF	%r14
    126#define WK_BUF		%r15
    127
    128#define W_TMP		%xmm0
    129#define WY_TMP		%ymm0
    130#define WY_TMP2		%ymm9
    131
    132# AVX2 variables
    133#define WY0		%ymm3
    134#define WY4		%ymm5
    135#define WY08		%ymm7
    136#define WY12		%ymm8
    137#define WY16		%ymm12
    138#define WY20		%ymm13
    139#define WY24		%ymm14
    140#define WY28		%ymm15
    141
    142#define YMM_SHUFB_BSWAP	%ymm10
    143
    144/*
    145 * Keep 2 iterations precalculated at a time:
    146 *    - 80 DWORDs per iteration * 2
    147 */
    148#define W_SIZE		(80*2*2 +16)
    149
    150#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
    151#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
    152
    153
    154.macro UPDATE_HASH  hash, val
    155	add	\hash, \val
    156	mov	\val, \hash
    157.endm
    158
    159.macro PRECALC_RESET_WY
    160	.set WY_00, WY0
    161	.set WY_04, WY4
    162	.set WY_08, WY08
    163	.set WY_12, WY12
    164	.set WY_16, WY16
    165	.set WY_20, WY20
    166	.set WY_24, WY24
    167	.set WY_28, WY28
    168	.set WY_32, WY_00
    169.endm
    170
    171.macro PRECALC_ROTATE_WY
    172	/* Rotate macros */
    173	.set WY_32, WY_28
    174	.set WY_28, WY_24
    175	.set WY_24, WY_20
    176	.set WY_20, WY_16
    177	.set WY_16, WY_12
    178	.set WY_12, WY_08
    179	.set WY_08, WY_04
    180	.set WY_04, WY_00
    181	.set WY_00, WY_32
    182
    183	/* Define register aliases */
    184	.set WY, WY_00
    185	.set WY_minus_04, WY_04
    186	.set WY_minus_08, WY_08
    187	.set WY_minus_12, WY_12
    188	.set WY_minus_16, WY_16
    189	.set WY_minus_20, WY_20
    190	.set WY_minus_24, WY_24
    191	.set WY_minus_28, WY_28
    192	.set WY_minus_32, WY
    193.endm
    194
    195.macro PRECALC_00_15
    196	.if (i == 0) # Initialize and rotate registers
    197		PRECALC_RESET_WY
    198		PRECALC_ROTATE_WY
    199	.endif
    200
    201	/* message scheduling pre-compute for rounds 0-15 */
    202	.if   ((i & 7) == 0)
    203		/*
    204		 * blended AVX2 and ALU instruction scheduling
    205		 * 1 vector iteration per 8 rounds
    206		 */
    207		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
    208	.elseif ((i & 7) == 1)
    209		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
    210			 WY_TMP, WY_TMP
    211	.elseif ((i & 7) == 2)
    212		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
    213	.elseif ((i & 7) == 4)
    214		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
    215	.elseif ((i & 7) == 7)
    216		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
    217
    218		PRECALC_ROTATE_WY
    219	.endif
    220.endm
    221
    222.macro PRECALC_16_31
    223	/*
    224	 * message scheduling pre-compute for rounds 16-31
    225	 * calculating last 32 w[i] values in 8 XMM registers
    226	 * pre-calculate K+w[i] values and store to mem
    227	 * for later load by ALU add instruction
    228	 *
    229	 * "brute force" vectorization for rounds 16-31 only
    230	 * due to w[i]->w[i-3] dependency
    231	 */
    232	.if   ((i & 7) == 0)
    233		/*
    234		 * blended AVX2 and ALU instruction scheduling
    235		 * 1 vector iteration per 8 rounds
    236		 */
    237		/* w[i-14] */
    238		vpalignr	$8, WY_minus_16, WY_minus_12, WY
    239		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
    240	.elseif ((i & 7) == 1)
    241		vpxor	WY_minus_08, WY, WY
    242		vpxor	WY_minus_16, WY_TMP, WY_TMP
    243	.elseif ((i & 7) == 2)
    244		vpxor	WY_TMP, WY, WY
    245		vpslldq	$12, WY, WY_TMP2
    246	.elseif ((i & 7) == 3)
    247		vpslld	$1, WY, WY_TMP
    248		vpsrld	$31, WY, WY
    249	.elseif ((i & 7) == 4)
    250		vpor	WY, WY_TMP, WY_TMP
    251		vpslld	$2, WY_TMP2, WY
    252	.elseif ((i & 7) == 5)
    253		vpsrld	$30, WY_TMP2, WY_TMP2
    254		vpxor	WY, WY_TMP, WY_TMP
    255	.elseif ((i & 7) == 7)
    256		vpxor	WY_TMP2, WY_TMP, WY
    257		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
    258		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
    259
    260		PRECALC_ROTATE_WY
    261	.endif
    262.endm
    263
    264.macro PRECALC_32_79
    265	/*
    266	 * in SHA-1 specification:
    267	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
    268	 * instead we do equal:
    269	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
    270	 * allows more efficient vectorization
    271	 * since w[i]=>w[i-3] dependency is broken
    272	 */
    273
    274	.if   ((i & 7) == 0)
    275	/*
    276	 * blended AVX2 and ALU instruction scheduling
    277	 * 1 vector iteration per 8 rounds
    278	 */
    279		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
    280	.elseif ((i & 7) == 1)
    281		/* W is W_minus_32 before xor */
    282		vpxor	WY_minus_28, WY, WY
    283	.elseif ((i & 7) == 2)
    284		vpxor	WY_minus_16, WY_TMP, WY_TMP
    285	.elseif ((i & 7) == 3)
    286		vpxor	WY_TMP, WY, WY
    287	.elseif ((i & 7) == 4)
    288		vpslld	$2, WY, WY_TMP
    289	.elseif ((i & 7) == 5)
    290		vpsrld	$30, WY, WY
    291		vpor	WY, WY_TMP, WY
    292	.elseif ((i & 7) == 7)
    293		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
    294		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
    295
    296		PRECALC_ROTATE_WY
    297	.endif
    298.endm
    299
    300.macro PRECALC r, s
    301	.set i, \r
    302
    303	.if (i < 40)
    304		.set K_XMM, 32*0
    305	.elseif (i < 80)
    306		.set K_XMM, 32*1
    307	.elseif (i < 120)
    308		.set K_XMM, 32*2
    309	.else
    310		.set K_XMM, 32*3
    311	.endif
    312
    313	.if (i<32)
    314		PRECALC_00_15	\s
    315	.elseif (i<64)
    316		PRECALC_16_31	\s
    317	.elseif (i < 160)
    318		PRECALC_32_79	\s
    319	.endif
    320.endm
    321
    322.macro ROTATE_STATE
    323	.set T_REG, E
    324	.set E, D
    325	.set D, C
    326	.set C, B
    327	.set B, TB
    328	.set TB, A
    329	.set A, T_REG
    330
    331	.set T_REG, RE
    332	.set RE, RD
    333	.set RD, RC
    334	.set RC, RB
    335	.set RB, RTB
    336	.set RTB, RA
    337	.set RA, T_REG
    338.endm
    339
    340/* Macro relies on saved ROUND_Fx */
    341
    342.macro RND_FUN f, r
    343	.if (\f == RND_F1)
    344		ROUND_F1	\r
    345	.elseif (\f == RND_F2)
    346		ROUND_F2	\r
    347	.elseif (\f == RND_F3)
    348		ROUND_F3	\r
    349	.endif
    350.endm
    351
    352.macro RR r
    353	.set round_id, (\r % 80)
    354
    355	.if (round_id == 0)        /* Precalculate F for first round */
    356		.set ROUND_FUNC, RND_F1
    357		mov	B, TB
    358
    359		rorx	$(32-30), B, B    /* b>>>2 */
    360		andn	D, TB, T1
    361		and	C, TB
    362		xor	T1, TB
    363	.endif
    364
    365	RND_FUN ROUND_FUNC, \r
    366	ROTATE_STATE
    367
    368	.if   (round_id == 18)
    369		.set ROUND_FUNC, RND_F2
    370	.elseif (round_id == 38)
    371		.set ROUND_FUNC, RND_F3
    372	.elseif (round_id == 58)
    373		.set ROUND_FUNC, RND_F2
    374	.endif
    375
    376	.set round_id, ( (\r+1) % 80)
    377
    378	RND_FUN ROUND_FUNC, (\r+1)
    379	ROTATE_STATE
    380.endm
    381
    382.macro ROUND_F1 r
    383	add	WK(\r), E
    384
    385	andn	C, A, T1			/* ~b&d */
    386	lea	(RE,RTB), E		/* Add F from the previous round */
    387
    388	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
    389	rorx	$(32-30),A, TB		/* b>>>2 for next round */
    390
    391	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
    392
    393	/*
    394	 * Calculate F for the next round
    395	 * (b & c) ^ andn[b, d]
    396	 */
    397	and	B, A			/* b&c */
    398	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
    399
    400	lea	(RE,RTA), E		/* E += A >>> 5 */
    401.endm
    402
    403.macro ROUND_F2 r
    404	add	WK(\r), E
    405	lea	(RE,RTB), E		/* Add F from the previous round */
    406
    407	/* Calculate F for the next round */
    408	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
    409	.if ((round_id) < 79)
    410		rorx	$(32-30), A, TB	/* b>>>2 for next round */
    411	.endif
    412	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
    413
    414	.if ((round_id) < 79)
    415		xor	B, A
    416	.endif
    417
    418	add	TA, E			/* E += A >>> 5 */
    419
    420	.if ((round_id) < 79)
    421		xor	C, A
    422	.endif
    423.endm
    424
    425.macro ROUND_F3 r
    426	add	WK(\r), E
    427	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
    428
    429	lea	(RE,RTB), E		/* Add F from the previous round */
    430
    431	mov	B, T1
    432	or	A, T1
    433
    434	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
    435	rorx	$(32-30), A, TB		/* b>>>2 for next round */
    436
    437	/* Calculate F for the next round
    438	 * (b and c) or (d and (b or c))
    439	 */
    440	and	C, T1
    441	and	B, A
    442	or	T1, A
    443
    444	add	TA, E			/* E += A >>> 5 */
    445
    446.endm
    447
    448/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
    449 * %1 + %2 >= %3 ? %4 : 0
    450 */
    451.macro ADD_IF_GE a, b, c, d
    452	mov     \a, RTA
    453	add     $\d, RTA
    454	cmp     $\c, \b
    455	cmovge  RTA, \a
    456.endm
    457
    458/*
    459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
    460 */
    461.macro SHA1_PIPELINED_MAIN_BODY
    462
    463	REGALLOC
    464
    465	mov	(HASH_PTR), A
    466	mov	4(HASH_PTR), B
    467	mov	8(HASH_PTR), C
    468	mov	12(HASH_PTR), D
    469	mov	16(HASH_PTR), E
    470
    471	mov	%rsp, PRECALC_BUF
    472	lea	(2*4*80+32)(%rsp), WK_BUF
    473
    474	# Precalc WK for first 2 blocks
    475	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
    476	.set i, 0
    477	.rept    160
    478		PRECALC i
    479		.set i, i + 1
    480	.endr
    481
    482	/* Go to next block if needed */
    483	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
    484	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
    485	xchg	WK_BUF, PRECALC_BUF
    486
    487	.align 32
    488_loop:
    489	/*
    490	 * code loops through more than one block
    491	 * we use K_BASE value as a signal of a last block,
    492	 * it is set below by: cmovae BUFFER_PTR, K_BASE
    493	 */
    494	test BLOCKS_CTR, BLOCKS_CTR
    495	jnz _begin
    496	.align 32
    497	jmp	_end
    498	.align 32
    499_begin:
    500
    501	/*
    502	 * Do first block
    503	 * rounds: 0,2,4,6,8
    504	 */
    505	.set j, 0
    506	.rept 5
    507		RR	j
    508		.set j, j+2
    509	.endr
    510
    511	jmp _loop0
    512_loop0:
    513
    514	/*
    515	 * rounds:
    516	 * 10,12,14,16,18
    517	 * 20,22,24,26,28
    518	 * 30,32,34,36,38
    519	 * 40,42,44,46,48
    520	 * 50,52,54,56,58
    521	 */
    522	.rept 25
    523		RR	j
    524		.set j, j+2
    525	.endr
    526
    527	/* Update Counter */
    528	sub $1, BLOCKS_CTR
    529	/* Move to the next block only if needed*/
    530	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
    531	/*
    532	 * rounds
    533	 * 60,62,64,66,68
    534	 * 70,72,74,76,78
    535	 */
    536	.rept 10
    537		RR	j
    538		.set j, j+2
    539	.endr
    540
    541	UPDATE_HASH	(HASH_PTR), A
    542	UPDATE_HASH	4(HASH_PTR), TB
    543	UPDATE_HASH	8(HASH_PTR), C
    544	UPDATE_HASH	12(HASH_PTR), D
    545	UPDATE_HASH	16(HASH_PTR), E
    546
    547	test	BLOCKS_CTR, BLOCKS_CTR
    548	jz	_loop
    549
    550	mov	TB, B
    551
    552	/* Process second block */
    553	/*
    554	 * rounds
    555	 *  0+80, 2+80, 4+80, 6+80, 8+80
    556	 * 10+80,12+80,14+80,16+80,18+80
    557	 */
    558
    559	.set j, 0
    560	.rept 10
    561		RR	j+80
    562		.set j, j+2
    563	.endr
    564
    565	jmp	_loop1
    566_loop1:
    567	/*
    568	 * rounds
    569	 * 20+80,22+80,24+80,26+80,28+80
    570	 * 30+80,32+80,34+80,36+80,38+80
    571	 */
    572	.rept 10
    573		RR	j+80
    574		.set j, j+2
    575	.endr
    576
    577	jmp	_loop2
    578_loop2:
    579
    580	/*
    581	 * rounds
    582	 * 40+80,42+80,44+80,46+80,48+80
    583	 * 50+80,52+80,54+80,56+80,58+80
    584	 */
    585	.rept 10
    586		RR	j+80
    587		.set j, j+2
    588	.endr
    589
    590	/* update counter */
    591	sub     $1, BLOCKS_CTR
    592	/* Move to the next block only if needed*/
    593	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
    594
    595	jmp	_loop3
    596_loop3:
    597
    598	/*
    599	 * rounds
    600	 * 60+80,62+80,64+80,66+80,68+80
    601	 * 70+80,72+80,74+80,76+80,78+80
    602	 */
    603	.rept 10
    604		RR	j+80
    605		.set j, j+2
    606	.endr
    607
    608	UPDATE_HASH	(HASH_PTR), A
    609	UPDATE_HASH	4(HASH_PTR), TB
    610	UPDATE_HASH	8(HASH_PTR), C
    611	UPDATE_HASH	12(HASH_PTR), D
    612	UPDATE_HASH	16(HASH_PTR), E
    613
    614	/* Reset state for AVX2 reg permutation */
    615	mov	A, TA
    616	mov	TB, A
    617	mov	C, TB
    618	mov	E, C
    619	mov	D, B
    620	mov	TA, D
    621
    622	REGALLOC
    623
    624	xchg	WK_BUF, PRECALC_BUF
    625
    626	jmp	_loop
    627
    628	.align 32
    629	_end:
    630
    631.endm
    632/*
    633 * macro implements SHA-1 function's body for several 64-byte blocks
    634 * param: function's name
    635 */
    636.macro SHA1_VECTOR_ASM  name
    637	SYM_FUNC_START(\name)
    638
    639	push	%rbx
    640	push	%r12
    641	push	%r13
    642	push	%r14
    643	push	%r15
    644
    645	RESERVE_STACK  = (W_SIZE*4 + 8+24)
    646
    647	/* Align stack */
    648	push	%rbp
    649	mov	%rsp, %rbp
    650	and	$~(0x20-1), %rsp
    651	sub	$RESERVE_STACK, %rsp
    652
    653	avx2_zeroupper
    654
    655	/* Setup initial values */
    656	mov	CTX, HASH_PTR
    657	mov	BUF, BUFFER_PTR
    658
    659	mov	BUF, BUFFER_PTR2
    660	mov	CNT, BLOCKS_CTR
    661
    662	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
    663
    664	SHA1_PIPELINED_MAIN_BODY
    665
    666	avx2_zeroupper
    667
    668	mov	%rbp, %rsp
    669	pop	%rbp
    670
    671	pop	%r15
    672	pop	%r14
    673	pop	%r13
    674	pop	%r12
    675	pop	%rbx
    676
    677	RET
    678
    679	SYM_FUNC_END(\name)
    680.endm
    681
    682.section .rodata
    683
    684#define K1 0x5a827999
    685#define K2 0x6ed9eba1
    686#define K3 0x8f1bbcdc
    687#define K4 0xca62c1d6
    688
    689.align 128
    690K_XMM_AR:
    691	.long K1, K1, K1, K1
    692	.long K1, K1, K1, K1
    693	.long K2, K2, K2, K2
    694	.long K2, K2, K2, K2
    695	.long K3, K3, K3, K3
    696	.long K3, K3, K3, K3
    697	.long K4, K4, K4, K4
    698	.long K4, K4, K4, K4
    699
    700BSWAP_SHUFB_CTL:
    701	.long 0x00010203
    702	.long 0x04050607
    703	.long 0x08090a0b
    704	.long 0x0c0d0e0f
    705	.long 0x00010203
    706	.long 0x04050607
    707	.long 0x08090a0b
    708	.long 0x0c0d0e0f
    709.text
    710
    711SHA1_VECTOR_ASM     sha1_transform_avx2