cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

blake2s-core.S (7139B)


      1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
      2/*
      3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
      4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
      5 */
      6
      7#include <linux/linkage.h>
      8
      9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
     10.align 32
     11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
     12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
     13.section .rodata.cst16.ROT16, "aM", @progbits, 16
     14.align 16
     15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
     16.section .rodata.cst16.ROR328, "aM", @progbits, 16
     17.align 16
     18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
     19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
     20.align 64
     21SIGMA:
     22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
     23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
     24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
     25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
     26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
     27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
     28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
     29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
     30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
     31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
     32#ifdef CONFIG_AS_AVX512
     33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
     34.align 64
     35SIGMA2:
     36.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
     37.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
     38.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
     39.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
     40.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
     41.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
     42.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
     43.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
     44.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
     45.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
     46#endif /* CONFIG_AS_AVX512 */
     47
     48.text
     49SYM_FUNC_START(blake2s_compress_ssse3)
     50	testq		%rdx,%rdx
     51	je		.Lendofloop
     52	movdqu		(%rdi),%xmm0
     53	movdqu		0x10(%rdi),%xmm1
     54	movdqa		ROT16(%rip),%xmm12
     55	movdqa		ROR328(%rip),%xmm13
     56	movdqu		0x20(%rdi),%xmm14
     57	movq		%rcx,%xmm15
     58	leaq		SIGMA+0xa0(%rip),%r8
     59	jmp		.Lbeginofloop
     60	.align		32
     61.Lbeginofloop:
     62	movdqa		%xmm0,%xmm10
     63	movdqa		%xmm1,%xmm11
     64	paddq		%xmm15,%xmm14
     65	movdqa		IV(%rip),%xmm2
     66	movdqa		%xmm14,%xmm3
     67	pxor		IV+0x10(%rip),%xmm3
     68	leaq		SIGMA(%rip),%rcx
     69.Lroundloop:
     70	movzbl		(%rcx),%eax
     71	movd		(%rsi,%rax,4),%xmm4
     72	movzbl		0x1(%rcx),%eax
     73	movd		(%rsi,%rax,4),%xmm5
     74	movzbl		0x2(%rcx),%eax
     75	movd		(%rsi,%rax,4),%xmm6
     76	movzbl		0x3(%rcx),%eax
     77	movd		(%rsi,%rax,4),%xmm7
     78	punpckldq	%xmm5,%xmm4
     79	punpckldq	%xmm7,%xmm6
     80	punpcklqdq	%xmm6,%xmm4
     81	paddd		%xmm4,%xmm0
     82	paddd		%xmm1,%xmm0
     83	pxor		%xmm0,%xmm3
     84	pshufb		%xmm12,%xmm3
     85	paddd		%xmm3,%xmm2
     86	pxor		%xmm2,%xmm1
     87	movdqa		%xmm1,%xmm8
     88	psrld		$0xc,%xmm1
     89	pslld		$0x14,%xmm8
     90	por		%xmm8,%xmm1
     91	movzbl		0x4(%rcx),%eax
     92	movd		(%rsi,%rax,4),%xmm5
     93	movzbl		0x5(%rcx),%eax
     94	movd		(%rsi,%rax,4),%xmm6
     95	movzbl		0x6(%rcx),%eax
     96	movd		(%rsi,%rax,4),%xmm7
     97	movzbl		0x7(%rcx),%eax
     98	movd		(%rsi,%rax,4),%xmm4
     99	punpckldq	%xmm6,%xmm5
    100	punpckldq	%xmm4,%xmm7
    101	punpcklqdq	%xmm7,%xmm5
    102	paddd		%xmm5,%xmm0
    103	paddd		%xmm1,%xmm0
    104	pxor		%xmm0,%xmm3
    105	pshufb		%xmm13,%xmm3
    106	paddd		%xmm3,%xmm2
    107	pxor		%xmm2,%xmm1
    108	movdqa		%xmm1,%xmm8
    109	psrld		$0x7,%xmm1
    110	pslld		$0x19,%xmm8
    111	por		%xmm8,%xmm1
    112	pshufd		$0x93,%xmm0,%xmm0
    113	pshufd		$0x4e,%xmm3,%xmm3
    114	pshufd		$0x39,%xmm2,%xmm2
    115	movzbl		0x8(%rcx),%eax
    116	movd		(%rsi,%rax,4),%xmm6
    117	movzbl		0x9(%rcx),%eax
    118	movd		(%rsi,%rax,4),%xmm7
    119	movzbl		0xa(%rcx),%eax
    120	movd		(%rsi,%rax,4),%xmm4
    121	movzbl		0xb(%rcx),%eax
    122	movd		(%rsi,%rax,4),%xmm5
    123	punpckldq	%xmm7,%xmm6
    124	punpckldq	%xmm5,%xmm4
    125	punpcklqdq	%xmm4,%xmm6
    126	paddd		%xmm6,%xmm0
    127	paddd		%xmm1,%xmm0
    128	pxor		%xmm0,%xmm3
    129	pshufb		%xmm12,%xmm3
    130	paddd		%xmm3,%xmm2
    131	pxor		%xmm2,%xmm1
    132	movdqa		%xmm1,%xmm8
    133	psrld		$0xc,%xmm1
    134	pslld		$0x14,%xmm8
    135	por		%xmm8,%xmm1
    136	movzbl		0xc(%rcx),%eax
    137	movd		(%rsi,%rax,4),%xmm7
    138	movzbl		0xd(%rcx),%eax
    139	movd		(%rsi,%rax,4),%xmm4
    140	movzbl		0xe(%rcx),%eax
    141	movd		(%rsi,%rax,4),%xmm5
    142	movzbl		0xf(%rcx),%eax
    143	movd		(%rsi,%rax,4),%xmm6
    144	punpckldq	%xmm4,%xmm7
    145	punpckldq	%xmm6,%xmm5
    146	punpcklqdq	%xmm5,%xmm7
    147	paddd		%xmm7,%xmm0
    148	paddd		%xmm1,%xmm0
    149	pxor		%xmm0,%xmm3
    150	pshufb		%xmm13,%xmm3
    151	paddd		%xmm3,%xmm2
    152	pxor		%xmm2,%xmm1
    153	movdqa		%xmm1,%xmm8
    154	psrld		$0x7,%xmm1
    155	pslld		$0x19,%xmm8
    156	por		%xmm8,%xmm1
    157	pshufd		$0x39,%xmm0,%xmm0
    158	pshufd		$0x4e,%xmm3,%xmm3
    159	pshufd		$0x93,%xmm2,%xmm2
    160	addq		$0x10,%rcx
    161	cmpq		%r8,%rcx
    162	jnz		.Lroundloop
    163	pxor		%xmm2,%xmm0
    164	pxor		%xmm3,%xmm1
    165	pxor		%xmm10,%xmm0
    166	pxor		%xmm11,%xmm1
    167	addq		$0x40,%rsi
    168	decq		%rdx
    169	jnz		.Lbeginofloop
    170	movdqu		%xmm0,(%rdi)
    171	movdqu		%xmm1,0x10(%rdi)
    172	movdqu		%xmm14,0x20(%rdi)
    173.Lendofloop:
    174	RET
    175SYM_FUNC_END(blake2s_compress_ssse3)
    176
    177#ifdef CONFIG_AS_AVX512
    178SYM_FUNC_START(blake2s_compress_avx512)
    179	vmovdqu		(%rdi),%xmm0
    180	vmovdqu		0x10(%rdi),%xmm1
    181	vmovdqu		0x20(%rdi),%xmm4
    182	vmovq		%rcx,%xmm5
    183	vmovdqa		IV(%rip),%xmm14
    184	vmovdqa		IV+16(%rip),%xmm15
    185	jmp		.Lblake2s_compress_avx512_mainloop
    186.align 32
    187.Lblake2s_compress_avx512_mainloop:
    188	vmovdqa		%xmm0,%xmm10
    189	vmovdqa		%xmm1,%xmm11
    190	vpaddq		%xmm5,%xmm4,%xmm4
    191	vmovdqa		%xmm14,%xmm2
    192	vpxor		%xmm15,%xmm4,%xmm3
    193	vmovdqu		(%rsi),%ymm6
    194	vmovdqu		0x20(%rsi),%ymm7
    195	addq		$0x40,%rsi
    196	leaq		SIGMA2(%rip),%rax
    197	movb		$0xa,%cl
    198.Lblake2s_compress_avx512_roundloop:
    199	addq		$0x40,%rax
    200	vmovdqa		-0x40(%rax),%ymm8
    201	vmovdqa		-0x20(%rax),%ymm9
    202	vpermi2d	%ymm7,%ymm6,%ymm8
    203	vpermi2d	%ymm7,%ymm6,%ymm9
    204	vmovdqa		%ymm8,%ymm6
    205	vmovdqa		%ymm9,%ymm7
    206	vpaddd		%xmm8,%xmm0,%xmm0
    207	vpaddd		%xmm1,%xmm0,%xmm0
    208	vpxor		%xmm0,%xmm3,%xmm3
    209	vprord		$0x10,%xmm3,%xmm3
    210	vpaddd		%xmm3,%xmm2,%xmm2
    211	vpxor		%xmm2,%xmm1,%xmm1
    212	vprord		$0xc,%xmm1,%xmm1
    213	vextracti128	$0x1,%ymm8,%xmm8
    214	vpaddd		%xmm8,%xmm0,%xmm0
    215	vpaddd		%xmm1,%xmm0,%xmm0
    216	vpxor		%xmm0,%xmm3,%xmm3
    217	vprord		$0x8,%xmm3,%xmm3
    218	vpaddd		%xmm3,%xmm2,%xmm2
    219	vpxor		%xmm2,%xmm1,%xmm1
    220	vprord		$0x7,%xmm1,%xmm1
    221	vpshufd		$0x93,%xmm0,%xmm0
    222	vpshufd		$0x4e,%xmm3,%xmm3
    223	vpshufd		$0x39,%xmm2,%xmm2
    224	vpaddd		%xmm9,%xmm0,%xmm0
    225	vpaddd		%xmm1,%xmm0,%xmm0
    226	vpxor		%xmm0,%xmm3,%xmm3
    227	vprord		$0x10,%xmm3,%xmm3
    228	vpaddd		%xmm3,%xmm2,%xmm2
    229	vpxor		%xmm2,%xmm1,%xmm1
    230	vprord		$0xc,%xmm1,%xmm1
    231	vextracti128	$0x1,%ymm9,%xmm9
    232	vpaddd		%xmm9,%xmm0,%xmm0
    233	vpaddd		%xmm1,%xmm0,%xmm0
    234	vpxor		%xmm0,%xmm3,%xmm3
    235	vprord		$0x8,%xmm3,%xmm3
    236	vpaddd		%xmm3,%xmm2,%xmm2
    237	vpxor		%xmm2,%xmm1,%xmm1
    238	vprord		$0x7,%xmm1,%xmm1
    239	vpshufd		$0x39,%xmm0,%xmm0
    240	vpshufd		$0x4e,%xmm3,%xmm3
    241	vpshufd		$0x93,%xmm2,%xmm2
    242	decb		%cl
    243	jne		.Lblake2s_compress_avx512_roundloop
    244	vpxor		%xmm10,%xmm0,%xmm0
    245	vpxor		%xmm11,%xmm1,%xmm1
    246	vpxor		%xmm2,%xmm0,%xmm0
    247	vpxor		%xmm3,%xmm1,%xmm1
    248	decq		%rdx
    249	jne		.Lblake2s_compress_avx512_mainloop
    250	vmovdqu		%xmm0,(%rdi)
    251	vmovdqu		%xmm1,0x10(%rdi)
    252	vmovdqu		%xmm4,0x20(%rdi)
    253	vzeroupper
    254	RET
    255SYM_FUNC_END(blake2s_compress_avx512)
    256#endif /* CONFIG_AS_AVX512 */