blake2s-core.S (7139B)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 */ 6 7#include <linux/linkage.h> 8 9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 10.align 32 11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F 13.section .rodata.cst16.ROT16, "aM", @progbits, 16 14.align 16 15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 16.section .rodata.cst16.ROR328, "aM", @progbits, 16 17.align 16 18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 20.align 64 21SIGMA: 22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 32#ifdef CONFIG_AS_AVX512 33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 34.align 64 35SIGMA2: 36.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 37.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 38.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 39.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 40.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 41.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 42.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 43.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 44.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 45.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 46#endif /* CONFIG_AS_AVX512 */ 47 48.text 49SYM_FUNC_START(blake2s_compress_ssse3) 50 testq %rdx,%rdx 51 je .Lendofloop 52 movdqu (%rdi),%xmm0 53 movdqu 0x10(%rdi),%xmm1 54 movdqa ROT16(%rip),%xmm12 55 movdqa ROR328(%rip),%xmm13 56 movdqu 0x20(%rdi),%xmm14 57 movq %rcx,%xmm15 58 leaq SIGMA+0xa0(%rip),%r8 59 jmp .Lbeginofloop 60 .align 32 61.Lbeginofloop: 62 movdqa %xmm0,%xmm10 63 movdqa %xmm1,%xmm11 64 paddq %xmm15,%xmm14 65 movdqa IV(%rip),%xmm2 66 movdqa %xmm14,%xmm3 67 pxor IV+0x10(%rip),%xmm3 68 leaq SIGMA(%rip),%rcx 69.Lroundloop: 70 movzbl (%rcx),%eax 71 movd (%rsi,%rax,4),%xmm4 72 movzbl 0x1(%rcx),%eax 73 movd (%rsi,%rax,4),%xmm5 74 movzbl 0x2(%rcx),%eax 75 movd (%rsi,%rax,4),%xmm6 76 movzbl 0x3(%rcx),%eax 77 movd (%rsi,%rax,4),%xmm7 78 punpckldq %xmm5,%xmm4 79 punpckldq %xmm7,%xmm6 80 punpcklqdq %xmm6,%xmm4 81 paddd %xmm4,%xmm0 82 paddd %xmm1,%xmm0 83 pxor %xmm0,%xmm3 84 pshufb %xmm12,%xmm3 85 paddd %xmm3,%xmm2 86 pxor %xmm2,%xmm1 87 movdqa %xmm1,%xmm8 88 psrld $0xc,%xmm1 89 pslld $0x14,%xmm8 90 por %xmm8,%xmm1 91 movzbl 0x4(%rcx),%eax 92 movd (%rsi,%rax,4),%xmm5 93 movzbl 0x5(%rcx),%eax 94 movd (%rsi,%rax,4),%xmm6 95 movzbl 0x6(%rcx),%eax 96 movd (%rsi,%rax,4),%xmm7 97 movzbl 0x7(%rcx),%eax 98 movd (%rsi,%rax,4),%xmm4 99 punpckldq %xmm6,%xmm5 100 punpckldq %xmm4,%xmm7 101 punpcklqdq %xmm7,%xmm5 102 paddd %xmm5,%xmm0 103 paddd %xmm1,%xmm0 104 pxor %xmm0,%xmm3 105 pshufb %xmm13,%xmm3 106 paddd %xmm3,%xmm2 107 pxor %xmm2,%xmm1 108 movdqa %xmm1,%xmm8 109 psrld $0x7,%xmm1 110 pslld $0x19,%xmm8 111 por %xmm8,%xmm1 112 pshufd $0x93,%xmm0,%xmm0 113 pshufd $0x4e,%xmm3,%xmm3 114 pshufd $0x39,%xmm2,%xmm2 115 movzbl 0x8(%rcx),%eax 116 movd (%rsi,%rax,4),%xmm6 117 movzbl 0x9(%rcx),%eax 118 movd (%rsi,%rax,4),%xmm7 119 movzbl 0xa(%rcx),%eax 120 movd (%rsi,%rax,4),%xmm4 121 movzbl 0xb(%rcx),%eax 122 movd (%rsi,%rax,4),%xmm5 123 punpckldq %xmm7,%xmm6 124 punpckldq %xmm5,%xmm4 125 punpcklqdq %xmm4,%xmm6 126 paddd %xmm6,%xmm0 127 paddd %xmm1,%xmm0 128 pxor %xmm0,%xmm3 129 pshufb %xmm12,%xmm3 130 paddd %xmm3,%xmm2 131 pxor %xmm2,%xmm1 132 movdqa %xmm1,%xmm8 133 psrld $0xc,%xmm1 134 pslld $0x14,%xmm8 135 por %xmm8,%xmm1 136 movzbl 0xc(%rcx),%eax 137 movd (%rsi,%rax,4),%xmm7 138 movzbl 0xd(%rcx),%eax 139 movd (%rsi,%rax,4),%xmm4 140 movzbl 0xe(%rcx),%eax 141 movd (%rsi,%rax,4),%xmm5 142 movzbl 0xf(%rcx),%eax 143 movd (%rsi,%rax,4),%xmm6 144 punpckldq %xmm4,%xmm7 145 punpckldq %xmm6,%xmm5 146 punpcklqdq %xmm5,%xmm7 147 paddd %xmm7,%xmm0 148 paddd %xmm1,%xmm0 149 pxor %xmm0,%xmm3 150 pshufb %xmm13,%xmm3 151 paddd %xmm3,%xmm2 152 pxor %xmm2,%xmm1 153 movdqa %xmm1,%xmm8 154 psrld $0x7,%xmm1 155 pslld $0x19,%xmm8 156 por %xmm8,%xmm1 157 pshufd $0x39,%xmm0,%xmm0 158 pshufd $0x4e,%xmm3,%xmm3 159 pshufd $0x93,%xmm2,%xmm2 160 addq $0x10,%rcx 161 cmpq %r8,%rcx 162 jnz .Lroundloop 163 pxor %xmm2,%xmm0 164 pxor %xmm3,%xmm1 165 pxor %xmm10,%xmm0 166 pxor %xmm11,%xmm1 167 addq $0x40,%rsi 168 decq %rdx 169 jnz .Lbeginofloop 170 movdqu %xmm0,(%rdi) 171 movdqu %xmm1,0x10(%rdi) 172 movdqu %xmm14,0x20(%rdi) 173.Lendofloop: 174 RET 175SYM_FUNC_END(blake2s_compress_ssse3) 176 177#ifdef CONFIG_AS_AVX512 178SYM_FUNC_START(blake2s_compress_avx512) 179 vmovdqu (%rdi),%xmm0 180 vmovdqu 0x10(%rdi),%xmm1 181 vmovdqu 0x20(%rdi),%xmm4 182 vmovq %rcx,%xmm5 183 vmovdqa IV(%rip),%xmm14 184 vmovdqa IV+16(%rip),%xmm15 185 jmp .Lblake2s_compress_avx512_mainloop 186.align 32 187.Lblake2s_compress_avx512_mainloop: 188 vmovdqa %xmm0,%xmm10 189 vmovdqa %xmm1,%xmm11 190 vpaddq %xmm5,%xmm4,%xmm4 191 vmovdqa %xmm14,%xmm2 192 vpxor %xmm15,%xmm4,%xmm3 193 vmovdqu (%rsi),%ymm6 194 vmovdqu 0x20(%rsi),%ymm7 195 addq $0x40,%rsi 196 leaq SIGMA2(%rip),%rax 197 movb $0xa,%cl 198.Lblake2s_compress_avx512_roundloop: 199 addq $0x40,%rax 200 vmovdqa -0x40(%rax),%ymm8 201 vmovdqa -0x20(%rax),%ymm9 202 vpermi2d %ymm7,%ymm6,%ymm8 203 vpermi2d %ymm7,%ymm6,%ymm9 204 vmovdqa %ymm8,%ymm6 205 vmovdqa %ymm9,%ymm7 206 vpaddd %xmm8,%xmm0,%xmm0 207 vpaddd %xmm1,%xmm0,%xmm0 208 vpxor %xmm0,%xmm3,%xmm3 209 vprord $0x10,%xmm3,%xmm3 210 vpaddd %xmm3,%xmm2,%xmm2 211 vpxor %xmm2,%xmm1,%xmm1 212 vprord $0xc,%xmm1,%xmm1 213 vextracti128 $0x1,%ymm8,%xmm8 214 vpaddd %xmm8,%xmm0,%xmm0 215 vpaddd %xmm1,%xmm0,%xmm0 216 vpxor %xmm0,%xmm3,%xmm3 217 vprord $0x8,%xmm3,%xmm3 218 vpaddd %xmm3,%xmm2,%xmm2 219 vpxor %xmm2,%xmm1,%xmm1 220 vprord $0x7,%xmm1,%xmm1 221 vpshufd $0x93,%xmm0,%xmm0 222 vpshufd $0x4e,%xmm3,%xmm3 223 vpshufd $0x39,%xmm2,%xmm2 224 vpaddd %xmm9,%xmm0,%xmm0 225 vpaddd %xmm1,%xmm0,%xmm0 226 vpxor %xmm0,%xmm3,%xmm3 227 vprord $0x10,%xmm3,%xmm3 228 vpaddd %xmm3,%xmm2,%xmm2 229 vpxor %xmm2,%xmm1,%xmm1 230 vprord $0xc,%xmm1,%xmm1 231 vextracti128 $0x1,%ymm9,%xmm9 232 vpaddd %xmm9,%xmm0,%xmm0 233 vpaddd %xmm1,%xmm0,%xmm0 234 vpxor %xmm0,%xmm3,%xmm3 235 vprord $0x8,%xmm3,%xmm3 236 vpaddd %xmm3,%xmm2,%xmm2 237 vpxor %xmm2,%xmm1,%xmm1 238 vprord $0x7,%xmm1,%xmm1 239 vpshufd $0x39,%xmm0,%xmm0 240 vpshufd $0x4e,%xmm3,%xmm3 241 vpshufd $0x93,%xmm2,%xmm2 242 decb %cl 243 jne .Lblake2s_compress_avx512_roundloop 244 vpxor %xmm10,%xmm0,%xmm0 245 vpxor %xmm11,%xmm1,%xmm1 246 vpxor %xmm2,%xmm0,%xmm0 247 vpxor %xmm3,%xmm1,%xmm1 248 decq %rdx 249 jne .Lblake2s_compress_avx512_mainloop 250 vmovdqu %xmm0,(%rdi) 251 vmovdqu %xmm1,0x10(%rdi) 252 vmovdqu %xmm4,0x20(%rdi) 253 vzeroupper 254 RET 255SYM_FUNC_END(blake2s_compress_avx512) 256#endif /* CONFIG_AS_AVX512 */