chacha-avx512vl-x86_64.S (20441B)
1/* SPDX-License-Identifier: GPL-2.0+ */ 2/* 3 * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions 4 * 5 * Copyright (C) 2018 Martin Willi 6 */ 7 8#include <linux/linkage.h> 9 10.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 11.align 32 12CTR2BL: .octa 0x00000000000000000000000000000000 13 .octa 0x00000000000000000000000000000001 14 15.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 16.align 32 17CTR4BL: .octa 0x00000000000000000000000000000002 18 .octa 0x00000000000000000000000000000003 19 20.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 21.align 32 22CTR8BL: .octa 0x00000003000000020000000100000000 23 .octa 0x00000007000000060000000500000004 24 25.text 26 27SYM_FUNC_START(chacha_2block_xor_avx512vl) 28 # %rdi: Input state matrix, s 29 # %rsi: up to 2 data blocks output, o 30 # %rdx: up to 2 data blocks input, i 31 # %rcx: input/output length in bytes 32 # %r8d: nrounds 33 34 # This function encrypts two ChaCha blocks by loading the state 35 # matrix twice across four AVX registers. It performs matrix operations 36 # on four words in each matrix in parallel, but requires shuffling to 37 # rearrange the words after each round. 38 39 vzeroupper 40 41 # x0..3[0-2] = s0..3 42 vbroadcasti128 0x00(%rdi),%ymm0 43 vbroadcasti128 0x10(%rdi),%ymm1 44 vbroadcasti128 0x20(%rdi),%ymm2 45 vbroadcasti128 0x30(%rdi),%ymm3 46 47 vpaddd CTR2BL(%rip),%ymm3,%ymm3 48 49 vmovdqa %ymm0,%ymm8 50 vmovdqa %ymm1,%ymm9 51 vmovdqa %ymm2,%ymm10 52 vmovdqa %ymm3,%ymm11 53 54.Ldoubleround: 55 56 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 57 vpaddd %ymm1,%ymm0,%ymm0 58 vpxord %ymm0,%ymm3,%ymm3 59 vprold $16,%ymm3,%ymm3 60 61 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 62 vpaddd %ymm3,%ymm2,%ymm2 63 vpxord %ymm2,%ymm1,%ymm1 64 vprold $12,%ymm1,%ymm1 65 66 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 67 vpaddd %ymm1,%ymm0,%ymm0 68 vpxord %ymm0,%ymm3,%ymm3 69 vprold $8,%ymm3,%ymm3 70 71 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 72 vpaddd %ymm3,%ymm2,%ymm2 73 vpxord %ymm2,%ymm1,%ymm1 74 vprold $7,%ymm1,%ymm1 75 76 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 77 vpshufd $0x39,%ymm1,%ymm1 78 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 79 vpshufd $0x4e,%ymm2,%ymm2 80 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 81 vpshufd $0x93,%ymm3,%ymm3 82 83 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 84 vpaddd %ymm1,%ymm0,%ymm0 85 vpxord %ymm0,%ymm3,%ymm3 86 vprold $16,%ymm3,%ymm3 87 88 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 89 vpaddd %ymm3,%ymm2,%ymm2 90 vpxord %ymm2,%ymm1,%ymm1 91 vprold $12,%ymm1,%ymm1 92 93 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 94 vpaddd %ymm1,%ymm0,%ymm0 95 vpxord %ymm0,%ymm3,%ymm3 96 vprold $8,%ymm3,%ymm3 97 98 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 99 vpaddd %ymm3,%ymm2,%ymm2 100 vpxord %ymm2,%ymm1,%ymm1 101 vprold $7,%ymm1,%ymm1 102 103 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 104 vpshufd $0x93,%ymm1,%ymm1 105 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 106 vpshufd $0x4e,%ymm2,%ymm2 107 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 108 vpshufd $0x39,%ymm3,%ymm3 109 110 sub $2,%r8d 111 jnz .Ldoubleround 112 113 # o0 = i0 ^ (x0 + s0) 114 vpaddd %ymm8,%ymm0,%ymm7 115 cmp $0x10,%rcx 116 jl .Lxorpart2 117 vpxord 0x00(%rdx),%xmm7,%xmm6 118 vmovdqu %xmm6,0x00(%rsi) 119 vextracti128 $1,%ymm7,%xmm0 120 # o1 = i1 ^ (x1 + s1) 121 vpaddd %ymm9,%ymm1,%ymm7 122 cmp $0x20,%rcx 123 jl .Lxorpart2 124 vpxord 0x10(%rdx),%xmm7,%xmm6 125 vmovdqu %xmm6,0x10(%rsi) 126 vextracti128 $1,%ymm7,%xmm1 127 # o2 = i2 ^ (x2 + s2) 128 vpaddd %ymm10,%ymm2,%ymm7 129 cmp $0x30,%rcx 130 jl .Lxorpart2 131 vpxord 0x20(%rdx),%xmm7,%xmm6 132 vmovdqu %xmm6,0x20(%rsi) 133 vextracti128 $1,%ymm7,%xmm2 134 # o3 = i3 ^ (x3 + s3) 135 vpaddd %ymm11,%ymm3,%ymm7 136 cmp $0x40,%rcx 137 jl .Lxorpart2 138 vpxord 0x30(%rdx),%xmm7,%xmm6 139 vmovdqu %xmm6,0x30(%rsi) 140 vextracti128 $1,%ymm7,%xmm3 141 142 # xor and write second block 143 vmovdqa %xmm0,%xmm7 144 cmp $0x50,%rcx 145 jl .Lxorpart2 146 vpxord 0x40(%rdx),%xmm7,%xmm6 147 vmovdqu %xmm6,0x40(%rsi) 148 149 vmovdqa %xmm1,%xmm7 150 cmp $0x60,%rcx 151 jl .Lxorpart2 152 vpxord 0x50(%rdx),%xmm7,%xmm6 153 vmovdqu %xmm6,0x50(%rsi) 154 155 vmovdqa %xmm2,%xmm7 156 cmp $0x70,%rcx 157 jl .Lxorpart2 158 vpxord 0x60(%rdx),%xmm7,%xmm6 159 vmovdqu %xmm6,0x60(%rsi) 160 161 vmovdqa %xmm3,%xmm7 162 cmp $0x80,%rcx 163 jl .Lxorpart2 164 vpxord 0x70(%rdx),%xmm7,%xmm6 165 vmovdqu %xmm6,0x70(%rsi) 166 167.Ldone2: 168 vzeroupper 169 RET 170 171.Lxorpart2: 172 # xor remaining bytes from partial register into output 173 mov %rcx,%rax 174 and $0xf,%rcx 175 jz .Ldone2 176 mov %rax,%r9 177 and $~0xf,%r9 178 179 mov $1,%rax 180 shld %cl,%rax,%rax 181 sub $1,%rax 182 kmovq %rax,%k1 183 184 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} 185 vpxord %xmm7,%xmm1,%xmm1 186 vmovdqu8 %xmm1,(%rsi,%r9){%k1} 187 188 jmp .Ldone2 189 190SYM_FUNC_END(chacha_2block_xor_avx512vl) 191 192SYM_FUNC_START(chacha_4block_xor_avx512vl) 193 # %rdi: Input state matrix, s 194 # %rsi: up to 4 data blocks output, o 195 # %rdx: up to 4 data blocks input, i 196 # %rcx: input/output length in bytes 197 # %r8d: nrounds 198 199 # This function encrypts four ChaCha blocks by loading the state 200 # matrix four times across eight AVX registers. It performs matrix 201 # operations on four words in two matrices in parallel, sequentially 202 # to the operations on the four words of the other two matrices. The 203 # required word shuffling has a rather high latency, we can do the 204 # arithmetic on two matrix-pairs without much slowdown. 205 206 vzeroupper 207 208 # x0..3[0-4] = s0..3 209 vbroadcasti128 0x00(%rdi),%ymm0 210 vbroadcasti128 0x10(%rdi),%ymm1 211 vbroadcasti128 0x20(%rdi),%ymm2 212 vbroadcasti128 0x30(%rdi),%ymm3 213 214 vmovdqa %ymm0,%ymm4 215 vmovdqa %ymm1,%ymm5 216 vmovdqa %ymm2,%ymm6 217 vmovdqa %ymm3,%ymm7 218 219 vpaddd CTR2BL(%rip),%ymm3,%ymm3 220 vpaddd CTR4BL(%rip),%ymm7,%ymm7 221 222 vmovdqa %ymm0,%ymm11 223 vmovdqa %ymm1,%ymm12 224 vmovdqa %ymm2,%ymm13 225 vmovdqa %ymm3,%ymm14 226 vmovdqa %ymm7,%ymm15 227 228.Ldoubleround4: 229 230 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 231 vpaddd %ymm1,%ymm0,%ymm0 232 vpxord %ymm0,%ymm3,%ymm3 233 vprold $16,%ymm3,%ymm3 234 235 vpaddd %ymm5,%ymm4,%ymm4 236 vpxord %ymm4,%ymm7,%ymm7 237 vprold $16,%ymm7,%ymm7 238 239 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 240 vpaddd %ymm3,%ymm2,%ymm2 241 vpxord %ymm2,%ymm1,%ymm1 242 vprold $12,%ymm1,%ymm1 243 244 vpaddd %ymm7,%ymm6,%ymm6 245 vpxord %ymm6,%ymm5,%ymm5 246 vprold $12,%ymm5,%ymm5 247 248 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 249 vpaddd %ymm1,%ymm0,%ymm0 250 vpxord %ymm0,%ymm3,%ymm3 251 vprold $8,%ymm3,%ymm3 252 253 vpaddd %ymm5,%ymm4,%ymm4 254 vpxord %ymm4,%ymm7,%ymm7 255 vprold $8,%ymm7,%ymm7 256 257 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 258 vpaddd %ymm3,%ymm2,%ymm2 259 vpxord %ymm2,%ymm1,%ymm1 260 vprold $7,%ymm1,%ymm1 261 262 vpaddd %ymm7,%ymm6,%ymm6 263 vpxord %ymm6,%ymm5,%ymm5 264 vprold $7,%ymm5,%ymm5 265 266 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 267 vpshufd $0x39,%ymm1,%ymm1 268 vpshufd $0x39,%ymm5,%ymm5 269 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 270 vpshufd $0x4e,%ymm2,%ymm2 271 vpshufd $0x4e,%ymm6,%ymm6 272 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 273 vpshufd $0x93,%ymm3,%ymm3 274 vpshufd $0x93,%ymm7,%ymm7 275 276 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 277 vpaddd %ymm1,%ymm0,%ymm0 278 vpxord %ymm0,%ymm3,%ymm3 279 vprold $16,%ymm3,%ymm3 280 281 vpaddd %ymm5,%ymm4,%ymm4 282 vpxord %ymm4,%ymm7,%ymm7 283 vprold $16,%ymm7,%ymm7 284 285 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 286 vpaddd %ymm3,%ymm2,%ymm2 287 vpxord %ymm2,%ymm1,%ymm1 288 vprold $12,%ymm1,%ymm1 289 290 vpaddd %ymm7,%ymm6,%ymm6 291 vpxord %ymm6,%ymm5,%ymm5 292 vprold $12,%ymm5,%ymm5 293 294 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 295 vpaddd %ymm1,%ymm0,%ymm0 296 vpxord %ymm0,%ymm3,%ymm3 297 vprold $8,%ymm3,%ymm3 298 299 vpaddd %ymm5,%ymm4,%ymm4 300 vpxord %ymm4,%ymm7,%ymm7 301 vprold $8,%ymm7,%ymm7 302 303 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 304 vpaddd %ymm3,%ymm2,%ymm2 305 vpxord %ymm2,%ymm1,%ymm1 306 vprold $7,%ymm1,%ymm1 307 308 vpaddd %ymm7,%ymm6,%ymm6 309 vpxord %ymm6,%ymm5,%ymm5 310 vprold $7,%ymm5,%ymm5 311 312 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 313 vpshufd $0x93,%ymm1,%ymm1 314 vpshufd $0x93,%ymm5,%ymm5 315 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 316 vpshufd $0x4e,%ymm2,%ymm2 317 vpshufd $0x4e,%ymm6,%ymm6 318 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 319 vpshufd $0x39,%ymm3,%ymm3 320 vpshufd $0x39,%ymm7,%ymm7 321 322 sub $2,%r8d 323 jnz .Ldoubleround4 324 325 # o0 = i0 ^ (x0 + s0), first block 326 vpaddd %ymm11,%ymm0,%ymm10 327 cmp $0x10,%rcx 328 jl .Lxorpart4 329 vpxord 0x00(%rdx),%xmm10,%xmm9 330 vmovdqu %xmm9,0x00(%rsi) 331 vextracti128 $1,%ymm10,%xmm0 332 # o1 = i1 ^ (x1 + s1), first block 333 vpaddd %ymm12,%ymm1,%ymm10 334 cmp $0x20,%rcx 335 jl .Lxorpart4 336 vpxord 0x10(%rdx),%xmm10,%xmm9 337 vmovdqu %xmm9,0x10(%rsi) 338 vextracti128 $1,%ymm10,%xmm1 339 # o2 = i2 ^ (x2 + s2), first block 340 vpaddd %ymm13,%ymm2,%ymm10 341 cmp $0x30,%rcx 342 jl .Lxorpart4 343 vpxord 0x20(%rdx),%xmm10,%xmm9 344 vmovdqu %xmm9,0x20(%rsi) 345 vextracti128 $1,%ymm10,%xmm2 346 # o3 = i3 ^ (x3 + s3), first block 347 vpaddd %ymm14,%ymm3,%ymm10 348 cmp $0x40,%rcx 349 jl .Lxorpart4 350 vpxord 0x30(%rdx),%xmm10,%xmm9 351 vmovdqu %xmm9,0x30(%rsi) 352 vextracti128 $1,%ymm10,%xmm3 353 354 # xor and write second block 355 vmovdqa %xmm0,%xmm10 356 cmp $0x50,%rcx 357 jl .Lxorpart4 358 vpxord 0x40(%rdx),%xmm10,%xmm9 359 vmovdqu %xmm9,0x40(%rsi) 360 361 vmovdqa %xmm1,%xmm10 362 cmp $0x60,%rcx 363 jl .Lxorpart4 364 vpxord 0x50(%rdx),%xmm10,%xmm9 365 vmovdqu %xmm9,0x50(%rsi) 366 367 vmovdqa %xmm2,%xmm10 368 cmp $0x70,%rcx 369 jl .Lxorpart4 370 vpxord 0x60(%rdx),%xmm10,%xmm9 371 vmovdqu %xmm9,0x60(%rsi) 372 373 vmovdqa %xmm3,%xmm10 374 cmp $0x80,%rcx 375 jl .Lxorpart4 376 vpxord 0x70(%rdx),%xmm10,%xmm9 377 vmovdqu %xmm9,0x70(%rsi) 378 379 # o0 = i0 ^ (x0 + s0), third block 380 vpaddd %ymm11,%ymm4,%ymm10 381 cmp $0x90,%rcx 382 jl .Lxorpart4 383 vpxord 0x80(%rdx),%xmm10,%xmm9 384 vmovdqu %xmm9,0x80(%rsi) 385 vextracti128 $1,%ymm10,%xmm4 386 # o1 = i1 ^ (x1 + s1), third block 387 vpaddd %ymm12,%ymm5,%ymm10 388 cmp $0xa0,%rcx 389 jl .Lxorpart4 390 vpxord 0x90(%rdx),%xmm10,%xmm9 391 vmovdqu %xmm9,0x90(%rsi) 392 vextracti128 $1,%ymm10,%xmm5 393 # o2 = i2 ^ (x2 + s2), third block 394 vpaddd %ymm13,%ymm6,%ymm10 395 cmp $0xb0,%rcx 396 jl .Lxorpart4 397 vpxord 0xa0(%rdx),%xmm10,%xmm9 398 vmovdqu %xmm9,0xa0(%rsi) 399 vextracti128 $1,%ymm10,%xmm6 400 # o3 = i3 ^ (x3 + s3), third block 401 vpaddd %ymm15,%ymm7,%ymm10 402 cmp $0xc0,%rcx 403 jl .Lxorpart4 404 vpxord 0xb0(%rdx),%xmm10,%xmm9 405 vmovdqu %xmm9,0xb0(%rsi) 406 vextracti128 $1,%ymm10,%xmm7 407 408 # xor and write fourth block 409 vmovdqa %xmm4,%xmm10 410 cmp $0xd0,%rcx 411 jl .Lxorpart4 412 vpxord 0xc0(%rdx),%xmm10,%xmm9 413 vmovdqu %xmm9,0xc0(%rsi) 414 415 vmovdqa %xmm5,%xmm10 416 cmp $0xe0,%rcx 417 jl .Lxorpart4 418 vpxord 0xd0(%rdx),%xmm10,%xmm9 419 vmovdqu %xmm9,0xd0(%rsi) 420 421 vmovdqa %xmm6,%xmm10 422 cmp $0xf0,%rcx 423 jl .Lxorpart4 424 vpxord 0xe0(%rdx),%xmm10,%xmm9 425 vmovdqu %xmm9,0xe0(%rsi) 426 427 vmovdqa %xmm7,%xmm10 428 cmp $0x100,%rcx 429 jl .Lxorpart4 430 vpxord 0xf0(%rdx),%xmm10,%xmm9 431 vmovdqu %xmm9,0xf0(%rsi) 432 433.Ldone4: 434 vzeroupper 435 RET 436 437.Lxorpart4: 438 # xor remaining bytes from partial register into output 439 mov %rcx,%rax 440 and $0xf,%rcx 441 jz .Ldone4 442 mov %rax,%r9 443 and $~0xf,%r9 444 445 mov $1,%rax 446 shld %cl,%rax,%rax 447 sub $1,%rax 448 kmovq %rax,%k1 449 450 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} 451 vpxord %xmm10,%xmm1,%xmm1 452 vmovdqu8 %xmm1,(%rsi,%r9){%k1} 453 454 jmp .Ldone4 455 456SYM_FUNC_END(chacha_4block_xor_avx512vl) 457 458SYM_FUNC_START(chacha_8block_xor_avx512vl) 459 # %rdi: Input state matrix, s 460 # %rsi: up to 8 data blocks output, o 461 # %rdx: up to 8 data blocks input, i 462 # %rcx: input/output length in bytes 463 # %r8d: nrounds 464 465 # This function encrypts eight consecutive ChaCha blocks by loading 466 # the state matrix in AVX registers eight times. Compared to AVX2, this 467 # mostly benefits from the new rotate instructions in VL and the 468 # additional registers. 469 470 vzeroupper 471 472 # x0..15[0-7] = s[0..15] 473 vpbroadcastd 0x00(%rdi),%ymm0 474 vpbroadcastd 0x04(%rdi),%ymm1 475 vpbroadcastd 0x08(%rdi),%ymm2 476 vpbroadcastd 0x0c(%rdi),%ymm3 477 vpbroadcastd 0x10(%rdi),%ymm4 478 vpbroadcastd 0x14(%rdi),%ymm5 479 vpbroadcastd 0x18(%rdi),%ymm6 480 vpbroadcastd 0x1c(%rdi),%ymm7 481 vpbroadcastd 0x20(%rdi),%ymm8 482 vpbroadcastd 0x24(%rdi),%ymm9 483 vpbroadcastd 0x28(%rdi),%ymm10 484 vpbroadcastd 0x2c(%rdi),%ymm11 485 vpbroadcastd 0x30(%rdi),%ymm12 486 vpbroadcastd 0x34(%rdi),%ymm13 487 vpbroadcastd 0x38(%rdi),%ymm14 488 vpbroadcastd 0x3c(%rdi),%ymm15 489 490 # x12 += counter values 0-3 491 vpaddd CTR8BL(%rip),%ymm12,%ymm12 492 493 vmovdqa64 %ymm0,%ymm16 494 vmovdqa64 %ymm1,%ymm17 495 vmovdqa64 %ymm2,%ymm18 496 vmovdqa64 %ymm3,%ymm19 497 vmovdqa64 %ymm4,%ymm20 498 vmovdqa64 %ymm5,%ymm21 499 vmovdqa64 %ymm6,%ymm22 500 vmovdqa64 %ymm7,%ymm23 501 vmovdqa64 %ymm8,%ymm24 502 vmovdqa64 %ymm9,%ymm25 503 vmovdqa64 %ymm10,%ymm26 504 vmovdqa64 %ymm11,%ymm27 505 vmovdqa64 %ymm12,%ymm28 506 vmovdqa64 %ymm13,%ymm29 507 vmovdqa64 %ymm14,%ymm30 508 vmovdqa64 %ymm15,%ymm31 509 510.Ldoubleround8: 511 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 512 vpaddd %ymm0,%ymm4,%ymm0 513 vpxord %ymm0,%ymm12,%ymm12 514 vprold $16,%ymm12,%ymm12 515 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 516 vpaddd %ymm1,%ymm5,%ymm1 517 vpxord %ymm1,%ymm13,%ymm13 518 vprold $16,%ymm13,%ymm13 519 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 520 vpaddd %ymm2,%ymm6,%ymm2 521 vpxord %ymm2,%ymm14,%ymm14 522 vprold $16,%ymm14,%ymm14 523 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 524 vpaddd %ymm3,%ymm7,%ymm3 525 vpxord %ymm3,%ymm15,%ymm15 526 vprold $16,%ymm15,%ymm15 527 528 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 529 vpaddd %ymm12,%ymm8,%ymm8 530 vpxord %ymm8,%ymm4,%ymm4 531 vprold $12,%ymm4,%ymm4 532 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 533 vpaddd %ymm13,%ymm9,%ymm9 534 vpxord %ymm9,%ymm5,%ymm5 535 vprold $12,%ymm5,%ymm5 536 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 537 vpaddd %ymm14,%ymm10,%ymm10 538 vpxord %ymm10,%ymm6,%ymm6 539 vprold $12,%ymm6,%ymm6 540 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 541 vpaddd %ymm15,%ymm11,%ymm11 542 vpxord %ymm11,%ymm7,%ymm7 543 vprold $12,%ymm7,%ymm7 544 545 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 546 vpaddd %ymm0,%ymm4,%ymm0 547 vpxord %ymm0,%ymm12,%ymm12 548 vprold $8,%ymm12,%ymm12 549 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 550 vpaddd %ymm1,%ymm5,%ymm1 551 vpxord %ymm1,%ymm13,%ymm13 552 vprold $8,%ymm13,%ymm13 553 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 554 vpaddd %ymm2,%ymm6,%ymm2 555 vpxord %ymm2,%ymm14,%ymm14 556 vprold $8,%ymm14,%ymm14 557 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 558 vpaddd %ymm3,%ymm7,%ymm3 559 vpxord %ymm3,%ymm15,%ymm15 560 vprold $8,%ymm15,%ymm15 561 562 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 563 vpaddd %ymm12,%ymm8,%ymm8 564 vpxord %ymm8,%ymm4,%ymm4 565 vprold $7,%ymm4,%ymm4 566 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 567 vpaddd %ymm13,%ymm9,%ymm9 568 vpxord %ymm9,%ymm5,%ymm5 569 vprold $7,%ymm5,%ymm5 570 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 571 vpaddd %ymm14,%ymm10,%ymm10 572 vpxord %ymm10,%ymm6,%ymm6 573 vprold $7,%ymm6,%ymm6 574 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 575 vpaddd %ymm15,%ymm11,%ymm11 576 vpxord %ymm11,%ymm7,%ymm7 577 vprold $7,%ymm7,%ymm7 578 579 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 580 vpaddd %ymm0,%ymm5,%ymm0 581 vpxord %ymm0,%ymm15,%ymm15 582 vprold $16,%ymm15,%ymm15 583 # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 584 vpaddd %ymm1,%ymm6,%ymm1 585 vpxord %ymm1,%ymm12,%ymm12 586 vprold $16,%ymm12,%ymm12 587 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 588 vpaddd %ymm2,%ymm7,%ymm2 589 vpxord %ymm2,%ymm13,%ymm13 590 vprold $16,%ymm13,%ymm13 591 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 592 vpaddd %ymm3,%ymm4,%ymm3 593 vpxord %ymm3,%ymm14,%ymm14 594 vprold $16,%ymm14,%ymm14 595 596 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 597 vpaddd %ymm15,%ymm10,%ymm10 598 vpxord %ymm10,%ymm5,%ymm5 599 vprold $12,%ymm5,%ymm5 600 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 601 vpaddd %ymm12,%ymm11,%ymm11 602 vpxord %ymm11,%ymm6,%ymm6 603 vprold $12,%ymm6,%ymm6 604 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 605 vpaddd %ymm13,%ymm8,%ymm8 606 vpxord %ymm8,%ymm7,%ymm7 607 vprold $12,%ymm7,%ymm7 608 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 609 vpaddd %ymm14,%ymm9,%ymm9 610 vpxord %ymm9,%ymm4,%ymm4 611 vprold $12,%ymm4,%ymm4 612 613 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 614 vpaddd %ymm0,%ymm5,%ymm0 615 vpxord %ymm0,%ymm15,%ymm15 616 vprold $8,%ymm15,%ymm15 617 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 618 vpaddd %ymm1,%ymm6,%ymm1 619 vpxord %ymm1,%ymm12,%ymm12 620 vprold $8,%ymm12,%ymm12 621 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 622 vpaddd %ymm2,%ymm7,%ymm2 623 vpxord %ymm2,%ymm13,%ymm13 624 vprold $8,%ymm13,%ymm13 625 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 626 vpaddd %ymm3,%ymm4,%ymm3 627 vpxord %ymm3,%ymm14,%ymm14 628 vprold $8,%ymm14,%ymm14 629 630 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 631 vpaddd %ymm15,%ymm10,%ymm10 632 vpxord %ymm10,%ymm5,%ymm5 633 vprold $7,%ymm5,%ymm5 634 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 635 vpaddd %ymm12,%ymm11,%ymm11 636 vpxord %ymm11,%ymm6,%ymm6 637 vprold $7,%ymm6,%ymm6 638 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 639 vpaddd %ymm13,%ymm8,%ymm8 640 vpxord %ymm8,%ymm7,%ymm7 641 vprold $7,%ymm7,%ymm7 642 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 643 vpaddd %ymm14,%ymm9,%ymm9 644 vpxord %ymm9,%ymm4,%ymm4 645 vprold $7,%ymm4,%ymm4 646 647 sub $2,%r8d 648 jnz .Ldoubleround8 649 650 # x0..15[0-3] += s[0..15] 651 vpaddd %ymm16,%ymm0,%ymm0 652 vpaddd %ymm17,%ymm1,%ymm1 653 vpaddd %ymm18,%ymm2,%ymm2 654 vpaddd %ymm19,%ymm3,%ymm3 655 vpaddd %ymm20,%ymm4,%ymm4 656 vpaddd %ymm21,%ymm5,%ymm5 657 vpaddd %ymm22,%ymm6,%ymm6 658 vpaddd %ymm23,%ymm7,%ymm7 659 vpaddd %ymm24,%ymm8,%ymm8 660 vpaddd %ymm25,%ymm9,%ymm9 661 vpaddd %ymm26,%ymm10,%ymm10 662 vpaddd %ymm27,%ymm11,%ymm11 663 vpaddd %ymm28,%ymm12,%ymm12 664 vpaddd %ymm29,%ymm13,%ymm13 665 vpaddd %ymm30,%ymm14,%ymm14 666 vpaddd %ymm31,%ymm15,%ymm15 667 668 # interleave 32-bit words in state n, n+1 669 vpunpckldq %ymm1,%ymm0,%ymm16 670 vpunpckhdq %ymm1,%ymm0,%ymm17 671 vpunpckldq %ymm3,%ymm2,%ymm18 672 vpunpckhdq %ymm3,%ymm2,%ymm19 673 vpunpckldq %ymm5,%ymm4,%ymm20 674 vpunpckhdq %ymm5,%ymm4,%ymm21 675 vpunpckldq %ymm7,%ymm6,%ymm22 676 vpunpckhdq %ymm7,%ymm6,%ymm23 677 vpunpckldq %ymm9,%ymm8,%ymm24 678 vpunpckhdq %ymm9,%ymm8,%ymm25 679 vpunpckldq %ymm11,%ymm10,%ymm26 680 vpunpckhdq %ymm11,%ymm10,%ymm27 681 vpunpckldq %ymm13,%ymm12,%ymm28 682 vpunpckhdq %ymm13,%ymm12,%ymm29 683 vpunpckldq %ymm15,%ymm14,%ymm30 684 vpunpckhdq %ymm15,%ymm14,%ymm31 685 686 # interleave 64-bit words in state n, n+2 687 vpunpcklqdq %ymm18,%ymm16,%ymm0 688 vpunpcklqdq %ymm19,%ymm17,%ymm1 689 vpunpckhqdq %ymm18,%ymm16,%ymm2 690 vpunpckhqdq %ymm19,%ymm17,%ymm3 691 vpunpcklqdq %ymm22,%ymm20,%ymm4 692 vpunpcklqdq %ymm23,%ymm21,%ymm5 693 vpunpckhqdq %ymm22,%ymm20,%ymm6 694 vpunpckhqdq %ymm23,%ymm21,%ymm7 695 vpunpcklqdq %ymm26,%ymm24,%ymm8 696 vpunpcklqdq %ymm27,%ymm25,%ymm9 697 vpunpckhqdq %ymm26,%ymm24,%ymm10 698 vpunpckhqdq %ymm27,%ymm25,%ymm11 699 vpunpcklqdq %ymm30,%ymm28,%ymm12 700 vpunpcklqdq %ymm31,%ymm29,%ymm13 701 vpunpckhqdq %ymm30,%ymm28,%ymm14 702 vpunpckhqdq %ymm31,%ymm29,%ymm15 703 704 # interleave 128-bit words in state n, n+4 705 # xor/write first four blocks 706 vmovdqa64 %ymm0,%ymm16 707 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 708 cmp $0x0020,%rcx 709 jl .Lxorpart8 710 vpxord 0x0000(%rdx),%ymm0,%ymm0 711 vmovdqu64 %ymm0,0x0000(%rsi) 712 vmovdqa64 %ymm16,%ymm0 713 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 714 715 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 716 cmp $0x0040,%rcx 717 jl .Lxorpart8 718 vpxord 0x0020(%rdx),%ymm0,%ymm0 719 vmovdqu64 %ymm0,0x0020(%rsi) 720 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 721 722 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 723 cmp $0x0060,%rcx 724 jl .Lxorpart8 725 vpxord 0x0040(%rdx),%ymm0,%ymm0 726 vmovdqu64 %ymm0,0x0040(%rsi) 727 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 728 729 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 730 cmp $0x0080,%rcx 731 jl .Lxorpart8 732 vpxord 0x0060(%rdx),%ymm0,%ymm0 733 vmovdqu64 %ymm0,0x0060(%rsi) 734 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 735 736 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 737 cmp $0x00a0,%rcx 738 jl .Lxorpart8 739 vpxord 0x0080(%rdx),%ymm0,%ymm0 740 vmovdqu64 %ymm0,0x0080(%rsi) 741 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 742 743 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 744 cmp $0x00c0,%rcx 745 jl .Lxorpart8 746 vpxord 0x00a0(%rdx),%ymm0,%ymm0 747 vmovdqu64 %ymm0,0x00a0(%rsi) 748 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 749 750 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 751 cmp $0x00e0,%rcx 752 jl .Lxorpart8 753 vpxord 0x00c0(%rdx),%ymm0,%ymm0 754 vmovdqu64 %ymm0,0x00c0(%rsi) 755 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 756 757 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 758 cmp $0x0100,%rcx 759 jl .Lxorpart8 760 vpxord 0x00e0(%rdx),%ymm0,%ymm0 761 vmovdqu64 %ymm0,0x00e0(%rsi) 762 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 763 764 # xor remaining blocks, write to output 765 vmovdqa64 %ymm4,%ymm0 766 cmp $0x0120,%rcx 767 jl .Lxorpart8 768 vpxord 0x0100(%rdx),%ymm0,%ymm0 769 vmovdqu64 %ymm0,0x0100(%rsi) 770 771 vmovdqa64 %ymm12,%ymm0 772 cmp $0x0140,%rcx 773 jl .Lxorpart8 774 vpxord 0x0120(%rdx),%ymm0,%ymm0 775 vmovdqu64 %ymm0,0x0120(%rsi) 776 777 vmovdqa64 %ymm6,%ymm0 778 cmp $0x0160,%rcx 779 jl .Lxorpart8 780 vpxord 0x0140(%rdx),%ymm0,%ymm0 781 vmovdqu64 %ymm0,0x0140(%rsi) 782 783 vmovdqa64 %ymm14,%ymm0 784 cmp $0x0180,%rcx 785 jl .Lxorpart8 786 vpxord 0x0160(%rdx),%ymm0,%ymm0 787 vmovdqu64 %ymm0,0x0160(%rsi) 788 789 vmovdqa64 %ymm5,%ymm0 790 cmp $0x01a0,%rcx 791 jl .Lxorpart8 792 vpxord 0x0180(%rdx),%ymm0,%ymm0 793 vmovdqu64 %ymm0,0x0180(%rsi) 794 795 vmovdqa64 %ymm13,%ymm0 796 cmp $0x01c0,%rcx 797 jl .Lxorpart8 798 vpxord 0x01a0(%rdx),%ymm0,%ymm0 799 vmovdqu64 %ymm0,0x01a0(%rsi) 800 801 vmovdqa64 %ymm7,%ymm0 802 cmp $0x01e0,%rcx 803 jl .Lxorpart8 804 vpxord 0x01c0(%rdx),%ymm0,%ymm0 805 vmovdqu64 %ymm0,0x01c0(%rsi) 806 807 vmovdqa64 %ymm15,%ymm0 808 cmp $0x0200,%rcx 809 jl .Lxorpart8 810 vpxord 0x01e0(%rdx),%ymm0,%ymm0 811 vmovdqu64 %ymm0,0x01e0(%rsi) 812 813.Ldone8: 814 vzeroupper 815 RET 816 817.Lxorpart8: 818 # xor remaining bytes from partial register into output 819 mov %rcx,%rax 820 and $0x1f,%rcx 821 jz .Ldone8 822 mov %rax,%r9 823 and $~0x1f,%r9 824 825 mov $1,%rax 826 shld %cl,%rax,%rax 827 sub $1,%rax 828 kmovq %rax,%k1 829 830 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} 831 vpxord %ymm0,%ymm1,%ymm1 832 vmovdqu8 %ymm1,(%rsi,%r9){%k1} 833 834 jmp .Ldone8 835 836SYM_FUNC_END(chacha_8block_xor_avx512vl)