chacha-avx2-x86_64.S (25009B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 4 * 5 * Copyright (C) 2015 Martin Willi 6 */ 7 8#include <linux/linkage.h> 9 10.section .rodata.cst32.ROT8, "aM", @progbits, 32 11.align 32 12ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 13 .octa 0x0e0d0c0f0a09080b0605040702010003 14 15.section .rodata.cst32.ROT16, "aM", @progbits, 32 16.align 32 17ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 18 .octa 0x0d0c0f0e09080b0a0504070601000302 19 20.section .rodata.cst32.CTRINC, "aM", @progbits, 32 21.align 32 22CTRINC: .octa 0x00000003000000020000000100000000 23 .octa 0x00000007000000060000000500000004 24 25.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 26.align 32 27CTR2BL: .octa 0x00000000000000000000000000000000 28 .octa 0x00000000000000000000000000000001 29 30.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 31.align 32 32CTR4BL: .octa 0x00000000000000000000000000000002 33 .octa 0x00000000000000000000000000000003 34 35.text 36 37SYM_FUNC_START(chacha_2block_xor_avx2) 38 # %rdi: Input state matrix, s 39 # %rsi: up to 2 data blocks output, o 40 # %rdx: up to 2 data blocks input, i 41 # %rcx: input/output length in bytes 42 # %r8d: nrounds 43 44 # This function encrypts two ChaCha blocks by loading the state 45 # matrix twice across four AVX registers. It performs matrix operations 46 # on four words in each matrix in parallel, but requires shuffling to 47 # rearrange the words after each round. 48 49 vzeroupper 50 51 # x0..3[0-2] = s0..3 52 vbroadcasti128 0x00(%rdi),%ymm0 53 vbroadcasti128 0x10(%rdi),%ymm1 54 vbroadcasti128 0x20(%rdi),%ymm2 55 vbroadcasti128 0x30(%rdi),%ymm3 56 57 vpaddd CTR2BL(%rip),%ymm3,%ymm3 58 59 vmovdqa %ymm0,%ymm8 60 vmovdqa %ymm1,%ymm9 61 vmovdqa %ymm2,%ymm10 62 vmovdqa %ymm3,%ymm11 63 64 vmovdqa ROT8(%rip),%ymm4 65 vmovdqa ROT16(%rip),%ymm5 66 67 mov %rcx,%rax 68 69.Ldoubleround: 70 71 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 72 vpaddd %ymm1,%ymm0,%ymm0 73 vpxor %ymm0,%ymm3,%ymm3 74 vpshufb %ymm5,%ymm3,%ymm3 75 76 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 77 vpaddd %ymm3,%ymm2,%ymm2 78 vpxor %ymm2,%ymm1,%ymm1 79 vmovdqa %ymm1,%ymm6 80 vpslld $12,%ymm6,%ymm6 81 vpsrld $20,%ymm1,%ymm1 82 vpor %ymm6,%ymm1,%ymm1 83 84 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 85 vpaddd %ymm1,%ymm0,%ymm0 86 vpxor %ymm0,%ymm3,%ymm3 87 vpshufb %ymm4,%ymm3,%ymm3 88 89 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 90 vpaddd %ymm3,%ymm2,%ymm2 91 vpxor %ymm2,%ymm1,%ymm1 92 vmovdqa %ymm1,%ymm7 93 vpslld $7,%ymm7,%ymm7 94 vpsrld $25,%ymm1,%ymm1 95 vpor %ymm7,%ymm1,%ymm1 96 97 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 98 vpshufd $0x39,%ymm1,%ymm1 99 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 100 vpshufd $0x4e,%ymm2,%ymm2 101 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 102 vpshufd $0x93,%ymm3,%ymm3 103 104 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 105 vpaddd %ymm1,%ymm0,%ymm0 106 vpxor %ymm0,%ymm3,%ymm3 107 vpshufb %ymm5,%ymm3,%ymm3 108 109 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 110 vpaddd %ymm3,%ymm2,%ymm2 111 vpxor %ymm2,%ymm1,%ymm1 112 vmovdqa %ymm1,%ymm6 113 vpslld $12,%ymm6,%ymm6 114 vpsrld $20,%ymm1,%ymm1 115 vpor %ymm6,%ymm1,%ymm1 116 117 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 118 vpaddd %ymm1,%ymm0,%ymm0 119 vpxor %ymm0,%ymm3,%ymm3 120 vpshufb %ymm4,%ymm3,%ymm3 121 122 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 123 vpaddd %ymm3,%ymm2,%ymm2 124 vpxor %ymm2,%ymm1,%ymm1 125 vmovdqa %ymm1,%ymm7 126 vpslld $7,%ymm7,%ymm7 127 vpsrld $25,%ymm1,%ymm1 128 vpor %ymm7,%ymm1,%ymm1 129 130 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 131 vpshufd $0x93,%ymm1,%ymm1 132 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 133 vpshufd $0x4e,%ymm2,%ymm2 134 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 135 vpshufd $0x39,%ymm3,%ymm3 136 137 sub $2,%r8d 138 jnz .Ldoubleround 139 140 # o0 = i0 ^ (x0 + s0) 141 vpaddd %ymm8,%ymm0,%ymm7 142 cmp $0x10,%rax 143 jl .Lxorpart2 144 vpxor 0x00(%rdx),%xmm7,%xmm6 145 vmovdqu %xmm6,0x00(%rsi) 146 vextracti128 $1,%ymm7,%xmm0 147 # o1 = i1 ^ (x1 + s1) 148 vpaddd %ymm9,%ymm1,%ymm7 149 cmp $0x20,%rax 150 jl .Lxorpart2 151 vpxor 0x10(%rdx),%xmm7,%xmm6 152 vmovdqu %xmm6,0x10(%rsi) 153 vextracti128 $1,%ymm7,%xmm1 154 # o2 = i2 ^ (x2 + s2) 155 vpaddd %ymm10,%ymm2,%ymm7 156 cmp $0x30,%rax 157 jl .Lxorpart2 158 vpxor 0x20(%rdx),%xmm7,%xmm6 159 vmovdqu %xmm6,0x20(%rsi) 160 vextracti128 $1,%ymm7,%xmm2 161 # o3 = i3 ^ (x3 + s3) 162 vpaddd %ymm11,%ymm3,%ymm7 163 cmp $0x40,%rax 164 jl .Lxorpart2 165 vpxor 0x30(%rdx),%xmm7,%xmm6 166 vmovdqu %xmm6,0x30(%rsi) 167 vextracti128 $1,%ymm7,%xmm3 168 169 # xor and write second block 170 vmovdqa %xmm0,%xmm7 171 cmp $0x50,%rax 172 jl .Lxorpart2 173 vpxor 0x40(%rdx),%xmm7,%xmm6 174 vmovdqu %xmm6,0x40(%rsi) 175 176 vmovdqa %xmm1,%xmm7 177 cmp $0x60,%rax 178 jl .Lxorpart2 179 vpxor 0x50(%rdx),%xmm7,%xmm6 180 vmovdqu %xmm6,0x50(%rsi) 181 182 vmovdqa %xmm2,%xmm7 183 cmp $0x70,%rax 184 jl .Lxorpart2 185 vpxor 0x60(%rdx),%xmm7,%xmm6 186 vmovdqu %xmm6,0x60(%rsi) 187 188 vmovdqa %xmm3,%xmm7 189 cmp $0x80,%rax 190 jl .Lxorpart2 191 vpxor 0x70(%rdx),%xmm7,%xmm6 192 vmovdqu %xmm6,0x70(%rsi) 193 194.Ldone2: 195 vzeroupper 196 RET 197 198.Lxorpart2: 199 # xor remaining bytes from partial register into output 200 mov %rax,%r9 201 and $0x0f,%r9 202 jz .Ldone2 203 and $~0x0f,%rax 204 205 mov %rsi,%r11 206 207 lea 8(%rsp),%r10 208 sub $0x10,%rsp 209 and $~31,%rsp 210 211 lea (%rdx,%rax),%rsi 212 mov %rsp,%rdi 213 mov %r9,%rcx 214 rep movsb 215 216 vpxor 0x00(%rsp),%xmm7,%xmm7 217 vmovdqa %xmm7,0x00(%rsp) 218 219 mov %rsp,%rsi 220 lea (%r11,%rax),%rdi 221 mov %r9,%rcx 222 rep movsb 223 224 lea -8(%r10),%rsp 225 jmp .Ldone2 226 227SYM_FUNC_END(chacha_2block_xor_avx2) 228 229SYM_FUNC_START(chacha_4block_xor_avx2) 230 # %rdi: Input state matrix, s 231 # %rsi: up to 4 data blocks output, o 232 # %rdx: up to 4 data blocks input, i 233 # %rcx: input/output length in bytes 234 # %r8d: nrounds 235 236 # This function encrypts four ChaCha blocks by loading the state 237 # matrix four times across eight AVX registers. It performs matrix 238 # operations on four words in two matrices in parallel, sequentially 239 # to the operations on the four words of the other two matrices. The 240 # required word shuffling has a rather high latency, we can do the 241 # arithmetic on two matrix-pairs without much slowdown. 242 243 vzeroupper 244 245 # x0..3[0-4] = s0..3 246 vbroadcasti128 0x00(%rdi),%ymm0 247 vbroadcasti128 0x10(%rdi),%ymm1 248 vbroadcasti128 0x20(%rdi),%ymm2 249 vbroadcasti128 0x30(%rdi),%ymm3 250 251 vmovdqa %ymm0,%ymm4 252 vmovdqa %ymm1,%ymm5 253 vmovdqa %ymm2,%ymm6 254 vmovdqa %ymm3,%ymm7 255 256 vpaddd CTR2BL(%rip),%ymm3,%ymm3 257 vpaddd CTR4BL(%rip),%ymm7,%ymm7 258 259 vmovdqa %ymm0,%ymm11 260 vmovdqa %ymm1,%ymm12 261 vmovdqa %ymm2,%ymm13 262 vmovdqa %ymm3,%ymm14 263 vmovdqa %ymm7,%ymm15 264 265 vmovdqa ROT8(%rip),%ymm8 266 vmovdqa ROT16(%rip),%ymm9 267 268 mov %rcx,%rax 269 270.Ldoubleround4: 271 272 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 273 vpaddd %ymm1,%ymm0,%ymm0 274 vpxor %ymm0,%ymm3,%ymm3 275 vpshufb %ymm9,%ymm3,%ymm3 276 277 vpaddd %ymm5,%ymm4,%ymm4 278 vpxor %ymm4,%ymm7,%ymm7 279 vpshufb %ymm9,%ymm7,%ymm7 280 281 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 282 vpaddd %ymm3,%ymm2,%ymm2 283 vpxor %ymm2,%ymm1,%ymm1 284 vmovdqa %ymm1,%ymm10 285 vpslld $12,%ymm10,%ymm10 286 vpsrld $20,%ymm1,%ymm1 287 vpor %ymm10,%ymm1,%ymm1 288 289 vpaddd %ymm7,%ymm6,%ymm6 290 vpxor %ymm6,%ymm5,%ymm5 291 vmovdqa %ymm5,%ymm10 292 vpslld $12,%ymm10,%ymm10 293 vpsrld $20,%ymm5,%ymm5 294 vpor %ymm10,%ymm5,%ymm5 295 296 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 297 vpaddd %ymm1,%ymm0,%ymm0 298 vpxor %ymm0,%ymm3,%ymm3 299 vpshufb %ymm8,%ymm3,%ymm3 300 301 vpaddd %ymm5,%ymm4,%ymm4 302 vpxor %ymm4,%ymm7,%ymm7 303 vpshufb %ymm8,%ymm7,%ymm7 304 305 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 306 vpaddd %ymm3,%ymm2,%ymm2 307 vpxor %ymm2,%ymm1,%ymm1 308 vmovdqa %ymm1,%ymm10 309 vpslld $7,%ymm10,%ymm10 310 vpsrld $25,%ymm1,%ymm1 311 vpor %ymm10,%ymm1,%ymm1 312 313 vpaddd %ymm7,%ymm6,%ymm6 314 vpxor %ymm6,%ymm5,%ymm5 315 vmovdqa %ymm5,%ymm10 316 vpslld $7,%ymm10,%ymm10 317 vpsrld $25,%ymm5,%ymm5 318 vpor %ymm10,%ymm5,%ymm5 319 320 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 321 vpshufd $0x39,%ymm1,%ymm1 322 vpshufd $0x39,%ymm5,%ymm5 323 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 324 vpshufd $0x4e,%ymm2,%ymm2 325 vpshufd $0x4e,%ymm6,%ymm6 326 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 327 vpshufd $0x93,%ymm3,%ymm3 328 vpshufd $0x93,%ymm7,%ymm7 329 330 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 331 vpaddd %ymm1,%ymm0,%ymm0 332 vpxor %ymm0,%ymm3,%ymm3 333 vpshufb %ymm9,%ymm3,%ymm3 334 335 vpaddd %ymm5,%ymm4,%ymm4 336 vpxor %ymm4,%ymm7,%ymm7 337 vpshufb %ymm9,%ymm7,%ymm7 338 339 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 340 vpaddd %ymm3,%ymm2,%ymm2 341 vpxor %ymm2,%ymm1,%ymm1 342 vmovdqa %ymm1,%ymm10 343 vpslld $12,%ymm10,%ymm10 344 vpsrld $20,%ymm1,%ymm1 345 vpor %ymm10,%ymm1,%ymm1 346 347 vpaddd %ymm7,%ymm6,%ymm6 348 vpxor %ymm6,%ymm5,%ymm5 349 vmovdqa %ymm5,%ymm10 350 vpslld $12,%ymm10,%ymm10 351 vpsrld $20,%ymm5,%ymm5 352 vpor %ymm10,%ymm5,%ymm5 353 354 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 355 vpaddd %ymm1,%ymm0,%ymm0 356 vpxor %ymm0,%ymm3,%ymm3 357 vpshufb %ymm8,%ymm3,%ymm3 358 359 vpaddd %ymm5,%ymm4,%ymm4 360 vpxor %ymm4,%ymm7,%ymm7 361 vpshufb %ymm8,%ymm7,%ymm7 362 363 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 364 vpaddd %ymm3,%ymm2,%ymm2 365 vpxor %ymm2,%ymm1,%ymm1 366 vmovdqa %ymm1,%ymm10 367 vpslld $7,%ymm10,%ymm10 368 vpsrld $25,%ymm1,%ymm1 369 vpor %ymm10,%ymm1,%ymm1 370 371 vpaddd %ymm7,%ymm6,%ymm6 372 vpxor %ymm6,%ymm5,%ymm5 373 vmovdqa %ymm5,%ymm10 374 vpslld $7,%ymm10,%ymm10 375 vpsrld $25,%ymm5,%ymm5 376 vpor %ymm10,%ymm5,%ymm5 377 378 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 379 vpshufd $0x93,%ymm1,%ymm1 380 vpshufd $0x93,%ymm5,%ymm5 381 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 382 vpshufd $0x4e,%ymm2,%ymm2 383 vpshufd $0x4e,%ymm6,%ymm6 384 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 385 vpshufd $0x39,%ymm3,%ymm3 386 vpshufd $0x39,%ymm7,%ymm7 387 388 sub $2,%r8d 389 jnz .Ldoubleround4 390 391 # o0 = i0 ^ (x0 + s0), first block 392 vpaddd %ymm11,%ymm0,%ymm10 393 cmp $0x10,%rax 394 jl .Lxorpart4 395 vpxor 0x00(%rdx),%xmm10,%xmm9 396 vmovdqu %xmm9,0x00(%rsi) 397 vextracti128 $1,%ymm10,%xmm0 398 # o1 = i1 ^ (x1 + s1), first block 399 vpaddd %ymm12,%ymm1,%ymm10 400 cmp $0x20,%rax 401 jl .Lxorpart4 402 vpxor 0x10(%rdx),%xmm10,%xmm9 403 vmovdqu %xmm9,0x10(%rsi) 404 vextracti128 $1,%ymm10,%xmm1 405 # o2 = i2 ^ (x2 + s2), first block 406 vpaddd %ymm13,%ymm2,%ymm10 407 cmp $0x30,%rax 408 jl .Lxorpart4 409 vpxor 0x20(%rdx),%xmm10,%xmm9 410 vmovdqu %xmm9,0x20(%rsi) 411 vextracti128 $1,%ymm10,%xmm2 412 # o3 = i3 ^ (x3 + s3), first block 413 vpaddd %ymm14,%ymm3,%ymm10 414 cmp $0x40,%rax 415 jl .Lxorpart4 416 vpxor 0x30(%rdx),%xmm10,%xmm9 417 vmovdqu %xmm9,0x30(%rsi) 418 vextracti128 $1,%ymm10,%xmm3 419 420 # xor and write second block 421 vmovdqa %xmm0,%xmm10 422 cmp $0x50,%rax 423 jl .Lxorpart4 424 vpxor 0x40(%rdx),%xmm10,%xmm9 425 vmovdqu %xmm9,0x40(%rsi) 426 427 vmovdqa %xmm1,%xmm10 428 cmp $0x60,%rax 429 jl .Lxorpart4 430 vpxor 0x50(%rdx),%xmm10,%xmm9 431 vmovdqu %xmm9,0x50(%rsi) 432 433 vmovdqa %xmm2,%xmm10 434 cmp $0x70,%rax 435 jl .Lxorpart4 436 vpxor 0x60(%rdx),%xmm10,%xmm9 437 vmovdqu %xmm9,0x60(%rsi) 438 439 vmovdqa %xmm3,%xmm10 440 cmp $0x80,%rax 441 jl .Lxorpart4 442 vpxor 0x70(%rdx),%xmm10,%xmm9 443 vmovdqu %xmm9,0x70(%rsi) 444 445 # o0 = i0 ^ (x0 + s0), third block 446 vpaddd %ymm11,%ymm4,%ymm10 447 cmp $0x90,%rax 448 jl .Lxorpart4 449 vpxor 0x80(%rdx),%xmm10,%xmm9 450 vmovdqu %xmm9,0x80(%rsi) 451 vextracti128 $1,%ymm10,%xmm4 452 # o1 = i1 ^ (x1 + s1), third block 453 vpaddd %ymm12,%ymm5,%ymm10 454 cmp $0xa0,%rax 455 jl .Lxorpart4 456 vpxor 0x90(%rdx),%xmm10,%xmm9 457 vmovdqu %xmm9,0x90(%rsi) 458 vextracti128 $1,%ymm10,%xmm5 459 # o2 = i2 ^ (x2 + s2), third block 460 vpaddd %ymm13,%ymm6,%ymm10 461 cmp $0xb0,%rax 462 jl .Lxorpart4 463 vpxor 0xa0(%rdx),%xmm10,%xmm9 464 vmovdqu %xmm9,0xa0(%rsi) 465 vextracti128 $1,%ymm10,%xmm6 466 # o3 = i3 ^ (x3 + s3), third block 467 vpaddd %ymm15,%ymm7,%ymm10 468 cmp $0xc0,%rax 469 jl .Lxorpart4 470 vpxor 0xb0(%rdx),%xmm10,%xmm9 471 vmovdqu %xmm9,0xb0(%rsi) 472 vextracti128 $1,%ymm10,%xmm7 473 474 # xor and write fourth block 475 vmovdqa %xmm4,%xmm10 476 cmp $0xd0,%rax 477 jl .Lxorpart4 478 vpxor 0xc0(%rdx),%xmm10,%xmm9 479 vmovdqu %xmm9,0xc0(%rsi) 480 481 vmovdqa %xmm5,%xmm10 482 cmp $0xe0,%rax 483 jl .Lxorpart4 484 vpxor 0xd0(%rdx),%xmm10,%xmm9 485 vmovdqu %xmm9,0xd0(%rsi) 486 487 vmovdqa %xmm6,%xmm10 488 cmp $0xf0,%rax 489 jl .Lxorpart4 490 vpxor 0xe0(%rdx),%xmm10,%xmm9 491 vmovdqu %xmm9,0xe0(%rsi) 492 493 vmovdqa %xmm7,%xmm10 494 cmp $0x100,%rax 495 jl .Lxorpart4 496 vpxor 0xf0(%rdx),%xmm10,%xmm9 497 vmovdqu %xmm9,0xf0(%rsi) 498 499.Ldone4: 500 vzeroupper 501 RET 502 503.Lxorpart4: 504 # xor remaining bytes from partial register into output 505 mov %rax,%r9 506 and $0x0f,%r9 507 jz .Ldone4 508 and $~0x0f,%rax 509 510 mov %rsi,%r11 511 512 lea 8(%rsp),%r10 513 sub $0x10,%rsp 514 and $~31,%rsp 515 516 lea (%rdx,%rax),%rsi 517 mov %rsp,%rdi 518 mov %r9,%rcx 519 rep movsb 520 521 vpxor 0x00(%rsp),%xmm10,%xmm10 522 vmovdqa %xmm10,0x00(%rsp) 523 524 mov %rsp,%rsi 525 lea (%r11,%rax),%rdi 526 mov %r9,%rcx 527 rep movsb 528 529 lea -8(%r10),%rsp 530 jmp .Ldone4 531 532SYM_FUNC_END(chacha_4block_xor_avx2) 533 534SYM_FUNC_START(chacha_8block_xor_avx2) 535 # %rdi: Input state matrix, s 536 # %rsi: up to 8 data blocks output, o 537 # %rdx: up to 8 data blocks input, i 538 # %rcx: input/output length in bytes 539 # %r8d: nrounds 540 541 # This function encrypts eight consecutive ChaCha blocks by loading 542 # the state matrix in AVX registers eight times. As we need some 543 # scratch registers, we save the first four registers on the stack. The 544 # algorithm performs each operation on the corresponding word of each 545 # state matrix, hence requires no word shuffling. For final XORing step 546 # we transpose the matrix by interleaving 32-, 64- and then 128-bit 547 # words, which allows us to do XOR in AVX registers. 8/16-bit word 548 # rotation is done with the slightly better performing byte shuffling, 549 # 7/12-bit word rotation uses traditional shift+OR. 550 551 vzeroupper 552 # 4 * 32 byte stack, 32-byte aligned 553 lea 8(%rsp),%r10 554 and $~31, %rsp 555 sub $0x80, %rsp 556 mov %rcx,%rax 557 558 # x0..15[0-7] = s[0..15] 559 vpbroadcastd 0x00(%rdi),%ymm0 560 vpbroadcastd 0x04(%rdi),%ymm1 561 vpbroadcastd 0x08(%rdi),%ymm2 562 vpbroadcastd 0x0c(%rdi),%ymm3 563 vpbroadcastd 0x10(%rdi),%ymm4 564 vpbroadcastd 0x14(%rdi),%ymm5 565 vpbroadcastd 0x18(%rdi),%ymm6 566 vpbroadcastd 0x1c(%rdi),%ymm7 567 vpbroadcastd 0x20(%rdi),%ymm8 568 vpbroadcastd 0x24(%rdi),%ymm9 569 vpbroadcastd 0x28(%rdi),%ymm10 570 vpbroadcastd 0x2c(%rdi),%ymm11 571 vpbroadcastd 0x30(%rdi),%ymm12 572 vpbroadcastd 0x34(%rdi),%ymm13 573 vpbroadcastd 0x38(%rdi),%ymm14 574 vpbroadcastd 0x3c(%rdi),%ymm15 575 # x0..3 on stack 576 vmovdqa %ymm0,0x00(%rsp) 577 vmovdqa %ymm1,0x20(%rsp) 578 vmovdqa %ymm2,0x40(%rsp) 579 vmovdqa %ymm3,0x60(%rsp) 580 581 vmovdqa CTRINC(%rip),%ymm1 582 vmovdqa ROT8(%rip),%ymm2 583 vmovdqa ROT16(%rip),%ymm3 584 585 # x12 += counter values 0-3 586 vpaddd %ymm1,%ymm12,%ymm12 587 588.Ldoubleround8: 589 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 590 vpaddd 0x00(%rsp),%ymm4,%ymm0 591 vmovdqa %ymm0,0x00(%rsp) 592 vpxor %ymm0,%ymm12,%ymm12 593 vpshufb %ymm3,%ymm12,%ymm12 594 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 595 vpaddd 0x20(%rsp),%ymm5,%ymm0 596 vmovdqa %ymm0,0x20(%rsp) 597 vpxor %ymm0,%ymm13,%ymm13 598 vpshufb %ymm3,%ymm13,%ymm13 599 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 600 vpaddd 0x40(%rsp),%ymm6,%ymm0 601 vmovdqa %ymm0,0x40(%rsp) 602 vpxor %ymm0,%ymm14,%ymm14 603 vpshufb %ymm3,%ymm14,%ymm14 604 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 605 vpaddd 0x60(%rsp),%ymm7,%ymm0 606 vmovdqa %ymm0,0x60(%rsp) 607 vpxor %ymm0,%ymm15,%ymm15 608 vpshufb %ymm3,%ymm15,%ymm15 609 610 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 611 vpaddd %ymm12,%ymm8,%ymm8 612 vpxor %ymm8,%ymm4,%ymm4 613 vpslld $12,%ymm4,%ymm0 614 vpsrld $20,%ymm4,%ymm4 615 vpor %ymm0,%ymm4,%ymm4 616 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 617 vpaddd %ymm13,%ymm9,%ymm9 618 vpxor %ymm9,%ymm5,%ymm5 619 vpslld $12,%ymm5,%ymm0 620 vpsrld $20,%ymm5,%ymm5 621 vpor %ymm0,%ymm5,%ymm5 622 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 623 vpaddd %ymm14,%ymm10,%ymm10 624 vpxor %ymm10,%ymm6,%ymm6 625 vpslld $12,%ymm6,%ymm0 626 vpsrld $20,%ymm6,%ymm6 627 vpor %ymm0,%ymm6,%ymm6 628 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 629 vpaddd %ymm15,%ymm11,%ymm11 630 vpxor %ymm11,%ymm7,%ymm7 631 vpslld $12,%ymm7,%ymm0 632 vpsrld $20,%ymm7,%ymm7 633 vpor %ymm0,%ymm7,%ymm7 634 635 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 636 vpaddd 0x00(%rsp),%ymm4,%ymm0 637 vmovdqa %ymm0,0x00(%rsp) 638 vpxor %ymm0,%ymm12,%ymm12 639 vpshufb %ymm2,%ymm12,%ymm12 640 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 641 vpaddd 0x20(%rsp),%ymm5,%ymm0 642 vmovdqa %ymm0,0x20(%rsp) 643 vpxor %ymm0,%ymm13,%ymm13 644 vpshufb %ymm2,%ymm13,%ymm13 645 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 646 vpaddd 0x40(%rsp),%ymm6,%ymm0 647 vmovdqa %ymm0,0x40(%rsp) 648 vpxor %ymm0,%ymm14,%ymm14 649 vpshufb %ymm2,%ymm14,%ymm14 650 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 651 vpaddd 0x60(%rsp),%ymm7,%ymm0 652 vmovdqa %ymm0,0x60(%rsp) 653 vpxor %ymm0,%ymm15,%ymm15 654 vpshufb %ymm2,%ymm15,%ymm15 655 656 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 657 vpaddd %ymm12,%ymm8,%ymm8 658 vpxor %ymm8,%ymm4,%ymm4 659 vpslld $7,%ymm4,%ymm0 660 vpsrld $25,%ymm4,%ymm4 661 vpor %ymm0,%ymm4,%ymm4 662 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 663 vpaddd %ymm13,%ymm9,%ymm9 664 vpxor %ymm9,%ymm5,%ymm5 665 vpslld $7,%ymm5,%ymm0 666 vpsrld $25,%ymm5,%ymm5 667 vpor %ymm0,%ymm5,%ymm5 668 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 669 vpaddd %ymm14,%ymm10,%ymm10 670 vpxor %ymm10,%ymm6,%ymm6 671 vpslld $7,%ymm6,%ymm0 672 vpsrld $25,%ymm6,%ymm6 673 vpor %ymm0,%ymm6,%ymm6 674 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 675 vpaddd %ymm15,%ymm11,%ymm11 676 vpxor %ymm11,%ymm7,%ymm7 677 vpslld $7,%ymm7,%ymm0 678 vpsrld $25,%ymm7,%ymm7 679 vpor %ymm0,%ymm7,%ymm7 680 681 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 682 vpaddd 0x00(%rsp),%ymm5,%ymm0 683 vmovdqa %ymm0,0x00(%rsp) 684 vpxor %ymm0,%ymm15,%ymm15 685 vpshufb %ymm3,%ymm15,%ymm15 686 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 687 vpaddd 0x20(%rsp),%ymm6,%ymm0 688 vmovdqa %ymm0,0x20(%rsp) 689 vpxor %ymm0,%ymm12,%ymm12 690 vpshufb %ymm3,%ymm12,%ymm12 691 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 692 vpaddd 0x40(%rsp),%ymm7,%ymm0 693 vmovdqa %ymm0,0x40(%rsp) 694 vpxor %ymm0,%ymm13,%ymm13 695 vpshufb %ymm3,%ymm13,%ymm13 696 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 697 vpaddd 0x60(%rsp),%ymm4,%ymm0 698 vmovdqa %ymm0,0x60(%rsp) 699 vpxor %ymm0,%ymm14,%ymm14 700 vpshufb %ymm3,%ymm14,%ymm14 701 702 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 703 vpaddd %ymm15,%ymm10,%ymm10 704 vpxor %ymm10,%ymm5,%ymm5 705 vpslld $12,%ymm5,%ymm0 706 vpsrld $20,%ymm5,%ymm5 707 vpor %ymm0,%ymm5,%ymm5 708 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 709 vpaddd %ymm12,%ymm11,%ymm11 710 vpxor %ymm11,%ymm6,%ymm6 711 vpslld $12,%ymm6,%ymm0 712 vpsrld $20,%ymm6,%ymm6 713 vpor %ymm0,%ymm6,%ymm6 714 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 715 vpaddd %ymm13,%ymm8,%ymm8 716 vpxor %ymm8,%ymm7,%ymm7 717 vpslld $12,%ymm7,%ymm0 718 vpsrld $20,%ymm7,%ymm7 719 vpor %ymm0,%ymm7,%ymm7 720 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 721 vpaddd %ymm14,%ymm9,%ymm9 722 vpxor %ymm9,%ymm4,%ymm4 723 vpslld $12,%ymm4,%ymm0 724 vpsrld $20,%ymm4,%ymm4 725 vpor %ymm0,%ymm4,%ymm4 726 727 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 728 vpaddd 0x00(%rsp),%ymm5,%ymm0 729 vmovdqa %ymm0,0x00(%rsp) 730 vpxor %ymm0,%ymm15,%ymm15 731 vpshufb %ymm2,%ymm15,%ymm15 732 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 733 vpaddd 0x20(%rsp),%ymm6,%ymm0 734 vmovdqa %ymm0,0x20(%rsp) 735 vpxor %ymm0,%ymm12,%ymm12 736 vpshufb %ymm2,%ymm12,%ymm12 737 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 738 vpaddd 0x40(%rsp),%ymm7,%ymm0 739 vmovdqa %ymm0,0x40(%rsp) 740 vpxor %ymm0,%ymm13,%ymm13 741 vpshufb %ymm2,%ymm13,%ymm13 742 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 743 vpaddd 0x60(%rsp),%ymm4,%ymm0 744 vmovdqa %ymm0,0x60(%rsp) 745 vpxor %ymm0,%ymm14,%ymm14 746 vpshufb %ymm2,%ymm14,%ymm14 747 748 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 749 vpaddd %ymm15,%ymm10,%ymm10 750 vpxor %ymm10,%ymm5,%ymm5 751 vpslld $7,%ymm5,%ymm0 752 vpsrld $25,%ymm5,%ymm5 753 vpor %ymm0,%ymm5,%ymm5 754 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 755 vpaddd %ymm12,%ymm11,%ymm11 756 vpxor %ymm11,%ymm6,%ymm6 757 vpslld $7,%ymm6,%ymm0 758 vpsrld $25,%ymm6,%ymm6 759 vpor %ymm0,%ymm6,%ymm6 760 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 761 vpaddd %ymm13,%ymm8,%ymm8 762 vpxor %ymm8,%ymm7,%ymm7 763 vpslld $7,%ymm7,%ymm0 764 vpsrld $25,%ymm7,%ymm7 765 vpor %ymm0,%ymm7,%ymm7 766 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 767 vpaddd %ymm14,%ymm9,%ymm9 768 vpxor %ymm9,%ymm4,%ymm4 769 vpslld $7,%ymm4,%ymm0 770 vpsrld $25,%ymm4,%ymm4 771 vpor %ymm0,%ymm4,%ymm4 772 773 sub $2,%r8d 774 jnz .Ldoubleround8 775 776 # x0..15[0-3] += s[0..15] 777 vpbroadcastd 0x00(%rdi),%ymm0 778 vpaddd 0x00(%rsp),%ymm0,%ymm0 779 vmovdqa %ymm0,0x00(%rsp) 780 vpbroadcastd 0x04(%rdi),%ymm0 781 vpaddd 0x20(%rsp),%ymm0,%ymm0 782 vmovdqa %ymm0,0x20(%rsp) 783 vpbroadcastd 0x08(%rdi),%ymm0 784 vpaddd 0x40(%rsp),%ymm0,%ymm0 785 vmovdqa %ymm0,0x40(%rsp) 786 vpbroadcastd 0x0c(%rdi),%ymm0 787 vpaddd 0x60(%rsp),%ymm0,%ymm0 788 vmovdqa %ymm0,0x60(%rsp) 789 vpbroadcastd 0x10(%rdi),%ymm0 790 vpaddd %ymm0,%ymm4,%ymm4 791 vpbroadcastd 0x14(%rdi),%ymm0 792 vpaddd %ymm0,%ymm5,%ymm5 793 vpbroadcastd 0x18(%rdi),%ymm0 794 vpaddd %ymm0,%ymm6,%ymm6 795 vpbroadcastd 0x1c(%rdi),%ymm0 796 vpaddd %ymm0,%ymm7,%ymm7 797 vpbroadcastd 0x20(%rdi),%ymm0 798 vpaddd %ymm0,%ymm8,%ymm8 799 vpbroadcastd 0x24(%rdi),%ymm0 800 vpaddd %ymm0,%ymm9,%ymm9 801 vpbroadcastd 0x28(%rdi),%ymm0 802 vpaddd %ymm0,%ymm10,%ymm10 803 vpbroadcastd 0x2c(%rdi),%ymm0 804 vpaddd %ymm0,%ymm11,%ymm11 805 vpbroadcastd 0x30(%rdi),%ymm0 806 vpaddd %ymm0,%ymm12,%ymm12 807 vpbroadcastd 0x34(%rdi),%ymm0 808 vpaddd %ymm0,%ymm13,%ymm13 809 vpbroadcastd 0x38(%rdi),%ymm0 810 vpaddd %ymm0,%ymm14,%ymm14 811 vpbroadcastd 0x3c(%rdi),%ymm0 812 vpaddd %ymm0,%ymm15,%ymm15 813 814 # x12 += counter values 0-3 815 vpaddd %ymm1,%ymm12,%ymm12 816 817 # interleave 32-bit words in state n, n+1 818 vmovdqa 0x00(%rsp),%ymm0 819 vmovdqa 0x20(%rsp),%ymm1 820 vpunpckldq %ymm1,%ymm0,%ymm2 821 vpunpckhdq %ymm1,%ymm0,%ymm1 822 vmovdqa %ymm2,0x00(%rsp) 823 vmovdqa %ymm1,0x20(%rsp) 824 vmovdqa 0x40(%rsp),%ymm0 825 vmovdqa 0x60(%rsp),%ymm1 826 vpunpckldq %ymm1,%ymm0,%ymm2 827 vpunpckhdq %ymm1,%ymm0,%ymm1 828 vmovdqa %ymm2,0x40(%rsp) 829 vmovdqa %ymm1,0x60(%rsp) 830 vmovdqa %ymm4,%ymm0 831 vpunpckldq %ymm5,%ymm0,%ymm4 832 vpunpckhdq %ymm5,%ymm0,%ymm5 833 vmovdqa %ymm6,%ymm0 834 vpunpckldq %ymm7,%ymm0,%ymm6 835 vpunpckhdq %ymm7,%ymm0,%ymm7 836 vmovdqa %ymm8,%ymm0 837 vpunpckldq %ymm9,%ymm0,%ymm8 838 vpunpckhdq %ymm9,%ymm0,%ymm9 839 vmovdqa %ymm10,%ymm0 840 vpunpckldq %ymm11,%ymm0,%ymm10 841 vpunpckhdq %ymm11,%ymm0,%ymm11 842 vmovdqa %ymm12,%ymm0 843 vpunpckldq %ymm13,%ymm0,%ymm12 844 vpunpckhdq %ymm13,%ymm0,%ymm13 845 vmovdqa %ymm14,%ymm0 846 vpunpckldq %ymm15,%ymm0,%ymm14 847 vpunpckhdq %ymm15,%ymm0,%ymm15 848 849 # interleave 64-bit words in state n, n+2 850 vmovdqa 0x00(%rsp),%ymm0 851 vmovdqa 0x40(%rsp),%ymm2 852 vpunpcklqdq %ymm2,%ymm0,%ymm1 853 vpunpckhqdq %ymm2,%ymm0,%ymm2 854 vmovdqa %ymm1,0x00(%rsp) 855 vmovdqa %ymm2,0x40(%rsp) 856 vmovdqa 0x20(%rsp),%ymm0 857 vmovdqa 0x60(%rsp),%ymm2 858 vpunpcklqdq %ymm2,%ymm0,%ymm1 859 vpunpckhqdq %ymm2,%ymm0,%ymm2 860 vmovdqa %ymm1,0x20(%rsp) 861 vmovdqa %ymm2,0x60(%rsp) 862 vmovdqa %ymm4,%ymm0 863 vpunpcklqdq %ymm6,%ymm0,%ymm4 864 vpunpckhqdq %ymm6,%ymm0,%ymm6 865 vmovdqa %ymm5,%ymm0 866 vpunpcklqdq %ymm7,%ymm0,%ymm5 867 vpunpckhqdq %ymm7,%ymm0,%ymm7 868 vmovdqa %ymm8,%ymm0 869 vpunpcklqdq %ymm10,%ymm0,%ymm8 870 vpunpckhqdq %ymm10,%ymm0,%ymm10 871 vmovdqa %ymm9,%ymm0 872 vpunpcklqdq %ymm11,%ymm0,%ymm9 873 vpunpckhqdq %ymm11,%ymm0,%ymm11 874 vmovdqa %ymm12,%ymm0 875 vpunpcklqdq %ymm14,%ymm0,%ymm12 876 vpunpckhqdq %ymm14,%ymm0,%ymm14 877 vmovdqa %ymm13,%ymm0 878 vpunpcklqdq %ymm15,%ymm0,%ymm13 879 vpunpckhqdq %ymm15,%ymm0,%ymm15 880 881 # interleave 128-bit words in state n, n+4 882 # xor/write first four blocks 883 vmovdqa 0x00(%rsp),%ymm1 884 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 885 cmp $0x0020,%rax 886 jl .Lxorpart8 887 vpxor 0x0000(%rdx),%ymm0,%ymm0 888 vmovdqu %ymm0,0x0000(%rsi) 889 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 890 891 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 892 cmp $0x0040,%rax 893 jl .Lxorpart8 894 vpxor 0x0020(%rdx),%ymm0,%ymm0 895 vmovdqu %ymm0,0x0020(%rsi) 896 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 897 898 vmovdqa 0x40(%rsp),%ymm1 899 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 900 cmp $0x0060,%rax 901 jl .Lxorpart8 902 vpxor 0x0040(%rdx),%ymm0,%ymm0 903 vmovdqu %ymm0,0x0040(%rsi) 904 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 905 906 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 907 cmp $0x0080,%rax 908 jl .Lxorpart8 909 vpxor 0x0060(%rdx),%ymm0,%ymm0 910 vmovdqu %ymm0,0x0060(%rsi) 911 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 912 913 vmovdqa 0x20(%rsp),%ymm1 914 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 915 cmp $0x00a0,%rax 916 jl .Lxorpart8 917 vpxor 0x0080(%rdx),%ymm0,%ymm0 918 vmovdqu %ymm0,0x0080(%rsi) 919 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 920 921 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 922 cmp $0x00c0,%rax 923 jl .Lxorpart8 924 vpxor 0x00a0(%rdx),%ymm0,%ymm0 925 vmovdqu %ymm0,0x00a0(%rsi) 926 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 927 928 vmovdqa 0x60(%rsp),%ymm1 929 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 930 cmp $0x00e0,%rax 931 jl .Lxorpart8 932 vpxor 0x00c0(%rdx),%ymm0,%ymm0 933 vmovdqu %ymm0,0x00c0(%rsi) 934 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 935 936 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 937 cmp $0x0100,%rax 938 jl .Lxorpart8 939 vpxor 0x00e0(%rdx),%ymm0,%ymm0 940 vmovdqu %ymm0,0x00e0(%rsi) 941 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 942 943 # xor remaining blocks, write to output 944 vmovdqa %ymm4,%ymm0 945 cmp $0x0120,%rax 946 jl .Lxorpart8 947 vpxor 0x0100(%rdx),%ymm0,%ymm0 948 vmovdqu %ymm0,0x0100(%rsi) 949 950 vmovdqa %ymm12,%ymm0 951 cmp $0x0140,%rax 952 jl .Lxorpart8 953 vpxor 0x0120(%rdx),%ymm0,%ymm0 954 vmovdqu %ymm0,0x0120(%rsi) 955 956 vmovdqa %ymm6,%ymm0 957 cmp $0x0160,%rax 958 jl .Lxorpart8 959 vpxor 0x0140(%rdx),%ymm0,%ymm0 960 vmovdqu %ymm0,0x0140(%rsi) 961 962 vmovdqa %ymm14,%ymm0 963 cmp $0x0180,%rax 964 jl .Lxorpart8 965 vpxor 0x0160(%rdx),%ymm0,%ymm0 966 vmovdqu %ymm0,0x0160(%rsi) 967 968 vmovdqa %ymm5,%ymm0 969 cmp $0x01a0,%rax 970 jl .Lxorpart8 971 vpxor 0x0180(%rdx),%ymm0,%ymm0 972 vmovdqu %ymm0,0x0180(%rsi) 973 974 vmovdqa %ymm13,%ymm0 975 cmp $0x01c0,%rax 976 jl .Lxorpart8 977 vpxor 0x01a0(%rdx),%ymm0,%ymm0 978 vmovdqu %ymm0,0x01a0(%rsi) 979 980 vmovdqa %ymm7,%ymm0 981 cmp $0x01e0,%rax 982 jl .Lxorpart8 983 vpxor 0x01c0(%rdx),%ymm0,%ymm0 984 vmovdqu %ymm0,0x01c0(%rsi) 985 986 vmovdqa %ymm15,%ymm0 987 cmp $0x0200,%rax 988 jl .Lxorpart8 989 vpxor 0x01e0(%rdx),%ymm0,%ymm0 990 vmovdqu %ymm0,0x01e0(%rsi) 991 992.Ldone8: 993 vzeroupper 994 lea -8(%r10),%rsp 995 RET 996 997.Lxorpart8: 998 # xor remaining bytes from partial register into output 999 mov %rax,%r9 1000 and $0x1f,%r9 1001 jz .Ldone8 1002 and $~0x1f,%rax 1003 1004 mov %rsi,%r11 1005 1006 lea (%rdx,%rax),%rsi 1007 mov %rsp,%rdi 1008 mov %r9,%rcx 1009 rep movsb 1010 1011 vpxor 0x00(%rsp),%ymm0,%ymm0 1012 vmovdqa %ymm0,0x00(%rsp) 1013 1014 mov %rsp,%rsi 1015 lea (%r11,%rax),%rdi 1016 mov %r9,%rcx 1017 rep movsb 1018 1019 jmp .Ldone8 1020 1021SYM_FUNC_END(chacha_8block_xor_avx2)