cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-avx512vl-x86_64.S (20441B)


      1/* SPDX-License-Identifier: GPL-2.0+ */
      2/*
      3 * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
      4 *
      5 * Copyright (C) 2018 Martin Willi
      6 */
      7
      8#include <linux/linkage.h>
      9
     10.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
     11.align 32
     12CTR2BL:	.octa 0x00000000000000000000000000000000
     13	.octa 0x00000000000000000000000000000001
     14
     15.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
     16.align 32
     17CTR4BL:	.octa 0x00000000000000000000000000000002
     18	.octa 0x00000000000000000000000000000003
     19
     20.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
     21.align 32
     22CTR8BL:	.octa 0x00000003000000020000000100000000
     23	.octa 0x00000007000000060000000500000004
     24
     25.text
     26
     27SYM_FUNC_START(chacha_2block_xor_avx512vl)
     28	# %rdi: Input state matrix, s
     29	# %rsi: up to 2 data blocks output, o
     30	# %rdx: up to 2 data blocks input, i
     31	# %rcx: input/output length in bytes
     32	# %r8d: nrounds
     33
     34	# This function encrypts two ChaCha blocks by loading the state
     35	# matrix twice across four AVX registers. It performs matrix operations
     36	# on four words in each matrix in parallel, but requires shuffling to
     37	# rearrange the words after each round.
     38
     39	vzeroupper
     40
     41	# x0..3[0-2] = s0..3
     42	vbroadcasti128	0x00(%rdi),%ymm0
     43	vbroadcasti128	0x10(%rdi),%ymm1
     44	vbroadcasti128	0x20(%rdi),%ymm2
     45	vbroadcasti128	0x30(%rdi),%ymm3
     46
     47	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
     48
     49	vmovdqa		%ymm0,%ymm8
     50	vmovdqa		%ymm1,%ymm9
     51	vmovdqa		%ymm2,%ymm10
     52	vmovdqa		%ymm3,%ymm11
     53
     54.Ldoubleround:
     55
     56	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     57	vpaddd		%ymm1,%ymm0,%ymm0
     58	vpxord		%ymm0,%ymm3,%ymm3
     59	vprold		$16,%ymm3,%ymm3
     60
     61	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     62	vpaddd		%ymm3,%ymm2,%ymm2
     63	vpxord		%ymm2,%ymm1,%ymm1
     64	vprold		$12,%ymm1,%ymm1
     65
     66	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     67	vpaddd		%ymm1,%ymm0,%ymm0
     68	vpxord		%ymm0,%ymm3,%ymm3
     69	vprold		$8,%ymm3,%ymm3
     70
     71	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     72	vpaddd		%ymm3,%ymm2,%ymm2
     73	vpxord		%ymm2,%ymm1,%ymm1
     74	vprold		$7,%ymm1,%ymm1
     75
     76	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
     77	vpshufd		$0x39,%ymm1,%ymm1
     78	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
     79	vpshufd		$0x4e,%ymm2,%ymm2
     80	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
     81	vpshufd		$0x93,%ymm3,%ymm3
     82
     83	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     84	vpaddd		%ymm1,%ymm0,%ymm0
     85	vpxord		%ymm0,%ymm3,%ymm3
     86	vprold		$16,%ymm3,%ymm3
     87
     88	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     89	vpaddd		%ymm3,%ymm2,%ymm2
     90	vpxord		%ymm2,%ymm1,%ymm1
     91	vprold		$12,%ymm1,%ymm1
     92
     93	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     94	vpaddd		%ymm1,%ymm0,%ymm0
     95	vpxord		%ymm0,%ymm3,%ymm3
     96	vprold		$8,%ymm3,%ymm3
     97
     98	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     99	vpaddd		%ymm3,%ymm2,%ymm2
    100	vpxord		%ymm2,%ymm1,%ymm1
    101	vprold		$7,%ymm1,%ymm1
    102
    103	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    104	vpshufd		$0x93,%ymm1,%ymm1
    105	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    106	vpshufd		$0x4e,%ymm2,%ymm2
    107	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    108	vpshufd		$0x39,%ymm3,%ymm3
    109
    110	sub		$2,%r8d
    111	jnz		.Ldoubleround
    112
    113	# o0 = i0 ^ (x0 + s0)
    114	vpaddd		%ymm8,%ymm0,%ymm7
    115	cmp		$0x10,%rcx
    116	jl		.Lxorpart2
    117	vpxord		0x00(%rdx),%xmm7,%xmm6
    118	vmovdqu		%xmm6,0x00(%rsi)
    119	vextracti128	$1,%ymm7,%xmm0
    120	# o1 = i1 ^ (x1 + s1)
    121	vpaddd		%ymm9,%ymm1,%ymm7
    122	cmp		$0x20,%rcx
    123	jl		.Lxorpart2
    124	vpxord		0x10(%rdx),%xmm7,%xmm6
    125	vmovdqu		%xmm6,0x10(%rsi)
    126	vextracti128	$1,%ymm7,%xmm1
    127	# o2 = i2 ^ (x2 + s2)
    128	vpaddd		%ymm10,%ymm2,%ymm7
    129	cmp		$0x30,%rcx
    130	jl		.Lxorpart2
    131	vpxord		0x20(%rdx),%xmm7,%xmm6
    132	vmovdqu		%xmm6,0x20(%rsi)
    133	vextracti128	$1,%ymm7,%xmm2
    134	# o3 = i3 ^ (x3 + s3)
    135	vpaddd		%ymm11,%ymm3,%ymm7
    136	cmp		$0x40,%rcx
    137	jl		.Lxorpart2
    138	vpxord		0x30(%rdx),%xmm7,%xmm6
    139	vmovdqu		%xmm6,0x30(%rsi)
    140	vextracti128	$1,%ymm7,%xmm3
    141
    142	# xor and write second block
    143	vmovdqa		%xmm0,%xmm7
    144	cmp		$0x50,%rcx
    145	jl		.Lxorpart2
    146	vpxord		0x40(%rdx),%xmm7,%xmm6
    147	vmovdqu		%xmm6,0x40(%rsi)
    148
    149	vmovdqa		%xmm1,%xmm7
    150	cmp		$0x60,%rcx
    151	jl		.Lxorpart2
    152	vpxord		0x50(%rdx),%xmm7,%xmm6
    153	vmovdqu		%xmm6,0x50(%rsi)
    154
    155	vmovdqa		%xmm2,%xmm7
    156	cmp		$0x70,%rcx
    157	jl		.Lxorpart2
    158	vpxord		0x60(%rdx),%xmm7,%xmm6
    159	vmovdqu		%xmm6,0x60(%rsi)
    160
    161	vmovdqa		%xmm3,%xmm7
    162	cmp		$0x80,%rcx
    163	jl		.Lxorpart2
    164	vpxord		0x70(%rdx),%xmm7,%xmm6
    165	vmovdqu		%xmm6,0x70(%rsi)
    166
    167.Ldone2:
    168	vzeroupper
    169	RET
    170
    171.Lxorpart2:
    172	# xor remaining bytes from partial register into output
    173	mov		%rcx,%rax
    174	and		$0xf,%rcx
    175	jz		.Ldone2
    176	mov		%rax,%r9
    177	and		$~0xf,%r9
    178
    179	mov		$1,%rax
    180	shld		%cl,%rax,%rax
    181	sub		$1,%rax
    182	kmovq		%rax,%k1
    183
    184	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
    185	vpxord		%xmm7,%xmm1,%xmm1
    186	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
    187
    188	jmp		.Ldone2
    189
    190SYM_FUNC_END(chacha_2block_xor_avx512vl)
    191
    192SYM_FUNC_START(chacha_4block_xor_avx512vl)
    193	# %rdi: Input state matrix, s
    194	# %rsi: up to 4 data blocks output, o
    195	# %rdx: up to 4 data blocks input, i
    196	# %rcx: input/output length in bytes
    197	# %r8d: nrounds
    198
    199	# This function encrypts four ChaCha blocks by loading the state
    200	# matrix four times across eight AVX registers. It performs matrix
    201	# operations on four words in two matrices in parallel, sequentially
    202	# to the operations on the four words of the other two matrices. The
    203	# required word shuffling has a rather high latency, we can do the
    204	# arithmetic on two matrix-pairs without much slowdown.
    205
    206	vzeroupper
    207
    208	# x0..3[0-4] = s0..3
    209	vbroadcasti128	0x00(%rdi),%ymm0
    210	vbroadcasti128	0x10(%rdi),%ymm1
    211	vbroadcasti128	0x20(%rdi),%ymm2
    212	vbroadcasti128	0x30(%rdi),%ymm3
    213
    214	vmovdqa		%ymm0,%ymm4
    215	vmovdqa		%ymm1,%ymm5
    216	vmovdqa		%ymm2,%ymm6
    217	vmovdqa		%ymm3,%ymm7
    218
    219	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
    220	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
    221
    222	vmovdqa		%ymm0,%ymm11
    223	vmovdqa		%ymm1,%ymm12
    224	vmovdqa		%ymm2,%ymm13
    225	vmovdqa		%ymm3,%ymm14
    226	vmovdqa		%ymm7,%ymm15
    227
    228.Ldoubleround4:
    229
    230	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    231	vpaddd		%ymm1,%ymm0,%ymm0
    232	vpxord		%ymm0,%ymm3,%ymm3
    233	vprold		$16,%ymm3,%ymm3
    234
    235	vpaddd		%ymm5,%ymm4,%ymm4
    236	vpxord		%ymm4,%ymm7,%ymm7
    237	vprold		$16,%ymm7,%ymm7
    238
    239	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    240	vpaddd		%ymm3,%ymm2,%ymm2
    241	vpxord		%ymm2,%ymm1,%ymm1
    242	vprold		$12,%ymm1,%ymm1
    243
    244	vpaddd		%ymm7,%ymm6,%ymm6
    245	vpxord		%ymm6,%ymm5,%ymm5
    246	vprold		$12,%ymm5,%ymm5
    247
    248	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    249	vpaddd		%ymm1,%ymm0,%ymm0
    250	vpxord		%ymm0,%ymm3,%ymm3
    251	vprold		$8,%ymm3,%ymm3
    252
    253	vpaddd		%ymm5,%ymm4,%ymm4
    254	vpxord		%ymm4,%ymm7,%ymm7
    255	vprold		$8,%ymm7,%ymm7
    256
    257	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    258	vpaddd		%ymm3,%ymm2,%ymm2
    259	vpxord		%ymm2,%ymm1,%ymm1
    260	vprold		$7,%ymm1,%ymm1
    261
    262	vpaddd		%ymm7,%ymm6,%ymm6
    263	vpxord		%ymm6,%ymm5,%ymm5
    264	vprold		$7,%ymm5,%ymm5
    265
    266	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
    267	vpshufd		$0x39,%ymm1,%ymm1
    268	vpshufd		$0x39,%ymm5,%ymm5
    269	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    270	vpshufd		$0x4e,%ymm2,%ymm2
    271	vpshufd		$0x4e,%ymm6,%ymm6
    272	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
    273	vpshufd		$0x93,%ymm3,%ymm3
    274	vpshufd		$0x93,%ymm7,%ymm7
    275
    276	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    277	vpaddd		%ymm1,%ymm0,%ymm0
    278	vpxord		%ymm0,%ymm3,%ymm3
    279	vprold		$16,%ymm3,%ymm3
    280
    281	vpaddd		%ymm5,%ymm4,%ymm4
    282	vpxord		%ymm4,%ymm7,%ymm7
    283	vprold		$16,%ymm7,%ymm7
    284
    285	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    286	vpaddd		%ymm3,%ymm2,%ymm2
    287	vpxord		%ymm2,%ymm1,%ymm1
    288	vprold		$12,%ymm1,%ymm1
    289
    290	vpaddd		%ymm7,%ymm6,%ymm6
    291	vpxord		%ymm6,%ymm5,%ymm5
    292	vprold		$12,%ymm5,%ymm5
    293
    294	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    295	vpaddd		%ymm1,%ymm0,%ymm0
    296	vpxord		%ymm0,%ymm3,%ymm3
    297	vprold		$8,%ymm3,%ymm3
    298
    299	vpaddd		%ymm5,%ymm4,%ymm4
    300	vpxord		%ymm4,%ymm7,%ymm7
    301	vprold		$8,%ymm7,%ymm7
    302
    303	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    304	vpaddd		%ymm3,%ymm2,%ymm2
    305	vpxord		%ymm2,%ymm1,%ymm1
    306	vprold		$7,%ymm1,%ymm1
    307
    308	vpaddd		%ymm7,%ymm6,%ymm6
    309	vpxord		%ymm6,%ymm5,%ymm5
    310	vprold		$7,%ymm5,%ymm5
    311
    312	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    313	vpshufd		$0x93,%ymm1,%ymm1
    314	vpshufd		$0x93,%ymm5,%ymm5
    315	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    316	vpshufd		$0x4e,%ymm2,%ymm2
    317	vpshufd		$0x4e,%ymm6,%ymm6
    318	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    319	vpshufd		$0x39,%ymm3,%ymm3
    320	vpshufd		$0x39,%ymm7,%ymm7
    321
    322	sub		$2,%r8d
    323	jnz		.Ldoubleround4
    324
    325	# o0 = i0 ^ (x0 + s0), first block
    326	vpaddd		%ymm11,%ymm0,%ymm10
    327	cmp		$0x10,%rcx
    328	jl		.Lxorpart4
    329	vpxord		0x00(%rdx),%xmm10,%xmm9
    330	vmovdqu		%xmm9,0x00(%rsi)
    331	vextracti128	$1,%ymm10,%xmm0
    332	# o1 = i1 ^ (x1 + s1), first block
    333	vpaddd		%ymm12,%ymm1,%ymm10
    334	cmp		$0x20,%rcx
    335	jl		.Lxorpart4
    336	vpxord		0x10(%rdx),%xmm10,%xmm9
    337	vmovdqu		%xmm9,0x10(%rsi)
    338	vextracti128	$1,%ymm10,%xmm1
    339	# o2 = i2 ^ (x2 + s2), first block
    340	vpaddd		%ymm13,%ymm2,%ymm10
    341	cmp		$0x30,%rcx
    342	jl		.Lxorpart4
    343	vpxord		0x20(%rdx),%xmm10,%xmm9
    344	vmovdqu		%xmm9,0x20(%rsi)
    345	vextracti128	$1,%ymm10,%xmm2
    346	# o3 = i3 ^ (x3 + s3), first block
    347	vpaddd		%ymm14,%ymm3,%ymm10
    348	cmp		$0x40,%rcx
    349	jl		.Lxorpart4
    350	vpxord		0x30(%rdx),%xmm10,%xmm9
    351	vmovdqu		%xmm9,0x30(%rsi)
    352	vextracti128	$1,%ymm10,%xmm3
    353
    354	# xor and write second block
    355	vmovdqa		%xmm0,%xmm10
    356	cmp		$0x50,%rcx
    357	jl		.Lxorpart4
    358	vpxord		0x40(%rdx),%xmm10,%xmm9
    359	vmovdqu		%xmm9,0x40(%rsi)
    360
    361	vmovdqa		%xmm1,%xmm10
    362	cmp		$0x60,%rcx
    363	jl		.Lxorpart4
    364	vpxord		0x50(%rdx),%xmm10,%xmm9
    365	vmovdqu		%xmm9,0x50(%rsi)
    366
    367	vmovdqa		%xmm2,%xmm10
    368	cmp		$0x70,%rcx
    369	jl		.Lxorpart4
    370	vpxord		0x60(%rdx),%xmm10,%xmm9
    371	vmovdqu		%xmm9,0x60(%rsi)
    372
    373	vmovdqa		%xmm3,%xmm10
    374	cmp		$0x80,%rcx
    375	jl		.Lxorpart4
    376	vpxord		0x70(%rdx),%xmm10,%xmm9
    377	vmovdqu		%xmm9,0x70(%rsi)
    378
    379	# o0 = i0 ^ (x0 + s0), third block
    380	vpaddd		%ymm11,%ymm4,%ymm10
    381	cmp		$0x90,%rcx
    382	jl		.Lxorpart4
    383	vpxord		0x80(%rdx),%xmm10,%xmm9
    384	vmovdqu		%xmm9,0x80(%rsi)
    385	vextracti128	$1,%ymm10,%xmm4
    386	# o1 = i1 ^ (x1 + s1), third block
    387	vpaddd		%ymm12,%ymm5,%ymm10
    388	cmp		$0xa0,%rcx
    389	jl		.Lxorpart4
    390	vpxord		0x90(%rdx),%xmm10,%xmm9
    391	vmovdqu		%xmm9,0x90(%rsi)
    392	vextracti128	$1,%ymm10,%xmm5
    393	# o2 = i2 ^ (x2 + s2), third block
    394	vpaddd		%ymm13,%ymm6,%ymm10
    395	cmp		$0xb0,%rcx
    396	jl		.Lxorpart4
    397	vpxord		0xa0(%rdx),%xmm10,%xmm9
    398	vmovdqu		%xmm9,0xa0(%rsi)
    399	vextracti128	$1,%ymm10,%xmm6
    400	# o3 = i3 ^ (x3 + s3), third block
    401	vpaddd		%ymm15,%ymm7,%ymm10
    402	cmp		$0xc0,%rcx
    403	jl		.Lxorpart4
    404	vpxord		0xb0(%rdx),%xmm10,%xmm9
    405	vmovdqu		%xmm9,0xb0(%rsi)
    406	vextracti128	$1,%ymm10,%xmm7
    407
    408	# xor and write fourth block
    409	vmovdqa		%xmm4,%xmm10
    410	cmp		$0xd0,%rcx
    411	jl		.Lxorpart4
    412	vpxord		0xc0(%rdx),%xmm10,%xmm9
    413	vmovdqu		%xmm9,0xc0(%rsi)
    414
    415	vmovdqa		%xmm5,%xmm10
    416	cmp		$0xe0,%rcx
    417	jl		.Lxorpart4
    418	vpxord		0xd0(%rdx),%xmm10,%xmm9
    419	vmovdqu		%xmm9,0xd0(%rsi)
    420
    421	vmovdqa		%xmm6,%xmm10
    422	cmp		$0xf0,%rcx
    423	jl		.Lxorpart4
    424	vpxord		0xe0(%rdx),%xmm10,%xmm9
    425	vmovdqu		%xmm9,0xe0(%rsi)
    426
    427	vmovdqa		%xmm7,%xmm10
    428	cmp		$0x100,%rcx
    429	jl		.Lxorpart4
    430	vpxord		0xf0(%rdx),%xmm10,%xmm9
    431	vmovdqu		%xmm9,0xf0(%rsi)
    432
    433.Ldone4:
    434	vzeroupper
    435	RET
    436
    437.Lxorpart4:
    438	# xor remaining bytes from partial register into output
    439	mov		%rcx,%rax
    440	and		$0xf,%rcx
    441	jz		.Ldone4
    442	mov		%rax,%r9
    443	and		$~0xf,%r9
    444
    445	mov		$1,%rax
    446	shld		%cl,%rax,%rax
    447	sub		$1,%rax
    448	kmovq		%rax,%k1
    449
    450	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
    451	vpxord		%xmm10,%xmm1,%xmm1
    452	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
    453
    454	jmp		.Ldone4
    455
    456SYM_FUNC_END(chacha_4block_xor_avx512vl)
    457
    458SYM_FUNC_START(chacha_8block_xor_avx512vl)
    459	# %rdi: Input state matrix, s
    460	# %rsi: up to 8 data blocks output, o
    461	# %rdx: up to 8 data blocks input, i
    462	# %rcx: input/output length in bytes
    463	# %r8d: nrounds
    464
    465	# This function encrypts eight consecutive ChaCha blocks by loading
    466	# the state matrix in AVX registers eight times. Compared to AVX2, this
    467	# mostly benefits from the new rotate instructions in VL and the
    468	# additional registers.
    469
    470	vzeroupper
    471
    472	# x0..15[0-7] = s[0..15]
    473	vpbroadcastd	0x00(%rdi),%ymm0
    474	vpbroadcastd	0x04(%rdi),%ymm1
    475	vpbroadcastd	0x08(%rdi),%ymm2
    476	vpbroadcastd	0x0c(%rdi),%ymm3
    477	vpbroadcastd	0x10(%rdi),%ymm4
    478	vpbroadcastd	0x14(%rdi),%ymm5
    479	vpbroadcastd	0x18(%rdi),%ymm6
    480	vpbroadcastd	0x1c(%rdi),%ymm7
    481	vpbroadcastd	0x20(%rdi),%ymm8
    482	vpbroadcastd	0x24(%rdi),%ymm9
    483	vpbroadcastd	0x28(%rdi),%ymm10
    484	vpbroadcastd	0x2c(%rdi),%ymm11
    485	vpbroadcastd	0x30(%rdi),%ymm12
    486	vpbroadcastd	0x34(%rdi),%ymm13
    487	vpbroadcastd	0x38(%rdi),%ymm14
    488	vpbroadcastd	0x3c(%rdi),%ymm15
    489
    490	# x12 += counter values 0-3
    491	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
    492
    493	vmovdqa64	%ymm0,%ymm16
    494	vmovdqa64	%ymm1,%ymm17
    495	vmovdqa64	%ymm2,%ymm18
    496	vmovdqa64	%ymm3,%ymm19
    497	vmovdqa64	%ymm4,%ymm20
    498	vmovdqa64	%ymm5,%ymm21
    499	vmovdqa64	%ymm6,%ymm22
    500	vmovdqa64	%ymm7,%ymm23
    501	vmovdqa64	%ymm8,%ymm24
    502	vmovdqa64	%ymm9,%ymm25
    503	vmovdqa64	%ymm10,%ymm26
    504	vmovdqa64	%ymm11,%ymm27
    505	vmovdqa64	%ymm12,%ymm28
    506	vmovdqa64	%ymm13,%ymm29
    507	vmovdqa64	%ymm14,%ymm30
    508	vmovdqa64	%ymm15,%ymm31
    509
    510.Ldoubleround8:
    511	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
    512	vpaddd		%ymm0,%ymm4,%ymm0
    513	vpxord		%ymm0,%ymm12,%ymm12
    514	vprold		$16,%ymm12,%ymm12
    515	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
    516	vpaddd		%ymm1,%ymm5,%ymm1
    517	vpxord		%ymm1,%ymm13,%ymm13
    518	vprold		$16,%ymm13,%ymm13
    519	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
    520	vpaddd		%ymm2,%ymm6,%ymm2
    521	vpxord		%ymm2,%ymm14,%ymm14
    522	vprold		$16,%ymm14,%ymm14
    523	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
    524	vpaddd		%ymm3,%ymm7,%ymm3
    525	vpxord		%ymm3,%ymm15,%ymm15
    526	vprold		$16,%ymm15,%ymm15
    527
    528	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
    529	vpaddd		%ymm12,%ymm8,%ymm8
    530	vpxord		%ymm8,%ymm4,%ymm4
    531	vprold		$12,%ymm4,%ymm4
    532	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
    533	vpaddd		%ymm13,%ymm9,%ymm9
    534	vpxord		%ymm9,%ymm5,%ymm5
    535	vprold		$12,%ymm5,%ymm5
    536	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
    537	vpaddd		%ymm14,%ymm10,%ymm10
    538	vpxord		%ymm10,%ymm6,%ymm6
    539	vprold		$12,%ymm6,%ymm6
    540	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
    541	vpaddd		%ymm15,%ymm11,%ymm11
    542	vpxord		%ymm11,%ymm7,%ymm7
    543	vprold		$12,%ymm7,%ymm7
    544
    545	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
    546	vpaddd		%ymm0,%ymm4,%ymm0
    547	vpxord		%ymm0,%ymm12,%ymm12
    548	vprold		$8,%ymm12,%ymm12
    549	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
    550	vpaddd		%ymm1,%ymm5,%ymm1
    551	vpxord		%ymm1,%ymm13,%ymm13
    552	vprold		$8,%ymm13,%ymm13
    553	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
    554	vpaddd		%ymm2,%ymm6,%ymm2
    555	vpxord		%ymm2,%ymm14,%ymm14
    556	vprold		$8,%ymm14,%ymm14
    557	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
    558	vpaddd		%ymm3,%ymm7,%ymm3
    559	vpxord		%ymm3,%ymm15,%ymm15
    560	vprold		$8,%ymm15,%ymm15
    561
    562	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
    563	vpaddd		%ymm12,%ymm8,%ymm8
    564	vpxord		%ymm8,%ymm4,%ymm4
    565	vprold		$7,%ymm4,%ymm4
    566	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
    567	vpaddd		%ymm13,%ymm9,%ymm9
    568	vpxord		%ymm9,%ymm5,%ymm5
    569	vprold		$7,%ymm5,%ymm5
    570	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
    571	vpaddd		%ymm14,%ymm10,%ymm10
    572	vpxord		%ymm10,%ymm6,%ymm6
    573	vprold		$7,%ymm6,%ymm6
    574	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
    575	vpaddd		%ymm15,%ymm11,%ymm11
    576	vpxord		%ymm11,%ymm7,%ymm7
    577	vprold		$7,%ymm7,%ymm7
    578
    579	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
    580	vpaddd		%ymm0,%ymm5,%ymm0
    581	vpxord		%ymm0,%ymm15,%ymm15
    582	vprold		$16,%ymm15,%ymm15
    583	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
    584	vpaddd		%ymm1,%ymm6,%ymm1
    585	vpxord		%ymm1,%ymm12,%ymm12
    586	vprold		$16,%ymm12,%ymm12
    587	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
    588	vpaddd		%ymm2,%ymm7,%ymm2
    589	vpxord		%ymm2,%ymm13,%ymm13
    590	vprold		$16,%ymm13,%ymm13
    591	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
    592	vpaddd		%ymm3,%ymm4,%ymm3
    593	vpxord		%ymm3,%ymm14,%ymm14
    594	vprold		$16,%ymm14,%ymm14
    595
    596	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
    597	vpaddd		%ymm15,%ymm10,%ymm10
    598	vpxord		%ymm10,%ymm5,%ymm5
    599	vprold		$12,%ymm5,%ymm5
    600	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
    601	vpaddd		%ymm12,%ymm11,%ymm11
    602	vpxord		%ymm11,%ymm6,%ymm6
    603	vprold		$12,%ymm6,%ymm6
    604	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
    605	vpaddd		%ymm13,%ymm8,%ymm8
    606	vpxord		%ymm8,%ymm7,%ymm7
    607	vprold		$12,%ymm7,%ymm7
    608	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
    609	vpaddd		%ymm14,%ymm9,%ymm9
    610	vpxord		%ymm9,%ymm4,%ymm4
    611	vprold		$12,%ymm4,%ymm4
    612
    613	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
    614	vpaddd		%ymm0,%ymm5,%ymm0
    615	vpxord		%ymm0,%ymm15,%ymm15
    616	vprold		$8,%ymm15,%ymm15
    617	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
    618	vpaddd		%ymm1,%ymm6,%ymm1
    619	vpxord		%ymm1,%ymm12,%ymm12
    620	vprold		$8,%ymm12,%ymm12
    621	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
    622	vpaddd		%ymm2,%ymm7,%ymm2
    623	vpxord		%ymm2,%ymm13,%ymm13
    624	vprold		$8,%ymm13,%ymm13
    625	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
    626	vpaddd		%ymm3,%ymm4,%ymm3
    627	vpxord		%ymm3,%ymm14,%ymm14
    628	vprold		$8,%ymm14,%ymm14
    629
    630	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
    631	vpaddd		%ymm15,%ymm10,%ymm10
    632	vpxord		%ymm10,%ymm5,%ymm5
    633	vprold		$7,%ymm5,%ymm5
    634	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
    635	vpaddd		%ymm12,%ymm11,%ymm11
    636	vpxord		%ymm11,%ymm6,%ymm6
    637	vprold		$7,%ymm6,%ymm6
    638	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
    639	vpaddd		%ymm13,%ymm8,%ymm8
    640	vpxord		%ymm8,%ymm7,%ymm7
    641	vprold		$7,%ymm7,%ymm7
    642	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
    643	vpaddd		%ymm14,%ymm9,%ymm9
    644	vpxord		%ymm9,%ymm4,%ymm4
    645	vprold		$7,%ymm4,%ymm4
    646
    647	sub		$2,%r8d
    648	jnz		.Ldoubleround8
    649
    650	# x0..15[0-3] += s[0..15]
    651	vpaddd		%ymm16,%ymm0,%ymm0
    652	vpaddd		%ymm17,%ymm1,%ymm1
    653	vpaddd		%ymm18,%ymm2,%ymm2
    654	vpaddd		%ymm19,%ymm3,%ymm3
    655	vpaddd		%ymm20,%ymm4,%ymm4
    656	vpaddd		%ymm21,%ymm5,%ymm5
    657	vpaddd		%ymm22,%ymm6,%ymm6
    658	vpaddd		%ymm23,%ymm7,%ymm7
    659	vpaddd		%ymm24,%ymm8,%ymm8
    660	vpaddd		%ymm25,%ymm9,%ymm9
    661	vpaddd		%ymm26,%ymm10,%ymm10
    662	vpaddd		%ymm27,%ymm11,%ymm11
    663	vpaddd		%ymm28,%ymm12,%ymm12
    664	vpaddd		%ymm29,%ymm13,%ymm13
    665	vpaddd		%ymm30,%ymm14,%ymm14
    666	vpaddd		%ymm31,%ymm15,%ymm15
    667
    668	# interleave 32-bit words in state n, n+1
    669	vpunpckldq	%ymm1,%ymm0,%ymm16
    670	vpunpckhdq	%ymm1,%ymm0,%ymm17
    671	vpunpckldq	%ymm3,%ymm2,%ymm18
    672	vpunpckhdq	%ymm3,%ymm2,%ymm19
    673	vpunpckldq	%ymm5,%ymm4,%ymm20
    674	vpunpckhdq	%ymm5,%ymm4,%ymm21
    675	vpunpckldq	%ymm7,%ymm6,%ymm22
    676	vpunpckhdq	%ymm7,%ymm6,%ymm23
    677	vpunpckldq	%ymm9,%ymm8,%ymm24
    678	vpunpckhdq	%ymm9,%ymm8,%ymm25
    679	vpunpckldq	%ymm11,%ymm10,%ymm26
    680	vpunpckhdq	%ymm11,%ymm10,%ymm27
    681	vpunpckldq	%ymm13,%ymm12,%ymm28
    682	vpunpckhdq	%ymm13,%ymm12,%ymm29
    683	vpunpckldq	%ymm15,%ymm14,%ymm30
    684	vpunpckhdq	%ymm15,%ymm14,%ymm31
    685
    686	# interleave 64-bit words in state n, n+2
    687	vpunpcklqdq	%ymm18,%ymm16,%ymm0
    688	vpunpcklqdq	%ymm19,%ymm17,%ymm1
    689	vpunpckhqdq	%ymm18,%ymm16,%ymm2
    690	vpunpckhqdq	%ymm19,%ymm17,%ymm3
    691	vpunpcklqdq	%ymm22,%ymm20,%ymm4
    692	vpunpcklqdq	%ymm23,%ymm21,%ymm5
    693	vpunpckhqdq	%ymm22,%ymm20,%ymm6
    694	vpunpckhqdq	%ymm23,%ymm21,%ymm7
    695	vpunpcklqdq	%ymm26,%ymm24,%ymm8
    696	vpunpcklqdq	%ymm27,%ymm25,%ymm9
    697	vpunpckhqdq	%ymm26,%ymm24,%ymm10
    698	vpunpckhqdq	%ymm27,%ymm25,%ymm11
    699	vpunpcklqdq	%ymm30,%ymm28,%ymm12
    700	vpunpcklqdq	%ymm31,%ymm29,%ymm13
    701	vpunpckhqdq	%ymm30,%ymm28,%ymm14
    702	vpunpckhqdq	%ymm31,%ymm29,%ymm15
    703
    704	# interleave 128-bit words in state n, n+4
    705	# xor/write first four blocks
    706	vmovdqa64	%ymm0,%ymm16
    707	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
    708	cmp		$0x0020,%rcx
    709	jl		.Lxorpart8
    710	vpxord		0x0000(%rdx),%ymm0,%ymm0
    711	vmovdqu64	%ymm0,0x0000(%rsi)
    712	vmovdqa64	%ymm16,%ymm0
    713	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
    714
    715	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
    716	cmp		$0x0040,%rcx
    717	jl		.Lxorpart8
    718	vpxord		0x0020(%rdx),%ymm0,%ymm0
    719	vmovdqu64	%ymm0,0x0020(%rsi)
    720	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
    721
    722	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
    723	cmp		$0x0060,%rcx
    724	jl		.Lxorpart8
    725	vpxord		0x0040(%rdx),%ymm0,%ymm0
    726	vmovdqu64	%ymm0,0x0040(%rsi)
    727	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
    728
    729	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
    730	cmp		$0x0080,%rcx
    731	jl		.Lxorpart8
    732	vpxord		0x0060(%rdx),%ymm0,%ymm0
    733	vmovdqu64	%ymm0,0x0060(%rsi)
    734	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
    735
    736	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
    737	cmp		$0x00a0,%rcx
    738	jl		.Lxorpart8
    739	vpxord		0x0080(%rdx),%ymm0,%ymm0
    740	vmovdqu64	%ymm0,0x0080(%rsi)
    741	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
    742
    743	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
    744	cmp		$0x00c0,%rcx
    745	jl		.Lxorpart8
    746	vpxord		0x00a0(%rdx),%ymm0,%ymm0
    747	vmovdqu64	%ymm0,0x00a0(%rsi)
    748	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
    749
    750	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
    751	cmp		$0x00e0,%rcx
    752	jl		.Lxorpart8
    753	vpxord		0x00c0(%rdx),%ymm0,%ymm0
    754	vmovdqu64	%ymm0,0x00c0(%rsi)
    755	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
    756
    757	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
    758	cmp		$0x0100,%rcx
    759	jl		.Lxorpart8
    760	vpxord		0x00e0(%rdx),%ymm0,%ymm0
    761	vmovdqu64	%ymm0,0x00e0(%rsi)
    762	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
    763
    764	# xor remaining blocks, write to output
    765	vmovdqa64	%ymm4,%ymm0
    766	cmp		$0x0120,%rcx
    767	jl		.Lxorpart8
    768	vpxord		0x0100(%rdx),%ymm0,%ymm0
    769	vmovdqu64	%ymm0,0x0100(%rsi)
    770
    771	vmovdqa64	%ymm12,%ymm0
    772	cmp		$0x0140,%rcx
    773	jl		.Lxorpart8
    774	vpxord		0x0120(%rdx),%ymm0,%ymm0
    775	vmovdqu64	%ymm0,0x0120(%rsi)
    776
    777	vmovdqa64	%ymm6,%ymm0
    778	cmp		$0x0160,%rcx
    779	jl		.Lxorpart8
    780	vpxord		0x0140(%rdx),%ymm0,%ymm0
    781	vmovdqu64	%ymm0,0x0140(%rsi)
    782
    783	vmovdqa64	%ymm14,%ymm0
    784	cmp		$0x0180,%rcx
    785	jl		.Lxorpart8
    786	vpxord		0x0160(%rdx),%ymm0,%ymm0
    787	vmovdqu64	%ymm0,0x0160(%rsi)
    788
    789	vmovdqa64	%ymm5,%ymm0
    790	cmp		$0x01a0,%rcx
    791	jl		.Lxorpart8
    792	vpxord		0x0180(%rdx),%ymm0,%ymm0
    793	vmovdqu64	%ymm0,0x0180(%rsi)
    794
    795	vmovdqa64	%ymm13,%ymm0
    796	cmp		$0x01c0,%rcx
    797	jl		.Lxorpart8
    798	vpxord		0x01a0(%rdx),%ymm0,%ymm0
    799	vmovdqu64	%ymm0,0x01a0(%rsi)
    800
    801	vmovdqa64	%ymm7,%ymm0
    802	cmp		$0x01e0,%rcx
    803	jl		.Lxorpart8
    804	vpxord		0x01c0(%rdx),%ymm0,%ymm0
    805	vmovdqu64	%ymm0,0x01c0(%rsi)
    806
    807	vmovdqa64	%ymm15,%ymm0
    808	cmp		$0x0200,%rcx
    809	jl		.Lxorpart8
    810	vpxord		0x01e0(%rdx),%ymm0,%ymm0
    811	vmovdqu64	%ymm0,0x01e0(%rsi)
    812
    813.Ldone8:
    814	vzeroupper
    815	RET
    816
    817.Lxorpart8:
    818	# xor remaining bytes from partial register into output
    819	mov		%rcx,%rax
    820	and		$0x1f,%rcx
    821	jz		.Ldone8
    822	mov		%rax,%r9
    823	and		$~0x1f,%r9
    824
    825	mov		$1,%rax
    826	shld		%cl,%rax,%rax
    827	sub		$1,%rax
    828	kmovq		%rax,%k1
    829
    830	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
    831	vpxord		%ymm0,%ymm1,%ymm1
    832	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
    833
    834	jmp		.Ldone8
    835
    836SYM_FUNC_END(chacha_8block_xor_avx512vl)