cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-avx2-x86_64.S (25009B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
      4 *
      5 * Copyright (C) 2015 Martin Willi
      6 */
      7
      8#include <linux/linkage.h>
      9
     10.section	.rodata.cst32.ROT8, "aM", @progbits, 32
     11.align 32
     12ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
     13	.octa 0x0e0d0c0f0a09080b0605040702010003
     14
     15.section	.rodata.cst32.ROT16, "aM", @progbits, 32
     16.align 32
     17ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
     18	.octa 0x0d0c0f0e09080b0a0504070601000302
     19
     20.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
     21.align 32
     22CTRINC:	.octa 0x00000003000000020000000100000000
     23	.octa 0x00000007000000060000000500000004
     24
     25.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
     26.align 32
     27CTR2BL:	.octa 0x00000000000000000000000000000000
     28	.octa 0x00000000000000000000000000000001
     29
     30.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
     31.align 32
     32CTR4BL:	.octa 0x00000000000000000000000000000002
     33	.octa 0x00000000000000000000000000000003
     34
     35.text
     36
     37SYM_FUNC_START(chacha_2block_xor_avx2)
     38	# %rdi: Input state matrix, s
     39	# %rsi: up to 2 data blocks output, o
     40	# %rdx: up to 2 data blocks input, i
     41	# %rcx: input/output length in bytes
     42	# %r8d: nrounds
     43
     44	# This function encrypts two ChaCha blocks by loading the state
     45	# matrix twice across four AVX registers. It performs matrix operations
     46	# on four words in each matrix in parallel, but requires shuffling to
     47	# rearrange the words after each round.
     48
     49	vzeroupper
     50
     51	# x0..3[0-2] = s0..3
     52	vbroadcasti128	0x00(%rdi),%ymm0
     53	vbroadcasti128	0x10(%rdi),%ymm1
     54	vbroadcasti128	0x20(%rdi),%ymm2
     55	vbroadcasti128	0x30(%rdi),%ymm3
     56
     57	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
     58
     59	vmovdqa		%ymm0,%ymm8
     60	vmovdqa		%ymm1,%ymm9
     61	vmovdqa		%ymm2,%ymm10
     62	vmovdqa		%ymm3,%ymm11
     63
     64	vmovdqa		ROT8(%rip),%ymm4
     65	vmovdqa		ROT16(%rip),%ymm5
     66
     67	mov		%rcx,%rax
     68
     69.Ldoubleround:
     70
     71	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     72	vpaddd		%ymm1,%ymm0,%ymm0
     73	vpxor		%ymm0,%ymm3,%ymm3
     74	vpshufb		%ymm5,%ymm3,%ymm3
     75
     76	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     77	vpaddd		%ymm3,%ymm2,%ymm2
     78	vpxor		%ymm2,%ymm1,%ymm1
     79	vmovdqa		%ymm1,%ymm6
     80	vpslld		$12,%ymm6,%ymm6
     81	vpsrld		$20,%ymm1,%ymm1
     82	vpor		%ymm6,%ymm1,%ymm1
     83
     84	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     85	vpaddd		%ymm1,%ymm0,%ymm0
     86	vpxor		%ymm0,%ymm3,%ymm3
     87	vpshufb		%ymm4,%ymm3,%ymm3
     88
     89	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     90	vpaddd		%ymm3,%ymm2,%ymm2
     91	vpxor		%ymm2,%ymm1,%ymm1
     92	vmovdqa		%ymm1,%ymm7
     93	vpslld		$7,%ymm7,%ymm7
     94	vpsrld		$25,%ymm1,%ymm1
     95	vpor		%ymm7,%ymm1,%ymm1
     96
     97	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
     98	vpshufd		$0x39,%ymm1,%ymm1
     99	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    100	vpshufd		$0x4e,%ymm2,%ymm2
    101	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
    102	vpshufd		$0x93,%ymm3,%ymm3
    103
    104	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    105	vpaddd		%ymm1,%ymm0,%ymm0
    106	vpxor		%ymm0,%ymm3,%ymm3
    107	vpshufb		%ymm5,%ymm3,%ymm3
    108
    109	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    110	vpaddd		%ymm3,%ymm2,%ymm2
    111	vpxor		%ymm2,%ymm1,%ymm1
    112	vmovdqa		%ymm1,%ymm6
    113	vpslld		$12,%ymm6,%ymm6
    114	vpsrld		$20,%ymm1,%ymm1
    115	vpor		%ymm6,%ymm1,%ymm1
    116
    117	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    118	vpaddd		%ymm1,%ymm0,%ymm0
    119	vpxor		%ymm0,%ymm3,%ymm3
    120	vpshufb		%ymm4,%ymm3,%ymm3
    121
    122	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    123	vpaddd		%ymm3,%ymm2,%ymm2
    124	vpxor		%ymm2,%ymm1,%ymm1
    125	vmovdqa		%ymm1,%ymm7
    126	vpslld		$7,%ymm7,%ymm7
    127	vpsrld		$25,%ymm1,%ymm1
    128	vpor		%ymm7,%ymm1,%ymm1
    129
    130	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    131	vpshufd		$0x93,%ymm1,%ymm1
    132	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    133	vpshufd		$0x4e,%ymm2,%ymm2
    134	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    135	vpshufd		$0x39,%ymm3,%ymm3
    136
    137	sub		$2,%r8d
    138	jnz		.Ldoubleround
    139
    140	# o0 = i0 ^ (x0 + s0)
    141	vpaddd		%ymm8,%ymm0,%ymm7
    142	cmp		$0x10,%rax
    143	jl		.Lxorpart2
    144	vpxor		0x00(%rdx),%xmm7,%xmm6
    145	vmovdqu		%xmm6,0x00(%rsi)
    146	vextracti128	$1,%ymm7,%xmm0
    147	# o1 = i1 ^ (x1 + s1)
    148	vpaddd		%ymm9,%ymm1,%ymm7
    149	cmp		$0x20,%rax
    150	jl		.Lxorpart2
    151	vpxor		0x10(%rdx),%xmm7,%xmm6
    152	vmovdqu		%xmm6,0x10(%rsi)
    153	vextracti128	$1,%ymm7,%xmm1
    154	# o2 = i2 ^ (x2 + s2)
    155	vpaddd		%ymm10,%ymm2,%ymm7
    156	cmp		$0x30,%rax
    157	jl		.Lxorpart2
    158	vpxor		0x20(%rdx),%xmm7,%xmm6
    159	vmovdqu		%xmm6,0x20(%rsi)
    160	vextracti128	$1,%ymm7,%xmm2
    161	# o3 = i3 ^ (x3 + s3)
    162	vpaddd		%ymm11,%ymm3,%ymm7
    163	cmp		$0x40,%rax
    164	jl		.Lxorpart2
    165	vpxor		0x30(%rdx),%xmm7,%xmm6
    166	vmovdqu		%xmm6,0x30(%rsi)
    167	vextracti128	$1,%ymm7,%xmm3
    168
    169	# xor and write second block
    170	vmovdqa		%xmm0,%xmm7
    171	cmp		$0x50,%rax
    172	jl		.Lxorpart2
    173	vpxor		0x40(%rdx),%xmm7,%xmm6
    174	vmovdqu		%xmm6,0x40(%rsi)
    175
    176	vmovdqa		%xmm1,%xmm7
    177	cmp		$0x60,%rax
    178	jl		.Lxorpart2
    179	vpxor		0x50(%rdx),%xmm7,%xmm6
    180	vmovdqu		%xmm6,0x50(%rsi)
    181
    182	vmovdqa		%xmm2,%xmm7
    183	cmp		$0x70,%rax
    184	jl		.Lxorpart2
    185	vpxor		0x60(%rdx),%xmm7,%xmm6
    186	vmovdqu		%xmm6,0x60(%rsi)
    187
    188	vmovdqa		%xmm3,%xmm7
    189	cmp		$0x80,%rax
    190	jl		.Lxorpart2
    191	vpxor		0x70(%rdx),%xmm7,%xmm6
    192	vmovdqu		%xmm6,0x70(%rsi)
    193
    194.Ldone2:
    195	vzeroupper
    196	RET
    197
    198.Lxorpart2:
    199	# xor remaining bytes from partial register into output
    200	mov		%rax,%r9
    201	and		$0x0f,%r9
    202	jz		.Ldone2
    203	and		$~0x0f,%rax
    204
    205	mov		%rsi,%r11
    206
    207	lea		8(%rsp),%r10
    208	sub		$0x10,%rsp
    209	and		$~31,%rsp
    210
    211	lea		(%rdx,%rax),%rsi
    212	mov		%rsp,%rdi
    213	mov		%r9,%rcx
    214	rep movsb
    215
    216	vpxor		0x00(%rsp),%xmm7,%xmm7
    217	vmovdqa		%xmm7,0x00(%rsp)
    218
    219	mov		%rsp,%rsi
    220	lea		(%r11,%rax),%rdi
    221	mov		%r9,%rcx
    222	rep movsb
    223
    224	lea		-8(%r10),%rsp
    225	jmp		.Ldone2
    226
    227SYM_FUNC_END(chacha_2block_xor_avx2)
    228
    229SYM_FUNC_START(chacha_4block_xor_avx2)
    230	# %rdi: Input state matrix, s
    231	# %rsi: up to 4 data blocks output, o
    232	# %rdx: up to 4 data blocks input, i
    233	# %rcx: input/output length in bytes
    234	# %r8d: nrounds
    235
    236	# This function encrypts four ChaCha blocks by loading the state
    237	# matrix four times across eight AVX registers. It performs matrix
    238	# operations on four words in two matrices in parallel, sequentially
    239	# to the operations on the four words of the other two matrices. The
    240	# required word shuffling has a rather high latency, we can do the
    241	# arithmetic on two matrix-pairs without much slowdown.
    242
    243	vzeroupper
    244
    245	# x0..3[0-4] = s0..3
    246	vbroadcasti128	0x00(%rdi),%ymm0
    247	vbroadcasti128	0x10(%rdi),%ymm1
    248	vbroadcasti128	0x20(%rdi),%ymm2
    249	vbroadcasti128	0x30(%rdi),%ymm3
    250
    251	vmovdqa		%ymm0,%ymm4
    252	vmovdqa		%ymm1,%ymm5
    253	vmovdqa		%ymm2,%ymm6
    254	vmovdqa		%ymm3,%ymm7
    255
    256	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
    257	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
    258
    259	vmovdqa		%ymm0,%ymm11
    260	vmovdqa		%ymm1,%ymm12
    261	vmovdqa		%ymm2,%ymm13
    262	vmovdqa		%ymm3,%ymm14
    263	vmovdqa		%ymm7,%ymm15
    264
    265	vmovdqa		ROT8(%rip),%ymm8
    266	vmovdqa		ROT16(%rip),%ymm9
    267
    268	mov		%rcx,%rax
    269
    270.Ldoubleround4:
    271
    272	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    273	vpaddd		%ymm1,%ymm0,%ymm0
    274	vpxor		%ymm0,%ymm3,%ymm3
    275	vpshufb		%ymm9,%ymm3,%ymm3
    276
    277	vpaddd		%ymm5,%ymm4,%ymm4
    278	vpxor		%ymm4,%ymm7,%ymm7
    279	vpshufb		%ymm9,%ymm7,%ymm7
    280
    281	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    282	vpaddd		%ymm3,%ymm2,%ymm2
    283	vpxor		%ymm2,%ymm1,%ymm1
    284	vmovdqa		%ymm1,%ymm10
    285	vpslld		$12,%ymm10,%ymm10
    286	vpsrld		$20,%ymm1,%ymm1
    287	vpor		%ymm10,%ymm1,%ymm1
    288
    289	vpaddd		%ymm7,%ymm6,%ymm6
    290	vpxor		%ymm6,%ymm5,%ymm5
    291	vmovdqa		%ymm5,%ymm10
    292	vpslld		$12,%ymm10,%ymm10
    293	vpsrld		$20,%ymm5,%ymm5
    294	vpor		%ymm10,%ymm5,%ymm5
    295
    296	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    297	vpaddd		%ymm1,%ymm0,%ymm0
    298	vpxor		%ymm0,%ymm3,%ymm3
    299	vpshufb		%ymm8,%ymm3,%ymm3
    300
    301	vpaddd		%ymm5,%ymm4,%ymm4
    302	vpxor		%ymm4,%ymm7,%ymm7
    303	vpshufb		%ymm8,%ymm7,%ymm7
    304
    305	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    306	vpaddd		%ymm3,%ymm2,%ymm2
    307	vpxor		%ymm2,%ymm1,%ymm1
    308	vmovdqa		%ymm1,%ymm10
    309	vpslld		$7,%ymm10,%ymm10
    310	vpsrld		$25,%ymm1,%ymm1
    311	vpor		%ymm10,%ymm1,%ymm1
    312
    313	vpaddd		%ymm7,%ymm6,%ymm6
    314	vpxor		%ymm6,%ymm5,%ymm5
    315	vmovdqa		%ymm5,%ymm10
    316	vpslld		$7,%ymm10,%ymm10
    317	vpsrld		$25,%ymm5,%ymm5
    318	vpor		%ymm10,%ymm5,%ymm5
    319
    320	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
    321	vpshufd		$0x39,%ymm1,%ymm1
    322	vpshufd		$0x39,%ymm5,%ymm5
    323	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    324	vpshufd		$0x4e,%ymm2,%ymm2
    325	vpshufd		$0x4e,%ymm6,%ymm6
    326	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
    327	vpshufd		$0x93,%ymm3,%ymm3
    328	vpshufd		$0x93,%ymm7,%ymm7
    329
    330	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    331	vpaddd		%ymm1,%ymm0,%ymm0
    332	vpxor		%ymm0,%ymm3,%ymm3
    333	vpshufb		%ymm9,%ymm3,%ymm3
    334
    335	vpaddd		%ymm5,%ymm4,%ymm4
    336	vpxor		%ymm4,%ymm7,%ymm7
    337	vpshufb		%ymm9,%ymm7,%ymm7
    338
    339	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    340	vpaddd		%ymm3,%ymm2,%ymm2
    341	vpxor		%ymm2,%ymm1,%ymm1
    342	vmovdqa		%ymm1,%ymm10
    343	vpslld		$12,%ymm10,%ymm10
    344	vpsrld		$20,%ymm1,%ymm1
    345	vpor		%ymm10,%ymm1,%ymm1
    346
    347	vpaddd		%ymm7,%ymm6,%ymm6
    348	vpxor		%ymm6,%ymm5,%ymm5
    349	vmovdqa		%ymm5,%ymm10
    350	vpslld		$12,%ymm10,%ymm10
    351	vpsrld		$20,%ymm5,%ymm5
    352	vpor		%ymm10,%ymm5,%ymm5
    353
    354	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    355	vpaddd		%ymm1,%ymm0,%ymm0
    356	vpxor		%ymm0,%ymm3,%ymm3
    357	vpshufb		%ymm8,%ymm3,%ymm3
    358
    359	vpaddd		%ymm5,%ymm4,%ymm4
    360	vpxor		%ymm4,%ymm7,%ymm7
    361	vpshufb		%ymm8,%ymm7,%ymm7
    362
    363	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    364	vpaddd		%ymm3,%ymm2,%ymm2
    365	vpxor		%ymm2,%ymm1,%ymm1
    366	vmovdqa		%ymm1,%ymm10
    367	vpslld		$7,%ymm10,%ymm10
    368	vpsrld		$25,%ymm1,%ymm1
    369	vpor		%ymm10,%ymm1,%ymm1
    370
    371	vpaddd		%ymm7,%ymm6,%ymm6
    372	vpxor		%ymm6,%ymm5,%ymm5
    373	vmovdqa		%ymm5,%ymm10
    374	vpslld		$7,%ymm10,%ymm10
    375	vpsrld		$25,%ymm5,%ymm5
    376	vpor		%ymm10,%ymm5,%ymm5
    377
    378	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    379	vpshufd		$0x93,%ymm1,%ymm1
    380	vpshufd		$0x93,%ymm5,%ymm5
    381	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    382	vpshufd		$0x4e,%ymm2,%ymm2
    383	vpshufd		$0x4e,%ymm6,%ymm6
    384	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    385	vpshufd		$0x39,%ymm3,%ymm3
    386	vpshufd		$0x39,%ymm7,%ymm7
    387
    388	sub		$2,%r8d
    389	jnz		.Ldoubleround4
    390
    391	# o0 = i0 ^ (x0 + s0), first block
    392	vpaddd		%ymm11,%ymm0,%ymm10
    393	cmp		$0x10,%rax
    394	jl		.Lxorpart4
    395	vpxor		0x00(%rdx),%xmm10,%xmm9
    396	vmovdqu		%xmm9,0x00(%rsi)
    397	vextracti128	$1,%ymm10,%xmm0
    398	# o1 = i1 ^ (x1 + s1), first block
    399	vpaddd		%ymm12,%ymm1,%ymm10
    400	cmp		$0x20,%rax
    401	jl		.Lxorpart4
    402	vpxor		0x10(%rdx),%xmm10,%xmm9
    403	vmovdqu		%xmm9,0x10(%rsi)
    404	vextracti128	$1,%ymm10,%xmm1
    405	# o2 = i2 ^ (x2 + s2), first block
    406	vpaddd		%ymm13,%ymm2,%ymm10
    407	cmp		$0x30,%rax
    408	jl		.Lxorpart4
    409	vpxor		0x20(%rdx),%xmm10,%xmm9
    410	vmovdqu		%xmm9,0x20(%rsi)
    411	vextracti128	$1,%ymm10,%xmm2
    412	# o3 = i3 ^ (x3 + s3), first block
    413	vpaddd		%ymm14,%ymm3,%ymm10
    414	cmp		$0x40,%rax
    415	jl		.Lxorpart4
    416	vpxor		0x30(%rdx),%xmm10,%xmm9
    417	vmovdqu		%xmm9,0x30(%rsi)
    418	vextracti128	$1,%ymm10,%xmm3
    419
    420	# xor and write second block
    421	vmovdqa		%xmm0,%xmm10
    422	cmp		$0x50,%rax
    423	jl		.Lxorpart4
    424	vpxor		0x40(%rdx),%xmm10,%xmm9
    425	vmovdqu		%xmm9,0x40(%rsi)
    426
    427	vmovdqa		%xmm1,%xmm10
    428	cmp		$0x60,%rax
    429	jl		.Lxorpart4
    430	vpxor		0x50(%rdx),%xmm10,%xmm9
    431	vmovdqu		%xmm9,0x50(%rsi)
    432
    433	vmovdqa		%xmm2,%xmm10
    434	cmp		$0x70,%rax
    435	jl		.Lxorpart4
    436	vpxor		0x60(%rdx),%xmm10,%xmm9
    437	vmovdqu		%xmm9,0x60(%rsi)
    438
    439	vmovdqa		%xmm3,%xmm10
    440	cmp		$0x80,%rax
    441	jl		.Lxorpart4
    442	vpxor		0x70(%rdx),%xmm10,%xmm9
    443	vmovdqu		%xmm9,0x70(%rsi)
    444
    445	# o0 = i0 ^ (x0 + s0), third block
    446	vpaddd		%ymm11,%ymm4,%ymm10
    447	cmp		$0x90,%rax
    448	jl		.Lxorpart4
    449	vpxor		0x80(%rdx),%xmm10,%xmm9
    450	vmovdqu		%xmm9,0x80(%rsi)
    451	vextracti128	$1,%ymm10,%xmm4
    452	# o1 = i1 ^ (x1 + s1), third block
    453	vpaddd		%ymm12,%ymm5,%ymm10
    454	cmp		$0xa0,%rax
    455	jl		.Lxorpart4
    456	vpxor		0x90(%rdx),%xmm10,%xmm9
    457	vmovdqu		%xmm9,0x90(%rsi)
    458	vextracti128	$1,%ymm10,%xmm5
    459	# o2 = i2 ^ (x2 + s2), third block
    460	vpaddd		%ymm13,%ymm6,%ymm10
    461	cmp		$0xb0,%rax
    462	jl		.Lxorpart4
    463	vpxor		0xa0(%rdx),%xmm10,%xmm9
    464	vmovdqu		%xmm9,0xa0(%rsi)
    465	vextracti128	$1,%ymm10,%xmm6
    466	# o3 = i3 ^ (x3 + s3), third block
    467	vpaddd		%ymm15,%ymm7,%ymm10
    468	cmp		$0xc0,%rax
    469	jl		.Lxorpart4
    470	vpxor		0xb0(%rdx),%xmm10,%xmm9
    471	vmovdqu		%xmm9,0xb0(%rsi)
    472	vextracti128	$1,%ymm10,%xmm7
    473
    474	# xor and write fourth block
    475	vmovdqa		%xmm4,%xmm10
    476	cmp		$0xd0,%rax
    477	jl		.Lxorpart4
    478	vpxor		0xc0(%rdx),%xmm10,%xmm9
    479	vmovdqu		%xmm9,0xc0(%rsi)
    480
    481	vmovdqa		%xmm5,%xmm10
    482	cmp		$0xe0,%rax
    483	jl		.Lxorpart4
    484	vpxor		0xd0(%rdx),%xmm10,%xmm9
    485	vmovdqu		%xmm9,0xd0(%rsi)
    486
    487	vmovdqa		%xmm6,%xmm10
    488	cmp		$0xf0,%rax
    489	jl		.Lxorpart4
    490	vpxor		0xe0(%rdx),%xmm10,%xmm9
    491	vmovdqu		%xmm9,0xe0(%rsi)
    492
    493	vmovdqa		%xmm7,%xmm10
    494	cmp		$0x100,%rax
    495	jl		.Lxorpart4
    496	vpxor		0xf0(%rdx),%xmm10,%xmm9
    497	vmovdqu		%xmm9,0xf0(%rsi)
    498
    499.Ldone4:
    500	vzeroupper
    501	RET
    502
    503.Lxorpart4:
    504	# xor remaining bytes from partial register into output
    505	mov		%rax,%r9
    506	and		$0x0f,%r9
    507	jz		.Ldone4
    508	and		$~0x0f,%rax
    509
    510	mov		%rsi,%r11
    511
    512	lea		8(%rsp),%r10
    513	sub		$0x10,%rsp
    514	and		$~31,%rsp
    515
    516	lea		(%rdx,%rax),%rsi
    517	mov		%rsp,%rdi
    518	mov		%r9,%rcx
    519	rep movsb
    520
    521	vpxor		0x00(%rsp),%xmm10,%xmm10
    522	vmovdqa		%xmm10,0x00(%rsp)
    523
    524	mov		%rsp,%rsi
    525	lea		(%r11,%rax),%rdi
    526	mov		%r9,%rcx
    527	rep movsb
    528
    529	lea		-8(%r10),%rsp
    530	jmp		.Ldone4
    531
    532SYM_FUNC_END(chacha_4block_xor_avx2)
    533
    534SYM_FUNC_START(chacha_8block_xor_avx2)
    535	# %rdi: Input state matrix, s
    536	# %rsi: up to 8 data blocks output, o
    537	# %rdx: up to 8 data blocks input, i
    538	# %rcx: input/output length in bytes
    539	# %r8d: nrounds
    540
    541	# This function encrypts eight consecutive ChaCha blocks by loading
    542	# the state matrix in AVX registers eight times. As we need some
    543	# scratch registers, we save the first four registers on the stack. The
    544	# algorithm performs each operation on the corresponding word of each
    545	# state matrix, hence requires no word shuffling. For final XORing step
    546	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
    547	# words, which allows us to do XOR in AVX registers. 8/16-bit word
    548	# rotation is done with the slightly better performing byte shuffling,
    549	# 7/12-bit word rotation uses traditional shift+OR.
    550
    551	vzeroupper
    552	# 4 * 32 byte stack, 32-byte aligned
    553	lea		8(%rsp),%r10
    554	and		$~31, %rsp
    555	sub		$0x80, %rsp
    556	mov		%rcx,%rax
    557
    558	# x0..15[0-7] = s[0..15]
    559	vpbroadcastd	0x00(%rdi),%ymm0
    560	vpbroadcastd	0x04(%rdi),%ymm1
    561	vpbroadcastd	0x08(%rdi),%ymm2
    562	vpbroadcastd	0x0c(%rdi),%ymm3
    563	vpbroadcastd	0x10(%rdi),%ymm4
    564	vpbroadcastd	0x14(%rdi),%ymm5
    565	vpbroadcastd	0x18(%rdi),%ymm6
    566	vpbroadcastd	0x1c(%rdi),%ymm7
    567	vpbroadcastd	0x20(%rdi),%ymm8
    568	vpbroadcastd	0x24(%rdi),%ymm9
    569	vpbroadcastd	0x28(%rdi),%ymm10
    570	vpbroadcastd	0x2c(%rdi),%ymm11
    571	vpbroadcastd	0x30(%rdi),%ymm12
    572	vpbroadcastd	0x34(%rdi),%ymm13
    573	vpbroadcastd	0x38(%rdi),%ymm14
    574	vpbroadcastd	0x3c(%rdi),%ymm15
    575	# x0..3 on stack
    576	vmovdqa		%ymm0,0x00(%rsp)
    577	vmovdqa		%ymm1,0x20(%rsp)
    578	vmovdqa		%ymm2,0x40(%rsp)
    579	vmovdqa		%ymm3,0x60(%rsp)
    580
    581	vmovdqa		CTRINC(%rip),%ymm1
    582	vmovdqa		ROT8(%rip),%ymm2
    583	vmovdqa		ROT16(%rip),%ymm3
    584
    585	# x12 += counter values 0-3
    586	vpaddd		%ymm1,%ymm12,%ymm12
    587
    588.Ldoubleround8:
    589	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
    590	vpaddd		0x00(%rsp),%ymm4,%ymm0
    591	vmovdqa		%ymm0,0x00(%rsp)
    592	vpxor		%ymm0,%ymm12,%ymm12
    593	vpshufb		%ymm3,%ymm12,%ymm12
    594	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
    595	vpaddd		0x20(%rsp),%ymm5,%ymm0
    596	vmovdqa		%ymm0,0x20(%rsp)
    597	vpxor		%ymm0,%ymm13,%ymm13
    598	vpshufb		%ymm3,%ymm13,%ymm13
    599	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
    600	vpaddd		0x40(%rsp),%ymm6,%ymm0
    601	vmovdqa		%ymm0,0x40(%rsp)
    602	vpxor		%ymm0,%ymm14,%ymm14
    603	vpshufb		%ymm3,%ymm14,%ymm14
    604	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
    605	vpaddd		0x60(%rsp),%ymm7,%ymm0
    606	vmovdqa		%ymm0,0x60(%rsp)
    607	vpxor		%ymm0,%ymm15,%ymm15
    608	vpshufb		%ymm3,%ymm15,%ymm15
    609
    610	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
    611	vpaddd		%ymm12,%ymm8,%ymm8
    612	vpxor		%ymm8,%ymm4,%ymm4
    613	vpslld		$12,%ymm4,%ymm0
    614	vpsrld		$20,%ymm4,%ymm4
    615	vpor		%ymm0,%ymm4,%ymm4
    616	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
    617	vpaddd		%ymm13,%ymm9,%ymm9
    618	vpxor		%ymm9,%ymm5,%ymm5
    619	vpslld		$12,%ymm5,%ymm0
    620	vpsrld		$20,%ymm5,%ymm5
    621	vpor		%ymm0,%ymm5,%ymm5
    622	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
    623	vpaddd		%ymm14,%ymm10,%ymm10
    624	vpxor		%ymm10,%ymm6,%ymm6
    625	vpslld		$12,%ymm6,%ymm0
    626	vpsrld		$20,%ymm6,%ymm6
    627	vpor		%ymm0,%ymm6,%ymm6
    628	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
    629	vpaddd		%ymm15,%ymm11,%ymm11
    630	vpxor		%ymm11,%ymm7,%ymm7
    631	vpslld		$12,%ymm7,%ymm0
    632	vpsrld		$20,%ymm7,%ymm7
    633	vpor		%ymm0,%ymm7,%ymm7
    634
    635	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
    636	vpaddd		0x00(%rsp),%ymm4,%ymm0
    637	vmovdqa		%ymm0,0x00(%rsp)
    638	vpxor		%ymm0,%ymm12,%ymm12
    639	vpshufb		%ymm2,%ymm12,%ymm12
    640	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
    641	vpaddd		0x20(%rsp),%ymm5,%ymm0
    642	vmovdqa		%ymm0,0x20(%rsp)
    643	vpxor		%ymm0,%ymm13,%ymm13
    644	vpshufb		%ymm2,%ymm13,%ymm13
    645	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
    646	vpaddd		0x40(%rsp),%ymm6,%ymm0
    647	vmovdqa		%ymm0,0x40(%rsp)
    648	vpxor		%ymm0,%ymm14,%ymm14
    649	vpshufb		%ymm2,%ymm14,%ymm14
    650	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
    651	vpaddd		0x60(%rsp),%ymm7,%ymm0
    652	vmovdqa		%ymm0,0x60(%rsp)
    653	vpxor		%ymm0,%ymm15,%ymm15
    654	vpshufb		%ymm2,%ymm15,%ymm15
    655
    656	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
    657	vpaddd		%ymm12,%ymm8,%ymm8
    658	vpxor		%ymm8,%ymm4,%ymm4
    659	vpslld		$7,%ymm4,%ymm0
    660	vpsrld		$25,%ymm4,%ymm4
    661	vpor		%ymm0,%ymm4,%ymm4
    662	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
    663	vpaddd		%ymm13,%ymm9,%ymm9
    664	vpxor		%ymm9,%ymm5,%ymm5
    665	vpslld		$7,%ymm5,%ymm0
    666	vpsrld		$25,%ymm5,%ymm5
    667	vpor		%ymm0,%ymm5,%ymm5
    668	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
    669	vpaddd		%ymm14,%ymm10,%ymm10
    670	vpxor		%ymm10,%ymm6,%ymm6
    671	vpslld		$7,%ymm6,%ymm0
    672	vpsrld		$25,%ymm6,%ymm6
    673	vpor		%ymm0,%ymm6,%ymm6
    674	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
    675	vpaddd		%ymm15,%ymm11,%ymm11
    676	vpxor		%ymm11,%ymm7,%ymm7
    677	vpslld		$7,%ymm7,%ymm0
    678	vpsrld		$25,%ymm7,%ymm7
    679	vpor		%ymm0,%ymm7,%ymm7
    680
    681	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
    682	vpaddd		0x00(%rsp),%ymm5,%ymm0
    683	vmovdqa		%ymm0,0x00(%rsp)
    684	vpxor		%ymm0,%ymm15,%ymm15
    685	vpshufb		%ymm3,%ymm15,%ymm15
    686	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
    687	vpaddd		0x20(%rsp),%ymm6,%ymm0
    688	vmovdqa		%ymm0,0x20(%rsp)
    689	vpxor		%ymm0,%ymm12,%ymm12
    690	vpshufb		%ymm3,%ymm12,%ymm12
    691	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
    692	vpaddd		0x40(%rsp),%ymm7,%ymm0
    693	vmovdqa		%ymm0,0x40(%rsp)
    694	vpxor		%ymm0,%ymm13,%ymm13
    695	vpshufb		%ymm3,%ymm13,%ymm13
    696	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
    697	vpaddd		0x60(%rsp),%ymm4,%ymm0
    698	vmovdqa		%ymm0,0x60(%rsp)
    699	vpxor		%ymm0,%ymm14,%ymm14
    700	vpshufb		%ymm3,%ymm14,%ymm14
    701
    702	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
    703	vpaddd		%ymm15,%ymm10,%ymm10
    704	vpxor		%ymm10,%ymm5,%ymm5
    705	vpslld		$12,%ymm5,%ymm0
    706	vpsrld		$20,%ymm5,%ymm5
    707	vpor		%ymm0,%ymm5,%ymm5
    708	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
    709	vpaddd		%ymm12,%ymm11,%ymm11
    710	vpxor		%ymm11,%ymm6,%ymm6
    711	vpslld		$12,%ymm6,%ymm0
    712	vpsrld		$20,%ymm6,%ymm6
    713	vpor		%ymm0,%ymm6,%ymm6
    714	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
    715	vpaddd		%ymm13,%ymm8,%ymm8
    716	vpxor		%ymm8,%ymm7,%ymm7
    717	vpslld		$12,%ymm7,%ymm0
    718	vpsrld		$20,%ymm7,%ymm7
    719	vpor		%ymm0,%ymm7,%ymm7
    720	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
    721	vpaddd		%ymm14,%ymm9,%ymm9
    722	vpxor		%ymm9,%ymm4,%ymm4
    723	vpslld		$12,%ymm4,%ymm0
    724	vpsrld		$20,%ymm4,%ymm4
    725	vpor		%ymm0,%ymm4,%ymm4
    726
    727	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
    728	vpaddd		0x00(%rsp),%ymm5,%ymm0
    729	vmovdqa		%ymm0,0x00(%rsp)
    730	vpxor		%ymm0,%ymm15,%ymm15
    731	vpshufb		%ymm2,%ymm15,%ymm15
    732	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
    733	vpaddd		0x20(%rsp),%ymm6,%ymm0
    734	vmovdqa		%ymm0,0x20(%rsp)
    735	vpxor		%ymm0,%ymm12,%ymm12
    736	vpshufb		%ymm2,%ymm12,%ymm12
    737	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
    738	vpaddd		0x40(%rsp),%ymm7,%ymm0
    739	vmovdqa		%ymm0,0x40(%rsp)
    740	vpxor		%ymm0,%ymm13,%ymm13
    741	vpshufb		%ymm2,%ymm13,%ymm13
    742	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
    743	vpaddd		0x60(%rsp),%ymm4,%ymm0
    744	vmovdqa		%ymm0,0x60(%rsp)
    745	vpxor		%ymm0,%ymm14,%ymm14
    746	vpshufb		%ymm2,%ymm14,%ymm14
    747
    748	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
    749	vpaddd		%ymm15,%ymm10,%ymm10
    750	vpxor		%ymm10,%ymm5,%ymm5
    751	vpslld		$7,%ymm5,%ymm0
    752	vpsrld		$25,%ymm5,%ymm5
    753	vpor		%ymm0,%ymm5,%ymm5
    754	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
    755	vpaddd		%ymm12,%ymm11,%ymm11
    756	vpxor		%ymm11,%ymm6,%ymm6
    757	vpslld		$7,%ymm6,%ymm0
    758	vpsrld		$25,%ymm6,%ymm6
    759	vpor		%ymm0,%ymm6,%ymm6
    760	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
    761	vpaddd		%ymm13,%ymm8,%ymm8
    762	vpxor		%ymm8,%ymm7,%ymm7
    763	vpslld		$7,%ymm7,%ymm0
    764	vpsrld		$25,%ymm7,%ymm7
    765	vpor		%ymm0,%ymm7,%ymm7
    766	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
    767	vpaddd		%ymm14,%ymm9,%ymm9
    768	vpxor		%ymm9,%ymm4,%ymm4
    769	vpslld		$7,%ymm4,%ymm0
    770	vpsrld		$25,%ymm4,%ymm4
    771	vpor		%ymm0,%ymm4,%ymm4
    772
    773	sub		$2,%r8d
    774	jnz		.Ldoubleround8
    775
    776	# x0..15[0-3] += s[0..15]
    777	vpbroadcastd	0x00(%rdi),%ymm0
    778	vpaddd		0x00(%rsp),%ymm0,%ymm0
    779	vmovdqa		%ymm0,0x00(%rsp)
    780	vpbroadcastd	0x04(%rdi),%ymm0
    781	vpaddd		0x20(%rsp),%ymm0,%ymm0
    782	vmovdqa		%ymm0,0x20(%rsp)
    783	vpbroadcastd	0x08(%rdi),%ymm0
    784	vpaddd		0x40(%rsp),%ymm0,%ymm0
    785	vmovdqa		%ymm0,0x40(%rsp)
    786	vpbroadcastd	0x0c(%rdi),%ymm0
    787	vpaddd		0x60(%rsp),%ymm0,%ymm0
    788	vmovdqa		%ymm0,0x60(%rsp)
    789	vpbroadcastd	0x10(%rdi),%ymm0
    790	vpaddd		%ymm0,%ymm4,%ymm4
    791	vpbroadcastd	0x14(%rdi),%ymm0
    792	vpaddd		%ymm0,%ymm5,%ymm5
    793	vpbroadcastd	0x18(%rdi),%ymm0
    794	vpaddd		%ymm0,%ymm6,%ymm6
    795	vpbroadcastd	0x1c(%rdi),%ymm0
    796	vpaddd		%ymm0,%ymm7,%ymm7
    797	vpbroadcastd	0x20(%rdi),%ymm0
    798	vpaddd		%ymm0,%ymm8,%ymm8
    799	vpbroadcastd	0x24(%rdi),%ymm0
    800	vpaddd		%ymm0,%ymm9,%ymm9
    801	vpbroadcastd	0x28(%rdi),%ymm0
    802	vpaddd		%ymm0,%ymm10,%ymm10
    803	vpbroadcastd	0x2c(%rdi),%ymm0
    804	vpaddd		%ymm0,%ymm11,%ymm11
    805	vpbroadcastd	0x30(%rdi),%ymm0
    806	vpaddd		%ymm0,%ymm12,%ymm12
    807	vpbroadcastd	0x34(%rdi),%ymm0
    808	vpaddd		%ymm0,%ymm13,%ymm13
    809	vpbroadcastd	0x38(%rdi),%ymm0
    810	vpaddd		%ymm0,%ymm14,%ymm14
    811	vpbroadcastd	0x3c(%rdi),%ymm0
    812	vpaddd		%ymm0,%ymm15,%ymm15
    813
    814	# x12 += counter values 0-3
    815	vpaddd		%ymm1,%ymm12,%ymm12
    816
    817	# interleave 32-bit words in state n, n+1
    818	vmovdqa		0x00(%rsp),%ymm0
    819	vmovdqa		0x20(%rsp),%ymm1
    820	vpunpckldq	%ymm1,%ymm0,%ymm2
    821	vpunpckhdq	%ymm1,%ymm0,%ymm1
    822	vmovdqa		%ymm2,0x00(%rsp)
    823	vmovdqa		%ymm1,0x20(%rsp)
    824	vmovdqa		0x40(%rsp),%ymm0
    825	vmovdqa		0x60(%rsp),%ymm1
    826	vpunpckldq	%ymm1,%ymm0,%ymm2
    827	vpunpckhdq	%ymm1,%ymm0,%ymm1
    828	vmovdqa		%ymm2,0x40(%rsp)
    829	vmovdqa		%ymm1,0x60(%rsp)
    830	vmovdqa		%ymm4,%ymm0
    831	vpunpckldq	%ymm5,%ymm0,%ymm4
    832	vpunpckhdq	%ymm5,%ymm0,%ymm5
    833	vmovdqa		%ymm6,%ymm0
    834	vpunpckldq	%ymm7,%ymm0,%ymm6
    835	vpunpckhdq	%ymm7,%ymm0,%ymm7
    836	vmovdqa		%ymm8,%ymm0
    837	vpunpckldq	%ymm9,%ymm0,%ymm8
    838	vpunpckhdq	%ymm9,%ymm0,%ymm9
    839	vmovdqa		%ymm10,%ymm0
    840	vpunpckldq	%ymm11,%ymm0,%ymm10
    841	vpunpckhdq	%ymm11,%ymm0,%ymm11
    842	vmovdqa		%ymm12,%ymm0
    843	vpunpckldq	%ymm13,%ymm0,%ymm12
    844	vpunpckhdq	%ymm13,%ymm0,%ymm13
    845	vmovdqa		%ymm14,%ymm0
    846	vpunpckldq	%ymm15,%ymm0,%ymm14
    847	vpunpckhdq	%ymm15,%ymm0,%ymm15
    848
    849	# interleave 64-bit words in state n, n+2
    850	vmovdqa		0x00(%rsp),%ymm0
    851	vmovdqa		0x40(%rsp),%ymm2
    852	vpunpcklqdq	%ymm2,%ymm0,%ymm1
    853	vpunpckhqdq	%ymm2,%ymm0,%ymm2
    854	vmovdqa		%ymm1,0x00(%rsp)
    855	vmovdqa		%ymm2,0x40(%rsp)
    856	vmovdqa		0x20(%rsp),%ymm0
    857	vmovdqa		0x60(%rsp),%ymm2
    858	vpunpcklqdq	%ymm2,%ymm0,%ymm1
    859	vpunpckhqdq	%ymm2,%ymm0,%ymm2
    860	vmovdqa		%ymm1,0x20(%rsp)
    861	vmovdqa		%ymm2,0x60(%rsp)
    862	vmovdqa		%ymm4,%ymm0
    863	vpunpcklqdq	%ymm6,%ymm0,%ymm4
    864	vpunpckhqdq	%ymm6,%ymm0,%ymm6
    865	vmovdqa		%ymm5,%ymm0
    866	vpunpcklqdq	%ymm7,%ymm0,%ymm5
    867	vpunpckhqdq	%ymm7,%ymm0,%ymm7
    868	vmovdqa		%ymm8,%ymm0
    869	vpunpcklqdq	%ymm10,%ymm0,%ymm8
    870	vpunpckhqdq	%ymm10,%ymm0,%ymm10
    871	vmovdqa		%ymm9,%ymm0
    872	vpunpcklqdq	%ymm11,%ymm0,%ymm9
    873	vpunpckhqdq	%ymm11,%ymm0,%ymm11
    874	vmovdqa		%ymm12,%ymm0
    875	vpunpcklqdq	%ymm14,%ymm0,%ymm12
    876	vpunpckhqdq	%ymm14,%ymm0,%ymm14
    877	vmovdqa		%ymm13,%ymm0
    878	vpunpcklqdq	%ymm15,%ymm0,%ymm13
    879	vpunpckhqdq	%ymm15,%ymm0,%ymm15
    880
    881	# interleave 128-bit words in state n, n+4
    882	# xor/write first four blocks
    883	vmovdqa		0x00(%rsp),%ymm1
    884	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
    885	cmp		$0x0020,%rax
    886	jl		.Lxorpart8
    887	vpxor		0x0000(%rdx),%ymm0,%ymm0
    888	vmovdqu		%ymm0,0x0000(%rsi)
    889	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
    890
    891	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
    892	cmp		$0x0040,%rax
    893	jl		.Lxorpart8
    894	vpxor		0x0020(%rdx),%ymm0,%ymm0
    895	vmovdqu		%ymm0,0x0020(%rsi)
    896	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
    897
    898	vmovdqa		0x40(%rsp),%ymm1
    899	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
    900	cmp		$0x0060,%rax
    901	jl		.Lxorpart8
    902	vpxor		0x0040(%rdx),%ymm0,%ymm0
    903	vmovdqu		%ymm0,0x0040(%rsi)
    904	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
    905
    906	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
    907	cmp		$0x0080,%rax
    908	jl		.Lxorpart8
    909	vpxor		0x0060(%rdx),%ymm0,%ymm0
    910	vmovdqu		%ymm0,0x0060(%rsi)
    911	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
    912
    913	vmovdqa		0x20(%rsp),%ymm1
    914	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
    915	cmp		$0x00a0,%rax
    916	jl		.Lxorpart8
    917	vpxor		0x0080(%rdx),%ymm0,%ymm0
    918	vmovdqu		%ymm0,0x0080(%rsi)
    919	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
    920
    921	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
    922	cmp		$0x00c0,%rax
    923	jl		.Lxorpart8
    924	vpxor		0x00a0(%rdx),%ymm0,%ymm0
    925	vmovdqu		%ymm0,0x00a0(%rsi)
    926	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
    927
    928	vmovdqa		0x60(%rsp),%ymm1
    929	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
    930	cmp		$0x00e0,%rax
    931	jl		.Lxorpart8
    932	vpxor		0x00c0(%rdx),%ymm0,%ymm0
    933	vmovdqu		%ymm0,0x00c0(%rsi)
    934	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
    935
    936	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
    937	cmp		$0x0100,%rax
    938	jl		.Lxorpart8
    939	vpxor		0x00e0(%rdx),%ymm0,%ymm0
    940	vmovdqu		%ymm0,0x00e0(%rsi)
    941	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
    942
    943	# xor remaining blocks, write to output
    944	vmovdqa		%ymm4,%ymm0
    945	cmp		$0x0120,%rax
    946	jl		.Lxorpart8
    947	vpxor		0x0100(%rdx),%ymm0,%ymm0
    948	vmovdqu		%ymm0,0x0100(%rsi)
    949
    950	vmovdqa		%ymm12,%ymm0
    951	cmp		$0x0140,%rax
    952	jl		.Lxorpart8
    953	vpxor		0x0120(%rdx),%ymm0,%ymm0
    954	vmovdqu		%ymm0,0x0120(%rsi)
    955
    956	vmovdqa		%ymm6,%ymm0
    957	cmp		$0x0160,%rax
    958	jl		.Lxorpart8
    959	vpxor		0x0140(%rdx),%ymm0,%ymm0
    960	vmovdqu		%ymm0,0x0140(%rsi)
    961
    962	vmovdqa		%ymm14,%ymm0
    963	cmp		$0x0180,%rax
    964	jl		.Lxorpart8
    965	vpxor		0x0160(%rdx),%ymm0,%ymm0
    966	vmovdqu		%ymm0,0x0160(%rsi)
    967
    968	vmovdqa		%ymm5,%ymm0
    969	cmp		$0x01a0,%rax
    970	jl		.Lxorpart8
    971	vpxor		0x0180(%rdx),%ymm0,%ymm0
    972	vmovdqu		%ymm0,0x0180(%rsi)
    973
    974	vmovdqa		%ymm13,%ymm0
    975	cmp		$0x01c0,%rax
    976	jl		.Lxorpart8
    977	vpxor		0x01a0(%rdx),%ymm0,%ymm0
    978	vmovdqu		%ymm0,0x01a0(%rsi)
    979
    980	vmovdqa		%ymm7,%ymm0
    981	cmp		$0x01e0,%rax
    982	jl		.Lxorpart8
    983	vpxor		0x01c0(%rdx),%ymm0,%ymm0
    984	vmovdqu		%ymm0,0x01c0(%rsi)
    985
    986	vmovdqa		%ymm15,%ymm0
    987	cmp		$0x0200,%rax
    988	jl		.Lxorpart8
    989	vpxor		0x01e0(%rdx),%ymm0,%ymm0
    990	vmovdqu		%ymm0,0x01e0(%rsi)
    991
    992.Ldone8:
    993	vzeroupper
    994	lea		-8(%r10),%rsp
    995	RET
    996
    997.Lxorpart8:
    998	# xor remaining bytes from partial register into output
    999	mov		%rax,%r9
   1000	and		$0x1f,%r9
   1001	jz		.Ldone8
   1002	and		$~0x1f,%rax
   1003
   1004	mov		%rsi,%r11
   1005
   1006	lea		(%rdx,%rax),%rsi
   1007	mov		%rsp,%rdi
   1008	mov		%r9,%rcx
   1009	rep movsb
   1010
   1011	vpxor		0x00(%rsp),%ymm0,%ymm0
   1012	vmovdqa		%ymm0,0x00(%rsp)
   1013
   1014	mov		%rsp,%rsi
   1015	lea		(%r11,%rax),%rdi
   1016	mov		%r9,%rcx
   1017	rep movsb
   1018
   1019	jmp		.Ldone8
   1020
   1021SYM_FUNC_END(chacha_8block_xor_avx2)