cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-ssse3-x86_64.S (17216B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
      4 *
      5 * Copyright (C) 2015 Martin Willi
      6 */
      7
      8#include <linux/linkage.h>
      9#include <asm/frame.h>
     10
     11.section	.rodata.cst16.ROT8, "aM", @progbits, 16
     12.align 16
     13ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
     14.section	.rodata.cst16.ROT16, "aM", @progbits, 16
     15.align 16
     16ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
     17.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
     18.align 16
     19CTRINC:	.octa 0x00000003000000020000000100000000
     20
     21.text
     22
     23/*
     24 * chacha_permute - permute one block
     25 *
     26 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
     27 * function performs matrix operations on four words in parallel, but requires
     28 * shuffling to rearrange the words after each round.  8/16-bit word rotation is
     29 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
     30 * rotation uses traditional shift+OR.
     31 *
     32 * The round count is given in %r8d.
     33 *
     34 * Clobbers: %r8d, %xmm4-%xmm7
     35 */
     36SYM_FUNC_START_LOCAL(chacha_permute)
     37
     38	movdqa		ROT8(%rip),%xmm4
     39	movdqa		ROT16(%rip),%xmm5
     40
     41.Ldoubleround:
     42	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     43	paddd		%xmm1,%xmm0
     44	pxor		%xmm0,%xmm3
     45	pshufb		%xmm5,%xmm3
     46
     47	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     48	paddd		%xmm3,%xmm2
     49	pxor		%xmm2,%xmm1
     50	movdqa		%xmm1,%xmm6
     51	pslld		$12,%xmm6
     52	psrld		$20,%xmm1
     53	por		%xmm6,%xmm1
     54
     55	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     56	paddd		%xmm1,%xmm0
     57	pxor		%xmm0,%xmm3
     58	pshufb		%xmm4,%xmm3
     59
     60	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     61	paddd		%xmm3,%xmm2
     62	pxor		%xmm2,%xmm1
     63	movdqa		%xmm1,%xmm7
     64	pslld		$7,%xmm7
     65	psrld		$25,%xmm1
     66	por		%xmm7,%xmm1
     67
     68	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
     69	pshufd		$0x39,%xmm1,%xmm1
     70	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
     71	pshufd		$0x4e,%xmm2,%xmm2
     72	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
     73	pshufd		$0x93,%xmm3,%xmm3
     74
     75	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     76	paddd		%xmm1,%xmm0
     77	pxor		%xmm0,%xmm3
     78	pshufb		%xmm5,%xmm3
     79
     80	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     81	paddd		%xmm3,%xmm2
     82	pxor		%xmm2,%xmm1
     83	movdqa		%xmm1,%xmm6
     84	pslld		$12,%xmm6
     85	psrld		$20,%xmm1
     86	por		%xmm6,%xmm1
     87
     88	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     89	paddd		%xmm1,%xmm0
     90	pxor		%xmm0,%xmm3
     91	pshufb		%xmm4,%xmm3
     92
     93	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     94	paddd		%xmm3,%xmm2
     95	pxor		%xmm2,%xmm1
     96	movdqa		%xmm1,%xmm7
     97	pslld		$7,%xmm7
     98	psrld		$25,%xmm1
     99	por		%xmm7,%xmm1
    100
    101	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    102	pshufd		$0x93,%xmm1,%xmm1
    103	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    104	pshufd		$0x4e,%xmm2,%xmm2
    105	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    106	pshufd		$0x39,%xmm3,%xmm3
    107
    108	sub		$2,%r8d
    109	jnz		.Ldoubleround
    110
    111	RET
    112SYM_FUNC_END(chacha_permute)
    113
    114SYM_FUNC_START(chacha_block_xor_ssse3)
    115	# %rdi: Input state matrix, s
    116	# %rsi: up to 1 data block output, o
    117	# %rdx: up to 1 data block input, i
    118	# %rcx: input/output length in bytes
    119	# %r8d: nrounds
    120	FRAME_BEGIN
    121
    122	# x0..3 = s0..3
    123	movdqu		0x00(%rdi),%xmm0
    124	movdqu		0x10(%rdi),%xmm1
    125	movdqu		0x20(%rdi),%xmm2
    126	movdqu		0x30(%rdi),%xmm3
    127	movdqa		%xmm0,%xmm8
    128	movdqa		%xmm1,%xmm9
    129	movdqa		%xmm2,%xmm10
    130	movdqa		%xmm3,%xmm11
    131
    132	mov		%rcx,%rax
    133	call		chacha_permute
    134
    135	# o0 = i0 ^ (x0 + s0)
    136	paddd		%xmm8,%xmm0
    137	cmp		$0x10,%rax
    138	jl		.Lxorpart
    139	movdqu		0x00(%rdx),%xmm4
    140	pxor		%xmm4,%xmm0
    141	movdqu		%xmm0,0x00(%rsi)
    142	# o1 = i1 ^ (x1 + s1)
    143	paddd		%xmm9,%xmm1
    144	movdqa		%xmm1,%xmm0
    145	cmp		$0x20,%rax
    146	jl		.Lxorpart
    147	movdqu		0x10(%rdx),%xmm0
    148	pxor		%xmm1,%xmm0
    149	movdqu		%xmm0,0x10(%rsi)
    150	# o2 = i2 ^ (x2 + s2)
    151	paddd		%xmm10,%xmm2
    152	movdqa		%xmm2,%xmm0
    153	cmp		$0x30,%rax
    154	jl		.Lxorpart
    155	movdqu		0x20(%rdx),%xmm0
    156	pxor		%xmm2,%xmm0
    157	movdqu		%xmm0,0x20(%rsi)
    158	# o3 = i3 ^ (x3 + s3)
    159	paddd		%xmm11,%xmm3
    160	movdqa		%xmm3,%xmm0
    161	cmp		$0x40,%rax
    162	jl		.Lxorpart
    163	movdqu		0x30(%rdx),%xmm0
    164	pxor		%xmm3,%xmm0
    165	movdqu		%xmm0,0x30(%rsi)
    166
    167.Ldone:
    168	FRAME_END
    169	RET
    170
    171.Lxorpart:
    172	# xor remaining bytes from partial register into output
    173	mov		%rax,%r9
    174	and		$0x0f,%r9
    175	jz		.Ldone
    176	and		$~0x0f,%rax
    177
    178	mov		%rsi,%r11
    179
    180	lea		8(%rsp),%r10
    181	sub		$0x10,%rsp
    182	and		$~31,%rsp
    183
    184	lea		(%rdx,%rax),%rsi
    185	mov		%rsp,%rdi
    186	mov		%r9,%rcx
    187	rep movsb
    188
    189	pxor		0x00(%rsp),%xmm0
    190	movdqa		%xmm0,0x00(%rsp)
    191
    192	mov		%rsp,%rsi
    193	lea		(%r11,%rax),%rdi
    194	mov		%r9,%rcx
    195	rep movsb
    196
    197	lea		-8(%r10),%rsp
    198	jmp		.Ldone
    199
    200SYM_FUNC_END(chacha_block_xor_ssse3)
    201
    202SYM_FUNC_START(hchacha_block_ssse3)
    203	# %rdi: Input state matrix, s
    204	# %rsi: output (8 32-bit words)
    205	# %edx: nrounds
    206	FRAME_BEGIN
    207
    208	movdqu		0x00(%rdi),%xmm0
    209	movdqu		0x10(%rdi),%xmm1
    210	movdqu		0x20(%rdi),%xmm2
    211	movdqu		0x30(%rdi),%xmm3
    212
    213	mov		%edx,%r8d
    214	call		chacha_permute
    215
    216	movdqu		%xmm0,0x00(%rsi)
    217	movdqu		%xmm3,0x10(%rsi)
    218
    219	FRAME_END
    220	RET
    221SYM_FUNC_END(hchacha_block_ssse3)
    222
    223SYM_FUNC_START(chacha_4block_xor_ssse3)
    224	# %rdi: Input state matrix, s
    225	# %rsi: up to 4 data blocks output, o
    226	# %rdx: up to 4 data blocks input, i
    227	# %rcx: input/output length in bytes
    228	# %r8d: nrounds
    229
    230	# This function encrypts four consecutive ChaCha blocks by loading the
    231	# the state matrix in SSE registers four times. As we need some scratch
    232	# registers, we save the first four registers on the stack. The
    233	# algorithm performs each operation on the corresponding word of each
    234	# state matrix, hence requires no word shuffling. For final XORing step
    235	# we transpose the matrix by interleaving 32- and then 64-bit words,
    236	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
    237	# done with the slightly better performing SSSE3 byte shuffling,
    238	# 7/12-bit word rotation uses traditional shift+OR.
    239
    240	lea		8(%rsp),%r10
    241	sub		$0x80,%rsp
    242	and		$~63,%rsp
    243	mov		%rcx,%rax
    244
    245	# x0..15[0-3] = s0..3[0..3]
    246	movq		0x00(%rdi),%xmm1
    247	pshufd		$0x00,%xmm1,%xmm0
    248	pshufd		$0x55,%xmm1,%xmm1
    249	movq		0x08(%rdi),%xmm3
    250	pshufd		$0x00,%xmm3,%xmm2
    251	pshufd		$0x55,%xmm3,%xmm3
    252	movq		0x10(%rdi),%xmm5
    253	pshufd		$0x00,%xmm5,%xmm4
    254	pshufd		$0x55,%xmm5,%xmm5
    255	movq		0x18(%rdi),%xmm7
    256	pshufd		$0x00,%xmm7,%xmm6
    257	pshufd		$0x55,%xmm7,%xmm7
    258	movq		0x20(%rdi),%xmm9
    259	pshufd		$0x00,%xmm9,%xmm8
    260	pshufd		$0x55,%xmm9,%xmm9
    261	movq		0x28(%rdi),%xmm11
    262	pshufd		$0x00,%xmm11,%xmm10
    263	pshufd		$0x55,%xmm11,%xmm11
    264	movq		0x30(%rdi),%xmm13
    265	pshufd		$0x00,%xmm13,%xmm12
    266	pshufd		$0x55,%xmm13,%xmm13
    267	movq		0x38(%rdi),%xmm15
    268	pshufd		$0x00,%xmm15,%xmm14
    269	pshufd		$0x55,%xmm15,%xmm15
    270	# x0..3 on stack
    271	movdqa		%xmm0,0x00(%rsp)
    272	movdqa		%xmm1,0x10(%rsp)
    273	movdqa		%xmm2,0x20(%rsp)
    274	movdqa		%xmm3,0x30(%rsp)
    275
    276	movdqa		CTRINC(%rip),%xmm1
    277	movdqa		ROT8(%rip),%xmm2
    278	movdqa		ROT16(%rip),%xmm3
    279
    280	# x12 += counter values 0-3
    281	paddd		%xmm1,%xmm12
    282
    283.Ldoubleround4:
    284	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
    285	movdqa		0x00(%rsp),%xmm0
    286	paddd		%xmm4,%xmm0
    287	movdqa		%xmm0,0x00(%rsp)
    288	pxor		%xmm0,%xmm12
    289	pshufb		%xmm3,%xmm12
    290	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
    291	movdqa		0x10(%rsp),%xmm0
    292	paddd		%xmm5,%xmm0
    293	movdqa		%xmm0,0x10(%rsp)
    294	pxor		%xmm0,%xmm13
    295	pshufb		%xmm3,%xmm13
    296	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
    297	movdqa		0x20(%rsp),%xmm0
    298	paddd		%xmm6,%xmm0
    299	movdqa		%xmm0,0x20(%rsp)
    300	pxor		%xmm0,%xmm14
    301	pshufb		%xmm3,%xmm14
    302	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
    303	movdqa		0x30(%rsp),%xmm0
    304	paddd		%xmm7,%xmm0
    305	movdqa		%xmm0,0x30(%rsp)
    306	pxor		%xmm0,%xmm15
    307	pshufb		%xmm3,%xmm15
    308
    309	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
    310	paddd		%xmm12,%xmm8
    311	pxor		%xmm8,%xmm4
    312	movdqa		%xmm4,%xmm0
    313	pslld		$12,%xmm0
    314	psrld		$20,%xmm4
    315	por		%xmm0,%xmm4
    316	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
    317	paddd		%xmm13,%xmm9
    318	pxor		%xmm9,%xmm5
    319	movdqa		%xmm5,%xmm0
    320	pslld		$12,%xmm0
    321	psrld		$20,%xmm5
    322	por		%xmm0,%xmm5
    323	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
    324	paddd		%xmm14,%xmm10
    325	pxor		%xmm10,%xmm6
    326	movdqa		%xmm6,%xmm0
    327	pslld		$12,%xmm0
    328	psrld		$20,%xmm6
    329	por		%xmm0,%xmm6
    330	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
    331	paddd		%xmm15,%xmm11
    332	pxor		%xmm11,%xmm7
    333	movdqa		%xmm7,%xmm0
    334	pslld		$12,%xmm0
    335	psrld		$20,%xmm7
    336	por		%xmm0,%xmm7
    337
    338	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
    339	movdqa		0x00(%rsp),%xmm0
    340	paddd		%xmm4,%xmm0
    341	movdqa		%xmm0,0x00(%rsp)
    342	pxor		%xmm0,%xmm12
    343	pshufb		%xmm2,%xmm12
    344	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
    345	movdqa		0x10(%rsp),%xmm0
    346	paddd		%xmm5,%xmm0
    347	movdqa		%xmm0,0x10(%rsp)
    348	pxor		%xmm0,%xmm13
    349	pshufb		%xmm2,%xmm13
    350	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
    351	movdqa		0x20(%rsp),%xmm0
    352	paddd		%xmm6,%xmm0
    353	movdqa		%xmm0,0x20(%rsp)
    354	pxor		%xmm0,%xmm14
    355	pshufb		%xmm2,%xmm14
    356	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
    357	movdqa		0x30(%rsp),%xmm0
    358	paddd		%xmm7,%xmm0
    359	movdqa		%xmm0,0x30(%rsp)
    360	pxor		%xmm0,%xmm15
    361	pshufb		%xmm2,%xmm15
    362
    363	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
    364	paddd		%xmm12,%xmm8
    365	pxor		%xmm8,%xmm4
    366	movdqa		%xmm4,%xmm0
    367	pslld		$7,%xmm0
    368	psrld		$25,%xmm4
    369	por		%xmm0,%xmm4
    370	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
    371	paddd		%xmm13,%xmm9
    372	pxor		%xmm9,%xmm5
    373	movdqa		%xmm5,%xmm0
    374	pslld		$7,%xmm0
    375	psrld		$25,%xmm5
    376	por		%xmm0,%xmm5
    377	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
    378	paddd		%xmm14,%xmm10
    379	pxor		%xmm10,%xmm6
    380	movdqa		%xmm6,%xmm0
    381	pslld		$7,%xmm0
    382	psrld		$25,%xmm6
    383	por		%xmm0,%xmm6
    384	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
    385	paddd		%xmm15,%xmm11
    386	pxor		%xmm11,%xmm7
    387	movdqa		%xmm7,%xmm0
    388	pslld		$7,%xmm0
    389	psrld		$25,%xmm7
    390	por		%xmm0,%xmm7
    391
    392	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
    393	movdqa		0x00(%rsp),%xmm0
    394	paddd		%xmm5,%xmm0
    395	movdqa		%xmm0,0x00(%rsp)
    396	pxor		%xmm0,%xmm15
    397	pshufb		%xmm3,%xmm15
    398	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
    399	movdqa		0x10(%rsp),%xmm0
    400	paddd		%xmm6,%xmm0
    401	movdqa		%xmm0,0x10(%rsp)
    402	pxor		%xmm0,%xmm12
    403	pshufb		%xmm3,%xmm12
    404	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
    405	movdqa		0x20(%rsp),%xmm0
    406	paddd		%xmm7,%xmm0
    407	movdqa		%xmm0,0x20(%rsp)
    408	pxor		%xmm0,%xmm13
    409	pshufb		%xmm3,%xmm13
    410	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
    411	movdqa		0x30(%rsp),%xmm0
    412	paddd		%xmm4,%xmm0
    413	movdqa		%xmm0,0x30(%rsp)
    414	pxor		%xmm0,%xmm14
    415	pshufb		%xmm3,%xmm14
    416
    417	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
    418	paddd		%xmm15,%xmm10
    419	pxor		%xmm10,%xmm5
    420	movdqa		%xmm5,%xmm0
    421	pslld		$12,%xmm0
    422	psrld		$20,%xmm5
    423	por		%xmm0,%xmm5
    424	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
    425	paddd		%xmm12,%xmm11
    426	pxor		%xmm11,%xmm6
    427	movdqa		%xmm6,%xmm0
    428	pslld		$12,%xmm0
    429	psrld		$20,%xmm6
    430	por		%xmm0,%xmm6
    431	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
    432	paddd		%xmm13,%xmm8
    433	pxor		%xmm8,%xmm7
    434	movdqa		%xmm7,%xmm0
    435	pslld		$12,%xmm0
    436	psrld		$20,%xmm7
    437	por		%xmm0,%xmm7
    438	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
    439	paddd		%xmm14,%xmm9
    440	pxor		%xmm9,%xmm4
    441	movdqa		%xmm4,%xmm0
    442	pslld		$12,%xmm0
    443	psrld		$20,%xmm4
    444	por		%xmm0,%xmm4
    445
    446	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
    447	movdqa		0x00(%rsp),%xmm0
    448	paddd		%xmm5,%xmm0
    449	movdqa		%xmm0,0x00(%rsp)
    450	pxor		%xmm0,%xmm15
    451	pshufb		%xmm2,%xmm15
    452	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
    453	movdqa		0x10(%rsp),%xmm0
    454	paddd		%xmm6,%xmm0
    455	movdqa		%xmm0,0x10(%rsp)
    456	pxor		%xmm0,%xmm12
    457	pshufb		%xmm2,%xmm12
    458	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
    459	movdqa		0x20(%rsp),%xmm0
    460	paddd		%xmm7,%xmm0
    461	movdqa		%xmm0,0x20(%rsp)
    462	pxor		%xmm0,%xmm13
    463	pshufb		%xmm2,%xmm13
    464	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
    465	movdqa		0x30(%rsp),%xmm0
    466	paddd		%xmm4,%xmm0
    467	movdqa		%xmm0,0x30(%rsp)
    468	pxor		%xmm0,%xmm14
    469	pshufb		%xmm2,%xmm14
    470
    471	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
    472	paddd		%xmm15,%xmm10
    473	pxor		%xmm10,%xmm5
    474	movdqa		%xmm5,%xmm0
    475	pslld		$7,%xmm0
    476	psrld		$25,%xmm5
    477	por		%xmm0,%xmm5
    478	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
    479	paddd		%xmm12,%xmm11
    480	pxor		%xmm11,%xmm6
    481	movdqa		%xmm6,%xmm0
    482	pslld		$7,%xmm0
    483	psrld		$25,%xmm6
    484	por		%xmm0,%xmm6
    485	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
    486	paddd		%xmm13,%xmm8
    487	pxor		%xmm8,%xmm7
    488	movdqa		%xmm7,%xmm0
    489	pslld		$7,%xmm0
    490	psrld		$25,%xmm7
    491	por		%xmm0,%xmm7
    492	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
    493	paddd		%xmm14,%xmm9
    494	pxor		%xmm9,%xmm4
    495	movdqa		%xmm4,%xmm0
    496	pslld		$7,%xmm0
    497	psrld		$25,%xmm4
    498	por		%xmm0,%xmm4
    499
    500	sub		$2,%r8d
    501	jnz		.Ldoubleround4
    502
    503	# x0[0-3] += s0[0]
    504	# x1[0-3] += s0[1]
    505	movq		0x00(%rdi),%xmm3
    506	pshufd		$0x00,%xmm3,%xmm2
    507	pshufd		$0x55,%xmm3,%xmm3
    508	paddd		0x00(%rsp),%xmm2
    509	movdqa		%xmm2,0x00(%rsp)
    510	paddd		0x10(%rsp),%xmm3
    511	movdqa		%xmm3,0x10(%rsp)
    512	# x2[0-3] += s0[2]
    513	# x3[0-3] += s0[3]
    514	movq		0x08(%rdi),%xmm3
    515	pshufd		$0x00,%xmm3,%xmm2
    516	pshufd		$0x55,%xmm3,%xmm3
    517	paddd		0x20(%rsp),%xmm2
    518	movdqa		%xmm2,0x20(%rsp)
    519	paddd		0x30(%rsp),%xmm3
    520	movdqa		%xmm3,0x30(%rsp)
    521
    522	# x4[0-3] += s1[0]
    523	# x5[0-3] += s1[1]
    524	movq		0x10(%rdi),%xmm3
    525	pshufd		$0x00,%xmm3,%xmm2
    526	pshufd		$0x55,%xmm3,%xmm3
    527	paddd		%xmm2,%xmm4
    528	paddd		%xmm3,%xmm5
    529	# x6[0-3] += s1[2]
    530	# x7[0-3] += s1[3]
    531	movq		0x18(%rdi),%xmm3
    532	pshufd		$0x00,%xmm3,%xmm2
    533	pshufd		$0x55,%xmm3,%xmm3
    534	paddd		%xmm2,%xmm6
    535	paddd		%xmm3,%xmm7
    536
    537	# x8[0-3] += s2[0]
    538	# x9[0-3] += s2[1]
    539	movq		0x20(%rdi),%xmm3
    540	pshufd		$0x00,%xmm3,%xmm2
    541	pshufd		$0x55,%xmm3,%xmm3
    542	paddd		%xmm2,%xmm8
    543	paddd		%xmm3,%xmm9
    544	# x10[0-3] += s2[2]
    545	# x11[0-3] += s2[3]
    546	movq		0x28(%rdi),%xmm3
    547	pshufd		$0x00,%xmm3,%xmm2
    548	pshufd		$0x55,%xmm3,%xmm3
    549	paddd		%xmm2,%xmm10
    550	paddd		%xmm3,%xmm11
    551
    552	# x12[0-3] += s3[0]
    553	# x13[0-3] += s3[1]
    554	movq		0x30(%rdi),%xmm3
    555	pshufd		$0x00,%xmm3,%xmm2
    556	pshufd		$0x55,%xmm3,%xmm3
    557	paddd		%xmm2,%xmm12
    558	paddd		%xmm3,%xmm13
    559	# x14[0-3] += s3[2]
    560	# x15[0-3] += s3[3]
    561	movq		0x38(%rdi),%xmm3
    562	pshufd		$0x00,%xmm3,%xmm2
    563	pshufd		$0x55,%xmm3,%xmm3
    564	paddd		%xmm2,%xmm14
    565	paddd		%xmm3,%xmm15
    566
    567	# x12 += counter values 0-3
    568	paddd		%xmm1,%xmm12
    569
    570	# interleave 32-bit words in state n, n+1
    571	movdqa		0x00(%rsp),%xmm0
    572	movdqa		0x10(%rsp),%xmm1
    573	movdqa		%xmm0,%xmm2
    574	punpckldq	%xmm1,%xmm2
    575	punpckhdq	%xmm1,%xmm0
    576	movdqa		%xmm2,0x00(%rsp)
    577	movdqa		%xmm0,0x10(%rsp)
    578	movdqa		0x20(%rsp),%xmm0
    579	movdqa		0x30(%rsp),%xmm1
    580	movdqa		%xmm0,%xmm2
    581	punpckldq	%xmm1,%xmm2
    582	punpckhdq	%xmm1,%xmm0
    583	movdqa		%xmm2,0x20(%rsp)
    584	movdqa		%xmm0,0x30(%rsp)
    585	movdqa		%xmm4,%xmm0
    586	punpckldq	%xmm5,%xmm4
    587	punpckhdq	%xmm5,%xmm0
    588	movdqa		%xmm0,%xmm5
    589	movdqa		%xmm6,%xmm0
    590	punpckldq	%xmm7,%xmm6
    591	punpckhdq	%xmm7,%xmm0
    592	movdqa		%xmm0,%xmm7
    593	movdqa		%xmm8,%xmm0
    594	punpckldq	%xmm9,%xmm8
    595	punpckhdq	%xmm9,%xmm0
    596	movdqa		%xmm0,%xmm9
    597	movdqa		%xmm10,%xmm0
    598	punpckldq	%xmm11,%xmm10
    599	punpckhdq	%xmm11,%xmm0
    600	movdqa		%xmm0,%xmm11
    601	movdqa		%xmm12,%xmm0
    602	punpckldq	%xmm13,%xmm12
    603	punpckhdq	%xmm13,%xmm0
    604	movdqa		%xmm0,%xmm13
    605	movdqa		%xmm14,%xmm0
    606	punpckldq	%xmm15,%xmm14
    607	punpckhdq	%xmm15,%xmm0
    608	movdqa		%xmm0,%xmm15
    609
    610	# interleave 64-bit words in state n, n+2
    611	movdqa		0x00(%rsp),%xmm0
    612	movdqa		0x20(%rsp),%xmm1
    613	movdqa		%xmm0,%xmm2
    614	punpcklqdq	%xmm1,%xmm2
    615	punpckhqdq	%xmm1,%xmm0
    616	movdqa		%xmm2,0x00(%rsp)
    617	movdqa		%xmm0,0x20(%rsp)
    618	movdqa		0x10(%rsp),%xmm0
    619	movdqa		0x30(%rsp),%xmm1
    620	movdqa		%xmm0,%xmm2
    621	punpcklqdq	%xmm1,%xmm2
    622	punpckhqdq	%xmm1,%xmm0
    623	movdqa		%xmm2,0x10(%rsp)
    624	movdqa		%xmm0,0x30(%rsp)
    625	movdqa		%xmm4,%xmm0
    626	punpcklqdq	%xmm6,%xmm4
    627	punpckhqdq	%xmm6,%xmm0
    628	movdqa		%xmm0,%xmm6
    629	movdqa		%xmm5,%xmm0
    630	punpcklqdq	%xmm7,%xmm5
    631	punpckhqdq	%xmm7,%xmm0
    632	movdqa		%xmm0,%xmm7
    633	movdqa		%xmm8,%xmm0
    634	punpcklqdq	%xmm10,%xmm8
    635	punpckhqdq	%xmm10,%xmm0
    636	movdqa		%xmm0,%xmm10
    637	movdqa		%xmm9,%xmm0
    638	punpcklqdq	%xmm11,%xmm9
    639	punpckhqdq	%xmm11,%xmm0
    640	movdqa		%xmm0,%xmm11
    641	movdqa		%xmm12,%xmm0
    642	punpcklqdq	%xmm14,%xmm12
    643	punpckhqdq	%xmm14,%xmm0
    644	movdqa		%xmm0,%xmm14
    645	movdqa		%xmm13,%xmm0
    646	punpcklqdq	%xmm15,%xmm13
    647	punpckhqdq	%xmm15,%xmm0
    648	movdqa		%xmm0,%xmm15
    649
    650	# xor with corresponding input, write to output
    651	movdqa		0x00(%rsp),%xmm0
    652	cmp		$0x10,%rax
    653	jl		.Lxorpart4
    654	movdqu		0x00(%rdx),%xmm1
    655	pxor		%xmm1,%xmm0
    656	movdqu		%xmm0,0x00(%rsi)
    657
    658	movdqu		%xmm4,%xmm0
    659	cmp		$0x20,%rax
    660	jl		.Lxorpart4
    661	movdqu		0x10(%rdx),%xmm1
    662	pxor		%xmm1,%xmm0
    663	movdqu		%xmm0,0x10(%rsi)
    664
    665	movdqu		%xmm8,%xmm0
    666	cmp		$0x30,%rax
    667	jl		.Lxorpart4
    668	movdqu		0x20(%rdx),%xmm1
    669	pxor		%xmm1,%xmm0
    670	movdqu		%xmm0,0x20(%rsi)
    671
    672	movdqu		%xmm12,%xmm0
    673	cmp		$0x40,%rax
    674	jl		.Lxorpart4
    675	movdqu		0x30(%rdx),%xmm1
    676	pxor		%xmm1,%xmm0
    677	movdqu		%xmm0,0x30(%rsi)
    678
    679	movdqa		0x20(%rsp),%xmm0
    680	cmp		$0x50,%rax
    681	jl		.Lxorpart4
    682	movdqu		0x40(%rdx),%xmm1
    683	pxor		%xmm1,%xmm0
    684	movdqu		%xmm0,0x40(%rsi)
    685
    686	movdqu		%xmm6,%xmm0
    687	cmp		$0x60,%rax
    688	jl		.Lxorpart4
    689	movdqu		0x50(%rdx),%xmm1
    690	pxor		%xmm1,%xmm0
    691	movdqu		%xmm0,0x50(%rsi)
    692
    693	movdqu		%xmm10,%xmm0
    694	cmp		$0x70,%rax
    695	jl		.Lxorpart4
    696	movdqu		0x60(%rdx),%xmm1
    697	pxor		%xmm1,%xmm0
    698	movdqu		%xmm0,0x60(%rsi)
    699
    700	movdqu		%xmm14,%xmm0
    701	cmp		$0x80,%rax
    702	jl		.Lxorpart4
    703	movdqu		0x70(%rdx),%xmm1
    704	pxor		%xmm1,%xmm0
    705	movdqu		%xmm0,0x70(%rsi)
    706
    707	movdqa		0x10(%rsp),%xmm0
    708	cmp		$0x90,%rax
    709	jl		.Lxorpart4
    710	movdqu		0x80(%rdx),%xmm1
    711	pxor		%xmm1,%xmm0
    712	movdqu		%xmm0,0x80(%rsi)
    713
    714	movdqu		%xmm5,%xmm0
    715	cmp		$0xa0,%rax
    716	jl		.Lxorpart4
    717	movdqu		0x90(%rdx),%xmm1
    718	pxor		%xmm1,%xmm0
    719	movdqu		%xmm0,0x90(%rsi)
    720
    721	movdqu		%xmm9,%xmm0
    722	cmp		$0xb0,%rax
    723	jl		.Lxorpart4
    724	movdqu		0xa0(%rdx),%xmm1
    725	pxor		%xmm1,%xmm0
    726	movdqu		%xmm0,0xa0(%rsi)
    727
    728	movdqu		%xmm13,%xmm0
    729	cmp		$0xc0,%rax
    730	jl		.Lxorpart4
    731	movdqu		0xb0(%rdx),%xmm1
    732	pxor		%xmm1,%xmm0
    733	movdqu		%xmm0,0xb0(%rsi)
    734
    735	movdqa		0x30(%rsp),%xmm0
    736	cmp		$0xd0,%rax
    737	jl		.Lxorpart4
    738	movdqu		0xc0(%rdx),%xmm1
    739	pxor		%xmm1,%xmm0
    740	movdqu		%xmm0,0xc0(%rsi)
    741
    742	movdqu		%xmm7,%xmm0
    743	cmp		$0xe0,%rax
    744	jl		.Lxorpart4
    745	movdqu		0xd0(%rdx),%xmm1
    746	pxor		%xmm1,%xmm0
    747	movdqu		%xmm0,0xd0(%rsi)
    748
    749	movdqu		%xmm11,%xmm0
    750	cmp		$0xf0,%rax
    751	jl		.Lxorpart4
    752	movdqu		0xe0(%rdx),%xmm1
    753	pxor		%xmm1,%xmm0
    754	movdqu		%xmm0,0xe0(%rsi)
    755
    756	movdqu		%xmm15,%xmm0
    757	cmp		$0x100,%rax
    758	jl		.Lxorpart4
    759	movdqu		0xf0(%rdx),%xmm1
    760	pxor		%xmm1,%xmm0
    761	movdqu		%xmm0,0xf0(%rsi)
    762
    763.Ldone4:
    764	lea		-8(%r10),%rsp
    765	RET
    766
    767.Lxorpart4:
    768	# xor remaining bytes from partial register into output
    769	mov		%rax,%r9
    770	and		$0x0f,%r9
    771	jz		.Ldone4
    772	and		$~0x0f,%rax
    773
    774	mov		%rsi,%r11
    775
    776	lea		(%rdx,%rax),%rsi
    777	mov		%rsp,%rdi
    778	mov		%r9,%rcx
    779	rep movsb
    780
    781	pxor		0x00(%rsp),%xmm0
    782	movdqa		%xmm0,0x00(%rsp)
    783
    784	mov		%rsp,%rsi
    785	lea		(%r11,%rax),%rdi
    786	mov		%r9,%rcx
    787	rep movsb
    788
    789	jmp		.Ldone4
    790
    791SYM_FUNC_END(chacha_4block_xor_ssse3)