chacha-neon-core.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
chacha-neon-core.S (15074B)
      1/*
      2 * ChaCha/XChaCha NEON helper functions
      3 *
      4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
      5 *
      6 * This program is free software; you can redistribute it and/or modify
      7 * it under the terms of the GNU General Public License version 2 as
      8 * published by the Free Software Foundation.
      9 *
     10 * Based on:
     11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
     12 *
     13 * Copyright (C) 2015 Martin Willi
     14 *
     15 * This program is free software; you can redistribute it and/or modify
     16 * it under the terms of the GNU General Public License as published by
     17 * the Free Software Foundation; either version 2 of the License, or
     18 * (at your option) any later version.
     19 */
     20
     21 /*
     22  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
     23  *
     24  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
     25  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
     26  * (c)  vrev32.16			(16-bit rotations only)
     27  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
     28  *					 needs index vector)
     29  *
     30  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
     31  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
     32  * cycles of (b) on both Cortex-A7 and Cortex-A53.
     33  *
     34  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
     35  * and doesn't need a temporary register.
     36  *
     37  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
     38  * is twice as fast as (a), even when doing (a) on multiple registers
     39  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
     40  * parallelizes better when temporary registers are scarce.
     41  *
     42  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
     43  * (a), so the need to load the rotation table actually makes the vtbl method
     44  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
     45  * seems to be a good compromise to get a more significant speed boost on some
     46  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
     47  */
     48
     49#include <linux/linkage.h>
     50#include <asm/cache.h>
     51
     52	.text
     53	.fpu		neon
     54	.align		5
     55
     56/*
     57 * chacha_permute - permute one block
     58 *
     59 * Permute one 64-byte block where the state matrix is stored in the four NEON
     60 * registers q0-q3.  It performs matrix operations on four words in parallel,
     61 * but requires shuffling to rearrange the words after each round.
     62 *
     63 * The round count is given in r3.
     64 *
     65 * Clobbers: r3, ip, q4-q5
     66 */
     67chacha_permute:
     68
     69	adr		ip, .Lrol8_table
     70	vld1.8		{d10}, [ip, :64]
     71
     72.Ldoubleround:
     73	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     74	vadd.i32	q0, q0, q1
     75	veor		q3, q3, q0
     76	vrev32.16	q3, q3
     77
     78	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     79	vadd.i32	q2, q2, q3
     80	veor		q4, q1, q2
     81	vshl.u32	q1, q4, #12
     82	vsri.u32	q1, q4, #20
     83
     84	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     85	vadd.i32	q0, q0, q1
     86	veor		q3, q3, q0
     87	vtbl.8		d6, {d6}, d10
     88	vtbl.8		d7, {d7}, d10
     89
     90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     91	vadd.i32	q2, q2, q3
     92	veor		q4, q1, q2
     93	vshl.u32	q1, q4, #7
     94	vsri.u32	q1, q4, #25
     95
     96	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
     97	vext.8		q1, q1, q1, #4
     98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
     99	vext.8		q2, q2, q2, #8
    100	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
    101	vext.8		q3, q3, q3, #12
    102
    103	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
    104	vadd.i32	q0, q0, q1
    105	veor		q3, q3, q0
    106	vrev32.16	q3, q3
    107
    108	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
    109	vadd.i32	q2, q2, q3
    110	veor		q4, q1, q2
    111	vshl.u32	q1, q4, #12
    112	vsri.u32	q1, q4, #20
    113
    114	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
    115	vadd.i32	q0, q0, q1
    116	veor		q3, q3, q0
    117	vtbl.8		d6, {d6}, d10
    118	vtbl.8		d7, {d7}, d10
    119
    120	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
    121	vadd.i32	q2, q2, q3
    122	veor		q4, q1, q2
    123	vshl.u32	q1, q4, #7
    124	vsri.u32	q1, q4, #25
    125
    126	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
    127	vext.8		q1, q1, q1, #12
    128	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
    129	vext.8		q2, q2, q2, #8
    130	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    131	vext.8		q3, q3, q3, #4
    132
    133	subs		r3, r3, #2
    134	bne		.Ldoubleround
    135
    136	bx		lr
    137ENDPROC(chacha_permute)
    138
    139ENTRY(chacha_block_xor_neon)
    140	// r0: Input state matrix, s
    141	// r1: 1 data block output, o
    142	// r2: 1 data block input, i
    143	// r3: nrounds
    144	push		{lr}
    145
    146	// x0..3 = s0..3
    147	add		ip, r0, #0x20
    148	vld1.32		{q0-q1}, [r0]
    149	vld1.32		{q2-q3}, [ip]
    150
    151	vmov		q8, q0
    152	vmov		q9, q1
    153	vmov		q10, q2
    154	vmov		q11, q3
    155
    156	bl		chacha_permute
    157
    158	add		ip, r2, #0x20
    159	vld1.8		{q4-q5}, [r2]
    160	vld1.8		{q6-q7}, [ip]
    161
    162	// o0 = i0 ^ (x0 + s0)
    163	vadd.i32	q0, q0, q8
    164	veor		q0, q0, q4
    165
    166	// o1 = i1 ^ (x1 + s1)
    167	vadd.i32	q1, q1, q9
    168	veor		q1, q1, q5
    169
    170	// o2 = i2 ^ (x2 + s2)
    171	vadd.i32	q2, q2, q10
    172	veor		q2, q2, q6
    173
    174	// o3 = i3 ^ (x3 + s3)
    175	vadd.i32	q3, q3, q11
    176	veor		q3, q3, q7
    177
    178	add		ip, r1, #0x20
    179	vst1.8		{q0-q1}, [r1]
    180	vst1.8		{q2-q3}, [ip]
    181
    182	pop		{pc}
    183ENDPROC(chacha_block_xor_neon)
    184
    185ENTRY(hchacha_block_neon)
    186	// r0: Input state matrix, s
    187	// r1: output (8 32-bit words)
    188	// r2: nrounds
    189	push		{lr}
    190
    191	vld1.32		{q0-q1}, [r0]!
    192	vld1.32		{q2-q3}, [r0]
    193
    194	mov		r3, r2
    195	bl		chacha_permute
    196
    197	vst1.32		{q0}, [r1]!
    198	vst1.32		{q3}, [r1]
    199
    200	pop		{pc}
    201ENDPROC(hchacha_block_neon)
    202
    203	.align		4
    204.Lctrinc:	.word	0, 1, 2, 3
    205.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
    206
    207	.align		5
    208ENTRY(chacha_4block_xor_neon)
    209	push		{r4, lr}
    210	mov		r4, sp			// preserve the stack pointer
    211	sub		ip, sp, #0x20		// allocate a 32 byte buffer
    212	bic		ip, ip, #0x1f		// aligned to 32 bytes
    213	mov		sp, ip
    214
    215	// r0: Input state matrix, s
    216	// r1: 4 data blocks output, o
    217	// r2: 4 data blocks input, i
    218	// r3: nrounds
    219
    220	//
    221	// This function encrypts four consecutive ChaCha blocks by loading
    222	// the state matrix in NEON registers four times. The algorithm performs
    223	// each operation on the corresponding word of each state matrix, hence
    224	// requires no word shuffling. The words are re-interleaved before the
    225	// final addition of the original state and the XORing step.
    226	//
    227
    228	// x0..15[0-3] = s0..15[0-3]
    229	add		ip, r0, #0x20
    230	vld1.32		{q0-q1}, [r0]
    231	vld1.32		{q2-q3}, [ip]
    232
    233	adr		lr, .Lctrinc
    234	vdup.32		q15, d7[1]
    235	vdup.32		q14, d7[0]
    236	vld1.32		{q4}, [lr, :128]
    237	vdup.32		q13, d6[1]
    238	vdup.32		q12, d6[0]
    239	vdup.32		q11, d5[1]
    240	vdup.32		q10, d5[0]
    241	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
    242	vdup.32		q9, d4[1]
    243	vdup.32		q8, d4[0]
    244	vdup.32		q7, d3[1]
    245	vdup.32		q6, d3[0]
    246	vdup.32		q5, d2[1]
    247	vdup.32		q4, d2[0]
    248	vdup.32		q3, d1[1]
    249	vdup.32		q2, d1[0]
    250	vdup.32		q1, d0[1]
    251	vdup.32		q0, d0[0]
    252
    253	adr		ip, .Lrol8_table
    254	b		1f
    255
    256.Ldoubleround4:
    257	vld1.32		{q8-q9}, [sp, :256]
    2581:
    259	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
    260	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
    261	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
    262	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
    263	vadd.i32	q0, q0, q4
    264	vadd.i32	q1, q1, q5
    265	vadd.i32	q2, q2, q6
    266	vadd.i32	q3, q3, q7
    267
    268	veor		q12, q12, q0
    269	veor		q13, q13, q1
    270	veor		q14, q14, q2
    271	veor		q15, q15, q3
    272
    273	vrev32.16	q12, q12
    274	vrev32.16	q13, q13
    275	vrev32.16	q14, q14
    276	vrev32.16	q15, q15
    277
    278	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
    279	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
    280	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
    281	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
    282	vadd.i32	q8, q8, q12
    283	vadd.i32	q9, q9, q13
    284	vadd.i32	q10, q10, q14
    285	vadd.i32	q11, q11, q15
    286
    287	vst1.32		{q8-q9}, [sp, :256]
    288
    289	veor		q8, q4, q8
    290	veor		q9, q5, q9
    291	vshl.u32	q4, q8, #12
    292	vshl.u32	q5, q9, #12
    293	vsri.u32	q4, q8, #20
    294	vsri.u32	q5, q9, #20
    295
    296	veor		q8, q6, q10
    297	veor		q9, q7, q11
    298	vshl.u32	q6, q8, #12
    299	vshl.u32	q7, q9, #12
    300	vsri.u32	q6, q8, #20
    301	vsri.u32	q7, q9, #20
    302
    303	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
    304	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
    305	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
    306	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
    307	vld1.8		{d16}, [ip, :64]
    308	vadd.i32	q0, q0, q4
    309	vadd.i32	q1, q1, q5
    310	vadd.i32	q2, q2, q6
    311	vadd.i32	q3, q3, q7
    312
    313	veor		q12, q12, q0
    314	veor		q13, q13, q1
    315	veor		q14, q14, q2
    316	veor		q15, q15, q3
    317
    318	vtbl.8		d24, {d24}, d16
    319	vtbl.8		d25, {d25}, d16
    320	vtbl.8		d26, {d26}, d16
    321	vtbl.8		d27, {d27}, d16
    322	vtbl.8		d28, {d28}, d16
    323	vtbl.8		d29, {d29}, d16
    324	vtbl.8		d30, {d30}, d16
    325	vtbl.8		d31, {d31}, d16
    326
    327	vld1.32		{q8-q9}, [sp, :256]
    328
    329	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
    330	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
    331	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
    332	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
    333	vadd.i32	q8, q8, q12
    334	vadd.i32	q9, q9, q13
    335	vadd.i32	q10, q10, q14
    336	vadd.i32	q11, q11, q15
    337
    338	vst1.32		{q8-q9}, [sp, :256]
    339
    340	veor		q8, q4, q8
    341	veor		q9, q5, q9
    342	vshl.u32	q4, q8, #7
    343	vshl.u32	q5, q9, #7
    344	vsri.u32	q4, q8, #25
    345	vsri.u32	q5, q9, #25
    346
    347	veor		q8, q6, q10
    348	veor		q9, q7, q11
    349	vshl.u32	q6, q8, #7
    350	vshl.u32	q7, q9, #7
    351	vsri.u32	q6, q8, #25
    352	vsri.u32	q7, q9, #25
    353
    354	vld1.32		{q8-q9}, [sp, :256]
    355
    356	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
    357	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
    358	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
    359	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
    360	vadd.i32	q0, q0, q5
    361	vadd.i32	q1, q1, q6
    362	vadd.i32	q2, q2, q7
    363	vadd.i32	q3, q3, q4
    364
    365	veor		q15, q15, q0
    366	veor		q12, q12, q1
    367	veor		q13, q13, q2
    368	veor		q14, q14, q3
    369
    370	vrev32.16	q15, q15
    371	vrev32.16	q12, q12
    372	vrev32.16	q13, q13
    373	vrev32.16	q14, q14
    374
    375	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
    376	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
    377	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
    378	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
    379	vadd.i32	q10, q10, q15
    380	vadd.i32	q11, q11, q12
    381	vadd.i32	q8, q8, q13
    382	vadd.i32	q9, q9, q14
    383
    384	vst1.32		{q8-q9}, [sp, :256]
    385
    386	veor		q8, q7, q8
    387	veor		q9, q4, q9
    388	vshl.u32	q7, q8, #12
    389	vshl.u32	q4, q9, #12
    390	vsri.u32	q7, q8, #20
    391	vsri.u32	q4, q9, #20
    392
    393	veor		q8, q5, q10
    394	veor		q9, q6, q11
    395	vshl.u32	q5, q8, #12
    396	vshl.u32	q6, q9, #12
    397	vsri.u32	q5, q8, #20
    398	vsri.u32	q6, q9, #20
    399
    400	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
    401	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
    402	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
    403	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
    404	vld1.8		{d16}, [ip, :64]
    405	vadd.i32	q0, q0, q5
    406	vadd.i32	q1, q1, q6
    407	vadd.i32	q2, q2, q7
    408	vadd.i32	q3, q3, q4
    409
    410	veor		q15, q15, q0
    411	veor		q12, q12, q1
    412	veor		q13, q13, q2
    413	veor		q14, q14, q3
    414
    415	vtbl.8		d30, {d30}, d16
    416	vtbl.8		d31, {d31}, d16
    417	vtbl.8		d24, {d24}, d16
    418	vtbl.8		d25, {d25}, d16
    419	vtbl.8		d26, {d26}, d16
    420	vtbl.8		d27, {d27}, d16
    421	vtbl.8		d28, {d28}, d16
    422	vtbl.8		d29, {d29}, d16
    423
    424	vld1.32		{q8-q9}, [sp, :256]
    425
    426	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
    427	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
    428	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
    429	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
    430	vadd.i32	q10, q10, q15
    431	vadd.i32	q11, q11, q12
    432	vadd.i32	q8, q8, q13
    433	vadd.i32	q9, q9, q14
    434
    435	vst1.32		{q8-q9}, [sp, :256]
    436
    437	veor		q8, q7, q8
    438	veor		q9, q4, q9
    439	vshl.u32	q7, q8, #7
    440	vshl.u32	q4, q9, #7
    441	vsri.u32	q7, q8, #25
    442	vsri.u32	q4, q9, #25
    443
    444	veor		q8, q5, q10
    445	veor		q9, q6, q11
    446	vshl.u32	q5, q8, #7
    447	vshl.u32	q6, q9, #7
    448	vsri.u32	q5, q8, #25
    449	vsri.u32	q6, q9, #25
    450
    451	subs		r3, r3, #2
    452	bne		.Ldoubleround4
    453
    454	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
    455	// x8..9[0-3] are on the stack.
    456
    457	// Re-interleave the words in the first two rows of each block (x0..7).
    458	// Also add the counter values 0-3 to x12[0-3].
    459	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
    460	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
    461	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
    462	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
    463	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
    464	  vadd.u32	q12, q8			// x12 += counter values 0-3
    465	vswp		d1, d4
    466	vswp		d3, d6
    467	  vld1.32	{q8-q9}, [r0]!		// load s0..7
    468	vswp		d9, d12
    469	vswp		d11, d14
    470
    471	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
    472	// after XORing the first 32 bytes.
    473	vswp		q1, q4
    474
    475	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
    476
    477	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
    478	vadd.u32	q0, q0, q8
    479	vadd.u32	q2, q2, q8
    480	vadd.u32	q4, q4, q8
    481	vadd.u32	q3, q3, q8
    482
    483	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
    484	vadd.u32	q1, q1, q9
    485	vadd.u32	q6, q6, q9
    486	vadd.u32	q5, q5, q9
    487	vadd.u32	q7, q7, q9
    488
    489	// XOR first 32 bytes using keystream from first two rows of first block
    490	vld1.8		{q8-q9}, [r2]!
    491	veor		q8, q8, q0
    492	veor		q9, q9, q1
    493	vst1.8		{q8-q9}, [r1]!
    494
    495	// Re-interleave the words in the last two rows of each block (x8..15).
    496	vld1.32		{q8-q9}, [sp, :256]
    497	  mov		sp, r4		// restore original stack pointer
    498	  ldr		r4, [r4, #8]	// load number of bytes
    499	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
    500	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
    501	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
    502	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
    503	  vld1.32	{q0-q1}, [r0]	// load s8..15
    504	vswp		d25, d28
    505	vswp		d27, d30
    506	vswp		d17, d20
    507	vswp		d19, d22
    508
    509	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
    510
    511	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
    512	vadd.u32	q8,  q8,  q0
    513	vadd.u32	q10, q10, q0
    514	vadd.u32	q9,  q9,  q0
    515	vadd.u32	q11, q11, q0
    516
    517	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
    518	vadd.u32	q12, q12, q1
    519	vadd.u32	q14, q14, q1
    520	vadd.u32	q13, q13, q1
    521	vadd.u32	q15, q15, q1
    522
    523	// XOR the rest of the data with the keystream
    524
    525	vld1.8		{q0-q1}, [r2]!
    526	subs		r4, r4, #96
    527	veor		q0, q0, q8
    528	veor		q1, q1, q12
    529	ble		.Lle96
    530	vst1.8		{q0-q1}, [r1]!
    531
    532	vld1.8		{q0-q1}, [r2]!
    533	subs		r4, r4, #32
    534	veor		q0, q0, q2
    535	veor		q1, q1, q6
    536	ble		.Lle128
    537	vst1.8		{q0-q1}, [r1]!
    538
    539	vld1.8		{q0-q1}, [r2]!
    540	subs		r4, r4, #32
    541	veor		q0, q0, q10
    542	veor		q1, q1, q14
    543	ble		.Lle160
    544	vst1.8		{q0-q1}, [r1]!
    545
    546	vld1.8		{q0-q1}, [r2]!
    547	subs		r4, r4, #32
    548	veor		q0, q0, q4
    549	veor		q1, q1, q5
    550	ble		.Lle192
    551	vst1.8		{q0-q1}, [r1]!
    552
    553	vld1.8		{q0-q1}, [r2]!
    554	subs		r4, r4, #32
    555	veor		q0, q0, q9
    556	veor		q1, q1, q13
    557	ble		.Lle224
    558	vst1.8		{q0-q1}, [r1]!
    559
    560	vld1.8		{q0-q1}, [r2]!
    561	subs		r4, r4, #32
    562	veor		q0, q0, q3
    563	veor		q1, q1, q7
    564	blt		.Llt256
    565.Lout:
    566	vst1.8		{q0-q1}, [r1]!
    567
    568	vld1.8		{q0-q1}, [r2]
    569	veor		q0, q0, q11
    570	veor		q1, q1, q15
    571	vst1.8		{q0-q1}, [r1]
    572
    573	pop		{r4, pc}
    574
    575.Lle192:
    576	vmov		q4, q9
    577	vmov		q5, q13
    578
    579.Lle160:
    580	// nothing to do
    581
    582.Lfinalblock:
    583	// Process the final block if processing less than 4 full blocks.
    584	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
    585	// previous 32 byte output block that still needs to be written at
    586	// [r1] in q0-q1.
    587	beq		.Lfullblock
    588
    589.Lpartialblock:
    590	adr		lr, .Lpermute + 32
    591	add		r2, r2, r4
    592	add		lr, lr, r4
    593	add		r4, r4, r1
    594
    595	vld1.8		{q2-q3}, [lr]
    596	vld1.8		{q6-q7}, [r2]
    597
    598	add		r4, r4, #32
    599
    600	vtbl.8		d4, {q4-q5}, d4
    601	vtbl.8		d5, {q4-q5}, d5
    602	vtbl.8		d6, {q4-q5}, d6
    603	vtbl.8		d7, {q4-q5}, d7
    604
    605	veor		q6, q6, q2
    606	veor		q7, q7, q3
    607
    608	vst1.8		{q6-q7}, [r4]	// overlapping stores
    609	vst1.8		{q0-q1}, [r1]
    610	pop		{r4, pc}
    611
    612.Lfullblock:
    613	vmov		q11, q4
    614	vmov		q15, q5
    615	b		.Lout
    616.Lle96:
    617	vmov		q4, q2
    618	vmov		q5, q6
    619	b		.Lfinalblock
    620.Lle128:
    621	vmov		q4, q10
    622	vmov		q5, q14
    623	b		.Lfinalblock
    624.Lle224:
    625	vmov		q4, q3
    626	vmov		q5, q7
    627	b		.Lfinalblock
    628.Llt256:
    629	vmov		q4, q11
    630	vmov		q5, q15
    631	b		.Lpartialblock
    632ENDPROC(chacha_4block_xor_neon)
    633
    634	.align		L1_CACHE_SHIFT
    635.Lpermute:
    636	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
    637	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
    638	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
    639	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
    640	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
    641	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
    642	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
    643	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f