cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-neon-core.S (19011B)


      1/*
      2 * ChaCha/XChaCha NEON helper functions
      3 *
      4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
      5 *
      6 * This program is free software; you can redistribute it and/or modify
      7 * it under the terms of the GNU General Public License version 2 as
      8 * published by the Free Software Foundation.
      9 *
     10 * Originally based on:
     11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
     12 *
     13 * Copyright (C) 2015 Martin Willi
     14 *
     15 * This program is free software; you can redistribute it and/or modify
     16 * it under the terms of the GNU General Public License as published by
     17 * the Free Software Foundation; either version 2 of the License, or
     18 * (at your option) any later version.
     19 */
     20
     21#include <linux/linkage.h>
     22#include <asm/assembler.h>
     23#include <asm/cache.h>
     24
     25	.text
     26	.align		6
     27
     28/*
     29 * chacha_permute - permute one block
     30 *
     31 * Permute one 64-byte block where the state matrix is stored in the four NEON
     32 * registers v0-v3.  It performs matrix operations on four words in parallel,
     33 * but requires shuffling to rearrange the words after each round.
     34 *
     35 * The round count is given in w3.
     36 *
     37 * Clobbers: w3, x10, v4, v12
     38 */
     39SYM_FUNC_START_LOCAL(chacha_permute)
     40
     41	adr_l		x10, ROT8
     42	ld1		{v12.4s}, [x10]
     43
     44.Ldoubleround:
     45	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     46	add		v0.4s, v0.4s, v1.4s
     47	eor		v3.16b, v3.16b, v0.16b
     48	rev32		v3.8h, v3.8h
     49
     50	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     51	add		v2.4s, v2.4s, v3.4s
     52	eor		v4.16b, v1.16b, v2.16b
     53	shl		v1.4s, v4.4s, #12
     54	sri		v1.4s, v4.4s, #20
     55
     56	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     57	add		v0.4s, v0.4s, v1.4s
     58	eor		v3.16b, v3.16b, v0.16b
     59	tbl		v3.16b, {v3.16b}, v12.16b
     60
     61	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     62	add		v2.4s, v2.4s, v3.4s
     63	eor		v4.16b, v1.16b, v2.16b
     64	shl		v1.4s, v4.4s, #7
     65	sri		v1.4s, v4.4s, #25
     66
     67	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
     68	ext		v1.16b, v1.16b, v1.16b, #4
     69	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
     70	ext		v2.16b, v2.16b, v2.16b, #8
     71	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
     72	ext		v3.16b, v3.16b, v3.16b, #12
     73
     74	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
     75	add		v0.4s, v0.4s, v1.4s
     76	eor		v3.16b, v3.16b, v0.16b
     77	rev32		v3.8h, v3.8h
     78
     79	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
     80	add		v2.4s, v2.4s, v3.4s
     81	eor		v4.16b, v1.16b, v2.16b
     82	shl		v1.4s, v4.4s, #12
     83	sri		v1.4s, v4.4s, #20
     84
     85	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
     86	add		v0.4s, v0.4s, v1.4s
     87	eor		v3.16b, v3.16b, v0.16b
     88	tbl		v3.16b, {v3.16b}, v12.16b
     89
     90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
     91	add		v2.4s, v2.4s, v3.4s
     92	eor		v4.16b, v1.16b, v2.16b
     93	shl		v1.4s, v4.4s, #7
     94	sri		v1.4s, v4.4s, #25
     95
     96	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
     97	ext		v1.16b, v1.16b, v1.16b, #12
     98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
     99	ext		v2.16b, v2.16b, v2.16b, #8
    100	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
    101	ext		v3.16b, v3.16b, v3.16b, #4
    102
    103	subs		w3, w3, #2
    104	b.ne		.Ldoubleround
    105
    106	ret
    107SYM_FUNC_END(chacha_permute)
    108
    109SYM_FUNC_START(chacha_block_xor_neon)
    110	// x0: Input state matrix, s
    111	// x1: 1 data block output, o
    112	// x2: 1 data block input, i
    113	// w3: nrounds
    114
    115	stp		x29, x30, [sp, #-16]!
    116	mov		x29, sp
    117
    118	// x0..3 = s0..3
    119	ld1		{v0.4s-v3.4s}, [x0]
    120	ld1		{v8.4s-v11.4s}, [x0]
    121
    122	bl		chacha_permute
    123
    124	ld1		{v4.16b-v7.16b}, [x2]
    125
    126	// o0 = i0 ^ (x0 + s0)
    127	add		v0.4s, v0.4s, v8.4s
    128	eor		v0.16b, v0.16b, v4.16b
    129
    130	// o1 = i1 ^ (x1 + s1)
    131	add		v1.4s, v1.4s, v9.4s
    132	eor		v1.16b, v1.16b, v5.16b
    133
    134	// o2 = i2 ^ (x2 + s2)
    135	add		v2.4s, v2.4s, v10.4s
    136	eor		v2.16b, v2.16b, v6.16b
    137
    138	// o3 = i3 ^ (x3 + s3)
    139	add		v3.4s, v3.4s, v11.4s
    140	eor		v3.16b, v3.16b, v7.16b
    141
    142	st1		{v0.16b-v3.16b}, [x1]
    143
    144	ldp		x29, x30, [sp], #16
    145	ret
    146SYM_FUNC_END(chacha_block_xor_neon)
    147
    148SYM_FUNC_START(hchacha_block_neon)
    149	// x0: Input state matrix, s
    150	// x1: output (8 32-bit words)
    151	// w2: nrounds
    152
    153	stp		x29, x30, [sp, #-16]!
    154	mov		x29, sp
    155
    156	ld1		{v0.4s-v3.4s}, [x0]
    157
    158	mov		w3, w2
    159	bl		chacha_permute
    160
    161	st1		{v0.4s}, [x1], #16
    162	st1		{v3.4s}, [x1]
    163
    164	ldp		x29, x30, [sp], #16
    165	ret
    166SYM_FUNC_END(hchacha_block_neon)
    167
    168	a0		.req	w12
    169	a1		.req	w13
    170	a2		.req	w14
    171	a3		.req	w15
    172	a4		.req	w16
    173	a5		.req	w17
    174	a6		.req	w19
    175	a7		.req	w20
    176	a8		.req	w21
    177	a9		.req	w22
    178	a10		.req	w23
    179	a11		.req	w24
    180	a12		.req	w25
    181	a13		.req	w26
    182	a14		.req	w27
    183	a15		.req	w28
    184
    185	.align		6
    186SYM_FUNC_START(chacha_4block_xor_neon)
    187	frame_push	10
    188
    189	// x0: Input state matrix, s
    190	// x1: 4 data blocks output, o
    191	// x2: 4 data blocks input, i
    192	// w3: nrounds
    193	// x4: byte count
    194
    195	adr_l		x10, .Lpermute
    196	and		x5, x4, #63
    197	add		x10, x10, x5
    198
    199	//
    200	// This function encrypts four consecutive ChaCha blocks by loading
    201	// the state matrix in NEON registers four times. The algorithm performs
    202	// each operation on the corresponding word of each state matrix, hence
    203	// requires no word shuffling. For final XORing step we transpose the
    204	// matrix by interleaving 32- and then 64-bit words, which allows us to
    205	// do XOR in NEON registers.
    206	//
    207	// At the same time, a fifth block is encrypted in parallel using
    208	// scalar registers
    209	//
    210	adr_l		x9, CTRINC		// ... and ROT8
    211	ld1		{v30.4s-v31.4s}, [x9]
    212
    213	// x0..15[0-3] = s0..3[0..3]
    214	add		x8, x0, #16
    215	ld4r		{ v0.4s- v3.4s}, [x0]
    216	ld4r		{ v4.4s- v7.4s}, [x8], #16
    217	ld4r		{ v8.4s-v11.4s}, [x8], #16
    218	ld4r		{v12.4s-v15.4s}, [x8]
    219
    220	mov		a0, v0.s[0]
    221	mov		a1, v1.s[0]
    222	mov		a2, v2.s[0]
    223	mov		a3, v3.s[0]
    224	mov		a4, v4.s[0]
    225	mov		a5, v5.s[0]
    226	mov		a6, v6.s[0]
    227	mov		a7, v7.s[0]
    228	mov		a8, v8.s[0]
    229	mov		a9, v9.s[0]
    230	mov		a10, v10.s[0]
    231	mov		a11, v11.s[0]
    232	mov		a12, v12.s[0]
    233	mov		a13, v13.s[0]
    234	mov		a14, v14.s[0]
    235	mov		a15, v15.s[0]
    236
    237	// x12 += counter values 1-4
    238	add		v12.4s, v12.4s, v30.4s
    239
    240.Ldoubleround4:
    241	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
    242	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
    243	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
    244	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
    245	add		v0.4s, v0.4s, v4.4s
    246	  add		a0, a0, a4
    247	add		v1.4s, v1.4s, v5.4s
    248	  add		a1, a1, a5
    249	add		v2.4s, v2.4s, v6.4s
    250	  add		a2, a2, a6
    251	add		v3.4s, v3.4s, v7.4s
    252	  add		a3, a3, a7
    253
    254	eor		v12.16b, v12.16b, v0.16b
    255	  eor		a12, a12, a0
    256	eor		v13.16b, v13.16b, v1.16b
    257	  eor		a13, a13, a1
    258	eor		v14.16b, v14.16b, v2.16b
    259	  eor		a14, a14, a2
    260	eor		v15.16b, v15.16b, v3.16b
    261	  eor		a15, a15, a3
    262
    263	rev32		v12.8h, v12.8h
    264	  ror		a12, a12, #16
    265	rev32		v13.8h, v13.8h
    266	  ror		a13, a13, #16
    267	rev32		v14.8h, v14.8h
    268	  ror		a14, a14, #16
    269	rev32		v15.8h, v15.8h
    270	  ror		a15, a15, #16
    271
    272	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
    273	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
    274	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
    275	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
    276	add		v8.4s, v8.4s, v12.4s
    277	  add		a8, a8, a12
    278	add		v9.4s, v9.4s, v13.4s
    279	  add		a9, a9, a13
    280	add		v10.4s, v10.4s, v14.4s
    281	  add		a10, a10, a14
    282	add		v11.4s, v11.4s, v15.4s
    283	  add		a11, a11, a15
    284
    285	eor		v16.16b, v4.16b, v8.16b
    286	  eor		a4, a4, a8
    287	eor		v17.16b, v5.16b, v9.16b
    288	  eor		a5, a5, a9
    289	eor		v18.16b, v6.16b, v10.16b
    290	  eor		a6, a6, a10
    291	eor		v19.16b, v7.16b, v11.16b
    292	  eor		a7, a7, a11
    293
    294	shl		v4.4s, v16.4s, #12
    295	shl		v5.4s, v17.4s, #12
    296	shl		v6.4s, v18.4s, #12
    297	shl		v7.4s, v19.4s, #12
    298
    299	sri		v4.4s, v16.4s, #20
    300	  ror		a4, a4, #20
    301	sri		v5.4s, v17.4s, #20
    302	  ror		a5, a5, #20
    303	sri		v6.4s, v18.4s, #20
    304	  ror		a6, a6, #20
    305	sri		v7.4s, v19.4s, #20
    306	  ror		a7, a7, #20
    307
    308	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
    309	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
    310	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
    311	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
    312	add		v0.4s, v0.4s, v4.4s
    313	  add		a0, a0, a4
    314	add		v1.4s, v1.4s, v5.4s
    315	  add		a1, a1, a5
    316	add		v2.4s, v2.4s, v6.4s
    317	  add		a2, a2, a6
    318	add		v3.4s, v3.4s, v7.4s
    319	  add		a3, a3, a7
    320
    321	eor		v12.16b, v12.16b, v0.16b
    322	  eor		a12, a12, a0
    323	eor		v13.16b, v13.16b, v1.16b
    324	  eor		a13, a13, a1
    325	eor		v14.16b, v14.16b, v2.16b
    326	  eor		a14, a14, a2
    327	eor		v15.16b, v15.16b, v3.16b
    328	  eor		a15, a15, a3
    329
    330	tbl		v12.16b, {v12.16b}, v31.16b
    331	  ror		a12, a12, #24
    332	tbl		v13.16b, {v13.16b}, v31.16b
    333	  ror		a13, a13, #24
    334	tbl		v14.16b, {v14.16b}, v31.16b
    335	  ror		a14, a14, #24
    336	tbl		v15.16b, {v15.16b}, v31.16b
    337	  ror		a15, a15, #24
    338
    339	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
    340	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
    341	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
    342	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
    343	add		v8.4s, v8.4s, v12.4s
    344	  add		a8, a8, a12
    345	add		v9.4s, v9.4s, v13.4s
    346	  add		a9, a9, a13
    347	add		v10.4s, v10.4s, v14.4s
    348	  add		a10, a10, a14
    349	add		v11.4s, v11.4s, v15.4s
    350	  add		a11, a11, a15
    351
    352	eor		v16.16b, v4.16b, v8.16b
    353	  eor		a4, a4, a8
    354	eor		v17.16b, v5.16b, v9.16b
    355	  eor		a5, a5, a9
    356	eor		v18.16b, v6.16b, v10.16b
    357	  eor		a6, a6, a10
    358	eor		v19.16b, v7.16b, v11.16b
    359	  eor		a7, a7, a11
    360
    361	shl		v4.4s, v16.4s, #7
    362	shl		v5.4s, v17.4s, #7
    363	shl		v6.4s, v18.4s, #7
    364	shl		v7.4s, v19.4s, #7
    365
    366	sri		v4.4s, v16.4s, #25
    367	  ror		a4, a4, #25
    368	sri		v5.4s, v17.4s, #25
    369	  ror		a5, a5, #25
    370	sri		v6.4s, v18.4s, #25
    371	 ror		a6, a6, #25
    372	sri		v7.4s, v19.4s, #25
    373	  ror		a7, a7, #25
    374
    375	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
    376	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
    377	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
    378	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
    379	add		v0.4s, v0.4s, v5.4s
    380	  add		a0, a0, a5
    381	add		v1.4s, v1.4s, v6.4s
    382	  add		a1, a1, a6
    383	add		v2.4s, v2.4s, v7.4s
    384	  add		a2, a2, a7
    385	add		v3.4s, v3.4s, v4.4s
    386	  add		a3, a3, a4
    387
    388	eor		v15.16b, v15.16b, v0.16b
    389	  eor		a15, a15, a0
    390	eor		v12.16b, v12.16b, v1.16b
    391	  eor		a12, a12, a1
    392	eor		v13.16b, v13.16b, v2.16b
    393	  eor		a13, a13, a2
    394	eor		v14.16b, v14.16b, v3.16b
    395	  eor		a14, a14, a3
    396
    397	rev32		v15.8h, v15.8h
    398	  ror		a15, a15, #16
    399	rev32		v12.8h, v12.8h
    400	  ror		a12, a12, #16
    401	rev32		v13.8h, v13.8h
    402	  ror		a13, a13, #16
    403	rev32		v14.8h, v14.8h
    404	  ror		a14, a14, #16
    405
    406	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
    407	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
    408	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
    409	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
    410	add		v10.4s, v10.4s, v15.4s
    411	  add		a10, a10, a15
    412	add		v11.4s, v11.4s, v12.4s
    413	  add		a11, a11, a12
    414	add		v8.4s, v8.4s, v13.4s
    415	  add		a8, a8, a13
    416	add		v9.4s, v9.4s, v14.4s
    417	  add		a9, a9, a14
    418
    419	eor		v16.16b, v5.16b, v10.16b
    420	  eor		a5, a5, a10
    421	eor		v17.16b, v6.16b, v11.16b
    422	  eor		a6, a6, a11
    423	eor		v18.16b, v7.16b, v8.16b
    424	  eor		a7, a7, a8
    425	eor		v19.16b, v4.16b, v9.16b
    426	  eor		a4, a4, a9
    427
    428	shl		v5.4s, v16.4s, #12
    429	shl		v6.4s, v17.4s, #12
    430	shl		v7.4s, v18.4s, #12
    431	shl		v4.4s, v19.4s, #12
    432
    433	sri		v5.4s, v16.4s, #20
    434	  ror		a5, a5, #20
    435	sri		v6.4s, v17.4s, #20
    436	  ror		a6, a6, #20
    437	sri		v7.4s, v18.4s, #20
    438	  ror		a7, a7, #20
    439	sri		v4.4s, v19.4s, #20
    440	  ror		a4, a4, #20
    441
    442	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
    443	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
    444	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
    445	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
    446	add		v0.4s, v0.4s, v5.4s
    447	  add		a0, a0, a5
    448	add		v1.4s, v1.4s, v6.4s
    449	  add		a1, a1, a6
    450	add		v2.4s, v2.4s, v7.4s
    451	  add		a2, a2, a7
    452	add		v3.4s, v3.4s, v4.4s
    453	  add		a3, a3, a4
    454
    455	eor		v15.16b, v15.16b, v0.16b
    456	  eor		a15, a15, a0
    457	eor		v12.16b, v12.16b, v1.16b
    458	  eor		a12, a12, a1
    459	eor		v13.16b, v13.16b, v2.16b
    460	  eor		a13, a13, a2
    461	eor		v14.16b, v14.16b, v3.16b
    462	  eor		a14, a14, a3
    463
    464	tbl		v15.16b, {v15.16b}, v31.16b
    465	  ror		a15, a15, #24
    466	tbl		v12.16b, {v12.16b}, v31.16b
    467	  ror		a12, a12, #24
    468	tbl		v13.16b, {v13.16b}, v31.16b
    469	  ror		a13, a13, #24
    470	tbl		v14.16b, {v14.16b}, v31.16b
    471	  ror		a14, a14, #24
    472
    473	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
    474	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
    475	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
    476	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
    477	add		v10.4s, v10.4s, v15.4s
    478	  add		a10, a10, a15
    479	add		v11.4s, v11.4s, v12.4s
    480	  add		a11, a11, a12
    481	add		v8.4s, v8.4s, v13.4s
    482	  add		a8, a8, a13
    483	add		v9.4s, v9.4s, v14.4s
    484	  add		a9, a9, a14
    485
    486	eor		v16.16b, v5.16b, v10.16b
    487	  eor		a5, a5, a10
    488	eor		v17.16b, v6.16b, v11.16b
    489	  eor		a6, a6, a11
    490	eor		v18.16b, v7.16b, v8.16b
    491	  eor		a7, a7, a8
    492	eor		v19.16b, v4.16b, v9.16b
    493	  eor		a4, a4, a9
    494
    495	shl		v5.4s, v16.4s, #7
    496	shl		v6.4s, v17.4s, #7
    497	shl		v7.4s, v18.4s, #7
    498	shl		v4.4s, v19.4s, #7
    499
    500	sri		v5.4s, v16.4s, #25
    501	  ror		a5, a5, #25
    502	sri		v6.4s, v17.4s, #25
    503	  ror		a6, a6, #25
    504	sri		v7.4s, v18.4s, #25
    505	  ror		a7, a7, #25
    506	sri		v4.4s, v19.4s, #25
    507	  ror		a4, a4, #25
    508
    509	subs		w3, w3, #2
    510	b.ne		.Ldoubleround4
    511
    512	ld4r		{v16.4s-v19.4s}, [x0], #16
    513	ld4r		{v20.4s-v23.4s}, [x0], #16
    514
    515	// x12 += counter values 0-3
    516	add		v12.4s, v12.4s, v30.4s
    517
    518	// x0[0-3] += s0[0]
    519	// x1[0-3] += s0[1]
    520	// x2[0-3] += s0[2]
    521	// x3[0-3] += s0[3]
    522	add		v0.4s, v0.4s, v16.4s
    523	  mov		w6, v16.s[0]
    524	  mov		w7, v17.s[0]
    525	add		v1.4s, v1.4s, v17.4s
    526	  mov		w8, v18.s[0]
    527	  mov		w9, v19.s[0]
    528	add		v2.4s, v2.4s, v18.4s
    529	  add		a0, a0, w6
    530	  add		a1, a1, w7
    531	add		v3.4s, v3.4s, v19.4s
    532	  add		a2, a2, w8
    533	  add		a3, a3, w9
    534CPU_BE(	  rev		a0, a0		)
    535CPU_BE(	  rev		a1, a1		)
    536CPU_BE(	  rev		a2, a2		)
    537CPU_BE(	  rev		a3, a3		)
    538
    539	ld4r		{v24.4s-v27.4s}, [x0], #16
    540	ld4r		{v28.4s-v31.4s}, [x0]
    541
    542	// x4[0-3] += s1[0]
    543	// x5[0-3] += s1[1]
    544	// x6[0-3] += s1[2]
    545	// x7[0-3] += s1[3]
    546	add		v4.4s, v4.4s, v20.4s
    547	  mov		w6, v20.s[0]
    548	  mov		w7, v21.s[0]
    549	add		v5.4s, v5.4s, v21.4s
    550	  mov		w8, v22.s[0]
    551	  mov		w9, v23.s[0]
    552	add		v6.4s, v6.4s, v22.4s
    553	  add		a4, a4, w6
    554	  add		a5, a5, w7
    555	add		v7.4s, v7.4s, v23.4s
    556	  add		a6, a6, w8
    557	  add		a7, a7, w9
    558CPU_BE(	  rev		a4, a4		)
    559CPU_BE(	  rev		a5, a5		)
    560CPU_BE(	  rev		a6, a6		)
    561CPU_BE(	  rev		a7, a7		)
    562
    563	// x8[0-3] += s2[0]
    564	// x9[0-3] += s2[1]
    565	// x10[0-3] += s2[2]
    566	// x11[0-3] += s2[3]
    567	add		v8.4s, v8.4s, v24.4s
    568	  mov		w6, v24.s[0]
    569	  mov		w7, v25.s[0]
    570	add		v9.4s, v9.4s, v25.4s
    571	  mov		w8, v26.s[0]
    572	  mov		w9, v27.s[0]
    573	add		v10.4s, v10.4s, v26.4s
    574	  add		a8, a8, w6
    575	  add		a9, a9, w7
    576	add		v11.4s, v11.4s, v27.4s
    577	  add		a10, a10, w8
    578	  add		a11, a11, w9
    579CPU_BE(	  rev		a8, a8		)
    580CPU_BE(	  rev		a9, a9		)
    581CPU_BE(	  rev		a10, a10	)
    582CPU_BE(	  rev		a11, a11	)
    583
    584	// x12[0-3] += s3[0]
    585	// x13[0-3] += s3[1]
    586	// x14[0-3] += s3[2]
    587	// x15[0-3] += s3[3]
    588	add		v12.4s, v12.4s, v28.4s
    589	  mov		w6, v28.s[0]
    590	  mov		w7, v29.s[0]
    591	add		v13.4s, v13.4s, v29.4s
    592	  mov		w8, v30.s[0]
    593	  mov		w9, v31.s[0]
    594	add		v14.4s, v14.4s, v30.4s
    595	  add		a12, a12, w6
    596	  add		a13, a13, w7
    597	add		v15.4s, v15.4s, v31.4s
    598	  add		a14, a14, w8
    599	  add		a15, a15, w9
    600CPU_BE(	  rev		a12, a12	)
    601CPU_BE(	  rev		a13, a13	)
    602CPU_BE(	  rev		a14, a14	)
    603CPU_BE(	  rev		a15, a15	)
    604
    605	// interleave 32-bit words in state n, n+1
    606	  ldp		w6, w7, [x2], #64
    607	zip1		v16.4s, v0.4s, v1.4s
    608	  ldp		w8, w9, [x2, #-56]
    609	  eor		a0, a0, w6
    610	zip2		v17.4s, v0.4s, v1.4s
    611	  eor		a1, a1, w7
    612	zip1		v18.4s, v2.4s, v3.4s
    613	  eor		a2, a2, w8
    614	zip2		v19.4s, v2.4s, v3.4s
    615	  eor		a3, a3, w9
    616	  ldp		w6, w7, [x2, #-48]
    617	zip1		v20.4s, v4.4s, v5.4s
    618	  ldp		w8, w9, [x2, #-40]
    619	  eor		a4, a4, w6
    620	zip2		v21.4s, v4.4s, v5.4s
    621	  eor		a5, a5, w7
    622	zip1		v22.4s, v6.4s, v7.4s
    623	  eor		a6, a6, w8
    624	zip2		v23.4s, v6.4s, v7.4s
    625	  eor		a7, a7, w9
    626	  ldp		w6, w7, [x2, #-32]
    627	zip1		v24.4s, v8.4s, v9.4s
    628	  ldp		w8, w9, [x2, #-24]
    629	  eor		a8, a8, w6
    630	zip2		v25.4s, v8.4s, v9.4s
    631	  eor		a9, a9, w7
    632	zip1		v26.4s, v10.4s, v11.4s
    633	  eor		a10, a10, w8
    634	zip2		v27.4s, v10.4s, v11.4s
    635	  eor		a11, a11, w9
    636	  ldp		w6, w7, [x2, #-16]
    637	zip1		v28.4s, v12.4s, v13.4s
    638	  ldp		w8, w9, [x2, #-8]
    639	  eor		a12, a12, w6
    640	zip2		v29.4s, v12.4s, v13.4s
    641	  eor		a13, a13, w7
    642	zip1		v30.4s, v14.4s, v15.4s
    643	  eor		a14, a14, w8
    644	zip2		v31.4s, v14.4s, v15.4s
    645	  eor		a15, a15, w9
    646
    647	add		x3, x2, x4
    648	sub		x3, x3, #128		// start of last block
    649
    650	subs		x5, x4, #128
    651	csel		x2, x2, x3, ge
    652
    653	// interleave 64-bit words in state n, n+2
    654	zip1		v0.2d, v16.2d, v18.2d
    655	zip2		v4.2d, v16.2d, v18.2d
    656	  stp		a0, a1, [x1], #64
    657	zip1		v8.2d, v17.2d, v19.2d
    658	zip2		v12.2d, v17.2d, v19.2d
    659	  stp		a2, a3, [x1, #-56]
    660
    661	subs		x6, x4, #192
    662	ld1		{v16.16b-v19.16b}, [x2], #64
    663	csel		x2, x2, x3, ge
    664
    665	zip1		v1.2d, v20.2d, v22.2d
    666	zip2		v5.2d, v20.2d, v22.2d
    667	  stp		a4, a5, [x1, #-48]
    668	zip1		v9.2d, v21.2d, v23.2d
    669	zip2		v13.2d, v21.2d, v23.2d
    670	  stp		a6, a7, [x1, #-40]
    671
    672	subs		x7, x4, #256
    673	ld1		{v20.16b-v23.16b}, [x2], #64
    674	csel		x2, x2, x3, ge
    675
    676	zip1		v2.2d, v24.2d, v26.2d
    677	zip2		v6.2d, v24.2d, v26.2d
    678	  stp		a8, a9, [x1, #-32]
    679	zip1		v10.2d, v25.2d, v27.2d
    680	zip2		v14.2d, v25.2d, v27.2d
    681	  stp		a10, a11, [x1, #-24]
    682
    683	subs		x8, x4, #320
    684	ld1		{v24.16b-v27.16b}, [x2], #64
    685	csel		x2, x2, x3, ge
    686
    687	zip1		v3.2d, v28.2d, v30.2d
    688	zip2		v7.2d, v28.2d, v30.2d
    689	  stp		a12, a13, [x1, #-16]
    690	zip1		v11.2d, v29.2d, v31.2d
    691	zip2		v15.2d, v29.2d, v31.2d
    692	  stp		a14, a15, [x1, #-8]
    693
    694	tbnz		x5, #63, .Lt128
    695	ld1		{v28.16b-v31.16b}, [x2]
    696
    697	// xor with corresponding input, write to output
    698	eor		v16.16b, v16.16b, v0.16b
    699	eor		v17.16b, v17.16b, v1.16b
    700	eor		v18.16b, v18.16b, v2.16b
    701	eor		v19.16b, v19.16b, v3.16b
    702
    703	tbnz		x6, #63, .Lt192
    704
    705	eor		v20.16b, v20.16b, v4.16b
    706	eor		v21.16b, v21.16b, v5.16b
    707	eor		v22.16b, v22.16b, v6.16b
    708	eor		v23.16b, v23.16b, v7.16b
    709
    710	st1		{v16.16b-v19.16b}, [x1], #64
    711	tbnz		x7, #63, .Lt256
    712
    713	eor		v24.16b, v24.16b, v8.16b
    714	eor		v25.16b, v25.16b, v9.16b
    715	eor		v26.16b, v26.16b, v10.16b
    716	eor		v27.16b, v27.16b, v11.16b
    717
    718	st1		{v20.16b-v23.16b}, [x1], #64
    719	tbnz		x8, #63, .Lt320
    720
    721	eor		v28.16b, v28.16b, v12.16b
    722	eor		v29.16b, v29.16b, v13.16b
    723	eor		v30.16b, v30.16b, v14.16b
    724	eor		v31.16b, v31.16b, v15.16b
    725
    726	st1		{v24.16b-v27.16b}, [x1], #64
    727	st1		{v28.16b-v31.16b}, [x1]
    728
    729.Lout:	frame_pop
    730	ret
    731
    732	// fewer than 192 bytes of in/output
    733.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
    734	ld1		{v28.16b-v31.16b}, [x10]
    735	add		x5, x5, x1
    736	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
    737	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
    738	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
    739	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
    740
    7410:	eor		v20.16b, v20.16b, v28.16b
    742	eor		v21.16b, v21.16b, v29.16b
    743	eor		v22.16b, v22.16b, v30.16b
    744	eor		v23.16b, v23.16b, v31.16b
    745	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
    7461:	st1		{v16.16b-v19.16b}, [x1]
    747	b		.Lout
    748
    749	// fewer than 128 bytes of in/output
    750.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
    751	add		x5, x5, x1
    752	sub		x1, x1, #64
    753	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
    754	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
    755	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
    756	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
    757	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
    758	b		0b
    759
    760	// fewer than 256 bytes of in/output
    761.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
    762	ld1		{v4.16b-v7.16b}, [x10]
    763	add		x6, x6, x1
    764	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
    765	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
    766	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
    767	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
    768
    769	eor		v28.16b, v28.16b, v0.16b
    770	eor		v29.16b, v29.16b, v1.16b
    771	eor		v30.16b, v30.16b, v2.16b
    772	eor		v31.16b, v31.16b, v3.16b
    773	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
    7742:	st1		{v20.16b-v23.16b}, [x1]
    775	b		.Lout
    776
    777	// fewer than 320 bytes of in/output
    778.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
    779	ld1		{v4.16b-v7.16b}, [x10]
    780	add		x7, x7, x1
    781	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
    782	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
    783	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
    784	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
    785
    786	eor		v28.16b, v28.16b, v0.16b
    787	eor		v29.16b, v29.16b, v1.16b
    788	eor		v30.16b, v30.16b, v2.16b
    789	eor		v31.16b, v31.16b, v3.16b
    790	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
    7913:	st1		{v24.16b-v27.16b}, [x1]
    792	b		.Lout
    793SYM_FUNC_END(chacha_4block_xor_neon)
    794
    795	.section	".rodata", "a", %progbits
    796	.align		L1_CACHE_SHIFT
    797.Lpermute:
    798	.set		.Li, 0
    799	.rept		128
    800	.byte		(.Li - 64)
    801	.set		.Li, .Li + 1
    802	.endr
    803
    804CTRINC:	.word		1, 2, 3, 4
    805ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f