cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

blake2b-neon-core.S (10122B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * BLAKE2b digest algorithm, NEON accelerated
      4 *
      5 * Copyright 2020 Google LLC
      6 *
      7 * Author: Eric Biggers <ebiggers@google.com>
      8 */
      9
     10#include <linux/linkage.h>
     11
     12	.text
     13	.fpu		neon
     14
     15	// The arguments to blake2b_compress_neon()
     16	STATE		.req	r0
     17	BLOCK		.req	r1
     18	NBLOCKS		.req	r2
     19	INC		.req	r3
     20
     21	// Pointers to the rotation tables
     22	ROR24_TABLE	.req	r4
     23	ROR16_TABLE	.req	r5
     24
     25	// The original stack pointer
     26	ORIG_SP		.req	r6
     27
     28	// NEON registers which contain the message words of the current block.
     29	// M_0-M_3 are occasionally used for other purposes too.
     30	M_0		.req	d16
     31	M_1		.req	d17
     32	M_2		.req	d18
     33	M_3		.req	d19
     34	M_4		.req	d20
     35	M_5		.req	d21
     36	M_6		.req	d22
     37	M_7		.req	d23
     38	M_8		.req	d24
     39	M_9		.req	d25
     40	M_10		.req	d26
     41	M_11		.req	d27
     42	M_12		.req	d28
     43	M_13		.req	d29
     44	M_14		.req	d30
     45	M_15		.req	d31
     46
     47	.align		4
     48	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
     49	// instruction.  This is the most efficient way to implement these
     50	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
     51	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
     52.Lror24_table:
     53	.byte		3, 4, 5, 6, 7, 0, 1, 2
     54.Lror16_table:
     55	.byte		2, 3, 4, 5, 6, 7, 0, 1
     56	// The BLAKE2b initialization vector
     57.Lblake2b_IV:
     58	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
     59	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
     60	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
     61	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
     62
     63// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
     64// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
     65// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
     66// (M_0-M_3), so that they can be reloaded if they are used as temporary
     67// registers.  The macro arguments s0-s15 give the order in which the message
     68// words are used in this round.  'final' is 1 if this is the final round.
     69.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
     70			s8, s9, s10, s11, s12, s13, s14, s15, final=0
     71
     72	// Mix the columns:
     73	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
     74	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
     75
     76	// a += b + m[blake2b_sigma[r][2*i + 0]];
     77	vadd.u64	q0, q0, q2
     78	vadd.u64	q1, q1, q3
     79	vadd.u64	d0, d0, M_\s0
     80	vadd.u64	d1, d1, M_\s2
     81	vadd.u64	d2, d2, M_\s4
     82	vadd.u64	d3, d3, M_\s6
     83
     84	// d = ror64(d ^ a, 32);
     85	veor		q6, q6, q0
     86	veor		q7, q7, q1
     87	vrev64.32	q6, q6
     88	vrev64.32	q7, q7
     89
     90	// c += d;
     91	vadd.u64	q4, q4, q6
     92	vadd.u64	q5, q5, q7
     93
     94	// b = ror64(b ^ c, 24);
     95	vld1.8		{M_0}, [ROR24_TABLE, :64]
     96	veor		q2, q2, q4
     97	veor		q3, q3, q5
     98	vtbl.8		d4, {d4}, M_0
     99	vtbl.8		d5, {d5}, M_0
    100	vtbl.8		d6, {d6}, M_0
    101	vtbl.8		d7, {d7}, M_0
    102
    103	// a += b + m[blake2b_sigma[r][2*i + 1]];
    104	//
    105	// M_0 got clobbered above, so we have to reload it if any of the four
    106	// message words this step needs happens to be M_0.  Otherwise we don't
    107	// need to reload it here, as it will just get clobbered again below.
    108.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
    109	vld1.8		{M_0}, [sp, :64]
    110.endif
    111	vadd.u64	q0, q0, q2
    112	vadd.u64	q1, q1, q3
    113	vadd.u64	d0, d0, M_\s1
    114	vadd.u64	d1, d1, M_\s3
    115	vadd.u64	d2, d2, M_\s5
    116	vadd.u64	d3, d3, M_\s7
    117
    118	// d = ror64(d ^ a, 16);
    119	vld1.8		{M_0}, [ROR16_TABLE, :64]
    120	veor		q6, q6, q0
    121	veor		q7, q7, q1
    122	vtbl.8		d12, {d12}, M_0
    123	vtbl.8		d13, {d13}, M_0
    124	vtbl.8		d14, {d14}, M_0
    125	vtbl.8		d15, {d15}, M_0
    126
    127	// c += d;
    128	vadd.u64	q4, q4, q6
    129	vadd.u64	q5, q5, q7
    130
    131	// b = ror64(b ^ c, 63);
    132	//
    133	// This rotation amount isn't a multiple of 8, so it has to be
    134	// implemented using a pair of shifts, which requires temporary
    135	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
    136	veor		q8, q2, q4
    137	veor		q9, q3, q5
    138	vshr.u64	q2, q8, #63
    139	vshr.u64	q3, q9, #63
    140	vsli.u64	q2, q8, #1
    141	vsli.u64	q3, q9, #1
    142	vld1.8		{q8-q9}, [sp, :256]
    143
    144	// Mix the diagonals:
    145	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
    146	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
    147	//
    148	// There are two possible ways to do this: use 'vext' instructions to
    149	// shift the rows of the matrix so that the diagonals become columns,
    150	// and undo it afterwards; or just use 64-bit operations on 'd'
    151	// registers instead of 128-bit operations on 'q' registers.  We use the
    152	// latter approach, as it performs much better on Cortex-A7.
    153
    154	// a += b + m[blake2b_sigma[r][2*i + 0]];
    155	vadd.u64	d0, d0, d5
    156	vadd.u64	d1, d1, d6
    157	vadd.u64	d2, d2, d7
    158	vadd.u64	d3, d3, d4
    159	vadd.u64	d0, d0, M_\s8
    160	vadd.u64	d1, d1, M_\s10
    161	vadd.u64	d2, d2, M_\s12
    162	vadd.u64	d3, d3, M_\s14
    163
    164	// d = ror64(d ^ a, 32);
    165	veor		d15, d15, d0
    166	veor		d12, d12, d1
    167	veor		d13, d13, d2
    168	veor		d14, d14, d3
    169	vrev64.32	d15, d15
    170	vrev64.32	d12, d12
    171	vrev64.32	d13, d13
    172	vrev64.32	d14, d14
    173
    174	// c += d;
    175	vadd.u64	d10, d10, d15
    176	vadd.u64	d11, d11, d12
    177	vadd.u64	d8, d8, d13
    178	vadd.u64	d9, d9, d14
    179
    180	// b = ror64(b ^ c, 24);
    181	vld1.8		{M_0}, [ROR24_TABLE, :64]
    182	veor		d5, d5, d10
    183	veor		d6, d6, d11
    184	veor		d7, d7, d8
    185	veor		d4, d4, d9
    186	vtbl.8		d5, {d5}, M_0
    187	vtbl.8		d6, {d6}, M_0
    188	vtbl.8		d7, {d7}, M_0
    189	vtbl.8		d4, {d4}, M_0
    190
    191	// a += b + m[blake2b_sigma[r][2*i + 1]];
    192.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
    193	vld1.8		{M_0}, [sp, :64]
    194.endif
    195	vadd.u64	d0, d0, d5
    196	vadd.u64	d1, d1, d6
    197	vadd.u64	d2, d2, d7
    198	vadd.u64	d3, d3, d4
    199	vadd.u64	d0, d0, M_\s9
    200	vadd.u64	d1, d1, M_\s11
    201	vadd.u64	d2, d2, M_\s13
    202	vadd.u64	d3, d3, M_\s15
    203
    204	// d = ror64(d ^ a, 16);
    205	vld1.8		{M_0}, [ROR16_TABLE, :64]
    206	veor		d15, d15, d0
    207	veor		d12, d12, d1
    208	veor		d13, d13, d2
    209	veor		d14, d14, d3
    210	vtbl.8		d12, {d12}, M_0
    211	vtbl.8		d13, {d13}, M_0
    212	vtbl.8		d14, {d14}, M_0
    213	vtbl.8		d15, {d15}, M_0
    214
    215	// c += d;
    216	vadd.u64	d10, d10, d15
    217	vadd.u64	d11, d11, d12
    218	vadd.u64	d8, d8, d13
    219	vadd.u64	d9, d9, d14
    220
    221	// b = ror64(b ^ c, 63);
    222	veor		d16, d4, d9
    223	veor		d17, d5, d10
    224	veor		d18, d6, d11
    225	veor		d19, d7, d8
    226	vshr.u64	q2, q8, #63
    227	vshr.u64	q3, q9, #63
    228	vsli.u64	q2, q8, #1
    229	vsli.u64	q3, q9, #1
    230	// Reloading q8-q9 can be skipped on the final round.
    231.if ! \final
    232	vld1.8		{q8-q9}, [sp, :256]
    233.endif
    234.endm
    235
    236//
    237// void blake2b_compress_neon(struct blake2b_state *state,
    238//			      const u8 *block, size_t nblocks, u32 inc);
    239//
    240// Only the first three fields of struct blake2b_state are used:
    241//	u64 h[8];	(inout)
    242//	u64 t[2];	(inout)
    243//	u64 f[2];	(in)
    244//
    245	.align		5
    246ENTRY(blake2b_compress_neon)
    247	push		{r4-r10}
    248
    249	// Allocate a 32-byte stack buffer that is 32-byte aligned.
    250	mov		ORIG_SP, sp
    251	sub		ip, sp, #32
    252	bic		ip, ip, #31
    253	mov		sp, ip
    254
    255	adr		ROR24_TABLE, .Lror24_table
    256	adr		ROR16_TABLE, .Lror16_table
    257
    258	mov		ip, STATE
    259	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
    260	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
    261.Lnext_block:
    262	  adr		r10, .Lblake2b_IV
    263	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
    264	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
    265	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
    266	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
    267	  adds		r7, r7, INC		// Increment counter
    268	bcs		.Lslow_inc_ctr
    269	vmov.i32	d28[0], r7
    270	vst1.64		{d28}, [ip]		// Update t[0]
    271.Linc_ctr_done:
    272
    273	// Load the next message block and finish initializing the state matrix
    274	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
    275	// entire state matrix in q0-q7 and the entire message block in q8-15.
    276	//
    277	// However, _blake2b_round also needs some extra registers for rotates,
    278	// so we have to spill some registers.  It's better to spill the message
    279	// registers than the state registers, as the message doesn't change.
    280	// Therefore we store a copy of the first 32 bytes of the message block
    281	// (q8-q9) in an aligned buffer on the stack so that they can be
    282	// reloaded when needed.  (We could just reload directly from the
    283	// message buffer, but it's faster to use aligned loads.)
    284	vld1.8		{q8-q9}, [BLOCK]!
    285	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
    286	vld1.8		{q10-q11}, [BLOCK]!
    287	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
    288	vld1.8		{q12-q13}, [BLOCK]!
    289	vst1.8		{q8-q9}, [sp, :256]
    290	  mov		ip, STATE
    291	vld1.8		{q14-q15}, [BLOCK]!
    292
    293	// Execute the rounds.  Each round is provided the order in which it
    294	// needs to use the message words.
    295	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    296	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
    297	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
    298	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
    299	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
    300	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
    301	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
    302	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
    303	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
    304	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
    305	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    306	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
    307			final=1
    308
    309	// Fold the final state matrix into the hash chaining value:
    310	//
    311	//	for (i = 0; i < 8; i++)
    312	//		h[i] ^= v[i] ^ v[i + 8];
    313	//
    314	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
    315	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
    316	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
    317	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
    318	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
    319	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
    320	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
    321	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
    322	  mov		ip, STATE
    323	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
    324	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
    325	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
    326	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
    327	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
    328
    329	// Advance to the next block, if there is one.
    330	bne		.Lnext_block		// nblocks != 0?
    331
    332	mov		sp, ORIG_SP
    333	pop		{r4-r10}
    334	mov		pc, lr
    335
    336.Lslow_inc_ctr:
    337	// Handle the case where the counter overflowed its low 32 bits, by
    338	// carrying the overflow bit into the full 128-bit counter.
    339	vmov		r9, r10, d29
    340	adcs		r8, r8, #0
    341	adcs		r9, r9, #0
    342	adc		r10, r10, #0
    343	vmov		d28, r7, r8
    344	vmov		d29, r9, r10
    345	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
    346	b		.Linc_ctr_done
    347ENDPROC(blake2b_compress_neon)