cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-scalar-core.S (10472B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * Copyright (C) 2018 Google, Inc.
      4 */
      5
      6#include <linux/linkage.h>
      7#include <asm/assembler.h>
      8
      9/*
     10 * Design notes:
     11 *
     12 * 16 registers would be needed to hold the state matrix, but only 14 are
     13 * available because 'sp' and 'pc' cannot be used.  So we spill the elements
     14 * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
     15 * 'ldrd' and one 'strd' instruction per round.
     16 *
     17 * All rotates are performed using the implicit rotate operand accepted by the
     18 * 'add' and 'eor' instructions.  This is faster than using explicit rotate
     19 * instructions.  To make this work, we allow the values in the second and last
     20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
     21 * wrong rotation amount.  The rotation amount is then fixed up just in time
     22 * when the values are used.  'brot' is the number of bits the values in row 'b'
     23 * need to be rotated right to arrive at the correct values, and 'drot'
     24 * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
     25 * that they end up as (25, 24) after every round.
     26 */
     27
     28	// ChaCha state registers
     29	X0	.req	r0
     30	X1	.req	r1
     31	X2	.req	r2
     32	X3	.req	r3
     33	X4	.req	r4
     34	X5	.req	r5
     35	X6	.req	r6
     36	X7	.req	r7
     37	X8_X10	.req	r8	// shared by x8 and x10
     38	X9_X11	.req	r9	// shared by x9 and x11
     39	X12	.req	r10
     40	X13	.req	r11
     41	X14	.req	r12
     42	X15	.req	r14
     43
     44.macro _le32_bswap_4x	a, b, c, d,  tmp
     45#ifdef __ARMEB__
     46	rev_l		\a,  \tmp
     47	rev_l		\b,  \tmp
     48	rev_l		\c,  \tmp
     49	rev_l		\d,  \tmp
     50#endif
     51.endm
     52
     53.macro __ldrd		a, b, src, offset
     54#if __LINUX_ARM_ARCH__ >= 6
     55	ldrd		\a, \b, [\src, #\offset]
     56#else
     57	ldr		\a, [\src, #\offset]
     58	ldr		\b, [\src, #\offset + 4]
     59#endif
     60.endm
     61
     62.macro __strd		a, b, dst, offset
     63#if __LINUX_ARM_ARCH__ >= 6
     64	strd		\a, \b, [\dst, #\offset]
     65#else
     66	str		\a, [\dst, #\offset]
     67	str		\b, [\dst, #\offset + 4]
     68#endif
     69.endm
     70
     71.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
     72
     73	// a += b; d ^= a; d = rol(d, 16);
     74	add		\a1, \a1, \b1, ror #brot
     75	add		\a2, \a2, \b2, ror #brot
     76	eor		\d1, \a1, \d1, ror #drot
     77	eor		\d2, \a2, \d2, ror #drot
     78	// drot == 32 - 16 == 16
     79
     80	// c += d; b ^= c; b = rol(b, 12);
     81	add		\c1, \c1, \d1, ror #16
     82	add		\c2, \c2, \d2, ror #16
     83	eor		\b1, \c1, \b1, ror #brot
     84	eor		\b2, \c2, \b2, ror #brot
     85	// brot == 32 - 12 == 20
     86
     87	// a += b; d ^= a; d = rol(d, 8);
     88	add		\a1, \a1, \b1, ror #20
     89	add		\a2, \a2, \b2, ror #20
     90	eor		\d1, \a1, \d1, ror #16
     91	eor		\d2, \a2, \d2, ror #16
     92	// drot == 32 - 8 == 24
     93
     94	// c += d; b ^= c; b = rol(b, 7);
     95	add		\c1, \c1, \d1, ror #24
     96	add		\c2, \c2, \d2, ror #24
     97	eor		\b1, \c1, \b1, ror #20
     98	eor		\b2, \c2, \b2, ror #20
     99	// brot == 32 - 7 == 25
    100.endm
    101
    102.macro _doubleround
    103
    104	// column round
    105
    106	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
    107	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
    108
    109	// save (x8, x9); restore (x10, x11)
    110	__strd		X8_X10, X9_X11, sp, 0
    111	__ldrd		X8_X10, X9_X11, sp, 8
    112
    113	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
    114	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
    115
    116	.set brot, 25
    117	.set drot, 24
    118
    119	// diagonal round
    120
    121	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
    122	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
    123
    124	// save (x10, x11); restore (x8, x9)
    125	__strd		X8_X10, X9_X11, sp, 8
    126	__ldrd		X8_X10, X9_X11, sp, 0
    127
    128	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
    129	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
    130.endm
    131
    132.macro _chacha_permute	nrounds
    133	.set brot, 0
    134	.set drot, 0
    135	.rept \nrounds / 2
    136	 _doubleround
    137	.endr
    138.endm
    139
    140.macro _chacha		nrounds
    141
    142.Lnext_block\@:
    143	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
    144	// Registers contain x0-x9,x12-x15.
    145
    146	// Do the core ChaCha permutation to update x0-x15.
    147	_chacha_permute	\nrounds
    148
    149	add		sp, #8
    150	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
    151	// Registers contain x0-x9,x12-x15.
    152	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
    153
    154	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
    155	push		{X8_X10, X9_X11, X12, X13, X14, X15}
    156
    157	// Load (OUT, IN, LEN).
    158	ldr		r14, [sp, #96]
    159	ldr		r12, [sp, #100]
    160	ldr		r11, [sp, #104]
    161
    162	orr		r10, r14, r12
    163
    164	// Use slow path if fewer than 64 bytes remain.
    165	cmp		r11, #64
    166	blt		.Lxor_slowpath\@
    167
    168	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
    169	// ARMv6+, since ldmia and stmia (used below) still require alignment.
    170	tst		r10, #3
    171	bne		.Lxor_slowpath\@
    172
    173	// Fast path: XOR 64 bytes of aligned data.
    174
    175	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
    176	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
    177	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
    178
    179	// x0-x3
    180	__ldrd		r8, r9, sp, 32
    181	__ldrd		r10, r11, sp, 40
    182	add		X0, X0, r8
    183	add		X1, X1, r9
    184	add		X2, X2, r10
    185	add		X3, X3, r11
    186	_le32_bswap_4x	X0, X1, X2, X3,  r8
    187	ldmia		r12!, {r8-r11}
    188	eor		X0, X0, r8
    189	eor		X1, X1, r9
    190	eor		X2, X2, r10
    191	eor		X3, X3, r11
    192	stmia		r14!, {X0-X3}
    193
    194	// x4-x7
    195	__ldrd		r8, r9, sp, 48
    196	__ldrd		r10, r11, sp, 56
    197	add		X4, r8, X4, ror #brot
    198	add		X5, r9, X5, ror #brot
    199	ldmia		r12!, {X0-X3}
    200	add		X6, r10, X6, ror #brot
    201	add		X7, r11, X7, ror #brot
    202	_le32_bswap_4x	X4, X5, X6, X7,  r8
    203	eor		X4, X4, X0
    204	eor		X5, X5, X1
    205	eor		X6, X6, X2
    206	eor		X7, X7, X3
    207	stmia		r14!, {X4-X7}
    208
    209	// x8-x15
    210	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
    211	__ldrd		r8, r9, sp, 32
    212	__ldrd		r10, r11, sp, 40
    213	add		r0, r0, r8		// x8
    214	add		r1, r1, r9		// x9
    215	add		r6, r6, r10		// x10
    216	add		r7, r7, r11		// x11
    217	_le32_bswap_4x	r0, r1, r6, r7,  r8
    218	ldmia		r12!, {r8-r11}
    219	eor		r0, r0, r8		// x8
    220	eor		r1, r1, r9		// x9
    221	eor		r6, r6, r10		// x10
    222	eor		r7, r7, r11		// x11
    223	stmia		r14!, {r0,r1,r6,r7}
    224	ldmia		r12!, {r0,r1,r6,r7}
    225	__ldrd		r8, r9, sp, 48
    226	__ldrd		r10, r11, sp, 56
    227	add		r2, r8, r2, ror #drot	// x12
    228	add		r3, r9, r3, ror #drot	// x13
    229	add		r4, r10, r4, ror #drot	// x14
    230	add		r5, r11, r5, ror #drot	// x15
    231	_le32_bswap_4x	r2, r3, r4, r5,  r9
    232	  ldr		r9, [sp, #72]		// load LEN
    233	eor		r2, r2, r0		// x12
    234	eor		r3, r3, r1		// x13
    235	eor		r4, r4, r6		// x14
    236	eor		r5, r5, r7		// x15
    237	  subs		r9, #64			// decrement and check LEN
    238	stmia		r14!, {r2-r5}
    239
    240	beq		.Ldone\@
    241
    242.Lprepare_for_next_block\@:
    243
    244	// Stack: x0-x15 OUT IN LEN
    245
    246	// Increment block counter (x12)
    247	add		r8, #1
    248
    249	// Store updated (OUT, IN, LEN)
    250	str		r14, [sp, #64]
    251	str		r12, [sp, #68]
    252	str		r9, [sp, #72]
    253
    254	  mov		r14, sp
    255
    256	// Store updated block counter (x12)
    257	str		r8, [sp, #48]
    258
    259	  sub		sp, #16
    260
    261	// Reload state and do next block
    262	ldmia		r14!, {r0-r11}		// load x0-x11
    263	__strd		r10, r11, sp, 8		// store x10-x11 before state
    264	ldmia		r14, {r10-r12,r14}	// load x12-x15
    265	b		.Lnext_block\@
    266
    267.Lxor_slowpath\@:
    268	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
    269	// We handle it by storing the 64 bytes of keystream to the stack, then
    270	// XOR-ing the needed portion with the data.
    271
    272	// Allocate keystream buffer
    273	sub		sp, #64
    274	mov		r14, sp
    275
    276	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
    277	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
    278	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
    279
    280	// Save keystream for x0-x3
    281	__ldrd		r8, r9, sp, 96
    282	__ldrd		r10, r11, sp, 104
    283	add		X0, X0, r8
    284	add		X1, X1, r9
    285	add		X2, X2, r10
    286	add		X3, X3, r11
    287	_le32_bswap_4x	X0, X1, X2, X3,  r8
    288	stmia		r14!, {X0-X3}
    289
    290	// Save keystream for x4-x7
    291	__ldrd		r8, r9, sp, 112
    292	__ldrd		r10, r11, sp, 120
    293	add		X4, r8, X4, ror #brot
    294	add		X5, r9, X5, ror #brot
    295	add		X6, r10, X6, ror #brot
    296	add		X7, r11, X7, ror #brot
    297	_le32_bswap_4x	X4, X5, X6, X7,  r8
    298	  add		r8, sp, #64
    299	stmia		r14!, {X4-X7}
    300
    301	// Save keystream for x8-x15
    302	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
    303	__ldrd		r8, r9, sp, 128
    304	__ldrd		r10, r11, sp, 136
    305	add		r0, r0, r8		// x8
    306	add		r1, r1, r9		// x9
    307	add		r6, r6, r10		// x10
    308	add		r7, r7, r11		// x11
    309	_le32_bswap_4x	r0, r1, r6, r7,  r8
    310	stmia		r14!, {r0,r1,r6,r7}
    311	__ldrd		r8, r9, sp, 144
    312	__ldrd		r10, r11, sp, 152
    313	add		r2, r8, r2, ror #drot	// x12
    314	add		r3, r9, r3, ror #drot	// x13
    315	add		r4, r10, r4, ror #drot	// x14
    316	add		r5, r11, r5, ror #drot	// x15
    317	_le32_bswap_4x	r2, r3, r4, r5,  r9
    318	stmia		r14, {r2-r5}
    319
    320	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
    321	// Registers: r8 is block counter, r12 is IN.
    322
    323	ldr		r9, [sp, #168]		// LEN
    324	ldr		r14, [sp, #160]		// OUT
    325	cmp		r9, #64
    326	  mov		r0, sp
    327	movle		r1, r9
    328	movgt		r1, #64
    329	// r1 is number of bytes to XOR, in range [1, 64]
    330
    331.if __LINUX_ARM_ARCH__ < 6
    332	orr		r2, r12, r14
    333	tst		r2, #3			// IN or OUT misaligned?
    334	bne		.Lxor_next_byte\@
    335.endif
    336
    337	// XOR a word at a time
    338.rept 16
    339	subs		r1, #4
    340	blt		.Lxor_words_done\@
    341	ldr		r2, [r12], #4
    342	ldr		r3, [r0], #4
    343	eor		r2, r2, r3
    344	str		r2, [r14], #4
    345.endr
    346	b		.Lxor_slowpath_done\@
    347.Lxor_words_done\@:
    348	ands		r1, r1, #3
    349	beq		.Lxor_slowpath_done\@
    350
    351	// XOR a byte at a time
    352.Lxor_next_byte\@:
    353	ldrb		r2, [r12], #1
    354	ldrb		r3, [r0], #1
    355	eor		r2, r2, r3
    356	strb		r2, [r14], #1
    357	subs		r1, #1
    358	bne		.Lxor_next_byte\@
    359
    360.Lxor_slowpath_done\@:
    361	subs		r9, #64
    362	add		sp, #96
    363	bgt		.Lprepare_for_next_block\@
    364
    365.Ldone\@:
    366.endm	// _chacha
    367
    368/*
    369 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
    370 *		     const u32 *state, int nrounds);
    371 */
    372ENTRY(chacha_doarm)
    373	cmp		r2, #0			// len == 0?
    374	reteq		lr
    375
    376	ldr		ip, [sp]
    377	cmp		ip, #12
    378
    379	push		{r0-r2,r4-r11,lr}
    380
    381	// Push state x0-x15 onto stack.
    382	// Also store an extra copy of x10-x11 just before the state.
    383
    384	add		X12, r3, #48
    385	ldm		X12, {X12,X13,X14,X15}
    386	push		{X12,X13,X14,X15}
    387	sub		sp, sp, #64
    388
    389	__ldrd		X8_X10, X9_X11, r3, 40
    390	__strd		X8_X10, X9_X11, sp, 8
    391	__strd		X8_X10, X9_X11, sp, 56
    392	ldm		r3, {X0-X9_X11}
    393	__strd		X0, X1, sp, 16
    394	__strd		X2, X3, sp, 24
    395	__strd		X4, X5, sp, 32
    396	__strd		X6, X7, sp, 40
    397	__strd		X8_X10, X9_X11, sp, 48
    398
    399	beq		1f
    400	_chacha		20
    401
    4020:	add		sp, #76
    403	pop		{r4-r11, pc}
    404
    4051:	_chacha		12
    406	b		0b
    407ENDPROC(chacha_doarm)
    408
    409/*
    410 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
    411 */
    412ENTRY(hchacha_block_arm)
    413	push		{r1,r4-r11,lr}
    414
    415	cmp		r2, #12			// ChaCha12 ?
    416
    417	mov		r14, r0
    418	ldmia		r14!, {r0-r11}		// load x0-x11
    419	push		{r10-r11}		// store x10-x11 to stack
    420	ldm		r14, {r10-r12,r14}	// load x12-x15
    421	sub		sp, #8
    422
    423	beq		1f
    424	_chacha_permute	20
    425
    426	// Skip over (unused0-unused1, x10-x11)
    4270:	add		sp, #16
    428
    429	// Fix up rotations of x12-x15
    430	ror		X12, X12, #drot
    431	ror		X13, X13, #drot
    432	  pop		{r4}			// load 'out'
    433	ror		X14, X14, #drot
    434	ror		X15, X15, #drot
    435
    436	// Store (x0-x3,x12-x15) to 'out'
    437	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
    438
    439	pop		{r4-r11,pc}
    440
    4411:	_chacha_permute	12
    442	b		0b
    443ENDPROC(hchacha_block_arm)