cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

chacha-core.S (10173B)


      1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
      2/*
      3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
      4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
      5 */
      6
      7#define MASK_U32		0x3c
      8#define CHACHA20_BLOCK_SIZE	64
      9#define STACK_SIZE		32
     10
     11#define X0	$t0
     12#define X1	$t1
     13#define X2	$t2
     14#define X3	$t3
     15#define X4	$t4
     16#define X5	$t5
     17#define X6	$t6
     18#define X7	$t7
     19#define X8	$t8
     20#define X9	$t9
     21#define X10	$v1
     22#define X11	$s6
     23#define X12	$s5
     24#define X13	$s4
     25#define X14	$s3
     26#define X15	$s2
     27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
     28#define T0	$s1
     29#define T1	$s0
     30#define T(n)	T ## n
     31#define X(n)	X ## n
     32
     33/* Input arguments */
     34#define STATE		$a0
     35#define OUT		$a1
     36#define IN		$a2
     37#define BYTES		$a3
     38
     39/* Output argument */
     40/* NONCE[0] is kept in a register and not in memory.
     41 * We don't want to touch original value in memory.
     42 * Must be incremented every loop iteration.
     43 */
     44#define NONCE_0		$v0
     45
     46/* SAVED_X and SAVED_CA are set in the jump table.
     47 * Use regs which are overwritten on exit else we don't leak clear data.
     48 * They are used to handling the last bytes which are not multiple of 4.
     49 */
     50#define SAVED_X		X15
     51#define SAVED_CA	$s7
     52
     53#define IS_UNALIGNED	$s7
     54
     55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     56#define MSB 0
     57#define LSB 3
     58#define ROTx rotl
     59#define ROTR(n) rotr n, 24
     60#define	CPU_TO_LE32(n) \
     61	wsbh	n; \
     62	rotr	n, 16;
     63#else
     64#define MSB 3
     65#define LSB 0
     66#define ROTx rotr
     67#define CPU_TO_LE32(n)
     68#define ROTR(n)
     69#endif
     70
     71#define FOR_EACH_WORD(x) \
     72	x( 0); \
     73	x( 1); \
     74	x( 2); \
     75	x( 3); \
     76	x( 4); \
     77	x( 5); \
     78	x( 6); \
     79	x( 7); \
     80	x( 8); \
     81	x( 9); \
     82	x(10); \
     83	x(11); \
     84	x(12); \
     85	x(13); \
     86	x(14); \
     87	x(15);
     88
     89#define FOR_EACH_WORD_REV(x) \
     90	x(15); \
     91	x(14); \
     92	x(13); \
     93	x(12); \
     94	x(11); \
     95	x(10); \
     96	x( 9); \
     97	x( 8); \
     98	x( 7); \
     99	x( 6); \
    100	x( 5); \
    101	x( 4); \
    102	x( 3); \
    103	x( 2); \
    104	x( 1); \
    105	x( 0);
    106
    107#define PLUS_ONE_0	 1
    108#define PLUS_ONE_1	 2
    109#define PLUS_ONE_2	 3
    110#define PLUS_ONE_3	 4
    111#define PLUS_ONE_4	 5
    112#define PLUS_ONE_5	 6
    113#define PLUS_ONE_6	 7
    114#define PLUS_ONE_7	 8
    115#define PLUS_ONE_8	 9
    116#define PLUS_ONE_9	10
    117#define PLUS_ONE_10	11
    118#define PLUS_ONE_11	12
    119#define PLUS_ONE_12	13
    120#define PLUS_ONE_13	14
    121#define PLUS_ONE_14	15
    122#define PLUS_ONE_15	16
    123#define PLUS_ONE(x)	PLUS_ONE_ ## x
    124#define _CONCAT3(a,b,c)	a ## b ## c
    125#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
    126
    127#define STORE_UNALIGNED(x) \
    128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
    129	.if (x != 12); \
    130		lw	T0, (x*4)(STATE); \
    131	.endif; \
    132	lwl	T1, (x*4)+MSB ## (IN); \
    133	lwr	T1, (x*4)+LSB ## (IN); \
    134	.if (x == 12); \
    135		addu	X ## x, NONCE_0; \
    136	.else; \
    137		addu	X ## x, T0; \
    138	.endif; \
    139	CPU_TO_LE32(X ## x); \
    140	xor	X ## x, T1; \
    141	swl	X ## x, (x*4)+MSB ## (OUT); \
    142	swr	X ## x, (x*4)+LSB ## (OUT);
    143
    144#define STORE_ALIGNED(x) \
    145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
    146	.if (x != 12); \
    147		lw	T0, (x*4)(STATE); \
    148	.endif; \
    149	lw	T1, (x*4) ## (IN); \
    150	.if (x == 12); \
    151		addu	X ## x, NONCE_0; \
    152	.else; \
    153		addu	X ## x, T0; \
    154	.endif; \
    155	CPU_TO_LE32(X ## x); \
    156	xor	X ## x, T1; \
    157	sw	X ## x, (x*4) ## (OUT);
    158
    159/* Jump table macro.
    160 * Used for setup and handling the last bytes, which are not multiple of 4.
    161 * X15 is free to store Xn
    162 * Every jumptable entry must be equal in size.
    163 */
    164#define JMPTBL_ALIGNED(x) \
    165.Lchacha_mips_jmptbl_aligned_ ## x: ; \
    166	.set	noreorder; \
    167	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
    168	.if (x == 12); \
    169		addu	SAVED_X, X ## x, NONCE_0; \
    170	.else; \
    171		addu	SAVED_X, X ## x, SAVED_CA; \
    172	.endif; \
    173	.set	reorder
    174
    175#define JMPTBL_UNALIGNED(x) \
    176.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
    177	.set	noreorder; \
    178	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
    179	.if (x == 12); \
    180		addu	SAVED_X, X ## x, NONCE_0; \
    181	.else; \
    182		addu	SAVED_X, X ## x, SAVED_CA; \
    183	.endif; \
    184	.set	reorder
    185
    186#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
    187	addu	X(A), X(K); \
    188	addu	X(B), X(L); \
    189	addu	X(C), X(M); \
    190	addu	X(D), X(N); \
    191	xor	X(V), X(A); \
    192	xor	X(W), X(B); \
    193	xor	X(Y), X(C); \
    194	xor	X(Z), X(D); \
    195	rotl	X(V), S;    \
    196	rotl	X(W), S;    \
    197	rotl	X(Y), S;    \
    198	rotl	X(Z), S;
    199
    200.text
    201.set	reorder
    202.set	noat
    203.globl	chacha_crypt_arch
    204.ent	chacha_crypt_arch
    205chacha_crypt_arch:
    206	.frame	$sp, STACK_SIZE, $ra
    207
    208	/* Load number of rounds */
    209	lw	$at, 16($sp)
    210
    211	addiu	$sp, -STACK_SIZE
    212
    213	/* Return bytes = 0. */
    214	beqz	BYTES, .Lchacha_mips_end
    215
    216	lw	NONCE_0, 48(STATE)
    217
    218	/* Save s0-s7 */
    219	sw	$s0,  0($sp)
    220	sw	$s1,  4($sp)
    221	sw	$s2,  8($sp)
    222	sw	$s3, 12($sp)
    223	sw	$s4, 16($sp)
    224	sw	$s5, 20($sp)
    225	sw	$s6, 24($sp)
    226	sw	$s7, 28($sp)
    227
    228	/* Test IN or OUT is unaligned.
    229	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
    230	 */
    231	or	IS_UNALIGNED, IN, OUT
    232	andi	IS_UNALIGNED, 0x3
    233
    234	b	.Lchacha_rounds_start
    235
    236.align 4
    237.Loop_chacha_rounds:
    238	addiu	IN,  CHACHA20_BLOCK_SIZE
    239	addiu	OUT, CHACHA20_BLOCK_SIZE
    240	addiu	NONCE_0, 1
    241
    242.Lchacha_rounds_start:
    243	lw	X0,  0(STATE)
    244	lw	X1,  4(STATE)
    245	lw	X2,  8(STATE)
    246	lw	X3,  12(STATE)
    247
    248	lw	X4,  16(STATE)
    249	lw	X5,  20(STATE)
    250	lw	X6,  24(STATE)
    251	lw	X7,  28(STATE)
    252	lw	X8,  32(STATE)
    253	lw	X9,  36(STATE)
    254	lw	X10, 40(STATE)
    255	lw	X11, 44(STATE)
    256
    257	move	X12, NONCE_0
    258	lw	X13, 52(STATE)
    259	lw	X14, 56(STATE)
    260	lw	X15, 60(STATE)
    261
    262.Loop_chacha_xor_rounds:
    263	addiu	$at, -2
    264	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
    265	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
    266	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
    267	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
    268	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
    269	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
    270	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
    271	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
    272	bnez	$at, .Loop_chacha_xor_rounds
    273
    274	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
    275
    276	/* Is data src/dst unaligned? Jump */
    277	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
    278
    279	/* Set number rounds here to fill delayslot. */
    280	lw	$at, (STACK_SIZE+16)($sp)
    281
    282	/* BYTES < 0, it has no full block. */
    283	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
    284
    285	FOR_EACH_WORD_REV(STORE_ALIGNED)
    286
    287	/* BYTES > 0? Loop again. */
    288	bgtz	BYTES, .Loop_chacha_rounds
    289
    290	/* Place this here to fill delay slot */
    291	addiu	NONCE_0, 1
    292
    293	/* BYTES < 0? Handle last bytes */
    294	bltz	BYTES, .Lchacha_mips_xor_bytes
    295
    296.Lchacha_mips_xor_done:
    297	/* Restore used registers */
    298	lw	$s0,  0($sp)
    299	lw	$s1,  4($sp)
    300	lw	$s2,  8($sp)
    301	lw	$s3, 12($sp)
    302	lw	$s4, 16($sp)
    303	lw	$s5, 20($sp)
    304	lw	$s6, 24($sp)
    305	lw	$s7, 28($sp)
    306
    307	/* Write NONCE_0 back to right location in state */
    308	sw	NONCE_0, 48(STATE)
    309
    310.Lchacha_mips_end:
    311	addiu	$sp, STACK_SIZE
    312	jr	$ra
    313
    314.Lchacha_mips_no_full_block_aligned:
    315	/* Restore the offset on BYTES */
    316	addiu	BYTES, CHACHA20_BLOCK_SIZE
    317
    318	/* Get number of full WORDS */
    319	andi	$at, BYTES, MASK_U32
    320
    321	/* Load upper half of jump table addr */
    322	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
    323
    324	/* Calculate lower half jump table offset */
    325	ins	T0, $at, 1, 6
    326
    327	/* Add offset to STATE */
    328	addu	T1, STATE, $at
    329
    330	/* Add lower half jump table addr */
    331	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
    332
    333	/* Read value from STATE */
    334	lw	SAVED_CA, 0(T1)
    335
    336	/* Store remaining bytecounter as negative value */
    337	subu	BYTES, $at, BYTES
    338
    339	jr	T0
    340
    341	/* Jump table */
    342	FOR_EACH_WORD(JMPTBL_ALIGNED)
    343
    344
    345.Loop_chacha_unaligned:
    346	/* Set number rounds here to fill delayslot. */
    347	lw	$at, (STACK_SIZE+16)($sp)
    348
    349	/* BYTES > 0, it has no full block. */
    350	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
    351
    352	FOR_EACH_WORD_REV(STORE_UNALIGNED)
    353
    354	/* BYTES > 0? Loop again. */
    355	bgtz	BYTES, .Loop_chacha_rounds
    356
    357	/* Write NONCE_0 back to right location in state */
    358	sw	NONCE_0, 48(STATE)
    359
    360	.set noreorder
    361	/* Fall through to byte handling */
    362	bgez	BYTES, .Lchacha_mips_xor_done
    363.Lchacha_mips_xor_unaligned_0_b:
    364.Lchacha_mips_xor_aligned_0_b:
    365	/* Place this here to fill delay slot */
    366	addiu	NONCE_0, 1
    367	.set reorder
    368
    369.Lchacha_mips_xor_bytes:
    370	addu	IN, $at
    371	addu	OUT, $at
    372	/* First byte */
    373	lbu	T1, 0(IN)
    374	addiu	$at, BYTES, 1
    375	CPU_TO_LE32(SAVED_X)
    376	ROTR(SAVED_X)
    377	xor	T1, SAVED_X
    378	sb	T1, 0(OUT)
    379	beqz	$at, .Lchacha_mips_xor_done
    380	/* Second byte */
    381	lbu	T1, 1(IN)
    382	addiu	$at, BYTES, 2
    383	ROTx	SAVED_X, 8
    384	xor	T1, SAVED_X
    385	sb	T1, 1(OUT)
    386	beqz	$at, .Lchacha_mips_xor_done
    387	/* Third byte */
    388	lbu	T1, 2(IN)
    389	ROTx	SAVED_X, 8
    390	xor	T1, SAVED_X
    391	sb	T1, 2(OUT)
    392	b	.Lchacha_mips_xor_done
    393
    394.Lchacha_mips_no_full_block_unaligned:
    395	/* Restore the offset on BYTES */
    396	addiu	BYTES, CHACHA20_BLOCK_SIZE
    397
    398	/* Get number of full WORDS */
    399	andi	$at, BYTES, MASK_U32
    400
    401	/* Load upper half of jump table addr */
    402	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
    403
    404	/* Calculate lower half jump table offset */
    405	ins	T0, $at, 1, 6
    406
    407	/* Add offset to STATE */
    408	addu	T1, STATE, $at
    409
    410	/* Add lower half jump table addr */
    411	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
    412
    413	/* Read value from STATE */
    414	lw	SAVED_CA, 0(T1)
    415
    416	/* Store remaining bytecounter as negative value */
    417	subu	BYTES, $at, BYTES
    418
    419	jr	T0
    420
    421	/* Jump table */
    422	FOR_EACH_WORD(JMPTBL_UNALIGNED)
    423.end chacha_crypt_arch
    424.set at
    425
    426/* Input arguments
    427 * STATE	$a0
    428 * OUT		$a1
    429 * NROUND	$a2
    430 */
    431
    432#undef X12
    433#undef X13
    434#undef X14
    435#undef X15
    436
    437#define X12	$a3
    438#define X13	$at
    439#define X14	$v0
    440#define X15	STATE
    441
    442.set noat
    443.globl	hchacha_block_arch
    444.ent	hchacha_block_arch
    445hchacha_block_arch:
    446	.frame	$sp, STACK_SIZE, $ra
    447
    448	addiu	$sp, -STACK_SIZE
    449
    450	/* Save X11(s6) */
    451	sw	X11, 0($sp)
    452
    453	lw	X0,  0(STATE)
    454	lw	X1,  4(STATE)
    455	lw	X2,  8(STATE)
    456	lw	X3,  12(STATE)
    457	lw	X4,  16(STATE)
    458	lw	X5,  20(STATE)
    459	lw	X6,  24(STATE)
    460	lw	X7,  28(STATE)
    461	lw	X8,  32(STATE)
    462	lw	X9,  36(STATE)
    463	lw	X10, 40(STATE)
    464	lw	X11, 44(STATE)
    465	lw	X12, 48(STATE)
    466	lw	X13, 52(STATE)
    467	lw	X14, 56(STATE)
    468	lw	X15, 60(STATE)
    469
    470.Loop_hchacha_xor_rounds:
    471	addiu	$a2, -2
    472	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
    473	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
    474	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
    475	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
    476	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
    477	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
    478	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
    479	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
    480	bnez	$a2, .Loop_hchacha_xor_rounds
    481
    482	/* Restore used register */
    483	lw	X11, 0($sp)
    484
    485	sw	X0,  0(OUT)
    486	sw	X1,  4(OUT)
    487	sw	X2,  8(OUT)
    488	sw	X3,  12(OUT)
    489	sw	X12, 16(OUT)
    490	sw	X13, 20(OUT)
    491	sw	X14, 24(OUT)
    492	sw	X15, 28(OUT)
    493
    494	addiu	$sp, STACK_SIZE
    495	jr	$ra
    496.end hchacha_block_arch
    497.set at