ev6-memset.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ev6-memset.S (16394B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * arch/alpha/lib/ev6-memset.S
      4 *
      5 * This is an efficient (and relatively small) implementation of the C library
      6 * "memset()" function for the 21264 implementation of Alpha.
      7 *
      8 * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
      9 *
     10 * Much of the information about 21264 scheduling/coding comes from:
     11 *	Compiler Writer's Guide for the Alpha 21264
     12 *	abbreviated as 'CWG' in other comments here
     13 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
     14 * Scheduling notation:
     15 *	E	- either cluster
     16 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
     17 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
     18 * The algorithm for the leading and trailing quadwords remains the same,
     19 * however the loop has been unrolled to enable better memory throughput,
     20 * and the code has been replicated for each of the entry points: __memset
     21 * and __memset16 to permit better scheduling to eliminate the stalling
     22 * encountered during the mask replication.
     23 * A future enhancement might be to put in a byte store loop for really
     24 * small (say < 32 bytes) memset()s.  Whether or not that change would be
     25 * a win in the kernel would depend upon the contextual usage.
     26 * WARNING: Maintaining this is going to be more work than the above version,
     27 * as fixes will need to be made in multiple places.  The performance gain
     28 * is worth it.
     29 */
     30#include <asm/export.h>
     31	.set noat
     32	.set noreorder
     33.text
     34	.globl memset
     35	.globl __memset
     36	.globl ___memset
     37	.globl __memset16
     38	.globl __constant_c_memset
     39
     40	.ent ___memset
     41.align 5
     42___memset:
     43	.frame $30,0,$26,0
     44	.prologue 0
     45
     46	/*
     47	 * Serious stalling happens.  The only way to mitigate this is to
     48	 * undertake a major re-write to interleave the constant materialization
     49	 * with other parts of the fall-through code.  This is important, even
     50	 * though it makes maintenance tougher.
     51	 * Do this later.
     52	 */
     53	and $17,255,$1		# E : 00000000000000ch
     54	insbl $17,1,$2		# U : 000000000000ch00
     55	bis $16,$16,$0		# E : return value
     56	ble $18,end_b		# U : zero length requested?
     57
     58	addq $18,$16,$6		# E : max address to write to
     59	bis	$1,$2,$17	# E : 000000000000chch
     60	insbl	$1,2,$3		# U : 0000000000ch0000
     61	insbl	$1,3,$4		# U : 00000000ch000000
     62
     63	or	$3,$4,$3	# E : 00000000chch0000
     64	inswl	$17,4,$5	# U : 0000chch00000000
     65	xor	$16,$6,$1	# E : will complete write be within one quadword?
     66	inswl	$17,6,$2	# U : chch000000000000
     67
     68	or	$17,$3,$17	# E : 00000000chchchch
     69	or	$2,$5,$2	# E : chchchch00000000
     70	bic	$1,7,$1		# E : fit within a single quadword?
     71	and	$16,7,$3	# E : Target addr misalignment
     72
     73	or	$17,$2,$17	# E : chchchchchchchch
     74	beq	$1,within_quad_b # U :
     75	nop			# E :
     76	beq	$3,aligned_b	# U : target is 0mod8
     77
     78	/*
     79	 * Target address is misaligned, and won't fit within a quadword
     80	 */
     81	ldq_u $4,0($16)		# L : Fetch first partial
     82	bis $16,$16,$5		# E : Save the address
     83	insql $17,$16,$2	# U : Insert new bytes
     84	subq $3,8,$3		# E : Invert (for addressing uses)
     85
     86	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
     87	mskql $4,$16,$4		# U : clear relevant parts of the quad
     88	subq $16,$3,$16		# E : $16 is new aligned destination
     89	bis $2,$4,$1		# E : Final bytes
     90
     91	nop
     92	stq_u $1,0($5)		# L : Store result
     93	nop
     94	nop
     95
     96.align 4
     97aligned_b:
     98	/*
     99	 * We are now guaranteed to be quad aligned, with at least
    100	 * one partial quad to write.
    101	 */
    102
    103	sra $18,3,$3		# U : Number of remaining quads to write
    104	and $18,7,$18		# E : Number of trailing bytes to write
    105	bis $16,$16,$5		# E : Save dest address
    106	beq $3,no_quad_b	# U : tail stuff only
    107
    108	/*
    109	 * it's worth the effort to unroll this and use wh64 if possible
    110	 * Lifted a bunch of code from clear_user.S
    111	 * At this point, entry values are:
    112	 * $16	Current destination address
    113	 * $5	A copy of $16
    114	 * $6	The max quadword address to write to
    115	 * $18	Number trailer bytes
    116	 * $3	Number quads to write
    117	 */
    118
    119	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
    120	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
    121	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
    122	blt	$4, loop_b	# U :
    123
    124	/*
    125	 * We know we've got at least 16 quads, minimum of one trip
    126	 * through unrolled loop.  Do a quad at a time to get us 0mod64
    127	 * aligned.
    128	 */
    129
    130	nop			# E :
    131	nop			# E :
    132	nop			# E :
    133	beq	$1, $bigalign_b	# U :
    134
    135$alignmod64_b:
    136	stq	$17, 0($5)	# L :
    137	subq	$3, 1, $3	# E : For consistency later
    138	addq	$1, 8, $1	# E : Increment towards zero for alignment
    139	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
    140
    141	nop
    142	nop
    143	addq	$5, 8, $5	# E : Inc address
    144	blt	$1, $alignmod64_b # U :
    145
    146$bigalign_b:
    147	/*
    148	 * $3 - number quads left to go
    149	 * $5 - target address (aligned 0mod64)
    150	 * $17 - mask of stuff to store
    151	 * Scratch registers available: $7, $2, $4, $1
    152	 * we know that we'll be taking a minimum of one trip through
    153 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
    154	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
    155	 * The wh64 is issued on for the starting destination address for trip +2
    156	 * through the loop, and if there are less than two trips left, the target
    157	 * address will be for the current trip.
    158	 */
    159
    160$do_wh64_b:
    161	wh64	($4)		# L1 : memory subsystem write hint
    162	subq	$3, 24, $2	# E : For determining future wh64 addresses
    163	stq	$17, 0($5)	# L :
    164	nop			# E :
    165
    166	addq	$5, 128, $4	# E : speculative target of next wh64
    167	stq	$17, 8($5)	# L :
    168	stq	$17, 16($5)	# L :
    169	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
    170
    171	stq	$17, 24($5)	# L :
    172	stq	$17, 32($5)	# L :
    173	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
    174	nop
    175
    176	stq	$17, 40($5)	# L :
    177	stq	$17, 48($5)	# L :
    178	subq	$3, 16, $2	# E : Repeat the loop at least once more?
    179	nop
    180
    181	stq	$17, 56($5)	# L :
    182	addq	$5, 64, $5	# E :
    183	subq	$3, 8, $3	# E :
    184	bge	$2, $do_wh64_b	# U :
    185
    186	nop
    187	nop
    188	nop
    189	beq	$3, no_quad_b	# U : Might have finished already
    190
    191.align 4
    192	/*
    193	 * Simple loop for trailing quadwords, or for small amounts
    194	 * of data (where we can't use an unrolled loop and wh64)
    195	 */
    196loop_b:
    197	stq $17,0($5)		# L :
    198	subq $3,1,$3		# E : Decrement number quads left
    199	addq $5,8,$5		# E : Inc address
    200	bne $3,loop_b		# U : more?
    201
    202no_quad_b:
    203	/*
    204	 * Write 0..7 trailing bytes.
    205	 */
    206	nop			# E :
    207	beq $18,end_b		# U : All done?
    208	ldq $7,0($5)		# L :
    209	mskqh $7,$6,$2		# U : Mask final quad
    210
    211	insqh $17,$6,$4		# U : New bits
    212	bis $2,$4,$1		# E : Put it all together
    213	stq $1,0($5)		# L : And back to memory
    214	ret $31,($26),1		# L0 :
    215
    216within_quad_b:
    217	ldq_u $1,0($16)		# L :
    218	insql $17,$16,$2	# U : New bits
    219	mskql $1,$16,$4		# U : Clear old
    220	bis $2,$4,$2		# E : New result
    221
    222	mskql $2,$6,$4		# U :
    223	mskqh $1,$6,$2		# U :
    224	bis $2,$4,$1		# E :
    225	stq_u $1,0($16)		# L :
    226
    227end_b:
    228	nop
    229	nop
    230	nop
    231	ret $31,($26),1		# L0 :
    232	.end ___memset
    233	EXPORT_SYMBOL(___memset)
    234
    235	/*
    236	 * This is the original body of code, prior to replication and
    237	 * rescheduling.  Leave it here, as there may be calls to this
    238	 * entry point.
    239	 */
    240.align 4
    241	.ent __constant_c_memset
    242__constant_c_memset:
    243	.frame $30,0,$26,0
    244	.prologue 0
    245
    246	addq $18,$16,$6		# E : max address to write to
    247	bis $16,$16,$0		# E : return value
    248	xor $16,$6,$1		# E : will complete write be within one quadword?
    249	ble $18,end		# U : zero length requested?
    250
    251	bic $1,7,$1		# E : fit within a single quadword
    252	beq $1,within_one_quad	# U :
    253	and $16,7,$3		# E : Target addr misalignment
    254	beq $3,aligned		# U : target is 0mod8
    255
    256	/*
    257	 * Target address is misaligned, and won't fit within a quadword
    258	 */
    259	ldq_u $4,0($16)		# L : Fetch first partial
    260	bis $16,$16,$5		# E : Save the address
    261	insql $17,$16,$2	# U : Insert new bytes
    262	subq $3,8,$3		# E : Invert (for addressing uses)
    263
    264	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
    265	mskql $4,$16,$4		# U : clear relevant parts of the quad
    266	subq $16,$3,$16		# E : $16 is new aligned destination
    267	bis $2,$4,$1		# E : Final bytes
    268
    269	nop
    270	stq_u $1,0($5)		# L : Store result
    271	nop
    272	nop
    273
    274.align 4
    275aligned:
    276	/*
    277	 * We are now guaranteed to be quad aligned, with at least
    278	 * one partial quad to write.
    279	 */
    280
    281	sra $18,3,$3		# U : Number of remaining quads to write
    282	and $18,7,$18		# E : Number of trailing bytes to write
    283	bis $16,$16,$5		# E : Save dest address
    284	beq $3,no_quad		# U : tail stuff only
    285
    286	/*
    287	 * it's worth the effort to unroll this and use wh64 if possible
    288	 * Lifted a bunch of code from clear_user.S
    289	 * At this point, entry values are:
    290	 * $16	Current destination address
    291	 * $5	A copy of $16
    292	 * $6	The max quadword address to write to
    293	 * $18	Number trailer bytes
    294	 * $3	Number quads to write
    295	 */
    296
    297	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
    298	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
    299	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
    300	blt	$4, loop	# U :
    301
    302	/*
    303	 * We know we've got at least 16 quads, minimum of one trip
    304	 * through unrolled loop.  Do a quad at a time to get us 0mod64
    305	 * aligned.
    306	 */
    307
    308	nop			# E :
    309	nop			# E :
    310	nop			# E :
    311	beq	$1, $bigalign	# U :
    312
    313$alignmod64:
    314	stq	$17, 0($5)	# L :
    315	subq	$3, 1, $3	# E : For consistency later
    316	addq	$1, 8, $1	# E : Increment towards zero for alignment
    317	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
    318
    319	nop
    320	nop
    321	addq	$5, 8, $5	# E : Inc address
    322	blt	$1, $alignmod64	# U :
    323
    324$bigalign:
    325	/*
    326	 * $3 - number quads left to go
    327	 * $5 - target address (aligned 0mod64)
    328	 * $17 - mask of stuff to store
    329	 * Scratch registers available: $7, $2, $4, $1
    330	 * we know that we'll be taking a minimum of one trip through
    331 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
    332	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
    333	 * The wh64 is issued on for the starting destination address for trip +2
    334	 * through the loop, and if there are less than two trips left, the target
    335	 * address will be for the current trip.
    336	 */
    337
    338$do_wh64:
    339	wh64	($4)		# L1 : memory subsystem write hint
    340	subq	$3, 24, $2	# E : For determining future wh64 addresses
    341	stq	$17, 0($5)	# L :
    342	nop			# E :
    343
    344	addq	$5, 128, $4	# E : speculative target of next wh64
    345	stq	$17, 8($5)	# L :
    346	stq	$17, 16($5)	# L :
    347	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
    348
    349	stq	$17, 24($5)	# L :
    350	stq	$17, 32($5)	# L :
    351	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
    352	nop
    353
    354	stq	$17, 40($5)	# L :
    355	stq	$17, 48($5)	# L :
    356	subq	$3, 16, $2	# E : Repeat the loop at least once more?
    357	nop
    358
    359	stq	$17, 56($5)	# L :
    360	addq	$5, 64, $5	# E :
    361	subq	$3, 8, $3	# E :
    362	bge	$2, $do_wh64	# U :
    363
    364	nop
    365	nop
    366	nop
    367	beq	$3, no_quad	# U : Might have finished already
    368
    369.align 4
    370	/*
    371	 * Simple loop for trailing quadwords, or for small amounts
    372	 * of data (where we can't use an unrolled loop and wh64)
    373	 */
    374loop:
    375	stq $17,0($5)		# L :
    376	subq $3,1,$3		# E : Decrement number quads left
    377	addq $5,8,$5		# E : Inc address
    378	bne $3,loop		# U : more?
    379
    380no_quad:
    381	/*
    382	 * Write 0..7 trailing bytes.
    383	 */
    384	nop			# E :
    385	beq $18,end		# U : All done?
    386	ldq $7,0($5)		# L :
    387	mskqh $7,$6,$2		# U : Mask final quad
    388
    389	insqh $17,$6,$4		# U : New bits
    390	bis $2,$4,$1		# E : Put it all together
    391	stq $1,0($5)		# L : And back to memory
    392	ret $31,($26),1		# L0 :
    393
    394within_one_quad:
    395	ldq_u $1,0($16)		# L :
    396	insql $17,$16,$2	# U : New bits
    397	mskql $1,$16,$4		# U : Clear old
    398	bis $2,$4,$2		# E : New result
    399
    400	mskql $2,$6,$4		# U :
    401	mskqh $1,$6,$2		# U :
    402	bis $2,$4,$1		# E :
    403	stq_u $1,0($16)		# L :
    404
    405end:
    406	nop
    407	nop
    408	nop
    409	ret $31,($26),1		# L0 :
    410	.end __constant_c_memset
    411	EXPORT_SYMBOL(__constant_c_memset)
    412
    413	/*
    414	 * This is a replicant of the __constant_c_memset code, rescheduled
    415	 * to mask stalls.  Note that entry point names also had to change
    416	 */
    417	.align 5
    418	.ent __memset16
    419
    420__memset16:
    421	.frame $30,0,$26,0
    422	.prologue 0
    423
    424	inswl $17,0,$5		# U : 000000000000c1c2
    425	inswl $17,2,$2		# U : 00000000c1c20000
    426	bis $16,$16,$0		# E : return value
    427	addq	$18,$16,$6	# E : max address to write to
    428
    429	ble $18, end_w		# U : zero length requested?
    430	inswl	$17,4,$3	# U : 0000c1c200000000
    431	inswl	$17,6,$4	# U : c1c2000000000000
    432	xor	$16,$6,$1	# E : will complete write be within one quadword?
    433
    434	or	$2,$5,$2	# E : 00000000c1c2c1c2
    435	or	$3,$4,$17	# E : c1c2c1c200000000
    436	bic	$1,7,$1		# E : fit within a single quadword
    437	and	$16,7,$3	# E : Target addr misalignment
    438
    439	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
    440	beq $1,within_quad_w	# U :
    441	nop
    442	beq $3,aligned_w	# U : target is 0mod8
    443
    444	/*
    445	 * Target address is misaligned, and won't fit within a quadword
    446	 */
    447	ldq_u $4,0($16)		# L : Fetch first partial
    448	bis $16,$16,$5		# E : Save the address
    449	insql $17,$16,$2	# U : Insert new bytes
    450	subq $3,8,$3		# E : Invert (for addressing uses)
    451
    452	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
    453	mskql $4,$16,$4		# U : clear relevant parts of the quad
    454	subq $16,$3,$16		# E : $16 is new aligned destination
    455	bis $2,$4,$1		# E : Final bytes
    456
    457	nop
    458	stq_u $1,0($5)		# L : Store result
    459	nop
    460	nop
    461
    462.align 4
    463aligned_w:
    464	/*
    465	 * We are now guaranteed to be quad aligned, with at least
    466	 * one partial quad to write.
    467	 */
    468
    469	sra $18,3,$3		# U : Number of remaining quads to write
    470	and $18,7,$18		# E : Number of trailing bytes to write
    471	bis $16,$16,$5		# E : Save dest address
    472	beq $3,no_quad_w	# U : tail stuff only
    473
    474	/*
    475	 * it's worth the effort to unroll this and use wh64 if possible
    476	 * Lifted a bunch of code from clear_user.S
    477	 * At this point, entry values are:
    478	 * $16	Current destination address
    479	 * $5	A copy of $16
    480	 * $6	The max quadword address to write to
    481	 * $18	Number trailer bytes
    482	 * $3	Number quads to write
    483	 */
    484
    485	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
    486	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
    487	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
    488	blt	$4, loop_w	# U :
    489
    490	/*
    491	 * We know we've got at least 16 quads, minimum of one trip
    492	 * through unrolled loop.  Do a quad at a time to get us 0mod64
    493	 * aligned.
    494	 */
    495
    496	nop			# E :
    497	nop			# E :
    498	nop			# E :
    499	beq	$1, $bigalign_w	# U :
    500
    501$alignmod64_w:
    502	stq	$17, 0($5)	# L :
    503	subq	$3, 1, $3	# E : For consistency later
    504	addq	$1, 8, $1	# E : Increment towards zero for alignment
    505	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
    506
    507	nop
    508	nop
    509	addq	$5, 8, $5	# E : Inc address
    510	blt	$1, $alignmod64_w	# U :
    511
    512$bigalign_w:
    513	/*
    514	 * $3 - number quads left to go
    515	 * $5 - target address (aligned 0mod64)
    516	 * $17 - mask of stuff to store
    517	 * Scratch registers available: $7, $2, $4, $1
    518	 * we know that we'll be taking a minimum of one trip through
    519 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
    520	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
    521	 * The wh64 is issued on for the starting destination address for trip +2
    522	 * through the loop, and if there are less than two trips left, the target
    523	 * address will be for the current trip.
    524	 */
    525
    526$do_wh64_w:
    527	wh64	($4)		# L1 : memory subsystem write hint
    528	subq	$3, 24, $2	# E : For determining future wh64 addresses
    529	stq	$17, 0($5)	# L :
    530	nop			# E :
    531
    532	addq	$5, 128, $4	# E : speculative target of next wh64
    533	stq	$17, 8($5)	# L :
    534	stq	$17, 16($5)	# L :
    535	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
    536
    537	stq	$17, 24($5)	# L :
    538	stq	$17, 32($5)	# L :
    539	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
    540	nop
    541
    542	stq	$17, 40($5)	# L :
    543	stq	$17, 48($5)	# L :
    544	subq	$3, 16, $2	# E : Repeat the loop at least once more?
    545	nop
    546
    547	stq	$17, 56($5)	# L :
    548	addq	$5, 64, $5	# E :
    549	subq	$3, 8, $3	# E :
    550	bge	$2, $do_wh64_w	# U :
    551
    552	nop
    553	nop
    554	nop
    555	beq	$3, no_quad_w	# U : Might have finished already
    556
    557.align 4
    558	/*
    559	 * Simple loop for trailing quadwords, or for small amounts
    560	 * of data (where we can't use an unrolled loop and wh64)
    561	 */
    562loop_w:
    563	stq $17,0($5)		# L :
    564	subq $3,1,$3		# E : Decrement number quads left
    565	addq $5,8,$5		# E : Inc address
    566	bne $3,loop_w		# U : more?
    567
    568no_quad_w:
    569	/*
    570	 * Write 0..7 trailing bytes.
    571	 */
    572	nop			# E :
    573	beq $18,end_w		# U : All done?
    574	ldq $7,0($5)		# L :
    575	mskqh $7,$6,$2		# U : Mask final quad
    576
    577	insqh $17,$6,$4		# U : New bits
    578	bis $2,$4,$1		# E : Put it all together
    579	stq $1,0($5)		# L : And back to memory
    580	ret $31,($26),1		# L0 :
    581
    582within_quad_w:
    583	ldq_u $1,0($16)		# L :
    584	insql $17,$16,$2	# U : New bits
    585	mskql $1,$16,$4		# U : Clear old
    586	bis $2,$4,$2		# E : New result
    587
    588	mskql $2,$6,$4		# U :
    589	mskqh $1,$6,$2		# U :
    590	bis $2,$4,$1		# E :
    591	stq_u $1,0($16)		# L :
    592
    593end_w:
    594	nop
    595	nop
    596	nop
    597	ret $31,($26),1		# L0 :
    598
    599	.end __memset16
    600	EXPORT_SYMBOL(__memset16)
    601
    602memset = ___memset
    603__memset = ___memset
    604	EXPORT_SYMBOL(memset)
    605	EXPORT_SYMBOL(__memset)