ev6-memcpy.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ev6-memcpy.S (6378B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * arch/alpha/lib/ev6-memcpy.S
      4 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
      5 *
      6 * Reasonably optimized memcpy() routine for the Alpha 21264
      7 *
      8 *	- memory accessed as aligned quadwords only
      9 *	- uses bcmpge to compare 8 bytes in parallel
     10 *
     11 * Much of the information about 21264 scheduling/coding comes from:
     12 *	Compiler Writer's Guide for the Alpha 21264
     13 *	abbreviated as 'CWG' in other comments here
     14 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
     15 * Scheduling notation:
     16 *	E	- either cluster
     17 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
     18 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
     19 *
     20 * Temp usage notes:
     21 *	$1,$2,		- scratch
     22 */
     23#include <asm/export.h>
     24	.set noreorder
     25	.set noat
     26
     27	.align	4
     28	.globl memcpy
     29	.ent memcpy
     30memcpy:
     31	.frame $30,0,$26,0
     32	.prologue 0
     33
     34	mov	$16, $0			# E : copy dest to return
     35	ble	$18, $nomoredata	# U : done with the copy?
     36	xor	$16, $17, $1		# E : are source and dest alignments the same?
     37	and	$1, 7, $1		# E : are they the same mod 8?
     38
     39	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
     40	/* source and dest are same mod 8 address */
     41	and	$16, 7, $1		# E : Are both 0mod8?
     42	beq	$1, $both_0mod8		# U : Yes
     43	nop				# E :
     44
     45	/*
     46	 * source and dest are same misalignment.  move a byte at a time
     47	 * until a 0mod8 alignment for both is reached.
     48	 * At least one byte more to move
     49	 */
     50
     51$head_align:
     52	ldbu	$1, 0($17)		# L : grab a byte
     53	subq	$18, 1, $18		# E : count--
     54	addq	$17, 1, $17		# E : src++
     55	stb	$1, 0($16)		# L :
     56	addq	$16, 1, $16		# E : dest++
     57	and	$16, 7, $1		# E : Are we at 0mod8 yet?
     58	ble	$18, $nomoredata	# U : done with the copy?
     59	bne	$1, $head_align		# U :
     60
     61$both_0mod8:
     62	cmple	$18, 127, $1		# E : Can we unroll the loop?
     63	bne	$1, $no_unroll		# U :
     64	and	$16, 63, $1		# E : get mod64 alignment
     65	beq	$1, $do_unroll		# U : no single quads to fiddle
     66
     67$single_head_quad:
     68	ldq	$1, 0($17)		# L : get 8 bytes
     69	subq	$18, 8, $18		# E : count -= 8
     70	addq	$17, 8, $17		# E : src += 8
     71	nop				# E :
     72
     73	stq	$1, 0($16)		# L : store
     74	addq	$16, 8, $16		# E : dest += 8
     75	and	$16, 63, $1		# E : get mod64 alignment
     76	bne	$1, $single_head_quad	# U : still not fully aligned
     77
     78$do_unroll:
     79	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
     80	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
     81	bne	$1, $tail_quads		# U : Nope
     82	nop				# E : 
     83
     84$unroll_body:
     85	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
     86					# ($7) are about to be over-written
     87	ldq	$6, 0($17)		# L0 : bytes 0..7
     88	nop				# E :
     89	nop				# E :
     90
     91	ldq	$4, 8($17)		# L : bytes 8..15
     92	ldq	$5, 16($17)		# L : bytes 16..23
     93	addq	$7, 64, $7		# E : Update next wh64 address
     94	nop				# E :
     95
     96	ldq	$3, 24($17)		# L : bytes 24..31
     97	addq	$16, 64, $1		# E : fallback value for wh64
     98	nop				# E :
     99	nop				# E :
    100
    101	addq	$17, 32, $17		# E : src += 32 bytes
    102	stq	$6, 0($16)		# L : bytes 0..7
    103	nop				# E :
    104	nop				# E :
    105
    106	stq	$4, 8($16)		# L : bytes 8..15
    107	stq	$5, 16($16)		# L : bytes 16..23
    108	subq	$18, 192, $2		# E : At least two more trips to go?
    109	nop				# E :
    110
    111	stq	$3, 24($16)		# L : bytes 24..31
    112	addq	$16, 32, $16		# E : dest += 32 bytes
    113	nop				# E :
    114	nop				# E :
    115
    116	ldq	$6, 0($17)		# L : bytes 0..7
    117	ldq	$4, 8($17)		# L : bytes 8..15
    118	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
    119					# fallback wh64 address if < 2 more trips
    120	nop				# E :
    121
    122	ldq	$5, 16($17)		# L : bytes 16..23
    123	ldq	$3, 24($17)		# L : bytes 24..31
    124	addq	$16, 32, $16		# E : dest += 32
    125	subq	$18, 64, $18		# E : count -= 64
    126
    127	addq	$17, 32, $17		# E : src += 32
    128	stq	$6, -32($16)		# L : bytes 0..7
    129	stq	$4, -24($16)		# L : bytes 8..15
    130	cmple	$18, 63, $1		# E : At least one more trip?
    131
    132	stq	$5, -16($16)		# L : bytes 16..23
    133	stq	$3, -8($16)		# L : bytes 24..31
    134	nop				# E :
    135	beq	$1, $unroll_body
    136
    137$tail_quads:
    138$no_unroll:
    139	.align 4
    140	subq	$18, 8, $18		# E : At least a quad left?
    141	blt	$18, $less_than_8	# U : Nope
    142	nop				# E :
    143	nop				# E :
    144
    145$move_a_quad:
    146	ldq	$1, 0($17)		# L : fetch 8
    147	subq	$18, 8, $18		# E : count -= 8
    148	addq	$17, 8, $17		# E : src += 8
    149	nop				# E :
    150
    151	stq	$1, 0($16)		# L : store 8
    152	addq	$16, 8, $16		# E : dest += 8
    153	bge	$18, $move_a_quad	# U :
    154	nop				# E :
    155
    156$less_than_8:
    157	.align 4
    158	addq	$18, 8, $18		# E : add back for trailing bytes
    159	ble	$18, $nomoredata	# U : All-done
    160	nop				# E :
    161	nop				# E :
    162
    163	/* Trailing bytes */
    164$tail_bytes:
    165	subq	$18, 1, $18		# E : count--
    166	ldbu	$1, 0($17)		# L : fetch a byte
    167	addq	$17, 1, $17		# E : src++
    168	nop				# E :
    169
    170	stb	$1, 0($16)		# L : store a byte
    171	addq	$16, 1, $16		# E : dest++
    172	bgt	$18, $tail_bytes	# U : more to be done?
    173	nop				# E :
    174
    175	/* branching to exit takes 3 extra cycles, so replicate exit here */
    176	ret	$31, ($26), 1		# L0 :
    177	nop				# E :
    178	nop				# E :
    179	nop				# E :
    180
    181$misaligned:
    182	mov	$0, $4			# E : dest temp
    183	and	$0, 7, $1		# E : dest alignment mod8
    184	beq	$1, $dest_0mod8		# U : life doesnt totally suck
    185	nop
    186
    187$aligndest:
    188	ble	$18, $nomoredata	# U :
    189	ldbu	$1, 0($17)		# L : fetch a byte
    190	subq	$18, 1, $18		# E : count--
    191	addq	$17, 1, $17		# E : src++
    192
    193	stb	$1, 0($4)		# L : store it
    194	addq	$4, 1, $4		# E : dest++
    195	and	$4, 7, $1		# E : dest 0mod8 yet?
    196	bne	$1, $aligndest		# U : go until we are aligned.
    197
    198	/* Source has unknown alignment, but dest is known to be 0mod8 */
    199$dest_0mod8:
    200	subq	$18, 8, $18		# E : At least a quad left?
    201	blt	$18, $misalign_tail	# U : Nope
    202	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
    203	nop				# E :
    204
    205$mis_quad:
    206	ldq_u	$16, 8($17)		# L : Fetch next 8
    207	extql	$3, $17, $3		# U : masking
    208	extqh	$16, $17, $1		# U : masking
    209	bis	$3, $1, $1		# E : merged bytes to store
    210
    211	subq	$18, 8, $18		# E : count -= 8
    212	addq	$17, 8, $17		# E : src += 8
    213	stq	$1, 0($4)		# L : store 8 (aligned)
    214	mov	$16, $3			# E : "rotate" source data
    215
    216	addq	$4, 8, $4		# E : dest += 8
    217	bge	$18, $mis_quad		# U : More quads to move
    218	nop
    219	nop
    220
    221$misalign_tail:
    222	addq	$18, 8, $18		# E : account for tail stuff
    223	ble	$18, $nomoredata	# U :
    224	nop
    225	nop
    226
    227$misalign_byte:
    228	ldbu	$1, 0($17)		# L : fetch 1
    229	subq	$18, 1, $18		# E : count--
    230	addq	$17, 1, $17		# E : src++
    231	nop				# E :
    232
    233	stb	$1, 0($4)		# L : store
    234	addq	$4, 1, $4		# E : dest++
    235	bgt	$18, $misalign_byte	# U : more to go?
    236	nop
    237
    238
    239$nomoredata:
    240	ret	$31, ($26), 1		# L0 :
    241	nop				# E :
    242	nop				# E :
    243	nop				# E :
    244
    245	.end memcpy
    246	EXPORT_SYMBOL(memcpy)
    247
    248/* For backwards module compatibility.  */
    249__memcpy = memcpy
    250.globl __memcpy