ev6-copy_page.S - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ev6-copy_page.S (4341B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * arch/alpha/lib/ev6-copy_page.S
      4 *
      5 * Copy an entire page.
      6 */
      7
      8/* The following comparison of this routine vs the normal copy_page.S
      9   was written by an unnamed ev6 hardware designer and forwarded to me
     10   via Steven Hobbs <hobbs@steven.zko.dec.com>.
     11 
     12   First Problem: STQ overflows.
     13   -----------------------------
     14
     15	It would be nice if EV6 handled every resource overflow efficiently,
     16	but for some it doesn't.  Including store queue overflows.  It causes
     17	a trap and a restart of the pipe.
     18
     19	To get around this we sometimes use (to borrow a term from a VSSAD
     20	researcher) "aeration".  The idea is to slow the rate at which the
     21	processor receives valid instructions by inserting nops in the fetch
     22	path.  In doing so, you can prevent the overflow and actually make
     23	the code run faster.  You can, of course, take advantage of the fact
     24	that the processor can fetch at most 4 aligned instructions per cycle.
     25
     26	I inserted enough nops to force it to take 10 cycles to fetch the
     27	loop code.  In theory, EV6 should be able to execute this loop in
     28	9 cycles but I was not able to get it to run that fast -- the initial
     29	conditions were such that I could not reach this optimum rate on
     30	(chaotic) EV6.  I wrote the code such that everything would issue
     31	in order. 
     32
     33   Second Problem: Dcache index matches.
     34   -------------------------------------
     35
     36	If you are going to use this routine on random aligned pages, there
     37	is a 25% chance that the pages will be at the same dcache indices.
     38	This results in many nasty memory traps without care.
     39
     40	The solution is to schedule the prefetches to avoid the memory
     41	conflicts.  I schedule the wh64 prefetches farther ahead of the
     42	read prefetches to avoid this problem.
     43
     44   Third Problem: Needs more prefetching.
     45   --------------------------------------
     46
     47	In order to improve the code I added deeper prefetching to take the
     48	most advantage of EV6's bandwidth.
     49
     50	I also prefetched the read stream. Note that adding the read prefetch
     51	forced me to add another cycle to the inner-most kernel - up to 11
     52	from the original 8 cycles per iteration.  We could improve performance
     53	further by unrolling the loop and doing multiple prefetches per cycle.
     54
     55   I think that the code below will be very robust and fast code for the
     56   purposes of copying aligned pages.  It is slower when both source and
     57   destination pages are in the dcache, but it is my guess that this is
     58   less important than the dcache miss case.  */
     59
     60#include <asm/export.h>
     61	.text
     62	.align 4
     63	.global copy_page
     64	.ent copy_page
     65copy_page:
     66	.prologue 0
     67
     68	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
     69	wh64	($16)
     70	ldl	$31,0($17)
     71	ldl	$31,64($17)
     72	lda	$1,1*64($16)
     73
     74	wh64	($1)
     75	ldl	$31,128($17)
     76	ldl	$31,192($17)
     77	lda	$1,2*64($16)
     78
     79	wh64	($1)
     80	ldl	$31,256($17)
     81	lda	$18,118
     82	lda	$1,3*64($16)
     83
     84	wh64	($1)
     85	nop
     86	lda	$1,4*64($16)
     87	lda	$2,5*64($16)
     88
     89	wh64	($1)
     90	wh64	($2)
     91	lda	$1,6*64($16)
     92	lda	$2,7*64($16)
     93
     94	wh64	($1)
     95	wh64	($2)
     96	lda	$1,8*64($16)
     97	lda	$2,9*64($16)
     98
     99	wh64	($1)
    100	wh64	($2)
    101	lda	$19,10*64($16)
    102	nop
    103
    104	/* Main prefetching/write-hinting loop.  */
    1051:	ldq	$0,0($17)
    106	ldq	$1,8($17)
    107	unop
    108	unop
    109
    110	unop
    111	unop
    112	ldq	$2,16($17)
    113	ldq	$3,24($17)
    114
    115	ldq	$4,32($17)
    116	ldq	$5,40($17)
    117	unop
    118	unop
    119
    120	unop
    121	unop
    122	ldq	$6,48($17)
    123	ldq	$7,56($17)
    124
    125	ldl	$31,320($17)
    126	unop
    127	unop
    128	unop
    129
    130	/* This gives the extra cycle of aeration above the minimum.  */
    131	unop			
    132	unop
    133	unop
    134	unop
    135
    136	wh64	($19)
    137	unop
    138	unop
    139	unop
    140
    141	stq	$0,0($16)
    142	subq	$18,1,$18
    143	stq	$1,8($16)
    144	unop
    145
    146	unop
    147	stq	$2,16($16)
    148	addq	$17,64,$17
    149	stq	$3,24($16)
    150
    151	stq	$4,32($16)
    152	stq	$5,40($16)
    153	addq	$19,64,$19
    154	unop
    155
    156	stq	$6,48($16)
    157	stq	$7,56($16)
    158	addq	$16,64,$16
    159	bne	$18, 1b
    160
    161	/* Prefetch the final 5 cache lines of the read stream.  */
    162	lda	$18,10
    163	ldl	$31,320($17)
    164	ldl	$31,384($17)
    165	ldl	$31,448($17)
    166
    167	ldl	$31,512($17)
    168	ldl	$31,576($17)
    169	nop
    170	nop
    171
    172	/* Non-prefetching, non-write-hinting cleanup loop for the
    173	   final 10 cache lines.  */
    1742:	ldq	$0,0($17)
    175	ldq	$1,8($17)
    176	ldq	$2,16($17)
    177	ldq	$3,24($17)
    178
    179	ldq	$4,32($17)
    180	ldq	$5,40($17)
    181	ldq	$6,48($17)
    182	ldq	$7,56($17)
    183
    184	stq	$0,0($16)
    185	subq	$18,1,$18
    186	stq	$1,8($16)
    187	addq	$17,64,$17
    188
    189	stq	$2,16($16)
    190	stq	$3,24($16)
    191	stq	$4,32($16)
    192	stq	$5,40($16)
    193
    194	stq	$6,48($16)
    195	stq	$7,56($16)
    196	addq	$16,64,$16
    197	bne	$18, 2b
    198
    199	ret
    200	nop
    201	unop
    202	nop
    203
    204	.end copy_page
    205	EXPORT_SYMBOL(copy_page)