cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ev6-divide.S (6508B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2/*
      3 * arch/alpha/lib/ev6-divide.S
      4 *
      5 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
      6 *
      7 * Alpha division..
      8 */
      9
     10/*
     11 * The alpha chip doesn't provide hardware division, so we have to do it
     12 * by hand.  The compiler expects the functions
     13 *
     14 *	__divqu: 64-bit unsigned long divide
     15 *	__remqu: 64-bit unsigned long remainder
     16 *	__divqs/__remqs: signed 64-bit
     17 *	__divlu/__remlu: unsigned 32-bit
     18 *	__divls/__remls: signed 32-bit
     19 *
     20 * These are not normal C functions: instead of the normal
     21 * calling sequence, these expect their arguments in registers
     22 * $24 and $25, and return the result in $27. Register $28 may
     23 * be clobbered (assembly temporary), anything else must be saved. 
     24 *
     25 * In short: painful.
     26 *
     27 * This is a rather simple bit-at-a-time algorithm: it's very good
     28 * at dividing random 64-bit numbers, but the more usual case where
     29 * the divisor is small is handled better by the DEC algorithm
     30 * using lookup tables. This uses much less memory, though, and is
     31 * nicer on the cache.. Besides, I don't know the copyright status
     32 * of the DEC code.
     33 */
     34
     35/*
     36 * My temporaries:
     37 *	$0 - current bit
     38 *	$1 - shifted divisor
     39 *	$2 - modulus/quotient
     40 *
     41 *	$23 - return address
     42 *	$24 - dividend
     43 *	$25 - divisor
     44 *
     45 *	$27 - quotient/modulus
     46 *	$28 - compare status
     47 *
     48 * Much of the information about 21264 scheduling/coding comes from:
     49 *	Compiler Writer's Guide for the Alpha 21264
     50 *	abbreviated as 'CWG' in other comments here
     51 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
     52 * Scheduling notation:
     53 *	E	- either cluster
     54 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
     55 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
     56 * Try not to change the actual algorithm if possible for consistency.
     57 */
     58
     59#include <asm/export.h>
     60#define halt .long 0
     61
     62/*
     63 * Select function type and registers
     64 */
     65#define mask	$0
     66#define divisor	$1
     67#define compare $28
     68#define tmp1	$3
     69#define tmp2	$4
     70
     71#ifdef DIV
     72#define DIV_ONLY(x,y...) x,##y
     73#define MOD_ONLY(x,y...)
     74#define func(x) __div##x
     75#define modulus $2
     76#define quotient $27
     77#define GETSIGN(x) xor $24,$25,x
     78#define STACK 48
     79#else
     80#define DIV_ONLY(x,y...)
     81#define MOD_ONLY(x,y...) x,##y
     82#define func(x) __rem##x
     83#define modulus $27
     84#define quotient $2
     85#define GETSIGN(x) bis $24,$24,x
     86#define STACK 32
     87#endif
     88
     89/*
     90 * For 32-bit operations, we need to extend to 64-bit
     91 */
     92#ifdef INTSIZE
     93#define ufunction func(lu)
     94#define sfunction func(l)
     95#define LONGIFY(x) zapnot x,15,x
     96#define SLONGIFY(x) addl x,0,x
     97#else
     98#define ufunction func(qu)
     99#define sfunction func(q)
    100#define LONGIFY(x)
    101#define SLONGIFY(x)
    102#endif
    103
    104.set noat
    105.align	4
    106.globl	ufunction
    107.ent	ufunction
    108ufunction:
    109	subq	$30,STACK,$30		# E :
    110	.frame	$30,STACK,$23
    111	.prologue 0
    112
    1137:	stq	$1, 0($30)		# L :
    114	bis	$25,$25,divisor		# E :
    115	stq	$2, 8($30)		# L : L U L U
    116
    117	bis	$24,$24,modulus		# E :
    118	stq	$0,16($30)		# L :
    119	bis	$31,$31,quotient	# E :
    120	LONGIFY(divisor)		# E : U L L U
    121
    122	stq	tmp1,24($30)		# L :
    123	LONGIFY(modulus)		# E :
    124	bis	$31,1,mask		# E :
    125	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
    126
    127	beq	divisor, 9f			/* div by zero */
    128	/*
    129	 * In spite of the DIV_ONLY being either a non-instruction
    130	 * or an actual stq, the addition of the .align directive
    131	 * below ensures that label 1 is going to be nicely aligned
    132	 */
    133
    134	.align	4
    135#ifdef INTSIZE
    136	/*
    137	 * shift divisor left, using 3-bit shifts for
    138	 * 32-bit divides as we can't overflow. Three-bit
    139	 * shifts will result in looping three times less
    140	 * here, but can result in two loops more later.
    141	 * Thus using a large shift isn't worth it (and
    142	 * s8add pairs better than a sll..)
    143	 */
    1441:	cmpult	divisor,modulus,compare	# E :
    145	s8addq	divisor,$31,divisor	# E :
    146	s8addq	mask,$31,mask		# E :
    147	bne	compare,1b		# U : U L U L
    148#else
    1491:	cmpult	divisor,modulus,compare	# E :
    150	nop				# E :
    151	nop				# E :
    152	blt     divisor, 2f		# U : U L U L
    153
    154	addq	divisor,divisor,divisor	# E :
    155	addq	mask,mask,mask		# E :
    156	unop				# E :
    157	bne	compare,1b		# U : U L U L
    158#endif
    159
    160	/* ok, start to go right again.. */
    1612:
    162	/*
    163	 * Keep things nicely bundled... use a nop instead of not
    164	 * having an instruction for DIV_ONLY
    165	 */
    166#ifdef DIV
    167	DIV_ONLY(addq quotient,mask,tmp2) # E :
    168#else
    169	nop				# E :
    170#endif
    171	srl	mask,1,mask		# U :
    172	cmpule	divisor,modulus,compare	# E :
    173	subq	modulus,divisor,tmp1	# E :
    174
    175#ifdef DIV
    176	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
    177	nop				# E : as part of the cmovne
    178	srl	divisor,1,divisor	# U :
    179	nop				# E : L U L U
    180
    181	nop				# E :
    182	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
    183	nop				# E : as part of the cmovne
    184	bne	mask,2b			# U : U L U L
    185#else
    186	srl	divisor,1,divisor	# U :
    187	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
    188	nop				# E : as part of the cmovne
    189	bne	mask,2b			# U : U L L U
    190#endif
    191
    1929:	ldq	$1, 0($30)		# L :
    193	ldq	$2, 8($30)		# L :
    194	nop				# E :
    195	nop				# E : U U L L
    196
    197	ldq	$0,16($30)		# L :
    198	ldq	tmp1,24($30)		# L :
    199	nop				# E :
    200	nop				# E :
    201
    202#ifdef DIV
    203	DIV_ONLY(ldq tmp2,32($30))	# L :
    204#else
    205	nop				# E :
    206#endif
    207	addq	$30,STACK,$30		# E :
    208	ret	$31,($23),1		# L0 : L U U L
    209	.end	ufunction
    210EXPORT_SYMBOL(ufunction)
    211
    212/*
    213 * Uhh.. Ugly signed division. I'd rather not have it at all, but
    214 * it's needed in some circumstances. There are different ways to
    215 * handle this, really. This does:
    216 * 	-a / b = a / -b = -(a / b)
    217 *	-a % b = -(a % b)
    218 *	a % -b = a % b
    219 * which is probably not the best solution, but at least should
    220 * have the property that (x/y)*y + (x%y) = x.
    221 */
    222.align 4
    223.globl	sfunction
    224.ent	sfunction
    225sfunction:
    226	subq	$30,STACK,$30		# E :
    227	.frame	$30,STACK,$23
    228	.prologue 0
    229	bis	$24,$25,$28		# E :
    230	SLONGIFY($28)			# E :
    231	bge	$28,7b			# U :
    232
    233	stq	$24,0($30)		# L :
    234	subq	$31,$24,$28		# E :
    235	stq	$25,8($30)		# L :
    236	nop				# E : U L U L
    237
    238	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
    239	nop				# E : as part of the cmov
    240	stq	$23,16($30)		# L :
    241	subq	$31,$25,$28		# E : U L U L
    242
    243	stq	tmp1,24($30)		# L :
    244	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
    245	nop				# E :
    246	bsr	$23,ufunction		# L0: L U L U
    247
    248	ldq	$24,0($30)		# L :
    249	ldq	$25,8($30)		# L :
    250	GETSIGN($28)			# E :
    251	subq	$31,$27,tmp1		# E : U U L L
    252
    253	SLONGIFY($28)			# E :
    254	ldq	$23,16($30)		# L :
    255	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
    256	nop				# E : U L L U : as part of the cmov
    257
    258	ldq	tmp1,24($30)		# L :
    259	nop				# E : as part of the cmov
    260	addq	$30,STACK,$30		# E :
    261	ret	$31,($23),1		# L0 : L U U L
    262	.end	sfunction
    263EXPORT_SYMBOL(sfunction)