cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

stan.S (13349B)


      1|
      2|	stan.sa 3.3 7/29/91
      3|
      4|	The entry point stan computes the tangent of
      5|	an input argument;
      6|	stand does the same except for denormalized input.
      7|
      8|	Input: Double-extended number X in location pointed to
      9|		by address register a0.
     10|
     11|	Output: The value tan(X) returned in floating-point register Fp0.
     12|
     13|	Accuracy and Monotonicity: The returned result is within 3 ulp in
     14|		64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
     15|		result is subsequently rounded to double precision. The
     16|		result is provably monotonic in double precision.
     17|
     18|	Speed: The program sTAN takes approximately 170 cycles for
     19|		input argument X such that |X| < 15Pi, which is the usual
     20|		situation.
     21|
     22|	Algorithm:
     23|
     24|	1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
     25|
     26|	2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
     27|		k = N mod 2, so in particular, k = 0 or 1.
     28|
     29|	3. If k is odd, go to 5.
     30|
     31|	4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
     32|		rational function U/V where
     33|		U = r + r*s*(P1 + s*(P2 + s*P3)), and
     34|		V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))),  s = r*r.
     35|		Exit.
     36|
     37|	4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
     38|		rational function U/V where
     39|		U = r + r*s*(P1 + s*(P2 + s*P3)), and
     40|		V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
     41|		-Cot(r) = -V/U. Exit.
     42|
     43|	6. If |X| > 1, go to 8.
     44|
     45|	7. (|X|<2**(-40)) Tan(X) = X. Exit.
     46|
     47|	8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
     48|
     49
     50|		Copyright (C) Motorola, Inc. 1990
     51|			All Rights Reserved
     52|
     53|       For details on the license for this file, please see the
     54|       file, README, in this same directory.
     55
     56|STAN	idnt	2,1 | Motorola 040 Floating Point Software Package
     57
     58	|section	8
     59
     60#include "fpsp.h"
     61
     62BOUNDS1:	.long 0x3FD78000,0x4004BC7E
     63TWOBYPI:	.long 0x3FE45F30,0x6DC9C883
     64
     65TANQ4:	.long 0x3EA0B759,0xF50F8688
     66TANP3:	.long 0xBEF2BAA5,0xA8924F04
     67
     68TANQ3:	.long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
     69
     70TANP2:	.long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
     71
     72TANQ2:	.long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
     73
     74TANP1:	.long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
     75
     76TANQ1:	.long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
     77
     78INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
     79
     80TWOPI1:	.long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
     81TWOPI2:	.long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
     82
     83|--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
     84|--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
     85|--MOST 69 BITS LONG.
     86	.global	PITBL
     87PITBL:
     88  .long  0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
     89  .long  0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
     90  .long  0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
     91  .long  0xC0040000,0xB6365E22,0xEE46F000,0x21480000
     92  .long  0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
     93  .long  0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
     94  .long  0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
     95  .long  0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
     96  .long  0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
     97  .long  0xC0040000,0x90836524,0x88034B96,0x20B00000
     98  .long  0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
     99  .long  0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
    100  .long  0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
    101  .long  0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
    102  .long  0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
    103  .long  0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
    104  .long  0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
    105  .long  0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
    106  .long  0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
    107  .long  0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
    108  .long  0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
    109  .long  0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
    110  .long  0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
    111  .long  0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
    112  .long  0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
    113  .long  0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
    114  .long  0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
    115  .long  0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
    116  .long  0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
    117  .long  0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
    118  .long  0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
    119  .long  0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
    120  .long  0x00000000,0x00000000,0x00000000,0x00000000
    121  .long  0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
    122  .long  0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
    123  .long  0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
    124  .long  0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
    125  .long  0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
    126  .long  0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
    127  .long  0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
    128  .long  0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
    129  .long  0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
    130  .long  0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
    131  .long  0x40030000,0x8A3AE64F,0x76F80584,0x21080000
    132  .long  0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
    133  .long  0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
    134  .long  0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
    135  .long  0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
    136  .long  0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
    137  .long  0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
    138  .long  0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
    139  .long  0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
    140  .long  0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
    141  .long  0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
    142  .long  0x40040000,0x8A3AE64F,0x76F80584,0x21880000
    143  .long  0x40040000,0x90836524,0x88034B96,0xA0B00000
    144  .long  0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
    145  .long  0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
    146  .long  0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
    147  .long  0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
    148  .long  0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
    149  .long  0x40040000,0xB6365E22,0xEE46F000,0xA1480000
    150  .long  0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
    151  .long  0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
    152  .long  0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
    153
    154	.set	INARG,FP_SCR4
    155
    156	.set	TWOTO63,L_SCR1
    157	.set	ENDFLAG,L_SCR2
    158	.set	N,L_SCR3
    159
    160	| xref	t_frcinx
    161	|xref	t_extdnrm
    162
    163	.global	stand
    164stand:
    165|--TAN(X) = X FOR DENORMALIZED X
    166
    167	bra		t_extdnrm
    168
    169	.global	stan
    170stan:
    171	fmovex		(%a0),%fp0	| ...LOAD INPUT
    172
    173	movel		(%a0),%d0
    174	movew		4(%a0),%d0
    175	andil		#0x7FFFFFFF,%d0
    176
    177	cmpil		#0x3FD78000,%d0		| ...|X| >= 2**(-40)?
    178	bges		TANOK1
    179	bra		TANSM
    180TANOK1:
    181	cmpil		#0x4004BC7E,%d0		| ...|X| < 15 PI?
    182	blts		TANMAIN
    183	bra		REDUCEX
    184
    185
    186TANMAIN:
    187|--THIS IS THE USUAL CASE, |X| <= 15 PI.
    188|--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
    189	fmovex		%fp0,%fp1
    190	fmuld		TWOBYPI,%fp1	| ...X*2/PI
    191
    192|--HIDE THE NEXT TWO INSTRUCTIONS
    193	leal		PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32
    194
    195|--FP1 IS NOW READY
    196	fmovel		%fp1,%d0		| ...CONVERT TO INTEGER
    197
    198	asll		#4,%d0
    199	addal		%d0,%a1		| ...ADDRESS N*PIBY2 IN Y1, Y2
    200
    201	fsubx		(%a1)+,%fp0	| ...X-Y1
    202|--HIDE THE NEXT ONE
    203
    204	fsubs		(%a1),%fp0	| ...FP0 IS R = (X-Y1)-Y2
    205
    206	rorl		#5,%d0
    207	andil		#0x80000000,%d0	| ...D0 WAS ODD IFF D0 < 0
    208
    209TANCONT:
    210
    211	cmpil		#0,%d0
    212	blt		NODD
    213
    214	fmovex		%fp0,%fp1
    215	fmulx		%fp1,%fp1		| ...S = R*R
    216
    217	fmoved		TANQ4,%fp3
    218	fmoved		TANP3,%fp2
    219
    220	fmulx		%fp1,%fp3		| ...SQ4
    221	fmulx		%fp1,%fp2		| ...SP3
    222
    223	faddd		TANQ3,%fp3	| ...Q3+SQ4
    224	faddx		TANP2,%fp2	| ...P2+SP3
    225
    226	fmulx		%fp1,%fp3		| ...S(Q3+SQ4)
    227	fmulx		%fp1,%fp2		| ...S(P2+SP3)
    228
    229	faddx		TANQ2,%fp3	| ...Q2+S(Q3+SQ4)
    230	faddx		TANP1,%fp2	| ...P1+S(P2+SP3)
    231
    232	fmulx		%fp1,%fp3		| ...S(Q2+S(Q3+SQ4))
    233	fmulx		%fp1,%fp2		| ...S(P1+S(P2+SP3))
    234
    235	faddx		TANQ1,%fp3	| ...Q1+S(Q2+S(Q3+SQ4))
    236	fmulx		%fp0,%fp2		| ...RS(P1+S(P2+SP3))
    237
    238	fmulx		%fp3,%fp1		| ...S(Q1+S(Q2+S(Q3+SQ4)))
    239
    240
    241	faddx		%fp2,%fp0		| ...R+RS(P1+S(P2+SP3))
    242
    243
    244	fadds		#0x3F800000,%fp1	| ...1+S(Q1+...)
    245
    246	fmovel		%d1,%fpcr		|restore users exceptions
    247	fdivx		%fp1,%fp0		|last inst - possible exception set
    248
    249	bra		t_frcinx
    250
    251NODD:
    252	fmovex		%fp0,%fp1
    253	fmulx		%fp0,%fp0		| ...S = R*R
    254
    255	fmoved		TANQ4,%fp3
    256	fmoved		TANP3,%fp2
    257
    258	fmulx		%fp0,%fp3		| ...SQ4
    259	fmulx		%fp0,%fp2		| ...SP3
    260
    261	faddd		TANQ3,%fp3	| ...Q3+SQ4
    262	faddx		TANP2,%fp2	| ...P2+SP3
    263
    264	fmulx		%fp0,%fp3		| ...S(Q3+SQ4)
    265	fmulx		%fp0,%fp2		| ...S(P2+SP3)
    266
    267	faddx		TANQ2,%fp3	| ...Q2+S(Q3+SQ4)
    268	faddx		TANP1,%fp2	| ...P1+S(P2+SP3)
    269
    270	fmulx		%fp0,%fp3		| ...S(Q2+S(Q3+SQ4))
    271	fmulx		%fp0,%fp2		| ...S(P1+S(P2+SP3))
    272
    273	faddx		TANQ1,%fp3	| ...Q1+S(Q2+S(Q3+SQ4))
    274	fmulx		%fp1,%fp2		| ...RS(P1+S(P2+SP3))
    275
    276	fmulx		%fp3,%fp0		| ...S(Q1+S(Q2+S(Q3+SQ4)))
    277
    278
    279	faddx		%fp2,%fp1		| ...R+RS(P1+S(P2+SP3))
    280	fadds		#0x3F800000,%fp0	| ...1+S(Q1+...)
    281
    282
    283	fmovex		%fp1,-(%sp)
    284	eoril		#0x80000000,(%sp)
    285
    286	fmovel		%d1,%fpcr		|restore users exceptions
    287	fdivx		(%sp)+,%fp0	|last inst - possible exception set
    288
    289	bra		t_frcinx
    290
    291TANBORS:
    292|--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
    293|--IF |X| < 2**(-40), RETURN X OR 1.
    294	cmpil		#0x3FFF8000,%d0
    295	bgts		REDUCEX
    296
    297TANSM:
    298
    299	fmovex		%fp0,-(%sp)
    300	fmovel		%d1,%fpcr		 |restore users exceptions
    301	fmovex		(%sp)+,%fp0	|last inst - possible exception set
    302
    303	bra		t_frcinx
    304
    305
    306REDUCEX:
    307|--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
    308|--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
    309|--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
    310
    311	fmovemx	%fp2-%fp5,-(%a7)	| ...save FP2 through FP5
    312	movel		%d2,-(%a7)
    313        fmoves         #0x00000000,%fp1
    314
    315|--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
    316|--there is a danger of unwanted overflow in first LOOP iteration.  In this
    317|--case, reduce argument by one remainder step to make subsequent reduction
    318|--safe.
    319	cmpil	#0x7ffeffff,%d0		|is argument dangerously large?
    320	bnes	LOOP
    321	movel	#0x7ffe0000,FP_SCR2(%a6)	|yes
    322|					;create 2**16383*PI/2
    323	movel	#0xc90fdaa2,FP_SCR2+4(%a6)
    324	clrl	FP_SCR2+8(%a6)
    325	ftstx	%fp0			|test sign of argument
    326	movel	#0x7fdc0000,FP_SCR3(%a6)	|create low half of 2**16383*
    327|					;PI/2 at FP_SCR3
    328	movel	#0x85a308d3,FP_SCR3+4(%a6)
    329	clrl   FP_SCR3+8(%a6)
    330	fblt	red_neg
    331	orw	#0x8000,FP_SCR2(%a6)	|positive arg
    332	orw	#0x8000,FP_SCR3(%a6)
    333red_neg:
    334	faddx  FP_SCR2(%a6),%fp0		|high part of reduction is exact
    335	fmovex  %fp0,%fp1		|save high result in fp1
    336	faddx  FP_SCR3(%a6),%fp0		|low part of reduction
    337	fsubx  %fp0,%fp1			|determine low component of result
    338	faddx  FP_SCR3(%a6),%fp1		|fp0/fp1 are reduced argument.
    339
    340|--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
    341|--integer quotient will be stored in N
    342|--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1)
    343
    344LOOP:
    345	fmovex		%fp0,INARG(%a6)	| ...+-2**K * F, 1 <= F < 2
    346	movew		INARG(%a6),%d0
    347        movel          %d0,%a1		| ...save a copy of D0
    348	andil		#0x00007FFF,%d0
    349	subil		#0x00003FFF,%d0	| ...D0 IS K
    350	cmpil		#28,%d0
    351	bles		LASTLOOP
    352CONTLOOP:
    353	subil		#27,%d0	 | ...D0 IS L := K-27
    354	movel		#0,ENDFLAG(%a6)
    355	bras		WORK
    356LASTLOOP:
    357	clrl		%d0		| ...D0 IS L := 0
    358	movel		#1,ENDFLAG(%a6)
    359
    360WORK:
    361|--FIND THE REMAINDER OF (R,r) W.R.T.	2**L * (PI/2). L IS SO CHOSEN
    362|--THAT	INT( X * (2/PI) / 2**(L) ) < 2**29.
    363
    364|--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
    365|--2**L * (PIby2_1), 2**L * (PIby2_2)
    366
    367	movel		#0x00003FFE,%d2	| ...BIASED EXPO OF 2/PI
    368	subl		%d0,%d2		| ...BIASED EXPO OF 2**(-L)*(2/PI)
    369
    370	movel		#0xA2F9836E,FP_SCR1+4(%a6)
    371	movel		#0x4E44152A,FP_SCR1+8(%a6)
    372	movew		%d2,FP_SCR1(%a6)	| ...FP_SCR1 is 2**(-L)*(2/PI)
    373
    374	fmovex		%fp0,%fp2
    375	fmulx		FP_SCR1(%a6),%fp2
    376|--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
    377|--FLOATING POINT FORMAT, THE TWO FMOVE'S	FMOVE.L FP <--> N
    378|--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
    379|--(SIGN(INARG)*2**63	+	FP2) - SIGN(INARG)*2**63 WILL GIVE
    380|--US THE DESIRED VALUE IN FLOATING POINT.
    381
    382|--HIDE SIX CYCLES OF INSTRUCTION
    383        movel		%a1,%d2
    384        swap		%d2
    385	andil		#0x80000000,%d2
    386	oril		#0x5F000000,%d2	| ...D2 IS SIGN(INARG)*2**63 IN SGL
    387	movel		%d2,TWOTO63(%a6)
    388
    389	movel		%d0,%d2
    390	addil		#0x00003FFF,%d2	| ...BIASED EXPO OF 2**L * (PI/2)
    391
    392|--FP2 IS READY
    393	fadds		TWOTO63(%a6),%fp2	| ...THE FRACTIONAL PART OF FP1 IS ROUNDED
    394
    395|--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1  and  2**(L)*Piby2_2
    396        movew		%d2,FP_SCR2(%a6)
    397	clrw           FP_SCR2+2(%a6)
    398	movel		#0xC90FDAA2,FP_SCR2+4(%a6)
    399	clrl		FP_SCR2+8(%a6)		| ...FP_SCR2 is  2**(L) * Piby2_1
    400
    401|--FP2 IS READY
    402	fsubs		TWOTO63(%a6),%fp2		| ...FP2 is N
    403
    404	addil		#0x00003FDD,%d0
    405        movew		%d0,FP_SCR3(%a6)
    406	clrw           FP_SCR3+2(%a6)
    407	movel		#0x85A308D3,FP_SCR3+4(%a6)
    408	clrl		FP_SCR3+8(%a6)		| ...FP_SCR3 is 2**(L) * Piby2_2
    409
    410	movel		ENDFLAG(%a6),%d0
    411
    412|--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
    413|--P2 = 2**(L) * Piby2_2
    414	fmovex		%fp2,%fp4
    415	fmulx		FP_SCR2(%a6),%fp4		| ...W = N*P1
    416	fmovex		%fp2,%fp5
    417	fmulx		FP_SCR3(%a6),%fp5		| ...w = N*P2
    418	fmovex		%fp4,%fp3
    419|--we want P+p = W+w  but  |p| <= half ulp of P
    420|--Then, we need to compute  A := R-P   and  a := r-p
    421	faddx		%fp5,%fp3			| ...FP3 is P
    422	fsubx		%fp3,%fp4			| ...W-P
    423
    424	fsubx		%fp3,%fp0			| ...FP0 is A := R - P
    425        faddx		%fp5,%fp4			| ...FP4 is p = (W-P)+w
    426
    427	fmovex		%fp0,%fp3			| ...FP3 A
    428	fsubx		%fp4,%fp1			| ...FP1 is a := r - p
    429
    430|--Now we need to normalize (A,a) to  "new (R,r)" where R+r = A+a but
    431|--|r| <= half ulp of R.
    432	faddx		%fp1,%fp0			| ...FP0 is R := A+a
    433|--No need to calculate r if this is the last loop
    434	cmpil		#0,%d0
    435	bgt		RESTORE
    436
    437|--Need to calculate r
    438	fsubx		%fp0,%fp3			| ...A-R
    439	faddx		%fp3,%fp1			| ...FP1 is r := (A-R)+a
    440	bra		LOOP
    441
    442RESTORE:
    443        fmovel		%fp2,N(%a6)
    444	movel		(%a7)+,%d2
    445	fmovemx	(%a7)+,%fp2-%fp5
    446
    447
    448	movel		N(%a6),%d0
    449        rorl		#1,%d0
    450
    451
    452	bra		TANCONT
    453
    454	|end