cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

setox.S (28388B)


      1|
      2|	setox.sa 3.1 12/10/90
      3|
      4|	The entry point setox computes the exponential of a value.
      5|	setoxd does the same except the input value is a denormalized
      6|	number.	setoxm1 computes exp(X)-1, and setoxm1d computes
      7|	exp(X)-1 for denormalized X.
      8|
      9|	INPUT
     10|	-----
     11|	Double-extended value in memory location pointed to by address
     12|	register a0.
     13|
     14|	OUTPUT
     15|	------
     16|	exp(X) or exp(X)-1 returned in floating-point register fp0.
     17|
     18|	ACCURACY and MONOTONICITY
     19|	-------------------------
     20|	The returned result is within 0.85 ulps in 64 significant bit, i.e.
     21|	within 0.5001 ulp to 53 bits if the result is subsequently rounded
     22|	to double precision. The result is provably monotonic in double
     23|	precision.
     24|
     25|	SPEED
     26|	-----
     27|	Two timings are measured, both in the copy-back mode. The
     28|	first one is measured when the function is invoked the first time
     29|	(so the instructions and data are not in cache), and the
     30|	second one is measured when the function is reinvoked at the same
     31|	input argument.
     32|
     33|	The program setox takes approximately 210/190 cycles for input
     34|	argument X whose magnitude is less than 16380 log2, which
     35|	is the usual situation.	For the less common arguments,
     36|	depending on their values, the program may run faster or slower --
     37|	but no worse than 10% slower even in the extreme cases.
     38|
     39|	The program setoxm1 takes approximately ??? / ??? cycles for input
     40|	argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
     41|	approximately ??? / ??? cycles. For the less common arguments,
     42|	depending on their values, the program may run faster or slower --
     43|	but no worse than 10% slower even in the extreme cases.
     44|
     45|	ALGORITHM and IMPLEMENTATION NOTES
     46|	----------------------------------
     47|
     48|	setoxd
     49|	------
     50|	Step 1.	Set ans := 1.0
     51|
     52|	Step 2.	Return	ans := ans + sign(X)*2^(-126). Exit.
     53|	Notes:	This will always generate one exception -- inexact.
     54|
     55|
     56|	setox
     57|	-----
     58|
     59|	Step 1.	Filter out extreme cases of input argument.
     60|		1.1	If |X| >= 2^(-65), go to Step 1.3.
     61|		1.2	Go to Step 7.
     62|		1.3	If |X| < 16380 log(2), go to Step 2.
     63|		1.4	Go to Step 8.
     64|	Notes:	The usual case should take the branches 1.1 -> 1.3 -> 2.
     65|		 To avoid the use of floating-point comparisons, a
     66|		 compact representation of |X| is used. This format is a
     67|		 32-bit integer, the upper (more significant) 16 bits are
     68|		 the sign and biased exponent field of |X|; the lower 16
     69|		 bits are the 16 most significant fraction (including the
     70|		 explicit bit) bits of |X|. Consequently, the comparisons
     71|		 in Steps 1.1 and 1.3 can be performed by integer comparison.
     72|		 Note also that the constant 16380 log(2) used in Step 1.3
     73|		 is also in the compact form. Thus taking the branch
     74|		 to Step 2 guarantees |X| < 16380 log(2). There is no harm
     75|		 to have a small number of cases where |X| is less than,
     76|		 but close to, 16380 log(2) and the branch to Step 9 is
     77|		 taken.
     78|
     79|	Step 2.	Calculate N = round-to-nearest-int( X * 64/log2 ).
     80|		2.1	Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
     81|		2.2	N := round-to-nearest-integer( X * 64/log2 ).
     82|		2.3	Calculate	J = N mod 64; so J = 0,1,2,..., or 63.
     83|		2.4	Calculate	M = (N - J)/64; so N = 64M + J.
     84|		2.5	Calculate the address of the stored value of 2^(J/64).
     85|		2.6	Create the value Scale = 2^M.
     86|	Notes:	The calculation in 2.2 is really performed by
     87|
     88|			Z := X * constant
     89|			N := round-to-nearest-integer(Z)
     90|
     91|		 where
     92|
     93|			constant := single-precision( 64/log 2 ).
     94|
     95|		 Using a single-precision constant avoids memory access.
     96|		 Another effect of using a single-precision "constant" is
     97|		 that the calculated value Z is
     98|
     99|			Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
    100|
    101|		 This error has to be considered later in Steps 3 and 4.
    102|
    103|	Step 3.	Calculate X - N*log2/64.
    104|		3.1	R := X + N*L1, where L1 := single-precision(-log2/64).
    105|		3.2	R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
    106|	Notes:	a) The way L1 and L2 are chosen ensures L1+L2 approximate
    107|		 the value	-log2/64	to 88 bits of accuracy.
    108|		 b) N*L1 is exact because N is no longer than 22 bits and
    109|		 L1 is no longer than 24 bits.
    110|		 c) The calculation X+N*L1 is also exact due to cancellation.
    111|		 Thus, R is practically X+N(L1+L2) to full 64 bits.
    112|		 d) It is important to estimate how large can |R| be after
    113|		 Step 3.2.
    114|
    115|			N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
    116|			X*64/log2 (1+eps)	=	N + f,	|f| <= 0.5
    117|			X*64/log2 - N	=	f - eps*X 64/log2
    118|			X - N*log2/64	=	f*log2/64 - eps*X
    119|
    120|
    121|		 Now |X| <= 16446 log2, thus
    122|
    123|			|X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
    124|					<= 0.57 log2/64.
    125|		 This bound will be used in Step 4.
    126|
    127|	Step 4.	Approximate exp(R)-1 by a polynomial
    128|			p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
    129|	Notes:	a) In order to reduce memory access, the coefficients are
    130|		 made as "short" as possible: A1 (which is 1/2), A4 and A5
    131|		 are single precision; A2 and A3 are double precision.
    132|		 b) Even with the restrictions above,
    133|			|p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
    134|		 Note that 0.0062 is slightly bigger than 0.57 log2/64.
    135|		 c) To fully utilize the pipeline, p is separated into
    136|		 two independent pieces of roughly equal complexities
    137|			p = [ R + R*S*(A2 + S*A4) ]	+
    138|				[ S*(A1 + S*(A3 + S*A5)) ]
    139|		 where S = R*R.
    140|
    141|	Step 5.	Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
    142|				ans := T + ( T*p + t)
    143|		 where T and t are the stored values for 2^(J/64).
    144|	Notes:	2^(J/64) is stored as T and t where T+t approximates
    145|		 2^(J/64) to roughly 85 bits; T is in extended precision
    146|		 and t is in single precision. Note also that T is rounded
    147|		 to 62 bits so that the last two bits of T are zero. The
    148|		 reason for such a special form is that T-1, T-2, and T-8
    149|		 will all be exact --- a property that will give much
    150|		 more accurate computation of the function EXPM1.
    151|
    152|	Step 6.	Reconstruction of exp(X)
    153|			exp(X) = 2^M * 2^(J/64) * exp(R).
    154|		6.1	If AdjFlag = 0, go to 6.3
    155|		6.2	ans := ans * AdjScale
    156|		6.3	Restore the user FPCR
    157|		6.4	Return ans := ans * Scale. Exit.
    158|	Notes:	If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
    159|		 |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
    160|		 neither overflow nor underflow. If AdjFlag = 1, that
    161|		 means that
    162|			X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
    163|		 Hence, exp(X) may overflow or underflow or neither.
    164|		 When that is the case, AdjScale = 2^(M1) where M1 is
    165|		 approximately M. Thus 6.2 will never cause over/underflow.
    166|		 Possible exception in 6.4 is overflow or underflow.
    167|		 The inexact exception is not generated in 6.4. Although
    168|		 one can argue that the inexact flag should always be
    169|		 raised, to simulate that exception cost to much than the
    170|		 flag is worth in practical uses.
    171|
    172|	Step 7.	Return 1 + X.
    173|		7.1	ans := X
    174|		7.2	Restore user FPCR.
    175|		7.3	Return ans := 1 + ans. Exit
    176|	Notes:	For non-zero X, the inexact exception will always be
    177|		 raised by 7.3. That is the only exception raised by 7.3.
    178|		 Note also that we use the FMOVEM instruction to move X
    179|		 in Step 7.1 to avoid unnecessary trapping. (Although
    180|		 the FMOVEM may not seem relevant since X is normalized,
    181|		 the precaution will be useful in the library version of
    182|		 this code where the separate entry for denormalized inputs
    183|		 will be done away with.)
    184|
    185|	Step 8.	Handle exp(X) where |X| >= 16380log2.
    186|		8.1	If |X| > 16480 log2, go to Step 9.
    187|		(mimic 2.2 - 2.6)
    188|		8.2	N := round-to-integer( X * 64/log2 )
    189|		8.3	Calculate J = N mod 64, J = 0,1,...,63
    190|		8.4	K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
    191|		8.5	Calculate the address of the stored value 2^(J/64).
    192|		8.6	Create the values Scale = 2^M, AdjScale = 2^M1.
    193|		8.7	Go to Step 3.
    194|	Notes:	Refer to notes for 2.2 - 2.6.
    195|
    196|	Step 9.	Handle exp(X), |X| > 16480 log2.
    197|		9.1	If X < 0, go to 9.3
    198|		9.2	ans := Huge, go to 9.4
    199|		9.3	ans := Tiny.
    200|		9.4	Restore user FPCR.
    201|		9.5	Return ans := ans * ans. Exit.
    202|	Notes:	Exp(X) will surely overflow or underflow, depending on
    203|		 X's sign. "Huge" and "Tiny" are respectively large/tiny
    204|		 extended-precision numbers whose square over/underflow
    205|		 with an inexact result. Thus, 9.5 always raises the
    206|		 inexact together with either overflow or underflow.
    207|
    208|
    209|	setoxm1d
    210|	--------
    211|
    212|	Step 1.	Set ans := 0
    213|
    214|	Step 2.	Return	ans := X + ans. Exit.
    215|	Notes:	This will return X with the appropriate rounding
    216|		 precision prescribed by the user FPCR.
    217|
    218|	setoxm1
    219|	-------
    220|
    221|	Step 1.	Check |X|
    222|		1.1	If |X| >= 1/4, go to Step 1.3.
    223|		1.2	Go to Step 7.
    224|		1.3	If |X| < 70 log(2), go to Step 2.
    225|		1.4	Go to Step 10.
    226|	Notes:	The usual case should take the branches 1.1 -> 1.3 -> 2.
    227|		 However, it is conceivable |X| can be small very often
    228|		 because EXPM1 is intended to evaluate exp(X)-1 accurately
    229|		 when |X| is small. For further details on the comparisons,
    230|		 see the notes on Step 1 of setox.
    231|
    232|	Step 2.	Calculate N = round-to-nearest-int( X * 64/log2 ).
    233|		2.1	N := round-to-nearest-integer( X * 64/log2 ).
    234|		2.2	Calculate	J = N mod 64; so J = 0,1,2,..., or 63.
    235|		2.3	Calculate	M = (N - J)/64; so N = 64M + J.
    236|		2.4	Calculate the address of the stored value of 2^(J/64).
    237|		2.5	Create the values Sc = 2^M and OnebySc := -2^(-M).
    238|	Notes:	See the notes on Step 2 of setox.
    239|
    240|	Step 3.	Calculate X - N*log2/64.
    241|		3.1	R := X + N*L1, where L1 := single-precision(-log2/64).
    242|		3.2	R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
    243|	Notes:	Applying the analysis of Step 3 of setox in this case
    244|		 shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
    245|		 this case).
    246|
    247|	Step 4.	Approximate exp(R)-1 by a polynomial
    248|			p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
    249|	Notes:	a) In order to reduce memory access, the coefficients are
    250|		 made as "short" as possible: A1 (which is 1/2), A5 and A6
    251|		 are single precision; A2, A3 and A4 are double precision.
    252|		 b) Even with the restriction above,
    253|			|p - (exp(R)-1)| <	|R| * 2^(-72.7)
    254|		 for all |R| <= 0.0055.
    255|		 c) To fully utilize the pipeline, p is separated into
    256|		 two independent pieces of roughly equal complexity
    257|			p = [ R*S*(A2 + S*(A4 + S*A6)) ]	+
    258|				[ R + S*(A1 + S*(A3 + S*A5)) ]
    259|		 where S = R*R.
    260|
    261|	Step 5.	Compute 2^(J/64)*p by
    262|				p := T*p
    263|		 where T and t are the stored values for 2^(J/64).
    264|	Notes:	2^(J/64) is stored as T and t where T+t approximates
    265|		 2^(J/64) to roughly 85 bits; T is in extended precision
    266|		 and t is in single precision. Note also that T is rounded
    267|		 to 62 bits so that the last two bits of T are zero. The
    268|		 reason for such a special form is that T-1, T-2, and T-8
    269|		 will all be exact --- a property that will be exploited
    270|		 in Step 6 below. The total relative error in p is no
    271|		 bigger than 2^(-67.7) compared to the final result.
    272|
    273|	Step 6.	Reconstruction of exp(X)-1
    274|			exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
    275|		6.1	If M <= 63, go to Step 6.3.
    276|		6.2	ans := T + (p + (t + OnebySc)). Go to 6.6
    277|		6.3	If M >= -3, go to 6.5.
    278|		6.4	ans := (T + (p + t)) + OnebySc. Go to 6.6
    279|		6.5	ans := (T + OnebySc) + (p + t).
    280|		6.6	Restore user FPCR.
    281|		6.7	Return ans := Sc * ans. Exit.
    282|	Notes:	The various arrangements of the expressions give accurate
    283|		 evaluations.
    284|
    285|	Step 7.	exp(X)-1 for |X| < 1/4.
    286|		7.1	If |X| >= 2^(-65), go to Step 9.
    287|		7.2	Go to Step 8.
    288|
    289|	Step 8.	Calculate exp(X)-1, |X| < 2^(-65).
    290|		8.1	If |X| < 2^(-16312), goto 8.3
    291|		8.2	Restore FPCR; return ans := X - 2^(-16382). Exit.
    292|		8.3	X := X * 2^(140).
    293|		8.4	Restore FPCR; ans := ans - 2^(-16382).
    294|		 Return ans := ans*2^(140). Exit
    295|	Notes:	The idea is to return "X - tiny" under the user
    296|		 precision and rounding modes. To avoid unnecessary
    297|		 inefficiency, we stay away from denormalized numbers the
    298|		 best we can. For |X| >= 2^(-16312), the straightforward
    299|		 8.2 generates the inexact exception as the case warrants.
    300|
    301|	Step 9.	Calculate exp(X)-1, |X| < 1/4, by a polynomial
    302|			p = X + X*X*(B1 + X*(B2 + ... + X*B12))
    303|	Notes:	a) In order to reduce memory access, the coefficients are
    304|		 made as "short" as possible: B1 (which is 1/2), B9 to B12
    305|		 are single precision; B3 to B8 are double precision; and
    306|		 B2 is double extended.
    307|		 b) Even with the restriction above,
    308|			|p - (exp(X)-1)| < |X| 2^(-70.6)
    309|		 for all |X| <= 0.251.
    310|		 Note that 0.251 is slightly bigger than 1/4.
    311|		 c) To fully preserve accuracy, the polynomial is computed
    312|		 as	X + ( S*B1 +	Q ) where S = X*X and
    313|			Q	=	X*S*(B2 + X*(B3 + ... + X*B12))
    314|		 d) To fully utilize the pipeline, Q is separated into
    315|		 two independent pieces of roughly equal complexity
    316|			Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
    317|				[ S*S*(B3 + S*(B5 + ... + S*B11)) ]
    318|
    319|	Step 10.	Calculate exp(X)-1 for |X| >= 70 log 2.
    320|		10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
    321|		 purposes. Therefore, go to Step 1 of setox.
    322|		10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
    323|		 ans := -1
    324|		 Restore user FPCR
    325|		 Return ans := ans + 2^(-126). Exit.
    326|	Notes:	10.2 will always create an inexact and return -1 + tiny
    327|		 in the user rounding precision and mode.
    328|
    329|
    330
    331|		Copyright (C) Motorola, Inc. 1990
    332|			All Rights Reserved
    333|
    334|       For details on the license for this file, please see the
    335|       file, README, in this same directory.
    336
    337|setox	idnt	2,1 | Motorola 040 Floating Point Software Package
    338
    339	|section	8
    340
    341#include "fpsp.h"
    342
    343L2:	.long	0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
    344
    345EXPA3:	.long	0x3FA55555,0x55554431
    346EXPA2:	.long	0x3FC55555,0x55554018
    347
    348HUGE:	.long	0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
    349TINY:	.long	0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
    350
    351EM1A4:	.long	0x3F811111,0x11174385
    352EM1A3:	.long	0x3FA55555,0x55554F5A
    353
    354EM1A2:	.long	0x3FC55555,0x55555555,0x00000000,0x00000000
    355
    356EM1B8:	.long	0x3EC71DE3,0xA5774682
    357EM1B7:	.long	0x3EFA01A0,0x19D7CB68
    358
    359EM1B6:	.long	0x3F2A01A0,0x1A019DF3
    360EM1B5:	.long	0x3F56C16C,0x16C170E2
    361
    362EM1B4:	.long	0x3F811111,0x11111111
    363EM1B3:	.long	0x3FA55555,0x55555555
    364
    365EM1B2:	.long	0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
    366	.long	0x00000000
    367
    368TWO140:	.long	0x48B00000,0x00000000
    369TWON140:	.long	0x37300000,0x00000000
    370
    371EXPTBL:
    372	.long	0x3FFF0000,0x80000000,0x00000000,0x00000000
    373	.long	0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
    374	.long	0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
    375	.long	0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
    376	.long	0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
    377	.long	0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
    378	.long	0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
    379	.long	0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
    380	.long	0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
    381	.long	0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
    382	.long	0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
    383	.long	0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
    384	.long	0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
    385	.long	0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
    386	.long	0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
    387	.long	0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
    388	.long	0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
    389	.long	0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
    390	.long	0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
    391	.long	0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
    392	.long	0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
    393	.long	0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
    394	.long	0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
    395	.long	0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
    396	.long	0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
    397	.long	0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
    398	.long	0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
    399	.long	0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
    400	.long	0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
    401	.long	0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
    402	.long	0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
    403	.long	0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
    404	.long	0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
    405	.long	0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
    406	.long	0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
    407	.long	0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
    408	.long	0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
    409	.long	0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
    410	.long	0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
    411	.long	0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
    412	.long	0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
    413	.long	0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
    414	.long	0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
    415	.long	0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
    416	.long	0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
    417	.long	0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
    418	.long	0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
    419	.long	0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
    420	.long	0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
    421	.long	0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
    422	.long	0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
    423	.long	0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
    424	.long	0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
    425	.long	0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
    426	.long	0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
    427	.long	0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
    428	.long	0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
    429	.long	0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
    430	.long	0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
    431	.long	0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
    432	.long	0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
    433	.long	0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
    434	.long	0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
    435	.long	0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
    436
    437	.set	ADJFLAG,L_SCR2
    438	.set	SCALE,FP_SCR1
    439	.set	ADJSCALE,FP_SCR2
    440	.set	SC,FP_SCR3
    441	.set	ONEBYSC,FP_SCR4
    442
    443	| xref	t_frcinx
    444	|xref	t_extdnrm
    445	|xref	t_unfl
    446	|xref	t_ovfl
    447
    448	.global	setoxd
    449setoxd:
    450|--entry point for EXP(X), X is denormalized
    451	movel		(%a0),%d0
    452	andil		#0x80000000,%d0
    453	oril		#0x00800000,%d0		| ...sign(X)*2^(-126)
    454	movel		%d0,-(%sp)
    455	fmoves		#0x3F800000,%fp0
    456	fmovel		%d1,%fpcr
    457	fadds		(%sp)+,%fp0
    458	bra		t_frcinx
    459
    460	.global	setox
    461setox:
    462|--entry point for EXP(X), here X is finite, non-zero, and not NaN's
    463
    464|--Step 1.
    465	movel		(%a0),%d0	 | ...load part of input X
    466	andil		#0x7FFF0000,%d0	| ...biased expo. of X
    467	cmpil		#0x3FBE0000,%d0	| ...2^(-65)
    468	bges		EXPC1		| ...normal case
    469	bra		EXPSM
    470
    471EXPC1:
    472|--The case |X| >= 2^(-65)
    473	movew		4(%a0),%d0	| ...expo. and partial sig. of |X|
    474	cmpil		#0x400CB167,%d0	| ...16380 log2 trunc. 16 bits
    475	blts		EXPMAIN	 | ...normal case
    476	bra		EXPBIG
    477
    478EXPMAIN:
    479|--Step 2.
    480|--This is the normal branch:	2^(-65) <= |X| < 16380 log2.
    481	fmovex		(%a0),%fp0	| ...load input from (a0)
    482
    483	fmovex		%fp0,%fp1
    484	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
    485	fmovemx	%fp2-%fp2/%fp3,-(%a7)		| ...save fp2
    486	movel		#0,ADJFLAG(%a6)
    487	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
    488	lea		EXPTBL,%a1
    489	fmovel		%d0,%fp0		| ...convert to floating-format
    490
    491	movel		%d0,L_SCR1(%a6)	| ...save N temporarily
    492	andil		#0x3F,%d0		| ...D0 is J = N mod 64
    493	lsll		#4,%d0
    494	addal		%d0,%a1		| ...address of 2^(J/64)
    495	movel		L_SCR1(%a6),%d0
    496	asrl		#6,%d0		| ...D0 is M
    497	addiw		#0x3FFF,%d0	| ...biased expo. of 2^(M)
    498	movew		L2,L_SCR1(%a6)	| ...prefetch L2, no need in CB
    499
    500EXPCONT1:
    501|--Step 3.
    502|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
    503|--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
    504	fmovex		%fp0,%fp2
    505	fmuls		#0xBC317218,%fp0	| ...N * L1, L1 = lead(-log2/64)
    506	fmulx		L2,%fp2		| ...N * L2, L1+L2 = -log2/64
    507	faddx		%fp1,%fp0		| ...X + N*L1
    508	faddx		%fp2,%fp0		| ...fp0 is R, reduced arg.
    509|	MOVE.W		#$3FA5,EXPA3	...load EXPA3 in cache
    510
    511|--Step 4.
    512|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
    513|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
    514|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
    515|--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
    516
    517	fmovex		%fp0,%fp1
    518	fmulx		%fp1,%fp1		| ...fp1 IS S = R*R
    519
    520	fmoves		#0x3AB60B70,%fp2	| ...fp2 IS A5
    521|	MOVE.W		#0,2(%a1)	...load 2^(J/64) in cache
    522
    523	fmulx		%fp1,%fp2		| ...fp2 IS S*A5
    524	fmovex		%fp1,%fp3
    525	fmuls		#0x3C088895,%fp3	| ...fp3 IS S*A4
    526
    527	faddd		EXPA3,%fp2	| ...fp2 IS A3+S*A5
    528	faddd		EXPA2,%fp3	| ...fp3 IS A2+S*A4
    529
    530	fmulx		%fp1,%fp2		| ...fp2 IS S*(A3+S*A5)
    531	movew		%d0,SCALE(%a6)	| ...SCALE is 2^(M) in extended
    532	clrw		SCALE+2(%a6)
    533	movel		#0x80000000,SCALE+4(%a6)
    534	clrl		SCALE+8(%a6)
    535
    536	fmulx		%fp1,%fp3		| ...fp3 IS S*(A2+S*A4)
    537
    538	fadds		#0x3F000000,%fp2	| ...fp2 IS A1+S*(A3+S*A5)
    539	fmulx		%fp0,%fp3		| ...fp3 IS R*S*(A2+S*A4)
    540
    541	fmulx		%fp1,%fp2		| ...fp2 IS S*(A1+S*(A3+S*A5))
    542	faddx		%fp3,%fp0		| ...fp0 IS R+R*S*(A2+S*A4),
    543|					...fp3 released
    544
    545	fmovex		(%a1)+,%fp1	| ...fp1 is lead. pt. of 2^(J/64)
    546	faddx		%fp2,%fp0		| ...fp0 is EXP(R) - 1
    547|					...fp2 released
    548
    549|--Step 5
    550|--final reconstruction process
    551|--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
    552
    553	fmulx		%fp1,%fp0		| ...2^(J/64)*(Exp(R)-1)
    554	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
    555	fadds		(%a1),%fp0	| ...accurate 2^(J/64)
    556
    557	faddx		%fp1,%fp0		| ...2^(J/64) + 2^(J/64)*...
    558	movel		ADJFLAG(%a6),%d0
    559
    560|--Step 6
    561	tstl		%d0
    562	beqs		NORMAL
    563ADJUST:
    564	fmulx		ADJSCALE(%a6),%fp0
    565NORMAL:
    566	fmovel		%d1,%FPCR		| ...restore user FPCR
    567	fmulx		SCALE(%a6),%fp0	| ...multiply 2^(M)
    568	bra		t_frcinx
    569
    570EXPSM:
    571|--Step 7
    572	fmovemx	(%a0),%fp0-%fp0	| ...in case X is denormalized
    573	fmovel		%d1,%FPCR
    574	fadds		#0x3F800000,%fp0	| ...1+X in user mode
    575	bra		t_frcinx
    576
    577EXPBIG:
    578|--Step 8
    579	cmpil		#0x400CB27C,%d0	| ...16480 log2
    580	bgts		EXP2BIG
    581|--Steps 8.2 -- 8.6
    582	fmovex		(%a0),%fp0	| ...load input from (a0)
    583
    584	fmovex		%fp0,%fp1
    585	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
    586	fmovemx	 %fp2-%fp2/%fp3,-(%a7)		| ...save fp2
    587	movel		#1,ADJFLAG(%a6)
    588	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
    589	lea		EXPTBL,%a1
    590	fmovel		%d0,%fp0		| ...convert to floating-format
    591	movel		%d0,L_SCR1(%a6)			| ...save N temporarily
    592	andil		#0x3F,%d0		 | ...D0 is J = N mod 64
    593	lsll		#4,%d0
    594	addal		%d0,%a1			| ...address of 2^(J/64)
    595	movel		L_SCR1(%a6),%d0
    596	asrl		#6,%d0			| ...D0 is K
    597	movel		%d0,L_SCR1(%a6)			| ...save K temporarily
    598	asrl		#1,%d0			| ...D0 is M1
    599	subl		%d0,L_SCR1(%a6)			| ...a1 is M
    600	addiw		#0x3FFF,%d0		| ...biased expo. of 2^(M1)
    601	movew		%d0,ADJSCALE(%a6)		| ...ADJSCALE := 2^(M1)
    602	clrw		ADJSCALE+2(%a6)
    603	movel		#0x80000000,ADJSCALE+4(%a6)
    604	clrl		ADJSCALE+8(%a6)
    605	movel		L_SCR1(%a6),%d0			| ...D0 is M
    606	addiw		#0x3FFF,%d0		| ...biased expo. of 2^(M)
    607	bra		EXPCONT1		| ...go back to Step 3
    608
    609EXP2BIG:
    610|--Step 9
    611	fmovel		%d1,%FPCR
    612	movel		(%a0),%d0
    613	bclrb		#sign_bit,(%a0)		| ...setox always returns positive
    614	cmpil		#0,%d0
    615	blt		t_unfl
    616	bra		t_ovfl
    617
    618	.global	setoxm1d
    619setoxm1d:
    620|--entry point for EXPM1(X), here X is denormalized
    621|--Step 0.
    622	bra		t_extdnrm
    623
    624
    625	.global	setoxm1
    626setoxm1:
    627|--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
    628
    629|--Step 1.
    630|--Step 1.1
    631	movel		(%a0),%d0	 | ...load part of input X
    632	andil		#0x7FFF0000,%d0	| ...biased expo. of X
    633	cmpil		#0x3FFD0000,%d0	| ...1/4
    634	bges		EM1CON1	 | ...|X| >= 1/4
    635	bra		EM1SM
    636
    637EM1CON1:
    638|--Step 1.3
    639|--The case |X| >= 1/4
    640	movew		4(%a0),%d0	| ...expo. and partial sig. of |X|
    641	cmpil		#0x4004C215,%d0	| ...70log2 rounded up to 16 bits
    642	bles		EM1MAIN	 | ...1/4 <= |X| <= 70log2
    643	bra		EM1BIG
    644
    645EM1MAIN:
    646|--Step 2.
    647|--This is the case:	1/4 <= |X| <= 70 log2.
    648	fmovex		(%a0),%fp0	| ...load input from (a0)
    649
    650	fmovex		%fp0,%fp1
    651	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
    652	fmovemx	%fp2-%fp2/%fp3,-(%a7)		| ...save fp2
    653|	MOVE.W		#$3F81,EM1A4		...prefetch in CB mode
    654	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
    655	lea		EXPTBL,%a1
    656	fmovel		%d0,%fp0		| ...convert to floating-format
    657
    658	movel		%d0,L_SCR1(%a6)			| ...save N temporarily
    659	andil		#0x3F,%d0		 | ...D0 is J = N mod 64
    660	lsll		#4,%d0
    661	addal		%d0,%a1			| ...address of 2^(J/64)
    662	movel		L_SCR1(%a6),%d0
    663	asrl		#6,%d0			| ...D0 is M
    664	movel		%d0,L_SCR1(%a6)			| ...save a copy of M
    665|	MOVE.W		#$3FDC,L2		...prefetch L2 in CB mode
    666
    667|--Step 3.
    668|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
    669|--a0 points to 2^(J/64), D0 and a1 both contain M
    670	fmovex		%fp0,%fp2
    671	fmuls		#0xBC317218,%fp0	| ...N * L1, L1 = lead(-log2/64)
    672	fmulx		L2,%fp2		| ...N * L2, L1+L2 = -log2/64
    673	faddx		%fp1,%fp0	 | ...X + N*L1
    674	faddx		%fp2,%fp0	 | ...fp0 is R, reduced arg.
    675|	MOVE.W		#$3FC5,EM1A2		...load EM1A2 in cache
    676	addiw		#0x3FFF,%d0		| ...D0 is biased expo. of 2^M
    677
    678|--Step 4.
    679|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
    680|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
    681|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
    682|--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
    683
    684	fmovex		%fp0,%fp1
    685	fmulx		%fp1,%fp1		| ...fp1 IS S = R*R
    686
    687	fmoves		#0x3950097B,%fp2	| ...fp2 IS a6
    688|	MOVE.W		#0,2(%a1)	...load 2^(J/64) in cache
    689
    690	fmulx		%fp1,%fp2		| ...fp2 IS S*A6
    691	fmovex		%fp1,%fp3
    692	fmuls		#0x3AB60B6A,%fp3	| ...fp3 IS S*A5
    693
    694	faddd		EM1A4,%fp2	| ...fp2 IS A4+S*A6
    695	faddd		EM1A3,%fp3	| ...fp3 IS A3+S*A5
    696	movew		%d0,SC(%a6)		| ...SC is 2^(M) in extended
    697	clrw		SC+2(%a6)
    698	movel		#0x80000000,SC+4(%a6)
    699	clrl		SC+8(%a6)
    700
    701	fmulx		%fp1,%fp2		| ...fp2 IS S*(A4+S*A6)
    702	movel		L_SCR1(%a6),%d0		| ...D0 is	M
    703	negw		%d0		| ...D0 is -M
    704	fmulx		%fp1,%fp3		| ...fp3 IS S*(A3+S*A5)
    705	addiw		#0x3FFF,%d0	| ...biased expo. of 2^(-M)
    706	faddd		EM1A2,%fp2	| ...fp2 IS A2+S*(A4+S*A6)
    707	fadds		#0x3F000000,%fp3	| ...fp3 IS A1+S*(A3+S*A5)
    708
    709	fmulx		%fp1,%fp2		| ...fp2 IS S*(A2+S*(A4+S*A6))
    710	oriw		#0x8000,%d0	| ...signed/expo. of -2^(-M)
    711	movew		%d0,ONEBYSC(%a6)	| ...OnebySc is -2^(-M)
    712	clrw		ONEBYSC+2(%a6)
    713	movel		#0x80000000,ONEBYSC+4(%a6)
    714	clrl		ONEBYSC+8(%a6)
    715	fmulx		%fp3,%fp1		| ...fp1 IS S*(A1+S*(A3+S*A5))
    716|					...fp3 released
    717
    718	fmulx		%fp0,%fp2		| ...fp2 IS R*S*(A2+S*(A4+S*A6))
    719	faddx		%fp1,%fp0		| ...fp0 IS R+S*(A1+S*(A3+S*A5))
    720|					...fp1 released
    721
    722	faddx		%fp2,%fp0		| ...fp0 IS EXP(R)-1
    723|					...fp2 released
    724	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
    725
    726|--Step 5
    727|--Compute 2^(J/64)*p
    728
    729	fmulx		(%a1),%fp0	| ...2^(J/64)*(Exp(R)-1)
    730
    731|--Step 6
    732|--Step 6.1
    733	movel		L_SCR1(%a6),%d0		| ...retrieve M
    734	cmpil		#63,%d0
    735	bles		MLE63
    736|--Step 6.2	M >= 64
    737	fmoves		12(%a1),%fp1	| ...fp1 is t
    738	faddx		ONEBYSC(%a6),%fp1	| ...fp1 is t+OnebySc
    739	faddx		%fp1,%fp0		| ...p+(t+OnebySc), fp1 released
    740	faddx		(%a1),%fp0	| ...T+(p+(t+OnebySc))
    741	bras		EM1SCALE
    742MLE63:
    743|--Step 6.3	M <= 63
    744	cmpil		#-3,%d0
    745	bges		MGEN3
    746MLTN3:
    747|--Step 6.4	M <= -4
    748	fadds		12(%a1),%fp0	| ...p+t
    749	faddx		(%a1),%fp0	| ...T+(p+t)
    750	faddx		ONEBYSC(%a6),%fp0	| ...OnebySc + (T+(p+t))
    751	bras		EM1SCALE
    752MGEN3:
    753|--Step 6.5	-3 <= M <= 63
    754	fmovex		(%a1)+,%fp1	| ...fp1 is T
    755	fadds		(%a1),%fp0	| ...fp0 is p+t
    756	faddx		ONEBYSC(%a6),%fp1	| ...fp1 is T+OnebySc
    757	faddx		%fp1,%fp0		| ...(T+OnebySc)+(p+t)
    758
    759EM1SCALE:
    760|--Step 6.6
    761	fmovel		%d1,%FPCR
    762	fmulx		SC(%a6),%fp0
    763
    764	bra		t_frcinx
    765
    766EM1SM:
    767|--Step 7	|X| < 1/4.
    768	cmpil		#0x3FBE0000,%d0	| ...2^(-65)
    769	bges		EM1POLY
    770
    771EM1TINY:
    772|--Step 8	|X| < 2^(-65)
    773	cmpil		#0x00330000,%d0	| ...2^(-16312)
    774	blts		EM12TINY
    775|--Step 8.2
    776	movel		#0x80010000,SC(%a6)	| ...SC is -2^(-16382)
    777	movel		#0x80000000,SC+4(%a6)
    778	clrl		SC+8(%a6)
    779	fmovex		(%a0),%fp0
    780	fmovel		%d1,%FPCR
    781	faddx		SC(%a6),%fp0
    782
    783	bra		t_frcinx
    784
    785EM12TINY:
    786|--Step 8.3
    787	fmovex		(%a0),%fp0
    788	fmuld		TWO140,%fp0
    789	movel		#0x80010000,SC(%a6)
    790	movel		#0x80000000,SC+4(%a6)
    791	clrl		SC+8(%a6)
    792	faddx		SC(%a6),%fp0
    793	fmovel		%d1,%FPCR
    794	fmuld		TWON140,%fp0
    795
    796	bra		t_frcinx
    797
    798EM1POLY:
    799|--Step 9	exp(X)-1 by a simple polynomial
    800	fmovex		(%a0),%fp0	| ...fp0 is X
    801	fmulx		%fp0,%fp0		| ...fp0 is S := X*X
    802	fmovemx	%fp2-%fp2/%fp3,-(%a7)	| ...save fp2
    803	fmoves		#0x2F30CAA8,%fp1	| ...fp1 is B12
    804	fmulx		%fp0,%fp1		| ...fp1 is S*B12
    805	fmoves		#0x310F8290,%fp2	| ...fp2 is B11
    806	fadds		#0x32D73220,%fp1	| ...fp1 is B10+S*B12
    807
    808	fmulx		%fp0,%fp2		| ...fp2 is S*B11
    809	fmulx		%fp0,%fp1		| ...fp1 is S*(B10 + ...
    810
    811	fadds		#0x3493F281,%fp2	| ...fp2 is B9+S*...
    812	faddd		EM1B8,%fp1	| ...fp1 is B8+S*...
    813
    814	fmulx		%fp0,%fp2		| ...fp2 is S*(B9+...
    815	fmulx		%fp0,%fp1		| ...fp1 is S*(B8+...
    816
    817	faddd		EM1B7,%fp2	| ...fp2 is B7+S*...
    818	faddd		EM1B6,%fp1	| ...fp1 is B6+S*...
    819
    820	fmulx		%fp0,%fp2		| ...fp2 is S*(B7+...
    821	fmulx		%fp0,%fp1		| ...fp1 is S*(B6+...
    822
    823	faddd		EM1B5,%fp2	| ...fp2 is B5+S*...
    824	faddd		EM1B4,%fp1	| ...fp1 is B4+S*...
    825
    826	fmulx		%fp0,%fp2		| ...fp2 is S*(B5+...
    827	fmulx		%fp0,%fp1		| ...fp1 is S*(B4+...
    828
    829	faddd		EM1B3,%fp2	| ...fp2 is B3+S*...
    830	faddx		EM1B2,%fp1	| ...fp1 is B2+S*...
    831
    832	fmulx		%fp0,%fp2		| ...fp2 is S*(B3+...
    833	fmulx		%fp0,%fp1		| ...fp1 is S*(B2+...
    834
    835	fmulx		%fp0,%fp2		| ...fp2 is S*S*(B3+...)
    836	fmulx		(%a0),%fp1	| ...fp1 is X*S*(B2...
    837
    838	fmuls		#0x3F000000,%fp0	| ...fp0 is S*B1
    839	faddx		%fp2,%fp1		| ...fp1 is Q
    840|					...fp2 released
    841
    842	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
    843
    844	faddx		%fp1,%fp0		| ...fp0 is S*B1+Q
    845|					...fp1 released
    846
    847	fmovel		%d1,%FPCR
    848	faddx		(%a0),%fp0
    849
    850	bra		t_frcinx
    851
    852EM1BIG:
    853|--Step 10	|X| > 70 log2
    854	movel		(%a0),%d0
    855	cmpil		#0,%d0
    856	bgt		EXPC1
    857|--Step 10.2
    858	fmoves		#0xBF800000,%fp0	| ...fp0 is -1
    859	fmovel		%d1,%FPCR
    860	fadds		#0x00800000,%fp0	| ...-1 + 2^(-126)
    861
    862	bra		t_frcinx
    863
    864	|end