cscg22-gearboy

CSCG 2022 Challenge 'Gearboy'
git clone https://git.sinitax.com/sinitax/cscg22-gearboy
Log | Files | Refs | sfeed.txt

e_sqrt.c (15284B)


      1/* @(#)e_sqrt.c 5.1 93/09/24 */
      2/*
      3 * ====================================================
      4 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
      5 *
      6 * Developed at SunPro, a Sun Microsystems, Inc. business.
      7 * Permission to use, copy, modify, and distribute this
      8 * software is freely granted, provided that this notice
      9 * is preserved.
     10 * ====================================================
     11 */
     12
     13#if defined(LIBM_SCCS) && !defined(lint)
     14static const char rcsid[] =
     15    "$NetBSD: e_sqrt.c,v 1.8 1995/05/10 20:46:17 jtc Exp $";
     16#endif
     17
     18/* __ieee754_sqrt(x)
     19 * Return correctly rounded sqrt.
     20 *           ------------------------------------------
     21 *	     |  Use the hardware sqrt if you have one |
     22 *           ------------------------------------------
     23 * Method:
     24 *   Bit by bit method using integer arithmetic. (Slow, but portable)
     25 *   1. Normalization
     26 *	Scale x to y in [1,4) with even powers of 2:
     27 *	find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
     28 *		sqrt(x) = 2^k * sqrt(y)
     29 *   2. Bit by bit computation
     30 *	Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
     31 *	     i							 0
     32 *                                     i+1         2
     33 *	    s  = 2*q , and	y  =  2   * ( y - q  ).		(1)
     34 *	     i      i            i                 i
     35 *
     36 *	To compute q    from q , one checks whether
     37 *		    i+1       i
     38 *
     39 *			      -(i+1) 2
     40 *			(q + 2      ) <= y.			(2)
     41 *     			  i
     42 *							      -(i+1)
     43 *	If (2) is false, then q   = q ; otherwise q   = q  + 2      .
     44 *		 	       i+1   i             i+1   i
     45 *
     46 *	With some algebric manipulation, it is not difficult to see
     47 *	that (2) is equivalent to
     48 *                             -(i+1)
     49 *			s  +  2       <= y			(3)
     50 *			 i                i
     51 *
     52 *	The advantage of (3) is that s  and y  can be computed by
     53 *				      i      i
     54 *	the following recurrence formula:
     55 *	    if (3) is false
     56 *
     57 *	    s     =  s  ,	y    = y   ;			(4)
     58 *	     i+1      i		 i+1    i
     59 *
     60 *	    otherwise,
     61 *                         -i                     -(i+1)
     62 *	    s	  =  s  + 2  ,  y    = y  -  s  - 2  		(5)
     63 *           i+1      i          i+1    i     i
     64 *
     65 *	One may easily use induction to prove (4) and (5).
     66 *	Note. Since the left hand side of (3) contain only i+2 bits,
     67 *	      it does not necessary to do a full (53-bit) comparison
     68 *	      in (3).
     69 *   3. Final rounding
     70 *	After generating the 53 bits result, we compute one more bit.
     71 *	Together with the remainder, we can decide whether the
     72 *	result is exact, bigger than 1/2ulp, or less than 1/2ulp
     73 *	(it will never equal to 1/2ulp).
     74 *	The rounding mode can be detected by checking whether
     75 *	huge + tiny is equal to huge, and whether huge - tiny is
     76 *	equal to huge for some floating point number "huge" and "tiny".
     77 *
     78 * Special cases:
     79 *	sqrt(+-0) = +-0 	... exact
     80 *	sqrt(inf) = inf
     81 *	sqrt(-ve) = NaN		... with invalid signal
     82 *	sqrt(NaN) = NaN		... with invalid signal for signaling NaN
     83 *
     84 * Other methods : see the appended file at the end of the program below.
     85 *---------------
     86 */
     87
     88#include "math_libm.h"
     89#include "math_private.h"
     90
     91#ifdef __STDC__
     92static const double one = 1.0, tiny = 1.0e-300;
     93#else
     94static double one = 1.0, tiny = 1.0e-300;
     95#endif
     96
     97#ifdef __STDC__
     98double attribute_hidden
     99__ieee754_sqrt(double x)
    100#else
    101double attribute_hidden
    102__ieee754_sqrt(x)
    103     double x;
    104#endif
    105{
    106    double z;
    107    int32_t sign = (int) 0x80000000;
    108    int32_t ix0, s0, q, m, t, i;
    109    u_int32_t r, t1, s1, ix1, q1;
    110
    111    EXTRACT_WORDS(ix0, ix1, x);
    112
    113    /* take care of Inf and NaN */
    114    if ((ix0 & 0x7ff00000) == 0x7ff00000) {
    115        return x * x + x;       /* sqrt(NaN)=NaN, sqrt(+inf)=+inf
    116                                   sqrt(-inf)=sNaN */
    117    }
    118    /* take care of zero */
    119    if (ix0 <= 0) {
    120        if (((ix0 & (~sign)) | ix1) == 0)
    121            return x;           /* sqrt(+-0) = +-0 */
    122        else if (ix0 < 0)
    123            return (x - x) / (x - x);   /* sqrt(-ve) = sNaN */
    124    }
    125    /* normalize x */
    126    m = (ix0 >> 20);
    127    if (m == 0) {               /* subnormal x */
    128        while (ix0 == 0) {
    129            m -= 21;
    130            ix0 |= (ix1 >> 11);
    131            ix1 <<= 21;
    132        }
    133        for (i = 0; (ix0 & 0x00100000) == 0; i++)
    134            ix0 <<= 1;
    135        m -= i - 1;
    136        ix0 |= (ix1 >> (32 - i));
    137        ix1 <<= i;
    138    }
    139    m -= 1023;                  /* unbias exponent */
    140    ix0 = (ix0 & 0x000fffff) | 0x00100000;
    141    if (m & 1) {                /* odd m, double x to make it even */
    142        ix0 += ix0 + ((ix1 & sign) >> 31);
    143        ix1 += ix1;
    144    }
    145    m >>= 1;                    /* m = [m/2] */
    146
    147    /* generate sqrt(x) bit by bit */
    148    ix0 += ix0 + ((ix1 & sign) >> 31);
    149    ix1 += ix1;
    150    q = q1 = s0 = s1 = 0;       /* [q,q1] = sqrt(x) */
    151    r = 0x00200000;             /* r = moving bit from right to left */
    152
    153    while (r != 0) {
    154        t = s0 + r;
    155        if (t <= ix0) {
    156            s0 = t + r;
    157            ix0 -= t;
    158            q += r;
    159        }
    160        ix0 += ix0 + ((ix1 & sign) >> 31);
    161        ix1 += ix1;
    162        r >>= 1;
    163    }
    164
    165    r = sign;
    166    while (r != 0) {
    167        t1 = s1 + r;
    168        t = s0;
    169        if ((t < ix0) || ((t == ix0) && (t1 <= ix1))) {
    170            s1 = t1 + r;
    171            if (((t1 & sign) == sign) && (s1 & sign) == 0)
    172                s0 += 1;
    173            ix0 -= t;
    174            if (ix1 < t1)
    175                ix0 -= 1;
    176            ix1 -= t1;
    177            q1 += r;
    178        }
    179        ix0 += ix0 + ((ix1 & sign) >> 31);
    180        ix1 += ix1;
    181        r >>= 1;
    182    }
    183
    184    /* use floating add to find out rounding direction */
    185    if ((ix0 | ix1) != 0) {
    186        z = one - tiny;         /* trigger inexact flag */
    187        if (z >= one) {
    188            z = one + tiny;
    189            if (q1 == (u_int32_t) 0xffffffff) {
    190                q1 = 0;
    191                q += 1;
    192            } else if (z > one) {
    193                if (q1 == (u_int32_t) 0xfffffffe)
    194                    q += 1;
    195                q1 += 2;
    196            } else
    197                q1 += (q1 & 1);
    198        }
    199    }
    200    ix0 = (q >> 1) + 0x3fe00000;
    201    ix1 = q1 >> 1;
    202    if ((q & 1) == 1)
    203        ix1 |= sign;
    204    ix0 += (m << 20);
    205    INSERT_WORDS(z, ix0, ix1);
    206    return z;
    207}
    208
    209/*
    210Other methods  (use floating-point arithmetic)
    211-------------
    212(This is a copy of a drafted paper by Prof W. Kahan
    213and K.C. Ng, written in May, 1986)
    214
    215	Two algorithms are given here to implement sqrt(x)
    216	(IEEE double precision arithmetic) in software.
    217	Both supply sqrt(x) correctly rounded. The first algorithm (in
    218	Section A) uses newton iterations and involves four divisions.
    219	The second one uses reciproot iterations to avoid division, but
    220	requires more multiplications. Both algorithms need the ability
    221	to chop results of arithmetic operations instead of round them,
    222	and the INEXACT flag to indicate when an arithmetic operation
    223	is executed exactly with no roundoff error, all part of the
    224	standard (IEEE 754-1985). The ability to perform shift, add,
    225	subtract and logical AND operations upon 32-bit words is needed
    226	too, though not part of the standard.
    227
    228A.  sqrt(x) by Newton Iteration
    229
    230   (1)	Initial approximation
    231
    232	Let x0 and x1 be the leading and the trailing 32-bit words of
    233	a floating point number x (in IEEE double format) respectively
    234
    235	    1    11		     52				  ...widths
    236	   ------------------------------------------------------
    237	x: |s|	  e     |	      f				|
    238	   ------------------------------------------------------
    239	      msb    lsb  msb				      lsb ...order
    240
    241
    242	     ------------------------  	     ------------------------
    243	x0:  |s|   e    |    f1     |	 x1: |          f2           |
    244	     ------------------------  	     ------------------------
    245
    246	By performing shifts and subtracts on x0 and x1 (both regarded
    247	as integers), we obtain an 8-bit approximation of sqrt(x) as
    248	follows.
    249
    250		k  := (x0>>1) + 0x1ff80000;
    251		y0 := k - T1[31&(k>>15)].	... y ~ sqrt(x) to 8 bits
    252	Here k is a 32-bit integer and T1[] is an integer array containing
    253	correction terms. Now magically the floating value of y (y's
    254	leading 32-bit word is y0, the value of its trailing word is 0)
    255	approximates sqrt(x) to almost 8-bit.
    256
    257	Value of T1:
    258	static int T1[32]= {
    259	0,	1024,	3062,	5746,	9193,	13348,	18162,	23592,
    260	29598,	36145,	43202,	50740,	58733,	67158,	75992,	85215,
    261	83599,	71378,	60428,	50647,	41945,	34246,	27478,	21581,
    262	16499,	12183,	8588,	5674,	3403,	1742,	661,	130,};
    263
    264    (2)	Iterative refinement
    265
    266	Apply Heron's rule three times to y, we have y approximates
    267	sqrt(x) to within 1 ulp (Unit in the Last Place):
    268
    269		y := (y+x/y)/2		... almost 17 sig. bits
    270		y := (y+x/y)/2		... almost 35 sig. bits
    271		y := y-(y-x/y)/2	... within 1 ulp
    272
    273
    274	Remark 1.
    275	    Another way to improve y to within 1 ulp is:
    276
    277		y := (y+x/y)		... almost 17 sig. bits to 2*sqrt(x)
    278		y := y - 0x00100006	... almost 18 sig. bits to sqrt(x)
    279
    280				2
    281			    (x-y )*y
    282		y := y + 2* ----------	...within 1 ulp
    283			       2
    284			     3y  + x
    285
    286
    287	This formula has one division fewer than the one above; however,
    288	it requires more multiplications and additions. Also x must be
    289	scaled in advance to avoid spurious overflow in evaluating the
    290	expression 3y*y+x. Hence it is not recommended uless division
    291	is slow. If division is very slow, then one should use the
    292	reciproot algorithm given in section B.
    293
    294    (3) Final adjustment
    295
    296	By twiddling y's last bit it is possible to force y to be
    297	correctly rounded according to the prevailing rounding mode
    298	as follows. Let r and i be copies of the rounding mode and
    299	inexact flag before entering the square root program. Also we
    300	use the expression y+-ulp for the next representable floating
    301	numbers (up and down) of y. Note that y+-ulp = either fixed
    302	point y+-1, or multiply y by nextafter(1,+-inf) in chopped
    303	mode.
    304
    305		I := FALSE;	... reset INEXACT flag I
    306		R := RZ;	... set rounding mode to round-toward-zero
    307		z := x/y;	... chopped quotient, possibly inexact
    308		If(not I) then {	... if the quotient is exact
    309		    if(z=y) {
    310		        I := i;	 ... restore inexact flag
    311		        R := r;  ... restore rounded mode
    312		        return sqrt(x):=y.
    313		    } else {
    314			z := z - ulp;	... special rounding
    315		    }
    316		}
    317		i := TRUE;		... sqrt(x) is inexact
    318		If (r=RN) then z=z+ulp	... rounded-to-nearest
    319		If (r=RP) then {	... round-toward-+inf
    320		    y = y+ulp; z=z+ulp;
    321		}
    322		y := y+z;		... chopped sum
    323		y0:=y0-0x00100000;	... y := y/2 is correctly rounded.
    324	        I := i;	 		... restore inexact flag
    325	        R := r;  		... restore rounded mode
    326	        return sqrt(x):=y.
    327
    328    (4)	Special cases
    329
    330	Square root of +inf, +-0, or NaN is itself;
    331	Square root of a negative number is NaN with invalid signal.
    332
    333
    334B.  sqrt(x) by Reciproot Iteration
    335
    336   (1)	Initial approximation
    337
    338	Let x0 and x1 be the leading and the trailing 32-bit words of
    339	a floating point number x (in IEEE double format) respectively
    340	(see section A). By performing shifs and subtracts on x0 and y0,
    341	we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.
    342
    343	    k := 0x5fe80000 - (x0>>1);
    344	    y0:= k - T2[63&(k>>14)].	... y ~ 1/sqrt(x) to 7.8 bits
    345
    346	Here k is a 32-bit integer and T2[] is an integer array
    347	containing correction terms. Now magically the floating
    348	value of y (y's leading 32-bit word is y0, the value of
    349	its trailing word y1 is set to zero) approximates 1/sqrt(x)
    350	to almost 7.8-bit.
    351
    352	Value of T2:
    353	static int T2[64]= {
    354	0x1500,	0x2ef8,	0x4d67,	0x6b02,	0x87be,	0xa395,	0xbe7a,	0xd866,
    355	0xf14a,	0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
    356	0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
    357	0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
    358	0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
    359	0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
    360	0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
    361	0x1527f,0x1334a,0x11051,0xe951,	0xbe01,	0x8e0d,	0x5924,	0x1edd,};
    362
    363    (2)	Iterative refinement
    364
    365	Apply Reciproot iteration three times to y and multiply the
    366	result by x to get an approximation z that matches sqrt(x)
    367	to about 1 ulp. To be exact, we will have
    368		-1ulp < sqrt(x)-z<1.0625ulp.
    369
    370	... set rounding mode to Round-to-nearest
    371	   y := y*(1.5-0.5*x*y*y)	... almost 15 sig. bits to 1/sqrt(x)
    372	   y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)
    373	... special arrangement for better accuracy
    374	   z := x*y			... 29 bits to sqrt(x), with z*y<1
    375	   z := z + 0.5*z*(1-z*y)	... about 1 ulp to sqrt(x)
    376
    377	Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
    378	(a) the term z*y in the final iteration is always less than 1;
    379	(b) the error in the final result is biased upward so that
    380		-1 ulp < sqrt(x) - z < 1.0625 ulp
    381	    instead of |sqrt(x)-z|<1.03125ulp.
    382
    383    (3)	Final adjustment
    384
    385	By twiddling y's last bit it is possible to force y to be
    386	correctly rounded according to the prevailing rounding mode
    387	as follows. Let r and i be copies of the rounding mode and
    388	inexact flag before entering the square root program. Also we
    389	use the expression y+-ulp for the next representable floating
    390	numbers (up and down) of y. Note that y+-ulp = either fixed
    391	point y+-1, or multiply y by nextafter(1,+-inf) in chopped
    392	mode.
    393
    394	R := RZ;		... set rounding mode to round-toward-zero
    395	switch(r) {
    396	    case RN:		... round-to-nearest
    397	       if(x<= z*(z-ulp)...chopped) z = z - ulp; else
    398	       if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
    399	       break;
    400	    case RZ:case RM:	... round-to-zero or round-to--inf
    401	       R:=RP;		... reset rounding mod to round-to-+inf
    402	       if(x<z*z ... rounded up) z = z - ulp; else
    403	       if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
    404	       break;
    405	    case RP:		... round-to-+inf
    406	       if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
    407	       if(x>z*z ...chopped) z = z+ulp;
    408	       break;
    409	}
    410
    411	Remark 3. The above comparisons can be done in fixed point. For
    412	example, to compare x and w=z*z chopped, it suffices to compare
    413	x1 and w1 (the trailing parts of x and w), regarding them as
    414	two's complement integers.
    415
    416	...Is z an exact square root?
    417	To determine whether z is an exact square root of x, let z1 be the
    418	trailing part of z, and also let x0 and x1 be the leading and
    419	trailing parts of x.
    420
    421	If ((z1&0x03ffffff)!=0)	... not exact if trailing 26 bits of z!=0
    422	    I := 1;		... Raise Inexact flag: z is not exact
    423	else {
    424	    j := 1 - [(x0>>20)&1]	... j = logb(x) mod 2
    425	    k := z1 >> 26;		... get z's 25-th and 26-th
    426					    fraction bits
    427	    I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
    428	}
    429	R:= r		... restore rounded mode
    430	return sqrt(x):=z.
    431
    432	If multiplication is cheaper then the foregoing red tape, the
    433	Inexact flag can be evaluated by
    434
    435	    I := i;
    436	    I := (z*z!=x) or I.
    437
    438	Note that z*z can overwrite I; this value must be sensed if it is
    439	True.
    440
    441	Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
    442	zero.
    443
    444		    --------------------
    445		z1: |        f2        |
    446		    --------------------
    447		bit 31		   bit 0
    448
    449	Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
    450	or even of logb(x) have the following relations:
    451
    452	-------------------------------------------------
    453	bit 27,26 of z1		bit 1,0 of x1	logb(x)
    454	-------------------------------------------------
    455	00			00		odd and even
    456	01			01		even
    457	10			10		odd
    458	10			00		even
    459	11			01		even
    460	-------------------------------------------------
    461
    462    (4)	Special cases (see (4) of Section A).
    463
    464 */