cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

softfloat.c (118936B)


      1/*
      2===============================================================================
      3
      4This C source file is part of the SoftFloat IEC/IEEE Floating-point
      5Arithmetic Package, Release 2.
      6
      7Written by John R. Hauser.  This work was made possible in part by the
      8International Computer Science Institute, located at Suite 600, 1947 Center
      9Street, Berkeley, California 94704.  Funding was partially provided by the
     10National Science Foundation under grant MIP-9311980.  The original version
     11of this code was written as part of a project to build a fixed-point vector
     12processor in collaboration with the University of California at Berkeley,
     13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     14is available through the web page
     15http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
     16
     17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
     18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
     19TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
     20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
     21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
     22
     23Derivative works are acceptable, even for commercial purposes, so long as
     24(1) they include prominent notice that the work is derivative, and (2) they
     25include prominent notice akin to these three paragraphs for those parts of
     26this code that are retained.
     27
     28===============================================================================
     29*/
     30
     31#include <asm/div64.h>
     32
     33#include "fpa11.h"
     34//#include "milieu.h"
     35//#include "softfloat.h"
     36
     37/*
     38-------------------------------------------------------------------------------
     39Primitive arithmetic functions, including multi-word arithmetic, and
     40division and square root approximations.  (Can be specialized to target if
     41desired.)
     42-------------------------------------------------------------------------------
     43*/
     44#include "softfloat-macros"
     45
     46/*
     47-------------------------------------------------------------------------------
     48Functions and definitions to determine:  (1) whether tininess for underflow
     49is detected before or after rounding by default, (2) what (if anything)
     50happens when exceptions are raised, (3) how signaling NaNs are distinguished
     51from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
     52are propagated from function inputs to output.  These details are target-
     53specific.
     54-------------------------------------------------------------------------------
     55*/
     56#include "softfloat-specialize"
     57
     58/*
     59-------------------------------------------------------------------------------
     60Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
     61and 7, and returns the properly rounded 32-bit integer corresponding to the
     62input.  If `zSign' is nonzero, the input is negated before being converted
     63to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
     64input is simply rounded to an integer, with the inexact exception raised if
     65the input cannot be represented exactly as an integer.  If the fixed-point
     66input is too large, however, the invalid exception is raised and the largest
     67positive or negative integer is returned.
     68-------------------------------------------------------------------------------
     69*/
     70static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
     71{
     72    int8 roundingMode;
     73    flag roundNearestEven;
     74    int8 roundIncrement, roundBits;
     75    int32 z;
     76
     77    roundingMode = roundData->mode;
     78    roundNearestEven = ( roundingMode == float_round_nearest_even );
     79    roundIncrement = 0x40;
     80    if ( ! roundNearestEven ) {
     81        if ( roundingMode == float_round_to_zero ) {
     82            roundIncrement = 0;
     83        }
     84        else {
     85            roundIncrement = 0x7F;
     86            if ( zSign ) {
     87                if ( roundingMode == float_round_up ) roundIncrement = 0;
     88            }
     89            else {
     90                if ( roundingMode == float_round_down ) roundIncrement = 0;
     91            }
     92        }
     93    }
     94    roundBits = absZ & 0x7F;
     95    absZ = ( absZ + roundIncrement )>>7;
     96    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
     97    z = absZ;
     98    if ( zSign ) z = - z;
     99    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
    100        roundData->exception |= float_flag_invalid;
    101        return zSign ? 0x80000000 : 0x7FFFFFFF;
    102    }
    103    if ( roundBits ) roundData->exception |= float_flag_inexact;
    104    return z;
    105
    106}
    107
    108/*
    109-------------------------------------------------------------------------------
    110Returns the fraction bits of the single-precision floating-point value `a'.
    111-------------------------------------------------------------------------------
    112*/
    113INLINE bits32 extractFloat32Frac( float32 a )
    114{
    115
    116    return a & 0x007FFFFF;
    117
    118}
    119
    120/*
    121-------------------------------------------------------------------------------
    122Returns the exponent bits of the single-precision floating-point value `a'.
    123-------------------------------------------------------------------------------
    124*/
    125INLINE int16 extractFloat32Exp( float32 a )
    126{
    127
    128    return ( a>>23 ) & 0xFF;
    129
    130}
    131
    132/*
    133-------------------------------------------------------------------------------
    134Returns the sign bit of the single-precision floating-point value `a'.
    135-------------------------------------------------------------------------------
    136*/
    137#if 0	/* in softfloat.h */
    138INLINE flag extractFloat32Sign( float32 a )
    139{
    140
    141    return a>>31;
    142
    143}
    144#endif
    145
    146/*
    147-------------------------------------------------------------------------------
    148Normalizes the subnormal single-precision floating-point value represented
    149by the denormalized significand `aSig'.  The normalized exponent and
    150significand are stored at the locations pointed to by `zExpPtr' and
    151`zSigPtr', respectively.
    152-------------------------------------------------------------------------------
    153*/
    154static void
    155 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
    156{
    157    int8 shiftCount;
    158
    159    shiftCount = countLeadingZeros32( aSig ) - 8;
    160    *zSigPtr = aSig<<shiftCount;
    161    *zExpPtr = 1 - shiftCount;
    162
    163}
    164
    165/*
    166-------------------------------------------------------------------------------
    167Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    168single-precision floating-point value, returning the result.  After being
    169shifted into the proper positions, the three fields are simply added
    170together to form the result.  This means that any integer portion of `zSig'
    171will be added into the exponent.  Since a properly normalized significand
    172will have an integer portion equal to 1, the `zExp' input should be 1 less
    173than the desired result exponent whenever `zSig' is a complete, normalized
    174significand.
    175-------------------------------------------------------------------------------
    176*/
    177INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
    178{
    179#if 0
    180   float32 f;
    181   __asm__("@ packFloat32				\n\
    182   	    mov %0, %1, asl #31				\n\
    183   	    orr %0, %2, asl #23				\n\
    184   	    orr %0, %3"
    185   	    : /* no outputs */
    186   	    : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
    187   	    : "cc");
    188   return f;
    189#else
    190    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
    191#endif 
    192}
    193
    194/*
    195-------------------------------------------------------------------------------
    196Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    197and significand `zSig', and returns the proper single-precision floating-
    198point value corresponding to the abstract input.  Ordinarily, the abstract
    199value is simply rounded and packed into the single-precision format, with
    200the inexact exception raised if the abstract input cannot be represented
    201exactly.  If the abstract value is too large, however, the overflow and
    202inexact exceptions are raised and an infinity or maximal finite value is
    203returned.  If the abstract value is too small, the input value is rounded to
    204a subnormal number, and the underflow and inexact exceptions are raised if
    205the abstract input cannot be represented exactly as a subnormal single-
    206precision floating-point number.
    207    The input significand `zSig' has its binary point between bits 30
    208and 29, which is 7 bits to the left of the usual location.  This shifted
    209significand must be normalized or smaller.  If `zSig' is not normalized,
    210`zExp' must be 0; in that case, the result returned is a subnormal number,
    211and it must not require rounding.  In the usual case that `zSig' is
    212normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    213The handling of underflow and overflow follows the IEC/IEEE Standard for
    214Binary Floating-point Arithmetic.
    215-------------------------------------------------------------------------------
    216*/
    217static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
    218{
    219    int8 roundingMode;
    220    flag roundNearestEven;
    221    int8 roundIncrement, roundBits;
    222    flag isTiny;
    223
    224    roundingMode = roundData->mode;
    225    roundNearestEven = ( roundingMode == float_round_nearest_even );
    226    roundIncrement = 0x40;
    227    if ( ! roundNearestEven ) {
    228        if ( roundingMode == float_round_to_zero ) {
    229            roundIncrement = 0;
    230        }
    231        else {
    232            roundIncrement = 0x7F;
    233            if ( zSign ) {
    234                if ( roundingMode == float_round_up ) roundIncrement = 0;
    235            }
    236            else {
    237                if ( roundingMode == float_round_down ) roundIncrement = 0;
    238            }
    239        }
    240    }
    241    roundBits = zSig & 0x7F;
    242    if ( 0xFD <= (bits16) zExp ) {
    243        if (    ( 0xFD < zExp )
    244             || (    ( zExp == 0xFD )
    245                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
    246           ) {
    247            roundData->exception |= float_flag_overflow | float_flag_inexact;
    248            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
    249        }
    250        if ( zExp < 0 ) {
    251            isTiny =
    252                   ( float_detect_tininess == float_tininess_before_rounding )
    253                || ( zExp < -1 )
    254                || ( zSig + roundIncrement < 0x80000000 );
    255            shift32RightJamming( zSig, - zExp, &zSig );
    256            zExp = 0;
    257            roundBits = zSig & 0x7F;
    258            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
    259        }
    260    }
    261    if ( roundBits ) roundData->exception |= float_flag_inexact;
    262    zSig = ( zSig + roundIncrement )>>7;
    263    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
    264    if ( zSig == 0 ) zExp = 0;
    265    return packFloat32( zSign, zExp, zSig );
    266
    267}
    268
    269/*
    270-------------------------------------------------------------------------------
    271Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    272and significand `zSig', and returns the proper single-precision floating-
    273point value corresponding to the abstract input.  This routine is just like
    274`roundAndPackFloat32' except that `zSig' does not have to be normalized in
    275any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
    276point exponent.
    277-------------------------------------------------------------------------------
    278*/
    279static float32
    280 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
    281{
    282    int8 shiftCount;
    283
    284    shiftCount = countLeadingZeros32( zSig ) - 1;
    285    return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
    286
    287}
    288
    289/*
    290-------------------------------------------------------------------------------
    291Returns the fraction bits of the double-precision floating-point value `a'.
    292-------------------------------------------------------------------------------
    293*/
    294INLINE bits64 extractFloat64Frac( float64 a )
    295{
    296
    297    return a & LIT64( 0x000FFFFFFFFFFFFF );
    298
    299}
    300
    301/*
    302-------------------------------------------------------------------------------
    303Returns the exponent bits of the double-precision floating-point value `a'.
    304-------------------------------------------------------------------------------
    305*/
    306INLINE int16 extractFloat64Exp( float64 a )
    307{
    308
    309    return ( a>>52 ) & 0x7FF;
    310
    311}
    312
    313/*
    314-------------------------------------------------------------------------------
    315Returns the sign bit of the double-precision floating-point value `a'.
    316-------------------------------------------------------------------------------
    317*/
    318#if 0	/* in softfloat.h */
    319INLINE flag extractFloat64Sign( float64 a )
    320{
    321
    322    return a>>63;
    323
    324}
    325#endif
    326
    327/*
    328-------------------------------------------------------------------------------
    329Normalizes the subnormal double-precision floating-point value represented
    330by the denormalized significand `aSig'.  The normalized exponent and
    331significand are stored at the locations pointed to by `zExpPtr' and
    332`zSigPtr', respectively.
    333-------------------------------------------------------------------------------
    334*/
    335static void
    336 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
    337{
    338    int8 shiftCount;
    339
    340    shiftCount = countLeadingZeros64( aSig ) - 11;
    341    *zSigPtr = aSig<<shiftCount;
    342    *zExpPtr = 1 - shiftCount;
    343
    344}
    345
    346/*
    347-------------------------------------------------------------------------------
    348Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
    349double-precision floating-point value, returning the result.  After being
    350shifted into the proper positions, the three fields are simply added
    351together to form the result.  This means that any integer portion of `zSig'
    352will be added into the exponent.  Since a properly normalized significand
    353will have an integer portion equal to 1, the `zExp' input should be 1 less
    354than the desired result exponent whenever `zSig' is a complete, normalized
    355significand.
    356-------------------------------------------------------------------------------
    357*/
    358INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
    359{
    360
    361    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
    362
    363}
    364
    365/*
    366-------------------------------------------------------------------------------
    367Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    368and significand `zSig', and returns the proper double-precision floating-
    369point value corresponding to the abstract input.  Ordinarily, the abstract
    370value is simply rounded and packed into the double-precision format, with
    371the inexact exception raised if the abstract input cannot be represented
    372exactly.  If the abstract value is too large, however, the overflow and
    373inexact exceptions are raised and an infinity or maximal finite value is
    374returned.  If the abstract value is too small, the input value is rounded to
    375a subnormal number, and the underflow and inexact exceptions are raised if
    376the abstract input cannot be represented exactly as a subnormal double-
    377precision floating-point number.
    378    The input significand `zSig' has its binary point between bits 62
    379and 61, which is 10 bits to the left of the usual location.  This shifted
    380significand must be normalized or smaller.  If `zSig' is not normalized,
    381`zExp' must be 0; in that case, the result returned is a subnormal number,
    382and it must not require rounding.  In the usual case that `zSig' is
    383normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
    384The handling of underflow and overflow follows the IEC/IEEE Standard for
    385Binary Floating-point Arithmetic.
    386-------------------------------------------------------------------------------
    387*/
    388static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
    389{
    390    int8 roundingMode;
    391    flag roundNearestEven;
    392    int16 roundIncrement, roundBits;
    393    flag isTiny;
    394
    395    roundingMode = roundData->mode;
    396    roundNearestEven = ( roundingMode == float_round_nearest_even );
    397    roundIncrement = 0x200;
    398    if ( ! roundNearestEven ) {
    399        if ( roundingMode == float_round_to_zero ) {
    400            roundIncrement = 0;
    401        }
    402        else {
    403            roundIncrement = 0x3FF;
    404            if ( zSign ) {
    405                if ( roundingMode == float_round_up ) roundIncrement = 0;
    406            }
    407            else {
    408                if ( roundingMode == float_round_down ) roundIncrement = 0;
    409            }
    410        }
    411    }
    412    roundBits = zSig & 0x3FF;
    413    if ( 0x7FD <= (bits16) zExp ) {
    414        if (    ( 0x7FD < zExp )
    415             || (    ( zExp == 0x7FD )
    416                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
    417           ) {
    418            //register int lr = __builtin_return_address(0);
    419            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
    420            roundData->exception |= float_flag_overflow | float_flag_inexact;
    421            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
    422        }
    423        if ( zExp < 0 ) {
    424            isTiny =
    425                   ( float_detect_tininess == float_tininess_before_rounding )
    426                || ( zExp < -1 )
    427                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
    428            shift64RightJamming( zSig, - zExp, &zSig );
    429            zExp = 0;
    430            roundBits = zSig & 0x3FF;
    431            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
    432        }
    433    }
    434    if ( roundBits ) roundData->exception |= float_flag_inexact;
    435    zSig = ( zSig + roundIncrement )>>10;
    436    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
    437    if ( zSig == 0 ) zExp = 0;
    438    return packFloat64( zSign, zExp, zSig );
    439
    440}
    441
    442/*
    443-------------------------------------------------------------------------------
    444Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    445and significand `zSig', and returns the proper double-precision floating-
    446point value corresponding to the abstract input.  This routine is just like
    447`roundAndPackFloat64' except that `zSig' does not have to be normalized in
    448any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
    449point exponent.
    450-------------------------------------------------------------------------------
    451*/
    452static float64
    453 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
    454{
    455    int8 shiftCount;
    456
    457    shiftCount = countLeadingZeros64( zSig ) - 1;
    458    return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
    459
    460}
    461
    462#ifdef FLOATX80
    463
    464/*
    465-------------------------------------------------------------------------------
    466Returns the fraction bits of the extended double-precision floating-point
    467value `a'.
    468-------------------------------------------------------------------------------
    469*/
    470INLINE bits64 extractFloatx80Frac( floatx80 a )
    471{
    472
    473    return a.low;
    474
    475}
    476
    477/*
    478-------------------------------------------------------------------------------
    479Returns the exponent bits of the extended double-precision floating-point
    480value `a'.
    481-------------------------------------------------------------------------------
    482*/
    483INLINE int32 extractFloatx80Exp( floatx80 a )
    484{
    485
    486    return a.high & 0x7FFF;
    487
    488}
    489
    490/*
    491-------------------------------------------------------------------------------
    492Returns the sign bit of the extended double-precision floating-point value
    493`a'.
    494-------------------------------------------------------------------------------
    495*/
    496INLINE flag extractFloatx80Sign( floatx80 a )
    497{
    498
    499    return a.high>>15;
    500
    501}
    502
    503/*
    504-------------------------------------------------------------------------------
    505Normalizes the subnormal extended double-precision floating-point value
    506represented by the denormalized significand `aSig'.  The normalized exponent
    507and significand are stored at the locations pointed to by `zExpPtr' and
    508`zSigPtr', respectively.
    509-------------------------------------------------------------------------------
    510*/
    511static void
    512 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
    513{
    514    int8 shiftCount;
    515
    516    shiftCount = countLeadingZeros64( aSig );
    517    *zSigPtr = aSig<<shiftCount;
    518    *zExpPtr = 1 - shiftCount;
    519
    520}
    521
    522/*
    523-------------------------------------------------------------------------------
    524Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
    525extended double-precision floating-point value, returning the result.
    526-------------------------------------------------------------------------------
    527*/
    528INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
    529{
    530    floatx80 z;
    531
    532    z.low = zSig;
    533    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
    534    z.__padding = 0;
    535    return z;
    536
    537}
    538
    539/*
    540-------------------------------------------------------------------------------
    541Takes an abstract floating-point value having sign `zSign', exponent `zExp',
    542and extended significand formed by the concatenation of `zSig0' and `zSig1',
    543and returns the proper extended double-precision floating-point value
    544corresponding to the abstract input.  Ordinarily, the abstract value is
    545rounded and packed into the extended double-precision format, with the
    546inexact exception raised if the abstract input cannot be represented
    547exactly.  If the abstract value is too large, however, the overflow and
    548inexact exceptions are raised and an infinity or maximal finite value is
    549returned.  If the abstract value is too small, the input value is rounded to
    550a subnormal number, and the underflow and inexact exceptions are raised if
    551the abstract input cannot be represented exactly as a subnormal extended
    552double-precision floating-point number.
    553    If `roundingPrecision' is 32 or 64, the result is rounded to the same
    554number of bits as single or double precision, respectively.  Otherwise, the
    555result is rounded to the full precision of the extended double-precision
    556format.
    557    The input significand must be normalized or smaller.  If the input
    558significand is not normalized, `zExp' must be 0; in that case, the result
    559returned is a subnormal number, and it must not require rounding.  The
    560handling of underflow and overflow follows the IEC/IEEE Standard for Binary
    561Floating-point Arithmetic.
    562-------------------------------------------------------------------------------
    563*/
    564static floatx80
    565 roundAndPackFloatx80(
    566     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    567 )
    568{
    569    int8 roundingMode, roundingPrecision;
    570    flag roundNearestEven, increment, isTiny;
    571    int64 roundIncrement, roundMask, roundBits;
    572
    573    roundingMode = roundData->mode;
    574    roundingPrecision = roundData->precision;
    575    roundNearestEven = ( roundingMode == float_round_nearest_even );
    576    if ( roundingPrecision == 80 ) goto precision80;
    577    if ( roundingPrecision == 64 ) {
    578        roundIncrement = LIT64( 0x0000000000000400 );
    579        roundMask = LIT64( 0x00000000000007FF );
    580    }
    581    else if ( roundingPrecision == 32 ) {
    582        roundIncrement = LIT64( 0x0000008000000000 );
    583        roundMask = LIT64( 0x000000FFFFFFFFFF );
    584    }
    585    else {
    586        goto precision80;
    587    }
    588    zSig0 |= ( zSig1 != 0 );
    589    if ( ! roundNearestEven ) {
    590        if ( roundingMode == float_round_to_zero ) {
    591            roundIncrement = 0;
    592        }
    593        else {
    594            roundIncrement = roundMask;
    595            if ( zSign ) {
    596                if ( roundingMode == float_round_up ) roundIncrement = 0;
    597            }
    598            else {
    599                if ( roundingMode == float_round_down ) roundIncrement = 0;
    600            }
    601        }
    602    }
    603    roundBits = zSig0 & roundMask;
    604    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    605        if (    ( 0x7FFE < zExp )
    606             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
    607           ) {
    608            goto overflow;
    609        }
    610        if ( zExp <= 0 ) {
    611            isTiny =
    612                   ( float_detect_tininess == float_tininess_before_rounding )
    613                || ( zExp < 0 )
    614                || ( zSig0 <= zSig0 + roundIncrement );
    615            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
    616            zExp = 0;
    617            roundBits = zSig0 & roundMask;
    618            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
    619            if ( roundBits ) roundData->exception |= float_flag_inexact;
    620            zSig0 += roundIncrement;
    621            if ( (sbits64) zSig0 < 0 ) zExp = 1;
    622            roundIncrement = roundMask + 1;
    623            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    624                roundMask |= roundIncrement;
    625            }
    626            zSig0 &= ~ roundMask;
    627            return packFloatx80( zSign, zExp, zSig0 );
    628        }
    629    }
    630    if ( roundBits ) roundData->exception |= float_flag_inexact;
    631    zSig0 += roundIncrement;
    632    if ( zSig0 < roundIncrement ) {
    633        ++zExp;
    634        zSig0 = LIT64( 0x8000000000000000 );
    635    }
    636    roundIncrement = roundMask + 1;
    637    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
    638        roundMask |= roundIncrement;
    639    }
    640    zSig0 &= ~ roundMask;
    641    if ( zSig0 == 0 ) zExp = 0;
    642    return packFloatx80( zSign, zExp, zSig0 );
    643 precision80:
    644    increment = ( (sbits64) zSig1 < 0 );
    645    if ( ! roundNearestEven ) {
    646        if ( roundingMode == float_round_to_zero ) {
    647            increment = 0;
    648        }
    649        else {
    650            if ( zSign ) {
    651                increment = ( roundingMode == float_round_down ) && zSig1;
    652            }
    653            else {
    654                increment = ( roundingMode == float_round_up ) && zSig1;
    655            }
    656        }
    657    }
    658    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
    659        if (    ( 0x7FFE < zExp )
    660             || (    ( zExp == 0x7FFE )
    661                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
    662                  && increment
    663                )
    664           ) {
    665            roundMask = 0;
    666 overflow:
    667            roundData->exception |= float_flag_overflow | float_flag_inexact;
    668            if (    ( roundingMode == float_round_to_zero )
    669                 || ( zSign && ( roundingMode == float_round_up ) )
    670                 || ( ! zSign && ( roundingMode == float_round_down ) )
    671               ) {
    672                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
    673            }
    674            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    675        }
    676        if ( zExp <= 0 ) {
    677            isTiny =
    678                   ( float_detect_tininess == float_tininess_before_rounding )
    679                || ( zExp < 0 )
    680                || ! increment
    681                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
    682            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
    683            zExp = 0;
    684            if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
    685            if ( zSig1 ) roundData->exception |= float_flag_inexact;
    686            if ( roundNearestEven ) {
    687                increment = ( (sbits64) zSig1 < 0 );
    688            }
    689            else {
    690                if ( zSign ) {
    691                    increment = ( roundingMode == float_round_down ) && zSig1;
    692                }
    693                else {
    694                    increment = ( roundingMode == float_round_up ) && zSig1;
    695                }
    696            }
    697            if ( increment ) {
    698                ++zSig0;
    699                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
    700                if ( (sbits64) zSig0 < 0 ) zExp = 1;
    701            }
    702            return packFloatx80( zSign, zExp, zSig0 );
    703        }
    704    }
    705    if ( zSig1 ) roundData->exception |= float_flag_inexact;
    706    if ( increment ) {
    707        ++zSig0;
    708        if ( zSig0 == 0 ) {
    709            ++zExp;
    710            zSig0 = LIT64( 0x8000000000000000 );
    711        }
    712        else {
    713            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
    714        }
    715    }
    716    else {
    717        if ( zSig0 == 0 ) zExp = 0;
    718    }
    719    
    720    return packFloatx80( zSign, zExp, zSig0 );
    721}
    722
    723/*
    724-------------------------------------------------------------------------------
    725Takes an abstract floating-point value having sign `zSign', exponent
    726`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
    727and returns the proper extended double-precision floating-point value
    728corresponding to the abstract input.  This routine is just like
    729`roundAndPackFloatx80' except that the input significand does not have to be
    730normalized.
    731-------------------------------------------------------------------------------
    732*/
    733static floatx80
    734 normalizeRoundAndPackFloatx80(
    735     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
    736 )
    737{
    738    int8 shiftCount;
    739
    740    if ( zSig0 == 0 ) {
    741        zSig0 = zSig1;
    742        zSig1 = 0;
    743        zExp -= 64;
    744    }
    745    shiftCount = countLeadingZeros64( zSig0 );
    746    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
    747    zExp -= shiftCount;
    748    return
    749        roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
    750
    751}
    752
    753#endif
    754
    755/*
    756-------------------------------------------------------------------------------
    757Returns the result of converting the 32-bit two's complement integer `a' to
    758the single-precision floating-point format.  The conversion is performed
    759according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
    760-------------------------------------------------------------------------------
    761*/
    762float32 int32_to_float32(struct roundingData *roundData, int32 a)
    763{
    764    flag zSign;
    765
    766    if ( a == 0 ) return 0;
    767    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
    768    zSign = ( a < 0 );
    769    return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
    770
    771}
    772
    773/*
    774-------------------------------------------------------------------------------
    775Returns the result of converting the 32-bit two's complement integer `a' to
    776the double-precision floating-point format.  The conversion is performed
    777according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
    778-------------------------------------------------------------------------------
    779*/
    780float64 int32_to_float64( int32 a )
    781{
    782    flag aSign;
    783    uint32 absA;
    784    int8 shiftCount;
    785    bits64 zSig;
    786
    787    if ( a == 0 ) return 0;
    788    aSign = ( a < 0 );
    789    absA = aSign ? - a : a;
    790    shiftCount = countLeadingZeros32( absA ) + 21;
    791    zSig = absA;
    792    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
    793
    794}
    795
    796#ifdef FLOATX80
    797
    798/*
    799-------------------------------------------------------------------------------
    800Returns the result of converting the 32-bit two's complement integer `a'
    801to the extended double-precision floating-point format.  The conversion
    802is performed according to the IEC/IEEE Standard for Binary Floating-point
    803Arithmetic.
    804-------------------------------------------------------------------------------
    805*/
    806floatx80 int32_to_floatx80( int32 a )
    807{
    808    flag zSign;
    809    uint32 absA;
    810    int8 shiftCount;
    811    bits64 zSig;
    812
    813    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
    814    zSign = ( a < 0 );
    815    absA = zSign ? - a : a;
    816    shiftCount = countLeadingZeros32( absA ) + 32;
    817    zSig = absA;
    818    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
    819
    820}
    821
    822#endif
    823
    824/*
    825-------------------------------------------------------------------------------
    826Returns the result of converting the single-precision floating-point value
    827`a' to the 32-bit two's complement integer format.  The conversion is
    828performed according to the IEC/IEEE Standard for Binary Floating-point
    829Arithmetic---which means in particular that the conversion is rounded
    830according to the current rounding mode.  If `a' is a NaN, the largest
    831positive integer is returned.  Otherwise, if the conversion overflows, the
    832largest integer with the same sign as `a' is returned.
    833-------------------------------------------------------------------------------
    834*/
    835int32 float32_to_int32( struct roundingData *roundData, float32 a )
    836{
    837    flag aSign;
    838    int16 aExp, shiftCount;
    839    bits32 aSig;
    840    bits64 zSig;
    841
    842    aSig = extractFloat32Frac( a );
    843    aExp = extractFloat32Exp( a );
    844    aSign = extractFloat32Sign( a );
    845    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
    846    if ( aExp ) aSig |= 0x00800000;
    847    shiftCount = 0xAF - aExp;
    848    zSig = aSig;
    849    zSig <<= 32;
    850    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
    851    return roundAndPackInt32( roundData, aSign, zSig );
    852
    853}
    854
    855/*
    856-------------------------------------------------------------------------------
    857Returns the result of converting the single-precision floating-point value
    858`a' to the 32-bit two's complement integer format.  The conversion is
    859performed according to the IEC/IEEE Standard for Binary Floating-point
    860Arithmetic, except that the conversion is always rounded toward zero.  If
    861`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
    862conversion overflows, the largest integer with the same sign as `a' is
    863returned.
    864-------------------------------------------------------------------------------
    865*/
    866int32 float32_to_int32_round_to_zero( float32 a )
    867{
    868    flag aSign;
    869    int16 aExp, shiftCount;
    870    bits32 aSig;
    871    int32 z;
    872
    873    aSig = extractFloat32Frac( a );
    874    aExp = extractFloat32Exp( a );
    875    aSign = extractFloat32Sign( a );
    876    shiftCount = aExp - 0x9E;
    877    if ( 0 <= shiftCount ) {
    878        if ( a == 0xCF000000 ) return 0x80000000;
    879        float_raise( float_flag_invalid );
    880        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
    881        return 0x80000000;
    882    }
    883    else if ( aExp <= 0x7E ) {
    884        if ( aExp | aSig ) float_raise( float_flag_inexact );
    885        return 0;
    886    }
    887    aSig = ( aSig | 0x00800000 )<<8;
    888    z = aSig>>( - shiftCount );
    889    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
    890        float_raise( float_flag_inexact );
    891    }
    892    return aSign ? - z : z;
    893
    894}
    895
    896/*
    897-------------------------------------------------------------------------------
    898Returns the result of converting the single-precision floating-point value
    899`a' to the double-precision floating-point format.  The conversion is
    900performed according to the IEC/IEEE Standard for Binary Floating-point
    901Arithmetic.
    902-------------------------------------------------------------------------------
    903*/
    904float64 float32_to_float64( float32 a )
    905{
    906    flag aSign;
    907    int16 aExp;
    908    bits32 aSig;
    909
    910    aSig = extractFloat32Frac( a );
    911    aExp = extractFloat32Exp( a );
    912    aSign = extractFloat32Sign( a );
    913    if ( aExp == 0xFF ) {
    914        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
    915        return packFloat64( aSign, 0x7FF, 0 );
    916    }
    917    if ( aExp == 0 ) {
    918        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
    919        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
    920        --aExp;
    921    }
    922    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
    923
    924}
    925
    926#ifdef FLOATX80
    927
    928/*
    929-------------------------------------------------------------------------------
    930Returns the result of converting the single-precision floating-point value
    931`a' to the extended double-precision floating-point format.  The conversion
    932is performed according to the IEC/IEEE Standard for Binary Floating-point
    933Arithmetic.
    934-------------------------------------------------------------------------------
    935*/
    936floatx80 float32_to_floatx80( float32 a )
    937{
    938    flag aSign;
    939    int16 aExp;
    940    bits32 aSig;
    941
    942    aSig = extractFloat32Frac( a );
    943    aExp = extractFloat32Exp( a );
    944    aSign = extractFloat32Sign( a );
    945    if ( aExp == 0xFF ) {
    946        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
    947        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
    948    }
    949    if ( aExp == 0 ) {
    950        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
    951        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
    952    }
    953    aSig |= 0x00800000;
    954    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
    955
    956}
    957
    958#endif
    959
    960/*
    961-------------------------------------------------------------------------------
    962Rounds the single-precision floating-point value `a' to an integer, and
    963returns the result as a single-precision floating-point value.  The
    964operation is performed according to the IEC/IEEE Standard for Binary
    965Floating-point Arithmetic.
    966-------------------------------------------------------------------------------
    967*/
    968float32 float32_round_to_int( struct roundingData *roundData, float32 a )
    969{
    970    flag aSign;
    971    int16 aExp;
    972    bits32 lastBitMask, roundBitsMask;
    973    int8 roundingMode;
    974    float32 z;
    975
    976    aExp = extractFloat32Exp( a );
    977    if ( 0x96 <= aExp ) {
    978        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
    979            return propagateFloat32NaN( a, a );
    980        }
    981        return a;
    982    }
    983    roundingMode = roundData->mode;
    984    if ( aExp <= 0x7E ) {
    985        if ( (bits32) ( a<<1 ) == 0 ) return a;
    986        roundData->exception |= float_flag_inexact;
    987        aSign = extractFloat32Sign( a );
    988        switch ( roundingMode ) {
    989         case float_round_nearest_even:
    990            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
    991                return packFloat32( aSign, 0x7F, 0 );
    992            }
    993            break;
    994         case float_round_down:
    995            return aSign ? 0xBF800000 : 0;
    996         case float_round_up:
    997            return aSign ? 0x80000000 : 0x3F800000;
    998        }
    999        return packFloat32( aSign, 0, 0 );
   1000    }
   1001    lastBitMask = 1;
   1002    lastBitMask <<= 0x96 - aExp;
   1003    roundBitsMask = lastBitMask - 1;
   1004    z = a;
   1005    if ( roundingMode == float_round_nearest_even ) {
   1006        z += lastBitMask>>1;
   1007        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1008    }
   1009    else if ( roundingMode != float_round_to_zero ) {
   1010        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1011            z += roundBitsMask;
   1012        }
   1013    }
   1014    z &= ~ roundBitsMask;
   1015    if ( z != a ) roundData->exception |= float_flag_inexact;
   1016    return z;
   1017
   1018}
   1019
   1020/*
   1021-------------------------------------------------------------------------------
   1022Returns the result of adding the absolute values of the single-precision
   1023floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
   1024before being returned.  `zSign' is ignored if the result is a NaN.  The
   1025addition is performed according to the IEC/IEEE Standard for Binary
   1026Floating-point Arithmetic.
   1027-------------------------------------------------------------------------------
   1028*/
   1029static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
   1030{
   1031    int16 aExp, bExp, zExp;
   1032    bits32 aSig, bSig, zSig;
   1033    int16 expDiff;
   1034
   1035    aSig = extractFloat32Frac( a );
   1036    aExp = extractFloat32Exp( a );
   1037    bSig = extractFloat32Frac( b );
   1038    bExp = extractFloat32Exp( b );
   1039    expDiff = aExp - bExp;
   1040    aSig <<= 6;
   1041    bSig <<= 6;
   1042    if ( 0 < expDiff ) {
   1043        if ( aExp == 0xFF ) {
   1044            if ( aSig ) return propagateFloat32NaN( a, b );
   1045            return a;
   1046        }
   1047        if ( bExp == 0 ) {
   1048            --expDiff;
   1049        }
   1050        else {
   1051            bSig |= 0x20000000;
   1052        }
   1053        shift32RightJamming( bSig, expDiff, &bSig );
   1054        zExp = aExp;
   1055    }
   1056    else if ( expDiff < 0 ) {
   1057        if ( bExp == 0xFF ) {
   1058            if ( bSig ) return propagateFloat32NaN( a, b );
   1059            return packFloat32( zSign, 0xFF, 0 );
   1060        }
   1061        if ( aExp == 0 ) {
   1062            ++expDiff;
   1063        }
   1064        else {
   1065            aSig |= 0x20000000;
   1066        }
   1067        shift32RightJamming( aSig, - expDiff, &aSig );
   1068        zExp = bExp;
   1069    }
   1070    else {
   1071        if ( aExp == 0xFF ) {
   1072            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1073            return a;
   1074        }
   1075        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
   1076        zSig = 0x40000000 + aSig + bSig;
   1077        zExp = aExp;
   1078        goto roundAndPack;
   1079    }
   1080    aSig |= 0x20000000;
   1081    zSig = ( aSig + bSig )<<1;
   1082    --zExp;
   1083    if ( (sbits32) zSig < 0 ) {
   1084        zSig = aSig + bSig;
   1085        ++zExp;
   1086    }
   1087 roundAndPack:
   1088    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
   1089
   1090}
   1091
   1092/*
   1093-------------------------------------------------------------------------------
   1094Returns the result of subtracting the absolute values of the single-
   1095precision floating-point values `a' and `b'.  If `zSign' is true, the
   1096difference is negated before being returned.  `zSign' is ignored if the
   1097result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1098Standard for Binary Floating-point Arithmetic.
   1099-------------------------------------------------------------------------------
   1100*/
   1101static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
   1102{
   1103    int16 aExp, bExp, zExp;
   1104    bits32 aSig, bSig, zSig;
   1105    int16 expDiff;
   1106
   1107    aSig = extractFloat32Frac( a );
   1108    aExp = extractFloat32Exp( a );
   1109    bSig = extractFloat32Frac( b );
   1110    bExp = extractFloat32Exp( b );
   1111    expDiff = aExp - bExp;
   1112    aSig <<= 7;
   1113    bSig <<= 7;
   1114    if ( 0 < expDiff ) goto aExpBigger;
   1115    if ( expDiff < 0 ) goto bExpBigger;
   1116    if ( aExp == 0xFF ) {
   1117        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
   1118        roundData->exception |= float_flag_invalid;
   1119        return float32_default_nan;
   1120    }
   1121    if ( aExp == 0 ) {
   1122        aExp = 1;
   1123        bExp = 1;
   1124    }
   1125    if ( bSig < aSig ) goto aBigger;
   1126    if ( aSig < bSig ) goto bBigger;
   1127    return packFloat32( roundData->mode == float_round_down, 0, 0 );
   1128 bExpBigger:
   1129    if ( bExp == 0xFF ) {
   1130        if ( bSig ) return propagateFloat32NaN( a, b );
   1131        return packFloat32( zSign ^ 1, 0xFF, 0 );
   1132    }
   1133    if ( aExp == 0 ) {
   1134        ++expDiff;
   1135    }
   1136    else {
   1137        aSig |= 0x40000000;
   1138    }
   1139    shift32RightJamming( aSig, - expDiff, &aSig );
   1140    bSig |= 0x40000000;
   1141 bBigger:
   1142    zSig = bSig - aSig;
   1143    zExp = bExp;
   1144    zSign ^= 1;
   1145    goto normalizeRoundAndPack;
   1146 aExpBigger:
   1147    if ( aExp == 0xFF ) {
   1148        if ( aSig ) return propagateFloat32NaN( a, b );
   1149        return a;
   1150    }
   1151    if ( bExp == 0 ) {
   1152        --expDiff;
   1153    }
   1154    else {
   1155        bSig |= 0x40000000;
   1156    }
   1157    shift32RightJamming( bSig, expDiff, &bSig );
   1158    aSig |= 0x40000000;
   1159 aBigger:
   1160    zSig = aSig - bSig;
   1161    zExp = aExp;
   1162 normalizeRoundAndPack:
   1163    --zExp;
   1164    return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
   1165
   1166}
   1167
   1168/*
   1169-------------------------------------------------------------------------------
   1170Returns the result of adding the single-precision floating-point values `a'
   1171and `b'.  The operation is performed according to the IEC/IEEE Standard for
   1172Binary Floating-point Arithmetic.
   1173-------------------------------------------------------------------------------
   1174*/
   1175float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
   1176{
   1177    flag aSign, bSign;
   1178
   1179    aSign = extractFloat32Sign( a );
   1180    bSign = extractFloat32Sign( b );
   1181    if ( aSign == bSign ) {
   1182        return addFloat32Sigs( roundData, a, b, aSign );
   1183    }
   1184    else {
   1185        return subFloat32Sigs( roundData, a, b, aSign );
   1186    }
   1187
   1188}
   1189
   1190/*
   1191-------------------------------------------------------------------------------
   1192Returns the result of subtracting the single-precision floating-point values
   1193`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1194for Binary Floating-point Arithmetic.
   1195-------------------------------------------------------------------------------
   1196*/
   1197float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
   1198{
   1199    flag aSign, bSign;
   1200
   1201    aSign = extractFloat32Sign( a );
   1202    bSign = extractFloat32Sign( b );
   1203    if ( aSign == bSign ) {
   1204        return subFloat32Sigs( roundData, a, b, aSign );
   1205    }
   1206    else {
   1207        return addFloat32Sigs( roundData, a, b, aSign );
   1208    }
   1209
   1210}
   1211
   1212/*
   1213-------------------------------------------------------------------------------
   1214Returns the result of multiplying the single-precision floating-point values
   1215`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   1216for Binary Floating-point Arithmetic.
   1217-------------------------------------------------------------------------------
   1218*/
   1219float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
   1220{
   1221    flag aSign, bSign, zSign;
   1222    int16 aExp, bExp, zExp;
   1223    bits32 aSig, bSig;
   1224    bits64 zSig64;
   1225    bits32 zSig;
   1226
   1227    aSig = extractFloat32Frac( a );
   1228    aExp = extractFloat32Exp( a );
   1229    aSign = extractFloat32Sign( a );
   1230    bSig = extractFloat32Frac( b );
   1231    bExp = extractFloat32Exp( b );
   1232    bSign = extractFloat32Sign( b );
   1233    zSign = aSign ^ bSign;
   1234    if ( aExp == 0xFF ) {
   1235        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1236            return propagateFloat32NaN( a, b );
   1237        }
   1238        if ( ( bExp | bSig ) == 0 ) {
   1239            roundData->exception |= float_flag_invalid;
   1240            return float32_default_nan;
   1241        }
   1242        return packFloat32( zSign, 0xFF, 0 );
   1243    }
   1244    if ( bExp == 0xFF ) {
   1245        if ( bSig ) return propagateFloat32NaN( a, b );
   1246        if ( ( aExp | aSig ) == 0 ) {
   1247            roundData->exception |= float_flag_invalid;
   1248            return float32_default_nan;
   1249        }
   1250        return packFloat32( zSign, 0xFF, 0 );
   1251    }
   1252    if ( aExp == 0 ) {
   1253        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1254        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1255    }
   1256    if ( bExp == 0 ) {
   1257        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
   1258        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1259    }
   1260    zExp = aExp + bExp - 0x7F;
   1261    aSig = ( aSig | 0x00800000 )<<7;
   1262    bSig = ( bSig | 0x00800000 )<<8;
   1263    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
   1264    zSig = zSig64;
   1265    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
   1266        zSig <<= 1;
   1267        --zExp;
   1268    }
   1269    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
   1270
   1271}
   1272
   1273/*
   1274-------------------------------------------------------------------------------
   1275Returns the result of dividing the single-precision floating-point value `a'
   1276by the corresponding value `b'.  The operation is performed according to the
   1277IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1278-------------------------------------------------------------------------------
   1279*/
   1280float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
   1281{
   1282    flag aSign, bSign, zSign;
   1283    int16 aExp, bExp, zExp;
   1284    bits32 aSig, bSig, zSig;
   1285
   1286    aSig = extractFloat32Frac( a );
   1287    aExp = extractFloat32Exp( a );
   1288    aSign = extractFloat32Sign( a );
   1289    bSig = extractFloat32Frac( b );
   1290    bExp = extractFloat32Exp( b );
   1291    bSign = extractFloat32Sign( b );
   1292    zSign = aSign ^ bSign;
   1293    if ( aExp == 0xFF ) {
   1294        if ( aSig ) return propagateFloat32NaN( a, b );
   1295        if ( bExp == 0xFF ) {
   1296            if ( bSig ) return propagateFloat32NaN( a, b );
   1297            roundData->exception |= float_flag_invalid;
   1298            return float32_default_nan;
   1299        }
   1300        return packFloat32( zSign, 0xFF, 0 );
   1301    }
   1302    if ( bExp == 0xFF ) {
   1303        if ( bSig ) return propagateFloat32NaN( a, b );
   1304        return packFloat32( zSign, 0, 0 );
   1305    }
   1306    if ( bExp == 0 ) {
   1307        if ( bSig == 0 ) {
   1308            if ( ( aExp | aSig ) == 0 ) {
   1309                roundData->exception |= float_flag_invalid;
   1310                return float32_default_nan;
   1311            }
   1312            roundData->exception |= float_flag_divbyzero;
   1313            return packFloat32( zSign, 0xFF, 0 );
   1314        }
   1315        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1316    }
   1317    if ( aExp == 0 ) {
   1318        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
   1319        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1320    }
   1321    zExp = aExp - bExp + 0x7D;
   1322    aSig = ( aSig | 0x00800000 )<<7;
   1323    bSig = ( bSig | 0x00800000 )<<8;
   1324    if ( bSig <= ( aSig + aSig ) ) {
   1325        aSig >>= 1;
   1326        ++zExp;
   1327    }
   1328    {
   1329        bits64 tmp = ( (bits64) aSig )<<32;
   1330        do_div( tmp, bSig );
   1331        zSig = tmp;
   1332    }
   1333    if ( ( zSig & 0x3F ) == 0 ) {
   1334        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
   1335    }
   1336    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
   1337
   1338}
   1339
   1340/*
   1341-------------------------------------------------------------------------------
   1342Returns the remainder of the single-precision floating-point value `a'
   1343with respect to the corresponding value `b'.  The operation is performed
   1344according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1345-------------------------------------------------------------------------------
   1346*/
   1347float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
   1348{
   1349    flag aSign, bSign, zSign;
   1350    int16 aExp, bExp, expDiff;
   1351    bits32 aSig, bSig;
   1352    bits32 q;
   1353    bits64 aSig64, bSig64, q64;
   1354    bits32 alternateASig;
   1355    sbits32 sigMean;
   1356
   1357    aSig = extractFloat32Frac( a );
   1358    aExp = extractFloat32Exp( a );
   1359    aSign = extractFloat32Sign( a );
   1360    bSig = extractFloat32Frac( b );
   1361    bExp = extractFloat32Exp( b );
   1362    bSign = extractFloat32Sign( b );
   1363    if ( aExp == 0xFF ) {
   1364        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
   1365            return propagateFloat32NaN( a, b );
   1366        }
   1367        roundData->exception |= float_flag_invalid;
   1368        return float32_default_nan;
   1369    }
   1370    if ( bExp == 0xFF ) {
   1371        if ( bSig ) return propagateFloat32NaN( a, b );
   1372        return a;
   1373    }
   1374    if ( bExp == 0 ) {
   1375        if ( bSig == 0 ) {
   1376            roundData->exception |= float_flag_invalid;
   1377            return float32_default_nan;
   1378        }
   1379        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
   1380    }
   1381    if ( aExp == 0 ) {
   1382        if ( aSig == 0 ) return a;
   1383        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1384    }
   1385    expDiff = aExp - bExp;
   1386    aSig |= 0x00800000;
   1387    bSig |= 0x00800000;
   1388    if ( expDiff < 32 ) {
   1389        aSig <<= 8;
   1390        bSig <<= 8;
   1391        if ( expDiff < 0 ) {
   1392            if ( expDiff < -1 ) return a;
   1393            aSig >>= 1;
   1394        }
   1395        q = ( bSig <= aSig );
   1396        if ( q ) aSig -= bSig;
   1397        if ( 0 < expDiff ) {
   1398            bits64 tmp = ( (bits64) aSig )<<32;
   1399            do_div( tmp, bSig );
   1400            q = tmp;
   1401            q >>= 32 - expDiff;
   1402            bSig >>= 2;
   1403            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   1404        }
   1405        else {
   1406            aSig >>= 2;
   1407            bSig >>= 2;
   1408        }
   1409    }
   1410    else {
   1411        if ( bSig <= aSig ) aSig -= bSig;
   1412        aSig64 = ( (bits64) aSig )<<40;
   1413        bSig64 = ( (bits64) bSig )<<40;
   1414        expDiff -= 64;
   1415        while ( 0 < expDiff ) {
   1416            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1417            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1418            aSig64 = - ( ( bSig * q64 )<<38 );
   1419            expDiff -= 62;
   1420        }
   1421        expDiff += 64;
   1422        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
   1423        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
   1424        q = q64>>( 64 - expDiff );
   1425        bSig <<= 6;
   1426        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
   1427    }
   1428    do {
   1429        alternateASig = aSig;
   1430        ++q;
   1431        aSig -= bSig;
   1432    } while ( 0 <= (sbits32) aSig );
   1433    sigMean = aSig + alternateASig;
   1434    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   1435        aSig = alternateASig;
   1436    }
   1437    zSign = ( (sbits32) aSig < 0 );
   1438    if ( zSign ) aSig = - aSig;
   1439    return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
   1440
   1441}
   1442
   1443/*
   1444-------------------------------------------------------------------------------
   1445Returns the square root of the single-precision floating-point value `a'.
   1446The operation is performed according to the IEC/IEEE Standard for Binary
   1447Floating-point Arithmetic.
   1448-------------------------------------------------------------------------------
   1449*/
   1450float32 float32_sqrt( struct roundingData *roundData, float32 a )
   1451{
   1452    flag aSign;
   1453    int16 aExp, zExp;
   1454    bits32 aSig, zSig;
   1455    bits64 rem, term;
   1456
   1457    aSig = extractFloat32Frac( a );
   1458    aExp = extractFloat32Exp( a );
   1459    aSign = extractFloat32Sign( a );
   1460    if ( aExp == 0xFF ) {
   1461        if ( aSig ) return propagateFloat32NaN( a, 0 );
   1462        if ( ! aSign ) return a;
   1463        roundData->exception |= float_flag_invalid;
   1464        return float32_default_nan;
   1465    }
   1466    if ( aSign ) {
   1467        if ( ( aExp | aSig ) == 0 ) return a;
   1468        roundData->exception |= float_flag_invalid;
   1469        return float32_default_nan;
   1470    }
   1471    if ( aExp == 0 ) {
   1472        if ( aSig == 0 ) return 0;
   1473        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
   1474    }
   1475    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
   1476    aSig = ( aSig | 0x00800000 )<<8;
   1477    zSig = estimateSqrt32( aExp, aSig ) + 2;
   1478    if ( ( zSig & 0x7F ) <= 5 ) {
   1479        if ( zSig < 2 ) {
   1480            zSig = 0xFFFFFFFF;
   1481        }
   1482        else {
   1483            aSig >>= aExp & 1;
   1484            term = ( (bits64) zSig ) * zSig;
   1485            rem = ( ( (bits64) aSig )<<32 ) - term;
   1486            while ( (sbits64) rem < 0 ) {
   1487                --zSig;
   1488                rem += ( ( (bits64) zSig )<<1 ) | 1;
   1489            }
   1490            zSig |= ( rem != 0 );
   1491        }
   1492    }
   1493    shift32RightJamming( zSig, 1, &zSig );
   1494    return roundAndPackFloat32( roundData, 0, zExp, zSig );
   1495
   1496}
   1497
   1498/*
   1499-------------------------------------------------------------------------------
   1500Returns 1 if the single-precision floating-point value `a' is equal to the
   1501corresponding value `b', and 0 otherwise.  The comparison is performed
   1502according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1503-------------------------------------------------------------------------------
   1504*/
   1505flag float32_eq( float32 a, float32 b )
   1506{
   1507
   1508    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1509         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1510       ) {
   1511        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
   1512            float_raise( float_flag_invalid );
   1513        }
   1514        return 0;
   1515    }
   1516    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   1517
   1518}
   1519
   1520/*
   1521-------------------------------------------------------------------------------
   1522Returns 1 if the single-precision floating-point value `a' is less than or
   1523equal to the corresponding value `b', and 0 otherwise.  The comparison is
   1524performed according to the IEC/IEEE Standard for Binary Floating-point
   1525Arithmetic.
   1526-------------------------------------------------------------------------------
   1527*/
   1528flag float32_le( float32 a, float32 b )
   1529{
   1530    flag aSign, bSign;
   1531
   1532    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1533         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1534       ) {
   1535        float_raise( float_flag_invalid );
   1536        return 0;
   1537    }
   1538    aSign = extractFloat32Sign( a );
   1539    bSign = extractFloat32Sign( b );
   1540    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   1541    return ( a == b ) || ( aSign ^ ( a < b ) );
   1542
   1543}
   1544
   1545/*
   1546-------------------------------------------------------------------------------
   1547Returns 1 if the single-precision floating-point value `a' is less than
   1548the corresponding value `b', and 0 otherwise.  The comparison is performed
   1549according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1550-------------------------------------------------------------------------------
   1551*/
   1552flag float32_lt( float32 a, float32 b )
   1553{
   1554    flag aSign, bSign;
   1555
   1556    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1557         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1558       ) {
   1559        float_raise( float_flag_invalid );
   1560        return 0;
   1561    }
   1562    aSign = extractFloat32Sign( a );
   1563    bSign = extractFloat32Sign( b );
   1564    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   1565    return ( a != b ) && ( aSign ^ ( a < b ) );
   1566
   1567}
   1568
   1569/*
   1570-------------------------------------------------------------------------------
   1571Returns 1 if the single-precision floating-point value `a' is equal to the
   1572corresponding value `b', and 0 otherwise.  The invalid exception is raised
   1573if either operand is a NaN.  Otherwise, the comparison is performed
   1574according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1575-------------------------------------------------------------------------------
   1576*/
   1577flag float32_eq_signaling( float32 a, float32 b )
   1578{
   1579
   1580    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1581         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1582       ) {
   1583        float_raise( float_flag_invalid );
   1584        return 0;
   1585    }
   1586    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
   1587
   1588}
   1589
   1590/*
   1591-------------------------------------------------------------------------------
   1592Returns 1 if the single-precision floating-point value `a' is less than or
   1593equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   1594cause an exception.  Otherwise, the comparison is performed according to the
   1595IEC/IEEE Standard for Binary Floating-point Arithmetic.
   1596-------------------------------------------------------------------------------
   1597*/
   1598flag float32_le_quiet( float32 a, float32 b )
   1599{
   1600    flag aSign, bSign;
   1601    //int16 aExp, bExp;
   1602
   1603    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1604         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1605       ) {
   1606        /* Do nothing, even if NaN as we're quiet */
   1607        return 0;
   1608    }
   1609    aSign = extractFloat32Sign( a );
   1610    bSign = extractFloat32Sign( b );
   1611    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
   1612    return ( a == b ) || ( aSign ^ ( a < b ) );
   1613
   1614}
   1615
   1616/*
   1617-------------------------------------------------------------------------------
   1618Returns 1 if the single-precision floating-point value `a' is less than
   1619the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   1620exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   1621Standard for Binary Floating-point Arithmetic.
   1622-------------------------------------------------------------------------------
   1623*/
   1624flag float32_lt_quiet( float32 a, float32 b )
   1625{
   1626    flag aSign, bSign;
   1627
   1628    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
   1629         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
   1630       ) {
   1631        /* Do nothing, even if NaN as we're quiet */
   1632        return 0;
   1633    }
   1634    aSign = extractFloat32Sign( a );
   1635    bSign = extractFloat32Sign( b );
   1636    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
   1637    return ( a != b ) && ( aSign ^ ( a < b ) );
   1638
   1639}
   1640
   1641/*
   1642-------------------------------------------------------------------------------
   1643Returns the result of converting the double-precision floating-point value
   1644`a' to the 32-bit two's complement integer format.  The conversion is
   1645performed according to the IEC/IEEE Standard for Binary Floating-point
   1646Arithmetic---which means in particular that the conversion is rounded
   1647according to the current rounding mode.  If `a' is a NaN, the largest
   1648positive integer is returned.  Otherwise, if the conversion overflows, the
   1649largest integer with the same sign as `a' is returned.
   1650-------------------------------------------------------------------------------
   1651*/
   1652int32 float64_to_int32( struct roundingData *roundData, float64 a )
   1653{
   1654    flag aSign;
   1655    int16 aExp, shiftCount;
   1656    bits64 aSig;
   1657
   1658    aSig = extractFloat64Frac( a );
   1659    aExp = extractFloat64Exp( a );
   1660    aSign = extractFloat64Sign( a );
   1661    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   1662    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   1663    shiftCount = 0x42C - aExp;
   1664    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   1665    return roundAndPackInt32( roundData, aSign, aSig );
   1666
   1667}
   1668
   1669/*
   1670-------------------------------------------------------------------------------
   1671Returns the result of converting the double-precision floating-point value
   1672`a' to the 32-bit two's complement integer format.  The conversion is
   1673performed according to the IEC/IEEE Standard for Binary Floating-point
   1674Arithmetic, except that the conversion is always rounded toward zero.  If
   1675`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1676conversion overflows, the largest integer with the same sign as `a' is
   1677returned.
   1678-------------------------------------------------------------------------------
   1679*/
   1680int32 float64_to_int32_round_to_zero( float64 a )
   1681{
   1682    flag aSign;
   1683    int16 aExp, shiftCount;
   1684    bits64 aSig, savedASig;
   1685    int32 z;
   1686
   1687    aSig = extractFloat64Frac( a );
   1688    aExp = extractFloat64Exp( a );
   1689    aSign = extractFloat64Sign( a );
   1690    shiftCount = 0x433 - aExp;
   1691    if ( shiftCount < 21 ) {
   1692        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   1693        goto invalid;
   1694    }
   1695    else if ( 52 < shiftCount ) {
   1696        if ( aExp || aSig ) float_raise( float_flag_inexact );
   1697        return 0;
   1698    }
   1699    aSig |= LIT64( 0x0010000000000000 );
   1700    savedASig = aSig;
   1701    aSig >>= shiftCount;
   1702    z = aSig;
   1703    if ( aSign ) z = - z;
   1704    if ( ( z < 0 ) ^ aSign ) {
   1705 invalid:
   1706        float_raise( float_flag_invalid );
   1707        return aSign ? 0x80000000 : 0x7FFFFFFF;
   1708    }
   1709    if ( ( aSig<<shiftCount ) != savedASig ) {
   1710        float_raise( float_flag_inexact );
   1711    }
   1712    return z;
   1713
   1714}
   1715
   1716/*
   1717-------------------------------------------------------------------------------
   1718Returns the result of converting the double-precision floating-point value
   1719`a' to the 32-bit two's complement unsigned integer format.  The conversion
   1720is performed according to the IEC/IEEE Standard for Binary Floating-point
   1721Arithmetic---which means in particular that the conversion is rounded
   1722according to the current rounding mode.  If `a' is a NaN, the largest
   1723positive integer is returned.  Otherwise, if the conversion overflows, the
   1724largest positive integer is returned.
   1725-------------------------------------------------------------------------------
   1726*/
   1727int32 float64_to_uint32( struct roundingData *roundData, float64 a )
   1728{
   1729    flag aSign;
   1730    int16 aExp, shiftCount;
   1731    bits64 aSig;
   1732
   1733    aSig = extractFloat64Frac( a );
   1734    aExp = extractFloat64Exp( a );
   1735    aSign = 0; //extractFloat64Sign( a );
   1736    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   1737    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
   1738    shiftCount = 0x42C - aExp;
   1739    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
   1740    return roundAndPackInt32( roundData, aSign, aSig );
   1741}
   1742
   1743/*
   1744-------------------------------------------------------------------------------
   1745Returns the result of converting the double-precision floating-point value
   1746`a' to the 32-bit two's complement integer format.  The conversion is
   1747performed according to the IEC/IEEE Standard for Binary Floating-point
   1748Arithmetic, except that the conversion is always rounded toward zero.  If
   1749`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
   1750conversion overflows, the largest positive integer is returned.
   1751-------------------------------------------------------------------------------
   1752*/
   1753int32 float64_to_uint32_round_to_zero( float64 a )
   1754{
   1755    flag aSign;
   1756    int16 aExp, shiftCount;
   1757    bits64 aSig, savedASig;
   1758    int32 z;
   1759
   1760    aSig = extractFloat64Frac( a );
   1761    aExp = extractFloat64Exp( a );
   1762    aSign = extractFloat64Sign( a );
   1763    shiftCount = 0x433 - aExp;
   1764    if ( shiftCount < 21 ) {
   1765        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
   1766        goto invalid;
   1767    }
   1768    else if ( 52 < shiftCount ) {
   1769        if ( aExp || aSig ) float_raise( float_flag_inexact );
   1770        return 0;
   1771    }
   1772    aSig |= LIT64( 0x0010000000000000 );
   1773    savedASig = aSig;
   1774    aSig >>= shiftCount;
   1775    z = aSig;
   1776    if ( aSign ) z = - z;
   1777    if ( ( z < 0 ) ^ aSign ) {
   1778 invalid:
   1779        float_raise( float_flag_invalid );
   1780        return aSign ? 0x80000000 : 0x7FFFFFFF;
   1781    }
   1782    if ( ( aSig<<shiftCount ) != savedASig ) {
   1783        float_raise( float_flag_inexact );
   1784    }
   1785    return z;
   1786}
   1787
   1788/*
   1789-------------------------------------------------------------------------------
   1790Returns the result of converting the double-precision floating-point value
   1791`a' to the single-precision floating-point format.  The conversion is
   1792performed according to the IEC/IEEE Standard for Binary Floating-point
   1793Arithmetic.
   1794-------------------------------------------------------------------------------
   1795*/
   1796float32 float64_to_float32( struct roundingData *roundData, float64 a )
   1797{
   1798    flag aSign;
   1799    int16 aExp;
   1800    bits64 aSig;
   1801    bits32 zSig;
   1802
   1803    aSig = extractFloat64Frac( a );
   1804    aExp = extractFloat64Exp( a );
   1805    aSign = extractFloat64Sign( a );
   1806    if ( aExp == 0x7FF ) {
   1807        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
   1808        return packFloat32( aSign, 0xFF, 0 );
   1809    }
   1810    shift64RightJamming( aSig, 22, &aSig );
   1811    zSig = aSig;
   1812    if ( aExp || zSig ) {
   1813        zSig |= 0x40000000;
   1814        aExp -= 0x381;
   1815    }
   1816    return roundAndPackFloat32( roundData, aSign, aExp, zSig );
   1817
   1818}
   1819
   1820#ifdef FLOATX80
   1821
   1822/*
   1823-------------------------------------------------------------------------------
   1824Returns the result of converting the double-precision floating-point value
   1825`a' to the extended double-precision floating-point format.  The conversion
   1826is performed according to the IEC/IEEE Standard for Binary Floating-point
   1827Arithmetic.
   1828-------------------------------------------------------------------------------
   1829*/
   1830floatx80 float64_to_floatx80( float64 a )
   1831{
   1832    flag aSign;
   1833    int16 aExp;
   1834    bits64 aSig;
   1835
   1836    aSig = extractFloat64Frac( a );
   1837    aExp = extractFloat64Exp( a );
   1838    aSign = extractFloat64Sign( a );
   1839    if ( aExp == 0x7FF ) {
   1840        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
   1841        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   1842    }
   1843    if ( aExp == 0 ) {
   1844        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
   1845        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   1846    }
   1847    return
   1848        packFloatx80(
   1849            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
   1850
   1851}
   1852
   1853#endif
   1854
   1855/*
   1856-------------------------------------------------------------------------------
   1857Rounds the double-precision floating-point value `a' to an integer, and
   1858returns the result as a double-precision floating-point value.  The
   1859operation is performed according to the IEC/IEEE Standard for Binary
   1860Floating-point Arithmetic.
   1861-------------------------------------------------------------------------------
   1862*/
   1863float64 float64_round_to_int( struct roundingData *roundData, float64 a )
   1864{
   1865    flag aSign;
   1866    int16 aExp;
   1867    bits64 lastBitMask, roundBitsMask;
   1868    int8 roundingMode;
   1869    float64 z;
   1870
   1871    aExp = extractFloat64Exp( a );
   1872    if ( 0x433 <= aExp ) {
   1873        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
   1874            return propagateFloat64NaN( a, a );
   1875        }
   1876        return a;
   1877    }
   1878    if ( aExp <= 0x3FE ) {
   1879        if ( (bits64) ( a<<1 ) == 0 ) return a;
   1880        roundData->exception |= float_flag_inexact;
   1881        aSign = extractFloat64Sign( a );
   1882        switch ( roundData->mode ) {
   1883         case float_round_nearest_even:
   1884            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
   1885                return packFloat64( aSign, 0x3FF, 0 );
   1886            }
   1887            break;
   1888         case float_round_down:
   1889            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
   1890         case float_round_up:
   1891            return
   1892            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
   1893        }
   1894        return packFloat64( aSign, 0, 0 );
   1895    }
   1896    lastBitMask = 1;
   1897    lastBitMask <<= 0x433 - aExp;
   1898    roundBitsMask = lastBitMask - 1;
   1899    z = a;
   1900    roundingMode = roundData->mode;
   1901    if ( roundingMode == float_round_nearest_even ) {
   1902        z += lastBitMask>>1;
   1903        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
   1904    }
   1905    else if ( roundingMode != float_round_to_zero ) {
   1906        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   1907            z += roundBitsMask;
   1908        }
   1909    }
   1910    z &= ~ roundBitsMask;
   1911    if ( z != a ) roundData->exception |= float_flag_inexact;
   1912    return z;
   1913
   1914}
   1915
   1916/*
   1917-------------------------------------------------------------------------------
   1918Returns the result of adding the absolute values of the double-precision
   1919floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
   1920before being returned.  `zSign' is ignored if the result is a NaN.  The
   1921addition is performed according to the IEC/IEEE Standard for Binary
   1922Floating-point Arithmetic.
   1923-------------------------------------------------------------------------------
   1924*/
   1925static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
   1926{
   1927    int16 aExp, bExp, zExp;
   1928    bits64 aSig, bSig, zSig;
   1929    int16 expDiff;
   1930
   1931    aSig = extractFloat64Frac( a );
   1932    aExp = extractFloat64Exp( a );
   1933    bSig = extractFloat64Frac( b );
   1934    bExp = extractFloat64Exp( b );
   1935    expDiff = aExp - bExp;
   1936    aSig <<= 9;
   1937    bSig <<= 9;
   1938    if ( 0 < expDiff ) {
   1939        if ( aExp == 0x7FF ) {
   1940            if ( aSig ) return propagateFloat64NaN( a, b );
   1941            return a;
   1942        }
   1943        if ( bExp == 0 ) {
   1944            --expDiff;
   1945        }
   1946        else {
   1947            bSig |= LIT64( 0x2000000000000000 );
   1948        }
   1949        shift64RightJamming( bSig, expDiff, &bSig );
   1950        zExp = aExp;
   1951    }
   1952    else if ( expDiff < 0 ) {
   1953        if ( bExp == 0x7FF ) {
   1954            if ( bSig ) return propagateFloat64NaN( a, b );
   1955            return packFloat64( zSign, 0x7FF, 0 );
   1956        }
   1957        if ( aExp == 0 ) {
   1958            ++expDiff;
   1959        }
   1960        else {
   1961            aSig |= LIT64( 0x2000000000000000 );
   1962        }
   1963        shift64RightJamming( aSig, - expDiff, &aSig );
   1964        zExp = bExp;
   1965    }
   1966    else {
   1967        if ( aExp == 0x7FF ) {
   1968            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   1969            return a;
   1970        }
   1971        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
   1972        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
   1973        zExp = aExp;
   1974        goto roundAndPack;
   1975    }
   1976    aSig |= LIT64( 0x2000000000000000 );
   1977    zSig = ( aSig + bSig )<<1;
   1978    --zExp;
   1979    if ( (sbits64) zSig < 0 ) {
   1980        zSig = aSig + bSig;
   1981        ++zExp;
   1982    }
   1983 roundAndPack:
   1984    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
   1985
   1986}
   1987
   1988/*
   1989-------------------------------------------------------------------------------
   1990Returns the result of subtracting the absolute values of the double-
   1991precision floating-point values `a' and `b'.  If `zSign' is true, the
   1992difference is negated before being returned.  `zSign' is ignored if the
   1993result is a NaN.  The subtraction is performed according to the IEC/IEEE
   1994Standard for Binary Floating-point Arithmetic.
   1995-------------------------------------------------------------------------------
   1996*/
   1997static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
   1998{
   1999    int16 aExp, bExp, zExp;
   2000    bits64 aSig, bSig, zSig;
   2001    int16 expDiff;
   2002
   2003    aSig = extractFloat64Frac( a );
   2004    aExp = extractFloat64Exp( a );
   2005    bSig = extractFloat64Frac( b );
   2006    bExp = extractFloat64Exp( b );
   2007    expDiff = aExp - bExp;
   2008    aSig <<= 10;
   2009    bSig <<= 10;
   2010    if ( 0 < expDiff ) goto aExpBigger;
   2011    if ( expDiff < 0 ) goto bExpBigger;
   2012    if ( aExp == 0x7FF ) {
   2013        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
   2014        roundData->exception |= float_flag_invalid;
   2015        return float64_default_nan;
   2016    }
   2017    if ( aExp == 0 ) {
   2018        aExp = 1;
   2019        bExp = 1;
   2020    }
   2021    if ( bSig < aSig ) goto aBigger;
   2022    if ( aSig < bSig ) goto bBigger;
   2023    return packFloat64( roundData->mode == float_round_down, 0, 0 );
   2024 bExpBigger:
   2025    if ( bExp == 0x7FF ) {
   2026        if ( bSig ) return propagateFloat64NaN( a, b );
   2027        return packFloat64( zSign ^ 1, 0x7FF, 0 );
   2028    }
   2029    if ( aExp == 0 ) {
   2030        ++expDiff;
   2031    }
   2032    else {
   2033        aSig |= LIT64( 0x4000000000000000 );
   2034    }
   2035    shift64RightJamming( aSig, - expDiff, &aSig );
   2036    bSig |= LIT64( 0x4000000000000000 );
   2037 bBigger:
   2038    zSig = bSig - aSig;
   2039    zExp = bExp;
   2040    zSign ^= 1;
   2041    goto normalizeRoundAndPack;
   2042 aExpBigger:
   2043    if ( aExp == 0x7FF ) {
   2044        if ( aSig ) return propagateFloat64NaN( a, b );
   2045        return a;
   2046    }
   2047    if ( bExp == 0 ) {
   2048        --expDiff;
   2049    }
   2050    else {
   2051        bSig |= LIT64( 0x4000000000000000 );
   2052    }
   2053    shift64RightJamming( bSig, expDiff, &bSig );
   2054    aSig |= LIT64( 0x4000000000000000 );
   2055 aBigger:
   2056    zSig = aSig - bSig;
   2057    zExp = aExp;
   2058 normalizeRoundAndPack:
   2059    --zExp;
   2060    return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
   2061
   2062}
   2063
   2064/*
   2065-------------------------------------------------------------------------------
   2066Returns the result of adding the double-precision floating-point values `a'
   2067and `b'.  The operation is performed according to the IEC/IEEE Standard for
   2068Binary Floating-point Arithmetic.
   2069-------------------------------------------------------------------------------
   2070*/
   2071float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
   2072{
   2073    flag aSign, bSign;
   2074
   2075    aSign = extractFloat64Sign( a );
   2076    bSign = extractFloat64Sign( b );
   2077    if ( aSign == bSign ) {
   2078        return addFloat64Sigs( roundData, a, b, aSign );
   2079    }
   2080    else {
   2081        return subFloat64Sigs( roundData, a, b, aSign );
   2082    }
   2083
   2084}
   2085
   2086/*
   2087-------------------------------------------------------------------------------
   2088Returns the result of subtracting the double-precision floating-point values
   2089`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2090for Binary Floating-point Arithmetic.
   2091-------------------------------------------------------------------------------
   2092*/
   2093float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
   2094{
   2095    flag aSign, bSign;
   2096
   2097    aSign = extractFloat64Sign( a );
   2098    bSign = extractFloat64Sign( b );
   2099    if ( aSign == bSign ) {
   2100        return subFloat64Sigs( roundData, a, b, aSign );
   2101    }
   2102    else {
   2103        return addFloat64Sigs( roundData, a, b, aSign );
   2104    }
   2105
   2106}
   2107
   2108/*
   2109-------------------------------------------------------------------------------
   2110Returns the result of multiplying the double-precision floating-point values
   2111`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
   2112for Binary Floating-point Arithmetic.
   2113-------------------------------------------------------------------------------
   2114*/
   2115float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
   2116{
   2117    flag aSign, bSign, zSign;
   2118    int16 aExp, bExp, zExp;
   2119    bits64 aSig, bSig, zSig0, zSig1;
   2120
   2121    aSig = extractFloat64Frac( a );
   2122    aExp = extractFloat64Exp( a );
   2123    aSign = extractFloat64Sign( a );
   2124    bSig = extractFloat64Frac( b );
   2125    bExp = extractFloat64Exp( b );
   2126    bSign = extractFloat64Sign( b );
   2127    zSign = aSign ^ bSign;
   2128    if ( aExp == 0x7FF ) {
   2129        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2130            return propagateFloat64NaN( a, b );
   2131        }
   2132        if ( ( bExp | bSig ) == 0 ) {
   2133            roundData->exception |= float_flag_invalid;
   2134            return float64_default_nan;
   2135        }
   2136        return packFloat64( zSign, 0x7FF, 0 );
   2137    }
   2138    if ( bExp == 0x7FF ) {
   2139        if ( bSig ) return propagateFloat64NaN( a, b );
   2140        if ( ( aExp | aSig ) == 0 ) {
   2141            roundData->exception |= float_flag_invalid;
   2142            return float64_default_nan;
   2143        }
   2144        return packFloat64( zSign, 0x7FF, 0 );
   2145    }
   2146    if ( aExp == 0 ) {
   2147        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2148        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2149    }
   2150    if ( bExp == 0 ) {
   2151        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
   2152        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2153    }
   2154    zExp = aExp + bExp - 0x3FF;
   2155    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2156    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2157    mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2158    zSig0 |= ( zSig1 != 0 );
   2159    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
   2160        zSig0 <<= 1;
   2161        --zExp;
   2162    }
   2163    return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
   2164
   2165}
   2166
   2167/*
   2168-------------------------------------------------------------------------------
   2169Returns the result of dividing the double-precision floating-point value `a'
   2170by the corresponding value `b'.  The operation is performed according to
   2171the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2172-------------------------------------------------------------------------------
   2173*/
   2174float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
   2175{
   2176    flag aSign, bSign, zSign;
   2177    int16 aExp, bExp, zExp;
   2178    bits64 aSig, bSig, zSig;
   2179    bits64 rem0, rem1;
   2180    bits64 term0, term1;
   2181
   2182    aSig = extractFloat64Frac( a );
   2183    aExp = extractFloat64Exp( a );
   2184    aSign = extractFloat64Sign( a );
   2185    bSig = extractFloat64Frac( b );
   2186    bExp = extractFloat64Exp( b );
   2187    bSign = extractFloat64Sign( b );
   2188    zSign = aSign ^ bSign;
   2189    if ( aExp == 0x7FF ) {
   2190        if ( aSig ) return propagateFloat64NaN( a, b );
   2191        if ( bExp == 0x7FF ) {
   2192            if ( bSig ) return propagateFloat64NaN( a, b );
   2193            roundData->exception |= float_flag_invalid;
   2194            return float64_default_nan;
   2195        }
   2196        return packFloat64( zSign, 0x7FF, 0 );
   2197    }
   2198    if ( bExp == 0x7FF ) {
   2199        if ( bSig ) return propagateFloat64NaN( a, b );
   2200        return packFloat64( zSign, 0, 0 );
   2201    }
   2202    if ( bExp == 0 ) {
   2203        if ( bSig == 0 ) {
   2204            if ( ( aExp | aSig ) == 0 ) {
   2205                roundData->exception |= float_flag_invalid;
   2206                return float64_default_nan;
   2207            }
   2208            roundData->exception |= float_flag_divbyzero;
   2209            return packFloat64( zSign, 0x7FF, 0 );
   2210        }
   2211        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2212    }
   2213    if ( aExp == 0 ) {
   2214        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
   2215        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2216    }
   2217    zExp = aExp - bExp + 0x3FD;
   2218    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
   2219    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2220    if ( bSig <= ( aSig + aSig ) ) {
   2221        aSig >>= 1;
   2222        ++zExp;
   2223    }
   2224    zSig = estimateDiv128To64( aSig, 0, bSig );
   2225    if ( ( zSig & 0x1FF ) <= 2 ) {
   2226        mul64To128( bSig, zSig, &term0, &term1 );
   2227        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2228        while ( (sbits64) rem0 < 0 ) {
   2229            --zSig;
   2230            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   2231        }
   2232        zSig |= ( rem1 != 0 );
   2233    }
   2234    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
   2235
   2236}
   2237
   2238/*
   2239-------------------------------------------------------------------------------
   2240Returns the remainder of the double-precision floating-point value `a'
   2241with respect to the corresponding value `b'.  The operation is performed
   2242according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2243-------------------------------------------------------------------------------
   2244*/
   2245float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
   2246{
   2247    flag aSign, bSign, zSign;
   2248    int16 aExp, bExp, expDiff;
   2249    bits64 aSig, bSig;
   2250    bits64 q, alternateASig;
   2251    sbits64 sigMean;
   2252
   2253    aSig = extractFloat64Frac( a );
   2254    aExp = extractFloat64Exp( a );
   2255    aSign = extractFloat64Sign( a );
   2256    bSig = extractFloat64Frac( b );
   2257    bExp = extractFloat64Exp( b );
   2258    bSign = extractFloat64Sign( b );
   2259    if ( aExp == 0x7FF ) {
   2260        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
   2261            return propagateFloat64NaN( a, b );
   2262        }
   2263        roundData->exception |= float_flag_invalid;
   2264        return float64_default_nan;
   2265    }
   2266    if ( bExp == 0x7FF ) {
   2267        if ( bSig ) return propagateFloat64NaN( a, b );
   2268        return a;
   2269    }
   2270    if ( bExp == 0 ) {
   2271        if ( bSig == 0 ) {
   2272            roundData->exception |= float_flag_invalid;
   2273            return float64_default_nan;
   2274        }
   2275        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
   2276    }
   2277    if ( aExp == 0 ) {
   2278        if ( aSig == 0 ) return a;
   2279        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2280    }
   2281    expDiff = aExp - bExp;
   2282    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
   2283    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
   2284    if ( expDiff < 0 ) {
   2285        if ( expDiff < -1 ) return a;
   2286        aSig >>= 1;
   2287    }
   2288    q = ( bSig <= aSig );
   2289    if ( q ) aSig -= bSig;
   2290    expDiff -= 64;
   2291    while ( 0 < expDiff ) {
   2292        q = estimateDiv128To64( aSig, 0, bSig );
   2293        q = ( 2 < q ) ? q - 2 : 0;
   2294        aSig = - ( ( bSig>>2 ) * q );
   2295        expDiff -= 62;
   2296    }
   2297    expDiff += 64;
   2298    if ( 0 < expDiff ) {
   2299        q = estimateDiv128To64( aSig, 0, bSig );
   2300        q = ( 2 < q ) ? q - 2 : 0;
   2301        q >>= 64 - expDiff;
   2302        bSig >>= 2;
   2303        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
   2304    }
   2305    else {
   2306        aSig >>= 2;
   2307        bSig >>= 2;
   2308    }
   2309    do {
   2310        alternateASig = aSig;
   2311        ++q;
   2312        aSig -= bSig;
   2313    } while ( 0 <= (sbits64) aSig );
   2314    sigMean = aSig + alternateASig;
   2315    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
   2316        aSig = alternateASig;
   2317    }
   2318    zSign = ( (sbits64) aSig < 0 );
   2319    if ( zSign ) aSig = - aSig;
   2320    return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
   2321
   2322}
   2323
   2324/*
   2325-------------------------------------------------------------------------------
   2326Returns the square root of the double-precision floating-point value `a'.
   2327The operation is performed according to the IEC/IEEE Standard for Binary
   2328Floating-point Arithmetic.
   2329-------------------------------------------------------------------------------
   2330*/
   2331float64 float64_sqrt( struct roundingData *roundData, float64 a )
   2332{
   2333    flag aSign;
   2334    int16 aExp, zExp;
   2335    bits64 aSig, zSig;
   2336    bits64 rem0, rem1, term0, term1; //, shiftedRem;
   2337    //float64 z;
   2338
   2339    aSig = extractFloat64Frac( a );
   2340    aExp = extractFloat64Exp( a );
   2341    aSign = extractFloat64Sign( a );
   2342    if ( aExp == 0x7FF ) {
   2343        if ( aSig ) return propagateFloat64NaN( a, a );
   2344        if ( ! aSign ) return a;
   2345        roundData->exception |= float_flag_invalid;
   2346        return float64_default_nan;
   2347    }
   2348    if ( aSign ) {
   2349        if ( ( aExp | aSig ) == 0 ) return a;
   2350        roundData->exception |= float_flag_invalid;
   2351        return float64_default_nan;
   2352    }
   2353    if ( aExp == 0 ) {
   2354        if ( aSig == 0 ) return 0;
   2355        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
   2356    }
   2357    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
   2358    aSig |= LIT64( 0x0010000000000000 );
   2359    zSig = estimateSqrt32( aExp, aSig>>21 );
   2360    zSig <<= 31;
   2361    aSig <<= 9 - ( aExp & 1 );
   2362    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
   2363    if ( ( zSig & 0x3FF ) <= 5 ) {
   2364        if ( zSig < 2 ) {
   2365            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
   2366        }
   2367        else {
   2368            aSig <<= 2;
   2369            mul64To128( zSig, zSig, &term0, &term1 );
   2370            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
   2371            while ( (sbits64) rem0 < 0 ) {
   2372                --zSig;
   2373                shortShift128Left( 0, zSig, 1, &term0, &term1 );
   2374                term1 |= 1;
   2375                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
   2376            }
   2377            zSig |= ( ( rem0 | rem1 ) != 0 );
   2378        }
   2379    }
   2380    shift64RightJamming( zSig, 1, &zSig );
   2381    return roundAndPackFloat64( roundData, 0, zExp, zSig );
   2382
   2383}
   2384
   2385/*
   2386-------------------------------------------------------------------------------
   2387Returns 1 if the double-precision floating-point value `a' is equal to the
   2388corresponding value `b', and 0 otherwise.  The comparison is performed
   2389according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2390-------------------------------------------------------------------------------
   2391*/
   2392flag float64_eq( float64 a, float64 b )
   2393{
   2394
   2395    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2396         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2397       ) {
   2398        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
   2399            float_raise( float_flag_invalid );
   2400        }
   2401        return 0;
   2402    }
   2403    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   2404
   2405}
   2406
   2407/*
   2408-------------------------------------------------------------------------------
   2409Returns 1 if the double-precision floating-point value `a' is less than or
   2410equal to the corresponding value `b', and 0 otherwise.  The comparison is
   2411performed according to the IEC/IEEE Standard for Binary Floating-point
   2412Arithmetic.
   2413-------------------------------------------------------------------------------
   2414*/
   2415flag float64_le( float64 a, float64 b )
   2416{
   2417    flag aSign, bSign;
   2418
   2419    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2420         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2421       ) {
   2422        float_raise( float_flag_invalid );
   2423        return 0;
   2424    }
   2425    aSign = extractFloat64Sign( a );
   2426    bSign = extractFloat64Sign( b );
   2427    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   2428    return ( a == b ) || ( aSign ^ ( a < b ) );
   2429
   2430}
   2431
   2432/*
   2433-------------------------------------------------------------------------------
   2434Returns 1 if the double-precision floating-point value `a' is less than
   2435the corresponding value `b', and 0 otherwise.  The comparison is performed
   2436according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2437-------------------------------------------------------------------------------
   2438*/
   2439flag float64_lt( float64 a, float64 b )
   2440{
   2441    flag aSign, bSign;
   2442
   2443    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2444         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2445       ) {
   2446        float_raise( float_flag_invalid );
   2447        return 0;
   2448    }
   2449    aSign = extractFloat64Sign( a );
   2450    bSign = extractFloat64Sign( b );
   2451    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   2452    return ( a != b ) && ( aSign ^ ( a < b ) );
   2453
   2454}
   2455
   2456/*
   2457-------------------------------------------------------------------------------
   2458Returns 1 if the double-precision floating-point value `a' is equal to the
   2459corresponding value `b', and 0 otherwise.  The invalid exception is raised
   2460if either operand is a NaN.  Otherwise, the comparison is performed
   2461according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2462-------------------------------------------------------------------------------
   2463*/
   2464flag float64_eq_signaling( float64 a, float64 b )
   2465{
   2466
   2467    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2468         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2469       ) {
   2470        float_raise( float_flag_invalid );
   2471        return 0;
   2472    }
   2473    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
   2474
   2475}
   2476
   2477/*
   2478-------------------------------------------------------------------------------
   2479Returns 1 if the double-precision floating-point value `a' is less than or
   2480equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
   2481cause an exception.  Otherwise, the comparison is performed according to the
   2482IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2483-------------------------------------------------------------------------------
   2484*/
   2485flag float64_le_quiet( float64 a, float64 b )
   2486{
   2487    flag aSign, bSign;
   2488    //int16 aExp, bExp;
   2489
   2490    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2491         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2492       ) {
   2493        /* Do nothing, even if NaN as we're quiet */
   2494        return 0;
   2495    }
   2496    aSign = extractFloat64Sign( a );
   2497    bSign = extractFloat64Sign( b );
   2498    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
   2499    return ( a == b ) || ( aSign ^ ( a < b ) );
   2500
   2501}
   2502
   2503/*
   2504-------------------------------------------------------------------------------
   2505Returns 1 if the double-precision floating-point value `a' is less than
   2506the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
   2507exception.  Otherwise, the comparison is performed according to the IEC/IEEE
   2508Standard for Binary Floating-point Arithmetic.
   2509-------------------------------------------------------------------------------
   2510*/
   2511flag float64_lt_quiet( float64 a, float64 b )
   2512{
   2513    flag aSign, bSign;
   2514
   2515    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
   2516         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
   2517       ) {
   2518        /* Do nothing, even if NaN as we're quiet */
   2519        return 0;
   2520    }
   2521    aSign = extractFloat64Sign( a );
   2522    bSign = extractFloat64Sign( b );
   2523    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
   2524    return ( a != b ) && ( aSign ^ ( a < b ) );
   2525
   2526}
   2527
   2528#ifdef FLOATX80
   2529
   2530/*
   2531-------------------------------------------------------------------------------
   2532Returns the result of converting the extended double-precision floating-
   2533point value `a' to the 32-bit two's complement integer format.  The
   2534conversion is performed according to the IEC/IEEE Standard for Binary
   2535Floating-point Arithmetic---which means in particular that the conversion
   2536is rounded according to the current rounding mode.  If `a' is a NaN, the
   2537largest positive integer is returned.  Otherwise, if the conversion
   2538overflows, the largest integer with the same sign as `a' is returned.
   2539-------------------------------------------------------------------------------
   2540*/
   2541int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
   2542{
   2543    flag aSign;
   2544    int32 aExp, shiftCount;
   2545    bits64 aSig;
   2546
   2547    aSig = extractFloatx80Frac( a );
   2548    aExp = extractFloatx80Exp( a );
   2549    aSign = extractFloatx80Sign( a );
   2550    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   2551    shiftCount = 0x4037 - aExp;
   2552    if ( shiftCount <= 0 ) shiftCount = 1;
   2553    shift64RightJamming( aSig, shiftCount, &aSig );
   2554    return roundAndPackInt32( roundData, aSign, aSig );
   2555
   2556}
   2557
   2558/*
   2559-------------------------------------------------------------------------------
   2560Returns the result of converting the extended double-precision floating-
   2561point value `a' to the 32-bit two's complement integer format.  The
   2562conversion is performed according to the IEC/IEEE Standard for Binary
   2563Floating-point Arithmetic, except that the conversion is always rounded
   2564toward zero.  If `a' is a NaN, the largest positive integer is returned.
   2565Otherwise, if the conversion overflows, the largest integer with the same
   2566sign as `a' is returned.
   2567-------------------------------------------------------------------------------
   2568*/
   2569int32 floatx80_to_int32_round_to_zero( floatx80 a )
   2570{
   2571    flag aSign;
   2572    int32 aExp, shiftCount;
   2573    bits64 aSig, savedASig;
   2574    int32 z;
   2575
   2576    aSig = extractFloatx80Frac( a );
   2577    aExp = extractFloatx80Exp( a );
   2578    aSign = extractFloatx80Sign( a );
   2579    shiftCount = 0x403E - aExp;
   2580    if ( shiftCount < 32 ) {
   2581        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
   2582        goto invalid;
   2583    }
   2584    else if ( 63 < shiftCount ) {
   2585        if ( aExp || aSig ) float_raise( float_flag_inexact );
   2586        return 0;
   2587    }
   2588    savedASig = aSig;
   2589    aSig >>= shiftCount;
   2590    z = aSig;
   2591    if ( aSign ) z = - z;
   2592    if ( ( z < 0 ) ^ aSign ) {
   2593 invalid:
   2594        float_raise( float_flag_invalid );
   2595        return aSign ? 0x80000000 : 0x7FFFFFFF;
   2596    }
   2597    if ( ( aSig<<shiftCount ) != savedASig ) {
   2598        float_raise( float_flag_inexact );
   2599    }
   2600    return z;
   2601
   2602}
   2603
   2604/*
   2605-------------------------------------------------------------------------------
   2606Returns the result of converting the extended double-precision floating-
   2607point value `a' to the single-precision floating-point format.  The
   2608conversion is performed according to the IEC/IEEE Standard for Binary
   2609Floating-point Arithmetic.
   2610-------------------------------------------------------------------------------
   2611*/
   2612float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
   2613{
   2614    flag aSign;
   2615    int32 aExp;
   2616    bits64 aSig;
   2617
   2618    aSig = extractFloatx80Frac( a );
   2619    aExp = extractFloatx80Exp( a );
   2620    aSign = extractFloatx80Sign( a );
   2621    if ( aExp == 0x7FFF ) {
   2622        if ( (bits64) ( aSig<<1 ) ) {
   2623            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
   2624        }
   2625        return packFloat32( aSign, 0xFF, 0 );
   2626    }
   2627    shift64RightJamming( aSig, 33, &aSig );
   2628    if ( aExp || aSig ) aExp -= 0x3F81;
   2629    return roundAndPackFloat32( roundData, aSign, aExp, aSig );
   2630
   2631}
   2632
   2633/*
   2634-------------------------------------------------------------------------------
   2635Returns the result of converting the extended double-precision floating-
   2636point value `a' to the double-precision floating-point format.  The
   2637conversion is performed according to the IEC/IEEE Standard for Binary
   2638Floating-point Arithmetic.
   2639-------------------------------------------------------------------------------
   2640*/
   2641float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
   2642{
   2643    flag aSign;
   2644    int32 aExp;
   2645    bits64 aSig, zSig;
   2646
   2647    aSig = extractFloatx80Frac( a );
   2648    aExp = extractFloatx80Exp( a );
   2649    aSign = extractFloatx80Sign( a );
   2650    if ( aExp == 0x7FFF ) {
   2651        if ( (bits64) ( aSig<<1 ) ) {
   2652            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
   2653        }
   2654        return packFloat64( aSign, 0x7FF, 0 );
   2655    }
   2656    shift64RightJamming( aSig, 1, &zSig );
   2657    if ( aExp || aSig ) aExp -= 0x3C01;
   2658    return roundAndPackFloat64( roundData, aSign, aExp, zSig );
   2659
   2660}
   2661
   2662/*
   2663-------------------------------------------------------------------------------
   2664Rounds the extended double-precision floating-point value `a' to an integer,
   2665and returns the result as an extended quadruple-precision floating-point
   2666value.  The operation is performed according to the IEC/IEEE Standard for
   2667Binary Floating-point Arithmetic.
   2668-------------------------------------------------------------------------------
   2669*/
   2670floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
   2671{
   2672    flag aSign;
   2673    int32 aExp;
   2674    bits64 lastBitMask, roundBitsMask;
   2675    int8 roundingMode;
   2676    floatx80 z;
   2677
   2678    aExp = extractFloatx80Exp( a );
   2679    if ( 0x403E <= aExp ) {
   2680        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
   2681            return propagateFloatx80NaN( a, a );
   2682        }
   2683        return a;
   2684    }
   2685    if ( aExp <= 0x3FFE ) {
   2686        if (    ( aExp == 0 )
   2687             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
   2688            return a;
   2689        }
   2690        roundData->exception |= float_flag_inexact;
   2691        aSign = extractFloatx80Sign( a );
   2692        switch ( roundData->mode ) {
   2693         case float_round_nearest_even:
   2694            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
   2695               ) {
   2696                return
   2697                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
   2698            }
   2699            break;
   2700         case float_round_down:
   2701            return
   2702                  aSign ?
   2703                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
   2704                : packFloatx80( 0, 0, 0 );
   2705         case float_round_up:
   2706            return
   2707                  aSign ? packFloatx80( 1, 0, 0 )
   2708                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
   2709        }
   2710        return packFloatx80( aSign, 0, 0 );
   2711    }
   2712    lastBitMask = 1;
   2713    lastBitMask <<= 0x403E - aExp;
   2714    roundBitsMask = lastBitMask - 1;
   2715    z = a;
   2716    roundingMode = roundData->mode;
   2717    if ( roundingMode == float_round_nearest_even ) {
   2718        z.low += lastBitMask>>1;
   2719        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
   2720    }
   2721    else if ( roundingMode != float_round_to_zero ) {
   2722        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
   2723            z.low += roundBitsMask;
   2724        }
   2725    }
   2726    z.low &= ~ roundBitsMask;
   2727    if ( z.low == 0 ) {
   2728        ++z.high;
   2729        z.low = LIT64( 0x8000000000000000 );
   2730    }
   2731    if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
   2732    return z;
   2733
   2734}
   2735
   2736/*
   2737-------------------------------------------------------------------------------
   2738Returns the result of adding the absolute values of the extended double-
   2739precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
   2740negated before being returned.  `zSign' is ignored if the result is a NaN.
   2741The addition is performed according to the IEC/IEEE Standard for Binary
   2742Floating-point Arithmetic.
   2743-------------------------------------------------------------------------------
   2744*/
   2745static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
   2746{
   2747    int32 aExp, bExp, zExp;
   2748    bits64 aSig, bSig, zSig0, zSig1;
   2749    int32 expDiff;
   2750
   2751    aSig = extractFloatx80Frac( a );
   2752    aExp = extractFloatx80Exp( a );
   2753    bSig = extractFloatx80Frac( b );
   2754    bExp = extractFloatx80Exp( b );
   2755    expDiff = aExp - bExp;
   2756    if ( 0 < expDiff ) {
   2757        if ( aExp == 0x7FFF ) {
   2758            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   2759            return a;
   2760        }
   2761        if ( bExp == 0 ) --expDiff;
   2762        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   2763        zExp = aExp;
   2764    }
   2765    else if ( expDiff < 0 ) {
   2766        if ( bExp == 0x7FFF ) {
   2767            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   2768            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2769        }
   2770        if ( aExp == 0 ) ++expDiff;
   2771        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   2772        zExp = bExp;
   2773    }
   2774    else {
   2775        if ( aExp == 0x7FFF ) {
   2776            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   2777                return propagateFloatx80NaN( a, b );
   2778            }
   2779            return a;
   2780        }
   2781        zSig1 = 0;
   2782        zSig0 = aSig + bSig;
   2783        if ( aExp == 0 ) {
   2784            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
   2785            goto roundAndPack;
   2786        }
   2787        zExp = aExp;
   2788        goto shiftRight1;
   2789    }
   2790    
   2791    zSig0 = aSig + bSig;
   2792
   2793    if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 
   2794 shiftRight1:
   2795    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
   2796    zSig0 |= LIT64( 0x8000000000000000 );
   2797    ++zExp;
   2798 roundAndPack:
   2799    return
   2800        roundAndPackFloatx80(
   2801            roundData, zSign, zExp, zSig0, zSig1 );
   2802
   2803}
   2804
   2805/*
   2806-------------------------------------------------------------------------------
   2807Returns the result of subtracting the absolute values of the extended
   2808double-precision floating-point values `a' and `b'.  If `zSign' is true,
   2809the difference is negated before being returned.  `zSign' is ignored if the
   2810result is a NaN.  The subtraction is performed according to the IEC/IEEE
   2811Standard for Binary Floating-point Arithmetic.
   2812-------------------------------------------------------------------------------
   2813*/
   2814static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
   2815{
   2816    int32 aExp, bExp, zExp;
   2817    bits64 aSig, bSig, zSig0, zSig1;
   2818    int32 expDiff;
   2819    floatx80 z;
   2820
   2821    aSig = extractFloatx80Frac( a );
   2822    aExp = extractFloatx80Exp( a );
   2823    bSig = extractFloatx80Frac( b );
   2824    bExp = extractFloatx80Exp( b );
   2825    expDiff = aExp - bExp;
   2826    if ( 0 < expDiff ) goto aExpBigger;
   2827    if ( expDiff < 0 ) goto bExpBigger;
   2828    if ( aExp == 0x7FFF ) {
   2829        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
   2830            return propagateFloatx80NaN( a, b );
   2831        }
   2832        roundData->exception |= float_flag_invalid;
   2833        z.low = floatx80_default_nan_low;
   2834        z.high = floatx80_default_nan_high;
   2835        z.__padding = 0;
   2836        return z;
   2837    }
   2838    if ( aExp == 0 ) {
   2839        aExp = 1;
   2840        bExp = 1;
   2841    }
   2842    zSig1 = 0;
   2843    if ( bSig < aSig ) goto aBigger;
   2844    if ( aSig < bSig ) goto bBigger;
   2845    return packFloatx80( roundData->mode == float_round_down, 0, 0 );
   2846 bExpBigger:
   2847    if ( bExp == 0x7FFF ) {
   2848        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   2849        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2850    }
   2851    if ( aExp == 0 ) ++expDiff;
   2852    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
   2853 bBigger:
   2854    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
   2855    zExp = bExp;
   2856    zSign ^= 1;
   2857    goto normalizeRoundAndPack;
   2858 aExpBigger:
   2859    if ( aExp == 0x7FFF ) {
   2860        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   2861        return a;
   2862    }
   2863    if ( bExp == 0 ) --expDiff;
   2864    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
   2865 aBigger:
   2866    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
   2867    zExp = aExp;
   2868 normalizeRoundAndPack:
   2869    return
   2870        normalizeRoundAndPackFloatx80(
   2871            roundData, zSign, zExp, zSig0, zSig1 );
   2872
   2873}
   2874
   2875/*
   2876-------------------------------------------------------------------------------
   2877Returns the result of adding the extended double-precision floating-point
   2878values `a' and `b'.  The operation is performed according to the IEC/IEEE
   2879Standard for Binary Floating-point Arithmetic.
   2880-------------------------------------------------------------------------------
   2881*/
   2882floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
   2883{
   2884    flag aSign, bSign;
   2885    
   2886    aSign = extractFloatx80Sign( a );
   2887    bSign = extractFloatx80Sign( b );
   2888    if ( aSign == bSign ) {
   2889        return addFloatx80Sigs( roundData, a, b, aSign );
   2890    }
   2891    else {
   2892        return subFloatx80Sigs( roundData, a, b, aSign );
   2893    }
   2894    
   2895}
   2896
   2897/*
   2898-------------------------------------------------------------------------------
   2899Returns the result of subtracting the extended double-precision floating-
   2900point values `a' and `b'.  The operation is performed according to the
   2901IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2902-------------------------------------------------------------------------------
   2903*/
   2904floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
   2905{
   2906    flag aSign, bSign;
   2907
   2908    aSign = extractFloatx80Sign( a );
   2909    bSign = extractFloatx80Sign( b );
   2910    if ( aSign == bSign ) {
   2911        return subFloatx80Sigs( roundData, a, b, aSign );
   2912    }
   2913    else {
   2914        return addFloatx80Sigs( roundData, a, b, aSign );
   2915    }
   2916
   2917}
   2918
   2919/*
   2920-------------------------------------------------------------------------------
   2921Returns the result of multiplying the extended double-precision floating-
   2922point values `a' and `b'.  The operation is performed according to the
   2923IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2924-------------------------------------------------------------------------------
   2925*/
   2926floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
   2927{
   2928    flag aSign, bSign, zSign;
   2929    int32 aExp, bExp, zExp;
   2930    bits64 aSig, bSig, zSig0, zSig1;
   2931    floatx80 z;
   2932
   2933    aSig = extractFloatx80Frac( a );
   2934    aExp = extractFloatx80Exp( a );
   2935    aSign = extractFloatx80Sign( a );
   2936    bSig = extractFloatx80Frac( b );
   2937    bExp = extractFloatx80Exp( b );
   2938    bSign = extractFloatx80Sign( b );
   2939    zSign = aSign ^ bSign;
   2940    if ( aExp == 0x7FFF ) {
   2941        if (    (bits64) ( aSig<<1 )
   2942             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   2943            return propagateFloatx80NaN( a, b );
   2944        }
   2945        if ( ( bExp | bSig ) == 0 ) goto invalid;
   2946        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2947    }
   2948    if ( bExp == 0x7FFF ) {
   2949        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   2950        if ( ( aExp | aSig ) == 0 ) {
   2951 invalid:
   2952            roundData->exception |= float_flag_invalid;
   2953            z.low = floatx80_default_nan_low;
   2954            z.high = floatx80_default_nan_high;
   2955            z.__padding = 0;
   2956            return z;
   2957        }
   2958        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   2959    }
   2960    if ( aExp == 0 ) {
   2961        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   2962        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   2963    }
   2964    if ( bExp == 0 ) {
   2965        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
   2966        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   2967    }
   2968    zExp = aExp + bExp - 0x3FFE;
   2969    mul64To128( aSig, bSig, &zSig0, &zSig1 );
   2970    if ( 0 < (sbits64) zSig0 ) {
   2971        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
   2972        --zExp;
   2973    }
   2974    return
   2975        roundAndPackFloatx80(
   2976            roundData, zSign, zExp, zSig0, zSig1 );
   2977
   2978}
   2979
   2980/*
   2981-------------------------------------------------------------------------------
   2982Returns the result of dividing the extended double-precision floating-point
   2983value `a' by the corresponding value `b'.  The operation is performed
   2984according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   2985-------------------------------------------------------------------------------
   2986*/
   2987floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
   2988{
   2989    flag aSign, bSign, zSign;
   2990    int32 aExp, bExp, zExp;
   2991    bits64 aSig, bSig, zSig0, zSig1;
   2992    bits64 rem0, rem1, rem2, term0, term1, term2;
   2993    floatx80 z;
   2994
   2995    aSig = extractFloatx80Frac( a );
   2996    aExp = extractFloatx80Exp( a );
   2997    aSign = extractFloatx80Sign( a );
   2998    bSig = extractFloatx80Frac( b );
   2999    bExp = extractFloatx80Exp( b );
   3000    bSign = extractFloatx80Sign( b );
   3001    zSign = aSign ^ bSign;
   3002    if ( aExp == 0x7FFF ) {
   3003        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3004        if ( bExp == 0x7FFF ) {
   3005            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3006            goto invalid;
   3007        }
   3008        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3009    }
   3010    if ( bExp == 0x7FFF ) {
   3011        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3012        return packFloatx80( zSign, 0, 0 );
   3013    }
   3014    if ( bExp == 0 ) {
   3015        if ( bSig == 0 ) {
   3016            if ( ( aExp | aSig ) == 0 ) {
   3017 invalid:
   3018                roundData->exception |= float_flag_invalid;
   3019                z.low = floatx80_default_nan_low;
   3020                z.high = floatx80_default_nan_high;
   3021                z.__padding = 0;
   3022                return z;
   3023            }
   3024            roundData->exception |= float_flag_divbyzero;
   3025            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
   3026        }
   3027        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3028    }
   3029    if ( aExp == 0 ) {
   3030        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
   3031        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
   3032    }
   3033    zExp = aExp - bExp + 0x3FFE;
   3034    rem1 = 0;
   3035    if ( bSig <= aSig ) {
   3036        shift128Right( aSig, 0, 1, &aSig, &rem1 );
   3037        ++zExp;
   3038    }
   3039    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
   3040    mul64To128( bSig, zSig0, &term0, &term1 );
   3041    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
   3042    while ( (sbits64) rem0 < 0 ) {
   3043        --zSig0;
   3044        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
   3045    }
   3046    zSig1 = estimateDiv128To64( rem1, 0, bSig );
   3047    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
   3048        mul64To128( bSig, zSig1, &term1, &term2 );
   3049        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3050        while ( (sbits64) rem1 < 0 ) {
   3051            --zSig1;
   3052            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
   3053        }
   3054        zSig1 |= ( ( rem1 | rem2 ) != 0 );
   3055    }
   3056    return
   3057        roundAndPackFloatx80(
   3058            roundData, zSign, zExp, zSig0, zSig1 );
   3059
   3060}
   3061
   3062/*
   3063-------------------------------------------------------------------------------
   3064Returns the remainder of the extended double-precision floating-point value
   3065`a' with respect to the corresponding value `b'.  The operation is performed
   3066according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   3067-------------------------------------------------------------------------------
   3068*/
   3069floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
   3070{
   3071    flag aSign, bSign, zSign;
   3072    int32 aExp, bExp, expDiff;
   3073    bits64 aSig0, aSig1, bSig;
   3074    bits64 q, term0, term1, alternateASig0, alternateASig1;
   3075    floatx80 z;
   3076
   3077    aSig0 = extractFloatx80Frac( a );
   3078    aExp = extractFloatx80Exp( a );
   3079    aSign = extractFloatx80Sign( a );
   3080    bSig = extractFloatx80Frac( b );
   3081    bExp = extractFloatx80Exp( b );
   3082    bSign = extractFloatx80Sign( b );
   3083    if ( aExp == 0x7FFF ) {
   3084        if (    (bits64) ( aSig0<<1 )
   3085             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
   3086            return propagateFloatx80NaN( a, b );
   3087        }
   3088        goto invalid;
   3089    }
   3090    if ( bExp == 0x7FFF ) {
   3091        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
   3092        return a;
   3093    }
   3094    if ( bExp == 0 ) {
   3095        if ( bSig == 0 ) {
   3096 invalid:
   3097            roundData->exception |= float_flag_invalid;
   3098            z.low = floatx80_default_nan_low;
   3099            z.high = floatx80_default_nan_high;
   3100            z.__padding = 0;
   3101            return z;
   3102        }
   3103        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
   3104    }
   3105    if ( aExp == 0 ) {
   3106        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
   3107        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3108    }
   3109    bSig |= LIT64( 0x8000000000000000 );
   3110    zSign = aSign;
   3111    expDiff = aExp - bExp;
   3112    aSig1 = 0;
   3113    if ( expDiff < 0 ) {
   3114        if ( expDiff < -1 ) return a;
   3115        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
   3116        expDiff = 0;
   3117    }
   3118    q = ( bSig <= aSig0 );
   3119    if ( q ) aSig0 -= bSig;
   3120    expDiff -= 64;
   3121    while ( 0 < expDiff ) {
   3122        q = estimateDiv128To64( aSig0, aSig1, bSig );
   3123        q = ( 2 < q ) ? q - 2 : 0;
   3124        mul64To128( bSig, q, &term0, &term1 );
   3125        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3126        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
   3127        expDiff -= 62;
   3128    }
   3129    expDiff += 64;
   3130    if ( 0 < expDiff ) {
   3131        q = estimateDiv128To64( aSig0, aSig1, bSig );
   3132        q = ( 2 < q ) ? q - 2 : 0;
   3133        q >>= 64 - expDiff;
   3134        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
   3135        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3136        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
   3137        while ( le128( term0, term1, aSig0, aSig1 ) ) {
   3138            ++q;
   3139            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
   3140        }
   3141    }
   3142    else {
   3143        term1 = 0;
   3144        term0 = bSig;
   3145    }
   3146    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
   3147    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3148         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
   3149              && ( q & 1 ) )
   3150       ) {
   3151        aSig0 = alternateASig0;
   3152        aSig1 = alternateASig1;
   3153        zSign = ! zSign;
   3154    }
   3155
   3156    return
   3157        normalizeRoundAndPackFloatx80(
   3158            roundData, zSign, bExp + expDiff, aSig0, aSig1 );
   3159
   3160}
   3161
   3162/*
   3163-------------------------------------------------------------------------------
   3164Returns the square root of the extended double-precision floating-point
   3165value `a'.  The operation is performed according to the IEC/IEEE Standard
   3166for Binary Floating-point Arithmetic.
   3167-------------------------------------------------------------------------------
   3168*/
   3169floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
   3170{
   3171    flag aSign;
   3172    int32 aExp, zExp;
   3173    bits64 aSig0, aSig1, zSig0, zSig1;
   3174    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
   3175    bits64 shiftedRem0, shiftedRem1;
   3176    floatx80 z;
   3177
   3178    aSig0 = extractFloatx80Frac( a );
   3179    aExp = extractFloatx80Exp( a );
   3180    aSign = extractFloatx80Sign( a );
   3181    if ( aExp == 0x7FFF ) {
   3182        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
   3183        if ( ! aSign ) return a;
   3184        goto invalid;
   3185    }
   3186    if ( aSign ) {
   3187        if ( ( aExp | aSig0 ) == 0 ) return a;
   3188 invalid:
   3189        roundData->exception |= float_flag_invalid;
   3190        z.low = floatx80_default_nan_low;
   3191        z.high = floatx80_default_nan_high;
   3192        z.__padding = 0;
   3193        return z;
   3194    }
   3195    if ( aExp == 0 ) {
   3196        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
   3197        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
   3198    }
   3199    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
   3200    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
   3201    zSig0 <<= 31;
   3202    aSig1 = 0;
   3203    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
   3204    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
   3205    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
   3206    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
   3207    mul64To128( zSig0, zSig0, &term0, &term1 );
   3208    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
   3209    while ( (sbits64) rem0 < 0 ) {
   3210        --zSig0;
   3211        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
   3212        term1 |= 1;
   3213        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
   3214    }
   3215    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
   3216    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
   3217    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
   3218        if ( zSig1 == 0 ) zSig1 = 1;
   3219        mul64To128( zSig0, zSig1, &term1, &term2 );
   3220        shortShift128Left( term1, term2, 1, &term1, &term2 );
   3221        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
   3222        mul64To128( zSig1, zSig1, &term2, &term3 );
   3223        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
   3224        while ( (sbits64) rem1 < 0 ) {
   3225            --zSig1;
   3226            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
   3227            term3 |= 1;
   3228            add192(
   3229                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
   3230        }
   3231        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
   3232    }
   3233    return
   3234        roundAndPackFloatx80(
   3235            roundData, 0, zExp, zSig0, zSig1 );
   3236
   3237}
   3238
   3239/*
   3240-------------------------------------------------------------------------------
   3241Returns 1 if the extended double-precision floating-point value `a' is
   3242equal to the corresponding value `b', and 0 otherwise.  The comparison is
   3243performed according to the IEC/IEEE Standard for Binary Floating-point
   3244Arithmetic.
   3245-------------------------------------------------------------------------------
   3246*/
   3247flag floatx80_eq( floatx80 a, floatx80 b )
   3248{
   3249
   3250    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3251              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3252         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3253              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3254       ) {
   3255        if (    floatx80_is_signaling_nan( a )
   3256             || floatx80_is_signaling_nan( b ) ) {
   3257            float_raise( float_flag_invalid );
   3258        }
   3259        return 0;
   3260    }
   3261    return
   3262           ( a.low == b.low )
   3263        && (    ( a.high == b.high )
   3264             || (    ( a.low == 0 )
   3265                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   3266           );
   3267
   3268}
   3269
   3270/*
   3271-------------------------------------------------------------------------------
   3272Returns 1 if the extended double-precision floating-point value `a' is
   3273less than or equal to the corresponding value `b', and 0 otherwise.  The
   3274comparison is performed according to the IEC/IEEE Standard for Binary
   3275Floating-point Arithmetic.
   3276-------------------------------------------------------------------------------
   3277*/
   3278flag floatx80_le( floatx80 a, floatx80 b )
   3279{
   3280    flag aSign, bSign;
   3281
   3282    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3283              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3284         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3285              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3286       ) {
   3287        float_raise( float_flag_invalid );
   3288        return 0;
   3289    }
   3290    aSign = extractFloatx80Sign( a );
   3291    bSign = extractFloatx80Sign( b );
   3292    if ( aSign != bSign ) {
   3293        return
   3294               aSign
   3295            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   3296                 == 0 );
   3297    }
   3298    return
   3299          aSign ? le128( b.high, b.low, a.high, a.low )
   3300        : le128( a.high, a.low, b.high, b.low );
   3301
   3302}
   3303
   3304/*
   3305-------------------------------------------------------------------------------
   3306Returns 1 if the extended double-precision floating-point value `a' is
   3307less than the corresponding value `b', and 0 otherwise.  The comparison
   3308is performed according to the IEC/IEEE Standard for Binary Floating-point
   3309Arithmetic.
   3310-------------------------------------------------------------------------------
   3311*/
   3312flag floatx80_lt( floatx80 a, floatx80 b )
   3313{
   3314    flag aSign, bSign;
   3315
   3316    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3317              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3318         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3319              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3320       ) {
   3321        float_raise( float_flag_invalid );
   3322        return 0;
   3323    }
   3324    aSign = extractFloatx80Sign( a );
   3325    bSign = extractFloatx80Sign( b );
   3326    if ( aSign != bSign ) {
   3327        return
   3328               aSign
   3329            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   3330                 != 0 );
   3331    }
   3332    return
   3333          aSign ? lt128( b.high, b.low, a.high, a.low )
   3334        : lt128( a.high, a.low, b.high, b.low );
   3335
   3336}
   3337
   3338/*
   3339-------------------------------------------------------------------------------
   3340Returns 1 if the extended double-precision floating-point value `a' is equal
   3341to the corresponding value `b', and 0 otherwise.  The invalid exception is
   3342raised if either operand is a NaN.  Otherwise, the comparison is performed
   3343according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   3344-------------------------------------------------------------------------------
   3345*/
   3346flag floatx80_eq_signaling( floatx80 a, floatx80 b )
   3347{
   3348
   3349    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3350              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3351         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3352              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3353       ) {
   3354        float_raise( float_flag_invalid );
   3355        return 0;
   3356    }
   3357    return
   3358           ( a.low == b.low )
   3359        && (    ( a.high == b.high )
   3360             || (    ( a.low == 0 )
   3361                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
   3362           );
   3363
   3364}
   3365
   3366/*
   3367-------------------------------------------------------------------------------
   3368Returns 1 if the extended double-precision floating-point value `a' is less
   3369than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
   3370do not cause an exception.  Otherwise, the comparison is performed according
   3371to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
   3372-------------------------------------------------------------------------------
   3373*/
   3374flag floatx80_le_quiet( floatx80 a, floatx80 b )
   3375{
   3376    flag aSign, bSign;
   3377
   3378    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3379              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3380         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3381              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3382       ) {
   3383        /* Do nothing, even if NaN as we're quiet */
   3384        return 0;
   3385    }
   3386    aSign = extractFloatx80Sign( a );
   3387    bSign = extractFloatx80Sign( b );
   3388    if ( aSign != bSign ) {
   3389        return
   3390               aSign
   3391            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   3392                 == 0 );
   3393    }
   3394    return
   3395          aSign ? le128( b.high, b.low, a.high, a.low )
   3396        : le128( a.high, a.low, b.high, b.low );
   3397
   3398}
   3399
   3400/*
   3401-------------------------------------------------------------------------------
   3402Returns 1 if the extended double-precision floating-point value `a' is less
   3403than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
   3404an exception.  Otherwise, the comparison is performed according to the
   3405IEC/IEEE Standard for Binary Floating-point Arithmetic.
   3406-------------------------------------------------------------------------------
   3407*/
   3408flag floatx80_lt_quiet( floatx80 a, floatx80 b )
   3409{
   3410    flag aSign, bSign;
   3411
   3412    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
   3413              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
   3414         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
   3415              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
   3416       ) {
   3417        /* Do nothing, even if NaN as we're quiet */
   3418        return 0;
   3419    }
   3420    aSign = extractFloatx80Sign( a );
   3421    bSign = extractFloatx80Sign( b );
   3422    if ( aSign != bSign ) {
   3423        return
   3424               aSign
   3425            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
   3426                 != 0 );
   3427    }
   3428    return
   3429          aSign ? lt128( b.high, b.low, a.high, a.low )
   3430        : lt128( a.high, a.low, b.high, b.low );
   3431
   3432}
   3433
   3434#endif
   3435