softfloat-macros.h - cachepc-qemu - Fork of AMDESE/qemu with changes for cachepc side-channel attack

	cachepc-qemu Fork of AMDESE/qemu with changes for cachepc side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-qemu
	Log \| Files \| Refs \| Submodules \| LICENSE \| sfeed.txt
softfloat-macros.h (28306B)
      1/*
      2 * QEMU float support macros
      3 *
      4 * The code in this source file is derived from release 2a of the SoftFloat
      5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
      6 * some later contributions) are provided under that license, as detailed below.
      7 * It has subsequently been modified by contributors to the QEMU Project,
      8 * so some portions are provided under:
      9 *  the SoftFloat-2a license
     10 *  the BSD license
     11 *  GPL-v2-or-later
     12 *
     13 * Any future contributions to this file after December 1st 2014 will be
     14 * taken to be licensed under the Softfloat-2a license unless specifically
     15 * indicated otherwise.
     16 */
     17
     18/*
     19===============================================================================
     20This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
     21Arithmetic Package, Release 2a.
     22
     23Written by John R. Hauser.  This work was made possible in part by the
     24International Computer Science Institute, located at Suite 600, 1947 Center
     25Street, Berkeley, California 94704.  Funding was partially provided by the
     26National Science Foundation under grant MIP-9311980.  The original version
     27of this code was written as part of a project to build a fixed-point vector
     28processor in collaboration with the University of California at Berkeley,
     29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
     30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
     31arithmetic/SoftFloat.html'.
     32
     33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
     34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
     35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
     36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
     37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
     38
     39Derivative works are acceptable, even for commercial purposes, so long as
     40(1) they include prominent notice that the work is derivative, and (2) they
     41include prominent notice akin to these four paragraphs for those parts of
     42this code that are retained.
     43
     44===============================================================================
     45*/
     46
     47/* BSD licensing:
     48 * Copyright (c) 2006, Fabrice Bellard
     49 * All rights reserved.
     50 *
     51 * Redistribution and use in source and binary forms, with or without
     52 * modification, are permitted provided that the following conditions are met:
     53 *
     54 * 1. Redistributions of source code must retain the above copyright notice,
     55 * this list of conditions and the following disclaimer.
     56 *
     57 * 2. Redistributions in binary form must reproduce the above copyright notice,
     58 * this list of conditions and the following disclaimer in the documentation
     59 * and/or other materials provided with the distribution.
     60 *
     61 * 3. Neither the name of the copyright holder nor the names of its contributors
     62 * may be used to endorse or promote products derived from this software without
     63 * specific prior written permission.
     64 *
     65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
     69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
     75 * THE POSSIBILITY OF SUCH DAMAGE.
     76 */
     77
     78/* Portions of this work are licensed under the terms of the GNU GPL,
     79 * version 2 or later. See the COPYING file in the top-level directory.
     80 */
     81
     82#ifndef FPU_SOFTFLOAT_MACROS_H
     83#define FPU_SOFTFLOAT_MACROS_H
     84
     85#include "fpu/softfloat-types.h"
     86#include "qemu/host-utils.h"
     87
     88/**
     89 * shl_double: double-word merging left shift
     90 * @l: left or most-significant word
     91 * @r: right or least-significant word
     92 * @c: shift count
     93 *
     94 * Shift @l left by @c bits, shifting in bits from @r.
     95 */
     96static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
     97{
     98#if defined(__x86_64__)
     99    asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
    100    return l;
    101#else
    102    return c ? (l << c) | (r >> (64 - c)) : l;
    103#endif
    104}
    105
    106/**
    107 * shr_double: double-word merging right shift
    108 * @l: left or most-significant word
    109 * @r: right or least-significant word
    110 * @c: shift count
    111 *
    112 * Shift @r right by @c bits, shifting in bits from @l.
    113 */
    114static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
    115{
    116#if defined(__x86_64__)
    117    asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
    118    return r;
    119#else
    120    return c ? (r >> c) | (l << (64 - c)) : r;
    121#endif
    122}
    123
    124/*----------------------------------------------------------------------------
    125| Shifts `a' right by the number of bits given in `count'.  If any nonzero
    126| bits are shifted off, they are ``jammed'' into the least significant bit of
    127| the result by setting the least significant bit to 1.  The value of `count'
    128| can be arbitrarily large; in particular, if `count' is greater than 32, the
    129| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
    130| The result is stored in the location pointed to by `zPtr'.
    131*----------------------------------------------------------------------------*/
    132
    133static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
    134{
    135    uint32_t z;
    136
    137    if ( count == 0 ) {
    138        z = a;
    139    }
    140    else if ( count < 32 ) {
    141        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
    142    }
    143    else {
    144        z = ( a != 0 );
    145    }
    146    *zPtr = z;
    147
    148}
    149
    150/*----------------------------------------------------------------------------
    151| Shifts `a' right by the number of bits given in `count'.  If any nonzero
    152| bits are shifted off, they are ``jammed'' into the least significant bit of
    153| the result by setting the least significant bit to 1.  The value of `count'
    154| can be arbitrarily large; in particular, if `count' is greater than 64, the
    155| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
    156| The result is stored in the location pointed to by `zPtr'.
    157*----------------------------------------------------------------------------*/
    158
    159static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
    160{
    161    uint64_t z;
    162
    163    if ( count == 0 ) {
    164        z = a;
    165    }
    166    else if ( count < 64 ) {
    167        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
    168    }
    169    else {
    170        z = ( a != 0 );
    171    }
    172    *zPtr = z;
    173
    174}
    175
    176/*----------------------------------------------------------------------------
    177| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
    178| _plus_ the number of bits given in `count'.  The shifted result is at most
    179| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
    180| bits shifted off form a second 64-bit result as follows:  The _last_ bit
    181| shifted off is the most-significant bit of the extra result, and the other
    182| 63 bits of the extra result are all zero if and only if _all_but_the_last_
    183| bits shifted off were all zero.  This extra result is stored in the location
    184| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
    185|     (This routine makes more sense if `a0' and `a1' are considered to form a
    186| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
    187| value is shifted right by the number of bits given in `count', and the
    188| integer part of the result is returned at the location pointed to by
    189| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
    190| described above, and is returned at the location pointed to by `z1Ptr'.)
    191*----------------------------------------------------------------------------*/
    192
    193static inline void
    194 shift64ExtraRightJamming(
    195     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
    196{
    197    uint64_t z0, z1;
    198    int8_t negCount = ( - count ) & 63;
    199
    200    if ( count == 0 ) {
    201        z1 = a1;
    202        z0 = a0;
    203    }
    204    else if ( count < 64 ) {
    205        z1 = ( a0<<negCount ) | ( a1 != 0 );
    206        z0 = a0>>count;
    207    }
    208    else {
    209        if ( count == 64 ) {
    210            z1 = a0 | ( a1 != 0 );
    211        }
    212        else {
    213            z1 = ( ( a0 | a1 ) != 0 );
    214        }
    215        z0 = 0;
    216    }
    217    *z1Ptr = z1;
    218    *z0Ptr = z0;
    219
    220}
    221
    222/*----------------------------------------------------------------------------
    223| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
    224| number of bits given in `count'.  Any bits shifted off are lost.  The value
    225| of `count' can be arbitrarily large; in particular, if `count' is greater
    226| than 128, the result will be 0.  The result is broken into two 64-bit pieces
    227| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
    228*----------------------------------------------------------------------------*/
    229
    230static inline void
    231 shift128Right(
    232     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
    233{
    234    uint64_t z0, z1;
    235    int8_t negCount = ( - count ) & 63;
    236
    237    if ( count == 0 ) {
    238        z1 = a1;
    239        z0 = a0;
    240    }
    241    else if ( count < 64 ) {
    242        z1 = ( a0<<negCount ) | ( a1>>count );
    243        z0 = a0>>count;
    244    }
    245    else {
    246        z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
    247        z0 = 0;
    248    }
    249    *z1Ptr = z1;
    250    *z0Ptr = z0;
    251
    252}
    253
    254/*----------------------------------------------------------------------------
    255| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
    256| number of bits given in `count'.  If any nonzero bits are shifted off, they
    257| are ``jammed'' into the least significant bit of the result by setting the
    258| least significant bit to 1.  The value of `count' can be arbitrarily large;
    259| in particular, if `count' is greater than 128, the result will be either
    260| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
    261| nonzero.  The result is broken into two 64-bit pieces which are stored at
    262| the locations pointed to by `z0Ptr' and `z1Ptr'.
    263*----------------------------------------------------------------------------*/
    264
    265static inline void
    266 shift128RightJamming(
    267     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
    268{
    269    uint64_t z0, z1;
    270    int8_t negCount = ( - count ) & 63;
    271
    272    if ( count == 0 ) {
    273        z1 = a1;
    274        z0 = a0;
    275    }
    276    else if ( count < 64 ) {
    277        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
    278        z0 = a0>>count;
    279    }
    280    else {
    281        if ( count == 64 ) {
    282            z1 = a0 | ( a1 != 0 );
    283        }
    284        else if ( count < 128 ) {
    285            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
    286        }
    287        else {
    288            z1 = ( ( a0 | a1 ) != 0 );
    289        }
    290        z0 = 0;
    291    }
    292    *z1Ptr = z1;
    293    *z0Ptr = z0;
    294
    295}
    296
    297/*----------------------------------------------------------------------------
    298| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
    299| by 64 _plus_ the number of bits given in `count'.  The shifted result is
    300| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
    301| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
    302| off form a third 64-bit result as follows:  The _last_ bit shifted off is
    303| the most-significant bit of the extra result, and the other 63 bits of the
    304| extra result are all zero if and only if _all_but_the_last_ bits shifted off
    305| were all zero.  This extra result is stored in the location pointed to by
    306| `z2Ptr'.  The value of `count' can be arbitrarily large.
    307|     (This routine makes more sense if `a0', `a1', and `a2' are considered
    308| to form a fixed-point value with binary point between `a1' and `a2'.  This
    309| fixed-point value is shifted right by the number of bits given in `count',
    310| and the integer part of the result is returned at the locations pointed to
    311| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
    312| corrupted as described above, and is returned at the location pointed to by
    313| `z2Ptr'.)
    314*----------------------------------------------------------------------------*/
    315
    316static inline void
    317 shift128ExtraRightJamming(
    318     uint64_t a0,
    319     uint64_t a1,
    320     uint64_t a2,
    321     int count,
    322     uint64_t *z0Ptr,
    323     uint64_t *z1Ptr,
    324     uint64_t *z2Ptr
    325 )
    326{
    327    uint64_t z0, z1, z2;
    328    int8_t negCount = ( - count ) & 63;
    329
    330    if ( count == 0 ) {
    331        z2 = a2;
    332        z1 = a1;
    333        z0 = a0;
    334    }
    335    else {
    336        if ( count < 64 ) {
    337            z2 = a1<<negCount;
    338            z1 = ( a0<<negCount ) | ( a1>>count );
    339            z0 = a0>>count;
    340        }
    341        else {
    342            if ( count == 64 ) {
    343                z2 = a1;
    344                z1 = a0;
    345            }
    346            else {
    347                a2 |= a1;
    348                if ( count < 128 ) {
    349                    z2 = a0<<negCount;
    350                    z1 = a0>>( count & 63 );
    351                }
    352                else {
    353                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
    354                    z1 = 0;
    355                }
    356            }
    357            z0 = 0;
    358        }
    359        z2 |= ( a2 != 0 );
    360    }
    361    *z2Ptr = z2;
    362    *z1Ptr = z1;
    363    *z0Ptr = z0;
    364
    365}
    366
    367/*----------------------------------------------------------------------------
    368| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
    369| number of bits given in `count'.  Any bits shifted off are lost.  The value
    370| of `count' must be less than 64.  The result is broken into two 64-bit
    371| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
    372*----------------------------------------------------------------------------*/
    373
    374static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
    375                                     uint64_t *z0Ptr, uint64_t *z1Ptr)
    376{
    377    *z1Ptr = a1 << count;
    378    *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
    379}
    380
    381/*----------------------------------------------------------------------------
    382| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
    383| number of bits given in `count'.  Any bits shifted off are lost.  The value
    384| of `count' may be greater than 64.  The result is broken into two 64-bit
    385| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
    386*----------------------------------------------------------------------------*/
    387
    388static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
    389                                uint64_t *z0Ptr, uint64_t *z1Ptr)
    390{
    391    if (count < 64) {
    392        *z1Ptr = a1 << count;
    393        *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
    394    } else {
    395        *z1Ptr = 0;
    396        *z0Ptr = a1 << (count - 64);
    397    }
    398}
    399
    400/*----------------------------------------------------------------------------
    401| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
    402| by the number of bits given in `count'.  Any bits shifted off are lost.
    403| The value of `count' must be less than 64.  The result is broken into three
    404| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
    405| `z1Ptr', and `z2Ptr'.
    406*----------------------------------------------------------------------------*/
    407
    408static inline void
    409 shortShift192Left(
    410     uint64_t a0,
    411     uint64_t a1,
    412     uint64_t a2,
    413     int count,
    414     uint64_t *z0Ptr,
    415     uint64_t *z1Ptr,
    416     uint64_t *z2Ptr
    417 )
    418{
    419    uint64_t z0, z1, z2;
    420    int8_t negCount;
    421
    422    z2 = a2<<count;
    423    z1 = a1<<count;
    424    z0 = a0<<count;
    425    if ( 0 < count ) {
    426        negCount = ( ( - count ) & 63 );
    427        z1 |= a2>>negCount;
    428        z0 |= a1>>negCount;
    429    }
    430    *z2Ptr = z2;
    431    *z1Ptr = z1;
    432    *z0Ptr = z0;
    433
    434}
    435
    436/*----------------------------------------------------------------------------
    437| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
    438| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
    439| any carry out is lost.  The result is broken into two 64-bit pieces which
    440| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
    441*----------------------------------------------------------------------------*/
    442
    443static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
    444                          uint64_t *z0Ptr, uint64_t *z1Ptr)
    445{
    446    bool c = 0;
    447    *z1Ptr = uadd64_carry(a1, b1, &c);
    448    *z0Ptr = uadd64_carry(a0, b0, &c);
    449}
    450
    451/*----------------------------------------------------------------------------
    452| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
    453| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
    454| modulo 2^192, so any carry out is lost.  The result is broken into three
    455| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
    456| `z1Ptr', and `z2Ptr'.
    457*----------------------------------------------------------------------------*/
    458
    459static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
    460                          uint64_t b0, uint64_t b1, uint64_t b2,
    461                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
    462{
    463    bool c = 0;
    464    *z2Ptr = uadd64_carry(a2, b2, &c);
    465    *z1Ptr = uadd64_carry(a1, b1, &c);
    466    *z0Ptr = uadd64_carry(a0, b0, &c);
    467}
    468
    469/*----------------------------------------------------------------------------
    470| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
    471| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
    472| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
    473| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
    474| `z1Ptr'.
    475*----------------------------------------------------------------------------*/
    476
    477static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
    478                          uint64_t *z0Ptr, uint64_t *z1Ptr)
    479{
    480    bool c = 0;
    481    *z1Ptr = usub64_borrow(a1, b1, &c);
    482    *z0Ptr = usub64_borrow(a0, b0, &c);
    483}
    484
    485/*----------------------------------------------------------------------------
    486| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
    487| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
    488| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
    489| result is broken into three 64-bit pieces which are stored at the locations
    490| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
    491*----------------------------------------------------------------------------*/
    492
    493static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
    494                          uint64_t b0, uint64_t b1, uint64_t b2,
    495                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
    496{
    497    bool c = 0;
    498    *z2Ptr = usub64_borrow(a2, b2, &c);
    499    *z1Ptr = usub64_borrow(a1, b1, &c);
    500    *z0Ptr = usub64_borrow(a0, b0, &c);
    501}
    502
    503/*----------------------------------------------------------------------------
    504| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
    505| into two 64-bit pieces which are stored at the locations pointed to by
    506| `z0Ptr' and `z1Ptr'.
    507*----------------------------------------------------------------------------*/
    508
    509static inline void
    510mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
    511{
    512    mulu64(z1Ptr, z0Ptr, a, b);
    513}
    514
    515/*----------------------------------------------------------------------------
    516| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
    517| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
    518| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
    519| `z2Ptr'.
    520*----------------------------------------------------------------------------*/
    521
    522static inline void
    523mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
    524                uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
    525{
    526    uint64_t z0, z1, m1;
    527
    528    mul64To128(a1, b, &m1, z2Ptr);
    529    mul64To128(a0, b, &z0, &z1);
    530    add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
    531}
    532
    533/*----------------------------------------------------------------------------
    534| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
    535| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
    536| product.  The product is broken into four 64-bit pieces which are stored at
    537| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
    538*----------------------------------------------------------------------------*/
    539
    540static inline void mul128To256(uint64_t a0, uint64_t a1,
    541                               uint64_t b0, uint64_t b1,
    542                               uint64_t *z0Ptr, uint64_t *z1Ptr,
    543                               uint64_t *z2Ptr, uint64_t *z3Ptr)
    544{
    545    uint64_t z0, z1, z2;
    546    uint64_t m0, m1, m2, n1, n2;
    547
    548    mul64To128(a1, b0, &m1, &m2);
    549    mul64To128(a0, b1, &n1, &n2);
    550    mul64To128(a1, b1, &z2, z3Ptr);
    551    mul64To128(a0, b0, &z0, &z1);
    552
    553    add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
    554    add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
    555}
    556
    557/*----------------------------------------------------------------------------
    558| Returns an approximation to the 64-bit integer quotient obtained by dividing
    559| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
    560| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
    561| toward zero, the approximation returned lies between q and q + 2 inclusive.
    562| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
    563| unsigned integer is returned.
    564*----------------------------------------------------------------------------*/
    565
    566static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
    567{
    568    uint64_t b0, b1;
    569    uint64_t rem0, rem1, term0, term1;
    570    uint64_t z;
    571
    572    if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
    573    b0 = b>>32;
    574    z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
    575    mul64To128( b, z, &term0, &term1 );
    576    sub128( a0, a1, term0, term1, &rem0, &rem1 );
    577    while ( ( (int64_t) rem0 ) < 0 ) {
    578        z -= UINT64_C(0x100000000);
    579        b1 = b<<32;
    580        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
    581    }
    582    rem0 = ( rem0<<32 ) | ( rem1>>32 );
    583    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
    584    return z;
    585
    586}
    587
    588/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
    589 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
    590 *
    591 * Licensed under the GPLv2/LGPLv3
    592 */
    593static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
    594                                  uint64_t n0, uint64_t d)
    595{
    596#if defined(__x86_64__)
    597    uint64_t q;
    598    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
    599    return q;
    600#elif defined(__s390x__) && !defined(__clang__)
    601    /* Need to use a TImode type to get an even register pair for DLGR.  */
    602    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
    603    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
    604    *r = n >> 64;
    605    return n;
    606#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
    607    /* From Power ISA 2.06, programming note for divdeu.  */
    608    uint64_t q1, q2, Q, r1, r2, R;
    609    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
    610        : "=&r"(q1), "=r"(q2)
    611        : "r"(n1), "r"(n0), "r"(d));
    612    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
    613    r2 = n0 - (q2 * d);
    614    Q = q1 + q2;
    615    R = r1 + r2;
    616    if (R >= d || R < r2) { /* overflow implies R > d */
    617        Q += 1;
    618        R -= d;
    619    }
    620    *r = R;
    621    return Q;
    622#else
    623    uint64_t d0, d1, q0, q1, r1, r0, m;
    624
    625    d0 = (uint32_t)d;
    626    d1 = d >> 32;
    627
    628    r1 = n1 % d1;
    629    q1 = n1 / d1;
    630    m = q1 * d0;
    631    r1 = (r1 << 32) | (n0 >> 32);
    632    if (r1 < m) {
    633        q1 -= 1;
    634        r1 += d;
    635        if (r1 >= d) {
    636            if (r1 < m) {
    637                q1 -= 1;
    638                r1 += d;
    639            }
    640        }
    641    }
    642    r1 -= m;
    643
    644    r0 = r1 % d1;
    645    q0 = r1 / d1;
    646    m = q0 * d0;
    647    r0 = (r0 << 32) | (uint32_t)n0;
    648    if (r0 < m) {
    649        q0 -= 1;
    650        r0 += d;
    651        if (r0 >= d) {
    652            if (r0 < m) {
    653                q0 -= 1;
    654                r0 += d;
    655            }
    656        }
    657    }
    658    r0 -= m;
    659
    660    *r = r0;
    661    return (q1 << 32) | q0;
    662#endif
    663}
    664
    665/*----------------------------------------------------------------------------
    666| Returns an approximation to the square root of the 32-bit significand given
    667| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
    668| `aExp' (the least significant bit) is 1, the integer returned approximates
    669| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
    670| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
    671| case, the approximation returned lies strictly within +/-2 of the exact
    672| value.
    673*----------------------------------------------------------------------------*/
    674
    675static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
    676{
    677    static const uint16_t sqrtOddAdjustments[] = {
    678        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
    679        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
    680    };
    681    static const uint16_t sqrtEvenAdjustments[] = {
    682        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
    683        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
    684    };
    685    int8_t index;
    686    uint32_t z;
    687
    688    index = ( a>>27 ) & 15;
    689    if ( aExp & 1 ) {
    690        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
    691        z = ( ( a / z )<<14 ) + ( z<<15 );
    692        a >>= 1;
    693    }
    694    else {
    695        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
    696        z = a / z + z;
    697        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
    698        if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
    699    }
    700    return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
    701
    702}
    703
    704/*----------------------------------------------------------------------------
    705| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
    706| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
    707| Otherwise, returns 0.
    708*----------------------------------------------------------------------------*/
    709
    710static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
    711{
    712    return a0 == b0 && a1 == b1;
    713}
    714
    715/*----------------------------------------------------------------------------
    716| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
    717| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
    718| Otherwise, returns 0.
    719*----------------------------------------------------------------------------*/
    720
    721static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
    722{
    723    return a0 < b0 || (a0 == b0 && a1 <= b1);
    724}
    725
    726/*----------------------------------------------------------------------------
    727| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
    728| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
    729| returns 0.
    730*----------------------------------------------------------------------------*/
    731
    732static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
    733{
    734    return a0 < b0 || (a0 == b0 && a1 < b1);
    735}
    736
    737/*----------------------------------------------------------------------------
    738| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
    739| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
    740| Otherwise, returns 0.
    741*----------------------------------------------------------------------------*/
    742
    743static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
    744{
    745    return a0 != b0 || a1 != b1;
    746}
    747
    748/*
    749 * Similarly, comparisons of 192-bit values.
    750 */
    751
    752static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
    753                         uint64_t b0, uint64_t b1, uint64_t b2)
    754{
    755    return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
    756}
    757
    758static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
    759                         uint64_t b0, uint64_t b1, uint64_t b2)
    760{
    761    if (a0 != b0) {
    762        return a0 < b0;
    763    }
    764    if (a1 != b1) {
    765        return a1 < b1;
    766    }
    767    return a2 <= b2;
    768}
    769
    770static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
    771                         uint64_t b0, uint64_t b1, uint64_t b2)
    772{
    773    if (a0 != b0) {
    774        return a0 < b0;
    775    }
    776    if (a1 != b1) {
    777        return a1 < b1;
    778    }
    779    return a2 < b2;
    780}
    781
    782#endif