cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

ops_sse.h (77409B)


      1/*
      2 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
      3 *
      4 *  Copyright (c) 2005 Fabrice Bellard
      5 *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
      6 *
      7 * This library is free software; you can redistribute it and/or
      8 * modify it under the terms of the GNU Lesser General Public
      9 * License as published by the Free Software Foundation; either
     10 * version 2.1 of the License, or (at your option) any later version.
     11 *
     12 * This library is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 * Lesser General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU Lesser General Public
     18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     19 */
     20
     21#include "crypto/aes.h"
     22
     23#if SHIFT == 0
     24#define Reg MMXReg
     25#define XMM_ONLY(...)
     26#define B(n) MMX_B(n)
     27#define W(n) MMX_W(n)
     28#define L(n) MMX_L(n)
     29#define Q(n) MMX_Q(n)
     30#define SUFFIX _mmx
     31#else
     32#define Reg ZMMReg
     33#define XMM_ONLY(...) __VA_ARGS__
     34#define B(n) ZMM_B(n)
     35#define W(n) ZMM_W(n)
     36#define L(n) ZMM_L(n)
     37#define Q(n) ZMM_Q(n)
     38#define SUFFIX _xmm
     39#endif
     40
     41void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     42{
     43    int shift;
     44
     45    if (s->Q(0) > 15) {
     46        d->Q(0) = 0;
     47#if SHIFT == 1
     48        d->Q(1) = 0;
     49#endif
     50    } else {
     51        shift = s->B(0);
     52        d->W(0) >>= shift;
     53        d->W(1) >>= shift;
     54        d->W(2) >>= shift;
     55        d->W(3) >>= shift;
     56#if SHIFT == 1
     57        d->W(4) >>= shift;
     58        d->W(5) >>= shift;
     59        d->W(6) >>= shift;
     60        d->W(7) >>= shift;
     61#endif
     62    }
     63}
     64
     65void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     66{
     67    int shift;
     68
     69    if (s->Q(0) > 15) {
     70        shift = 15;
     71    } else {
     72        shift = s->B(0);
     73    }
     74    d->W(0) = (int16_t)d->W(0) >> shift;
     75    d->W(1) = (int16_t)d->W(1) >> shift;
     76    d->W(2) = (int16_t)d->W(2) >> shift;
     77    d->W(3) = (int16_t)d->W(3) >> shift;
     78#if SHIFT == 1
     79    d->W(4) = (int16_t)d->W(4) >> shift;
     80    d->W(5) = (int16_t)d->W(5) >> shift;
     81    d->W(6) = (int16_t)d->W(6) >> shift;
     82    d->W(7) = (int16_t)d->W(7) >> shift;
     83#endif
     84}
     85
     86void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
     87{
     88    int shift;
     89
     90    if (s->Q(0) > 15) {
     91        d->Q(0) = 0;
     92#if SHIFT == 1
     93        d->Q(1) = 0;
     94#endif
     95    } else {
     96        shift = s->B(0);
     97        d->W(0) <<= shift;
     98        d->W(1) <<= shift;
     99        d->W(2) <<= shift;
    100        d->W(3) <<= shift;
    101#if SHIFT == 1
    102        d->W(4) <<= shift;
    103        d->W(5) <<= shift;
    104        d->W(6) <<= shift;
    105        d->W(7) <<= shift;
    106#endif
    107    }
    108}
    109
    110void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    111{
    112    int shift;
    113
    114    if (s->Q(0) > 31) {
    115        d->Q(0) = 0;
    116#if SHIFT == 1
    117        d->Q(1) = 0;
    118#endif
    119    } else {
    120        shift = s->B(0);
    121        d->L(0) >>= shift;
    122        d->L(1) >>= shift;
    123#if SHIFT == 1
    124        d->L(2) >>= shift;
    125        d->L(3) >>= shift;
    126#endif
    127    }
    128}
    129
    130void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    131{
    132    int shift;
    133
    134    if (s->Q(0) > 31) {
    135        shift = 31;
    136    } else {
    137        shift = s->B(0);
    138    }
    139    d->L(0) = (int32_t)d->L(0) >> shift;
    140    d->L(1) = (int32_t)d->L(1) >> shift;
    141#if SHIFT == 1
    142    d->L(2) = (int32_t)d->L(2) >> shift;
    143    d->L(3) = (int32_t)d->L(3) >> shift;
    144#endif
    145}
    146
    147void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    148{
    149    int shift;
    150
    151    if (s->Q(0) > 31) {
    152        d->Q(0) = 0;
    153#if SHIFT == 1
    154        d->Q(1) = 0;
    155#endif
    156    } else {
    157        shift = s->B(0);
    158        d->L(0) <<= shift;
    159        d->L(1) <<= shift;
    160#if SHIFT == 1
    161        d->L(2) <<= shift;
    162        d->L(3) <<= shift;
    163#endif
    164    }
    165}
    166
    167void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    168{
    169    int shift;
    170
    171    if (s->Q(0) > 63) {
    172        d->Q(0) = 0;
    173#if SHIFT == 1
    174        d->Q(1) = 0;
    175#endif
    176    } else {
    177        shift = s->B(0);
    178        d->Q(0) >>= shift;
    179#if SHIFT == 1
    180        d->Q(1) >>= shift;
    181#endif
    182    }
    183}
    184
    185void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    186{
    187    int shift;
    188
    189    if (s->Q(0) > 63) {
    190        d->Q(0) = 0;
    191#if SHIFT == 1
    192        d->Q(1) = 0;
    193#endif
    194    } else {
    195        shift = s->B(0);
    196        d->Q(0) <<= shift;
    197#if SHIFT == 1
    198        d->Q(1) <<= shift;
    199#endif
    200    }
    201}
    202
    203#if SHIFT == 1
    204void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    205{
    206    int shift, i;
    207
    208    shift = s->L(0);
    209    if (shift > 16) {
    210        shift = 16;
    211    }
    212    for (i = 0; i < 16 - shift; i++) {
    213        d->B(i) = d->B(i + shift);
    214    }
    215    for (i = 16 - shift; i < 16; i++) {
    216        d->B(i) = 0;
    217    }
    218}
    219
    220void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    221{
    222    int shift, i;
    223
    224    shift = s->L(0);
    225    if (shift > 16) {
    226        shift = 16;
    227    }
    228    for (i = 15; i >= shift; i--) {
    229        d->B(i) = d->B(i - shift);
    230    }
    231    for (i = 0; i < shift; i++) {
    232        d->B(i) = 0;
    233    }
    234}
    235#endif
    236
    237#define SSE_HELPER_B(name, F)                                   \
    238    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
    239    {                                                           \
    240        d->B(0) = F(d->B(0), s->B(0));                          \
    241        d->B(1) = F(d->B(1), s->B(1));                          \
    242        d->B(2) = F(d->B(2), s->B(2));                          \
    243        d->B(3) = F(d->B(3), s->B(3));                          \
    244        d->B(4) = F(d->B(4), s->B(4));                          \
    245        d->B(5) = F(d->B(5), s->B(5));                          \
    246        d->B(6) = F(d->B(6), s->B(6));                          \
    247        d->B(7) = F(d->B(7), s->B(7));                          \
    248        XMM_ONLY(                                               \
    249                 d->B(8) = F(d->B(8), s->B(8));                 \
    250                 d->B(9) = F(d->B(9), s->B(9));                 \
    251                 d->B(10) = F(d->B(10), s->B(10));              \
    252                 d->B(11) = F(d->B(11), s->B(11));              \
    253                 d->B(12) = F(d->B(12), s->B(12));              \
    254                 d->B(13) = F(d->B(13), s->B(13));              \
    255                 d->B(14) = F(d->B(14), s->B(14));              \
    256                 d->B(15) = F(d->B(15), s->B(15));              \
    257                                                        )       \
    258            }
    259
    260#define SSE_HELPER_W(name, F)                                   \
    261    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
    262    {                                                           \
    263        d->W(0) = F(d->W(0), s->W(0));                          \
    264        d->W(1) = F(d->W(1), s->W(1));                          \
    265        d->W(2) = F(d->W(2), s->W(2));                          \
    266        d->W(3) = F(d->W(3), s->W(3));                          \
    267        XMM_ONLY(                                               \
    268                 d->W(4) = F(d->W(4), s->W(4));                 \
    269                 d->W(5) = F(d->W(5), s->W(5));                 \
    270                 d->W(6) = F(d->W(6), s->W(6));                 \
    271                 d->W(7) = F(d->W(7), s->W(7));                 \
    272                                                        )       \
    273            }
    274
    275#define SSE_HELPER_L(name, F)                                   \
    276    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
    277    {                                                           \
    278        d->L(0) = F(d->L(0), s->L(0));                          \
    279        d->L(1) = F(d->L(1), s->L(1));                          \
    280        XMM_ONLY(                                               \
    281                 d->L(2) = F(d->L(2), s->L(2));                 \
    282                 d->L(3) = F(d->L(3), s->L(3));                 \
    283                                                        )       \
    284            }
    285
    286#define SSE_HELPER_Q(name, F)                                   \
    287    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
    288    {                                                           \
    289        d->Q(0) = F(d->Q(0), s->Q(0));                          \
    290        XMM_ONLY(                                               \
    291                 d->Q(1) = F(d->Q(1), s->Q(1));                 \
    292                                                        )       \
    293            }
    294
    295#if SHIFT == 0
    296static inline int satub(int x)
    297{
    298    if (x < 0) {
    299        return 0;
    300    } else if (x > 255) {
    301        return 255;
    302    } else {
    303        return x;
    304    }
    305}
    306
    307static inline int satuw(int x)
    308{
    309    if (x < 0) {
    310        return 0;
    311    } else if (x > 65535) {
    312        return 65535;
    313    } else {
    314        return x;
    315    }
    316}
    317
    318static inline int satsb(int x)
    319{
    320    if (x < -128) {
    321        return -128;
    322    } else if (x > 127) {
    323        return 127;
    324    } else {
    325        return x;
    326    }
    327}
    328
    329static inline int satsw(int x)
    330{
    331    if (x < -32768) {
    332        return -32768;
    333    } else if (x > 32767) {
    334        return 32767;
    335    } else {
    336        return x;
    337    }
    338}
    339
    340#define FADD(a, b) ((a) + (b))
    341#define FADDUB(a, b) satub((a) + (b))
    342#define FADDUW(a, b) satuw((a) + (b))
    343#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
    344#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
    345
    346#define FSUB(a, b) ((a) - (b))
    347#define FSUBUB(a, b) satub((a) - (b))
    348#define FSUBUW(a, b) satuw((a) - (b))
    349#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
    350#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
    351#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
    352#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
    353#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
    354#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
    355
    356#define FAND(a, b) ((a) & (b))
    357#define FANDN(a, b) ((~(a)) & (b))
    358#define FOR(a, b) ((a) | (b))
    359#define FXOR(a, b) ((a) ^ (b))
    360
    361#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
    362#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
    363#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
    364#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
    365
    366#define FMULLW(a, b) ((a) * (b))
    367#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
    368#define FMULHUW(a, b) ((a) * (b) >> 16)
    369#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
    370
    371#define FAVG(a, b) (((a) + (b) + 1) >> 1)
    372#endif
    373
    374SSE_HELPER_B(helper_paddb, FADD)
    375SSE_HELPER_W(helper_paddw, FADD)
    376SSE_HELPER_L(helper_paddl, FADD)
    377SSE_HELPER_Q(helper_paddq, FADD)
    378
    379SSE_HELPER_B(helper_psubb, FSUB)
    380SSE_HELPER_W(helper_psubw, FSUB)
    381SSE_HELPER_L(helper_psubl, FSUB)
    382SSE_HELPER_Q(helper_psubq, FSUB)
    383
    384SSE_HELPER_B(helper_paddusb, FADDUB)
    385SSE_HELPER_B(helper_paddsb, FADDSB)
    386SSE_HELPER_B(helper_psubusb, FSUBUB)
    387SSE_HELPER_B(helper_psubsb, FSUBSB)
    388
    389SSE_HELPER_W(helper_paddusw, FADDUW)
    390SSE_HELPER_W(helper_paddsw, FADDSW)
    391SSE_HELPER_W(helper_psubusw, FSUBUW)
    392SSE_HELPER_W(helper_psubsw, FSUBSW)
    393
    394SSE_HELPER_B(helper_pminub, FMINUB)
    395SSE_HELPER_B(helper_pmaxub, FMAXUB)
    396
    397SSE_HELPER_W(helper_pminsw, FMINSW)
    398SSE_HELPER_W(helper_pmaxsw, FMAXSW)
    399
    400SSE_HELPER_Q(helper_pand, FAND)
    401SSE_HELPER_Q(helper_pandn, FANDN)
    402SSE_HELPER_Q(helper_por, FOR)
    403SSE_HELPER_Q(helper_pxor, FXOR)
    404
    405SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
    406SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
    407SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
    408
    409SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
    410SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
    411SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
    412
    413SSE_HELPER_W(helper_pmullw, FMULLW)
    414#if SHIFT == 0
    415SSE_HELPER_W(helper_pmulhrw, FMULHRW)
    416#endif
    417SSE_HELPER_W(helper_pmulhuw, FMULHUW)
    418SSE_HELPER_W(helper_pmulhw, FMULHW)
    419
    420SSE_HELPER_B(helper_pavgb, FAVG)
    421SSE_HELPER_W(helper_pavgw, FAVG)
    422
    423void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    424{
    425    d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
    426#if SHIFT == 1
    427    d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
    428#endif
    429}
    430
    431void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    432{
    433    int i;
    434
    435    for (i = 0; i < (2 << SHIFT); i++) {
    436        d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) +
    437            (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1);
    438    }
    439}
    440
    441#if SHIFT == 0
    442static inline int abs1(int a)
    443{
    444    if (a < 0) {
    445        return -a;
    446    } else {
    447        return a;
    448    }
    449}
    450#endif
    451void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
    452{
    453    unsigned int val;
    454
    455    val = 0;
    456    val += abs1(d->B(0) - s->B(0));
    457    val += abs1(d->B(1) - s->B(1));
    458    val += abs1(d->B(2) - s->B(2));
    459    val += abs1(d->B(3) - s->B(3));
    460    val += abs1(d->B(4) - s->B(4));
    461    val += abs1(d->B(5) - s->B(5));
    462    val += abs1(d->B(6) - s->B(6));
    463    val += abs1(d->B(7) - s->B(7));
    464    d->Q(0) = val;
    465#if SHIFT == 1
    466    val = 0;
    467    val += abs1(d->B(8) - s->B(8));
    468    val += abs1(d->B(9) - s->B(9));
    469    val += abs1(d->B(10) - s->B(10));
    470    val += abs1(d->B(11) - s->B(11));
    471    val += abs1(d->B(12) - s->B(12));
    472    val += abs1(d->B(13) - s->B(13));
    473    val += abs1(d->B(14) - s->B(14));
    474    val += abs1(d->B(15) - s->B(15));
    475    d->Q(1) = val;
    476#endif
    477}
    478
    479void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
    480                                  target_ulong a0)
    481{
    482    int i;
    483
    484    for (i = 0; i < (8 << SHIFT); i++) {
    485        if (s->B(i) & 0x80) {
    486            cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
    487        }
    488    }
    489}
    490
    491void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
    492{
    493    d->L(0) = val;
    494    d->L(1) = 0;
    495#if SHIFT == 1
    496    d->Q(1) = 0;
    497#endif
    498}
    499
    500#ifdef TARGET_X86_64
    501void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
    502{
    503    d->Q(0) = val;
    504#if SHIFT == 1
    505    d->Q(1) = 0;
    506#endif
    507}
    508#endif
    509
    510#if SHIFT == 0
    511void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
    512{
    513    Reg r;
    514
    515    r.W(0) = s->W(order & 3);
    516    r.W(1) = s->W((order >> 2) & 3);
    517    r.W(2) = s->W((order >> 4) & 3);
    518    r.W(3) = s->W((order >> 6) & 3);
    519    *d = r;
    520}
    521#else
    522void helper_shufps(Reg *d, Reg *s, int order)
    523{
    524    Reg r;
    525
    526    r.L(0) = d->L(order & 3);
    527    r.L(1) = d->L((order >> 2) & 3);
    528    r.L(2) = s->L((order >> 4) & 3);
    529    r.L(3) = s->L((order >> 6) & 3);
    530    *d = r;
    531}
    532
    533void helper_shufpd(Reg *d, Reg *s, int order)
    534{
    535    Reg r;
    536
    537    r.Q(0) = d->Q(order & 1);
    538    r.Q(1) = s->Q((order >> 1) & 1);
    539    *d = r;
    540}
    541
    542void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
    543{
    544    Reg r;
    545
    546    r.L(0) = s->L(order & 3);
    547    r.L(1) = s->L((order >> 2) & 3);
    548    r.L(2) = s->L((order >> 4) & 3);
    549    r.L(3) = s->L((order >> 6) & 3);
    550    *d = r;
    551}
    552
    553void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
    554{
    555    Reg r;
    556
    557    r.W(0) = s->W(order & 3);
    558    r.W(1) = s->W((order >> 2) & 3);
    559    r.W(2) = s->W((order >> 4) & 3);
    560    r.W(3) = s->W((order >> 6) & 3);
    561    r.Q(1) = s->Q(1);
    562    *d = r;
    563}
    564
    565void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
    566{
    567    Reg r;
    568
    569    r.Q(0) = s->Q(0);
    570    r.W(4) = s->W(4 + (order & 3));
    571    r.W(5) = s->W(4 + ((order >> 2) & 3));
    572    r.W(6) = s->W(4 + ((order >> 4) & 3));
    573    r.W(7) = s->W(4 + ((order >> 6) & 3));
    574    *d = r;
    575}
    576#endif
    577
    578#if SHIFT == 1
    579/* FPU ops */
    580/* XXX: not accurate */
    581
    582#define SSE_HELPER_S(name, F)                                           \
    583    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
    584    {                                                                   \
    585        d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
    586        d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
    587        d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
    588        d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
    589    }                                                                   \
    590                                                                        \
    591    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
    592    {                                                                   \
    593        d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
    594    }                                                                   \
    595                                                                        \
    596    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
    597    {                                                                   \
    598        d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
    599        d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
    600    }                                                                   \
    601                                                                        \
    602    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
    603    {                                                                   \
    604        d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
    605    }
    606
    607#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
    608#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
    609#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
    610#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
    611#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
    612
    613/* Note that the choice of comparison op here is important to get the
    614 * special cases right: for min and max Intel specifies that (-0,0),
    615 * (NaN, anything) and (anything, NaN) return the second argument.
    616 */
    617#define FPU_MIN(size, a, b)                                     \
    618    (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
    619#define FPU_MAX(size, a, b)                                     \
    620    (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
    621
    622SSE_HELPER_S(add, FPU_ADD)
    623SSE_HELPER_S(sub, FPU_SUB)
    624SSE_HELPER_S(mul, FPU_MUL)
    625SSE_HELPER_S(div, FPU_DIV)
    626SSE_HELPER_S(min, FPU_MIN)
    627SSE_HELPER_S(max, FPU_MAX)
    628SSE_HELPER_S(sqrt, FPU_SQRT)
    629
    630
    631/* float to float conversions */
    632void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
    633{
    634    float32 s0, s1;
    635
    636    s0 = s->ZMM_S(0);
    637    s1 = s->ZMM_S(1);
    638    d->ZMM_D(0) = float32_to_float64(s0, &env->sse_status);
    639    d->ZMM_D(1) = float32_to_float64(s1, &env->sse_status);
    640}
    641
    642void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s)
    643{
    644    d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
    645    d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), &env->sse_status);
    646    d->Q(1) = 0;
    647}
    648
    649void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
    650{
    651    d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
    652}
    653
    654void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
    655{
    656    d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
    657}
    658
    659/* integer to float */
    660void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
    661{
    662    d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), &env->sse_status);
    663    d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), &env->sse_status);
    664    d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), &env->sse_status);
    665    d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), &env->sse_status);
    666}
    667
    668void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s)
    669{
    670    int32_t l0, l1;
    671
    672    l0 = (int32_t)s->ZMM_L(0);
    673    l1 = (int32_t)s->ZMM_L(1);
    674    d->ZMM_D(0) = int32_to_float64(l0, &env->sse_status);
    675    d->ZMM_D(1) = int32_to_float64(l1, &env->sse_status);
    676}
    677
    678void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
    679{
    680    d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
    681    d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
    682}
    683
    684void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
    685{
    686    d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
    687    d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
    688}
    689
    690void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
    691{
    692    d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
    693}
    694
    695void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
    696{
    697    d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
    698}
    699
    700#ifdef TARGET_X86_64
    701void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
    702{
    703    d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
    704}
    705
    706void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
    707{
    708    d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
    709}
    710#endif
    711
    712/* float to integer */
    713
    714/*
    715 * x86 mandates that we return the indefinite integer value for the result
    716 * of any float-to-integer conversion that raises the 'invalid' exception.
    717 * Wrap the softfloat functions to get this behaviour.
    718 */
    719#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
    720    static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
    721    {                                                                   \
    722        int oldflags, newflags;                                         \
    723        RETTYPE r;                                                      \
    724                                                                        \
    725        oldflags = get_float_exception_flags(s);                        \
    726        set_float_exception_flags(0, s);                                \
    727        r = FN(a, s);                                                   \
    728        newflags = get_float_exception_flags(s);                        \
    729        if (newflags & float_flag_invalid) {                            \
    730            r = INDEFVALUE;                                             \
    731        }                                                               \
    732        set_float_exception_flags(newflags | oldflags, s);              \
    733        return r;                                                       \
    734    }
    735
    736WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
    737WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
    738WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
    739WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
    740WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
    741WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
    742WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
    743WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
    744
    745void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    746{
    747    d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
    748    d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
    749    d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), &env->sse_status);
    750    d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), &env->sse_status);
    751}
    752
    753void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    754{
    755    d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
    756    d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
    757    d->ZMM_Q(1) = 0;
    758}
    759
    760void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    761{
    762    d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
    763    d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
    764}
    765
    766void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    767{
    768    d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
    769    d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
    770}
    771
    772int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
    773{
    774    return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
    775}
    776
    777int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
    778{
    779    return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
    780}
    781
    782#ifdef TARGET_X86_64
    783int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
    784{
    785    return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
    786}
    787
    788int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
    789{
    790    return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
    791}
    792#endif
    793
    794/* float to integer truncated */
    795void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    796{
    797    d->ZMM_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
    798    d->ZMM_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
    799    d->ZMM_L(2) = x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status);
    800    d->ZMM_L(3) = x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status);
    801}
    802
    803void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    804{
    805    d->ZMM_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
    806    d->ZMM_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
    807    d->ZMM_Q(1) = 0;
    808}
    809
    810void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    811{
    812    d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
    813    d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
    814}
    815
    816void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
    817{
    818    d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
    819    d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
    820}
    821
    822int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
    823{
    824    return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
    825}
    826
    827int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
    828{
    829    return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
    830}
    831
    832#ifdef TARGET_X86_64
    833int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
    834{
    835    return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
    836}
    837
    838int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
    839{
    840    return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
    841}
    842#endif
    843
    844void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    845{
    846    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    847    d->ZMM_S(0) = float32_div(float32_one,
    848                              float32_sqrt(s->ZMM_S(0), &env->sse_status),
    849                              &env->sse_status);
    850    d->ZMM_S(1) = float32_div(float32_one,
    851                              float32_sqrt(s->ZMM_S(1), &env->sse_status),
    852                              &env->sse_status);
    853    d->ZMM_S(2) = float32_div(float32_one,
    854                              float32_sqrt(s->ZMM_S(2), &env->sse_status),
    855                              &env->sse_status);
    856    d->ZMM_S(3) = float32_div(float32_one,
    857                              float32_sqrt(s->ZMM_S(3), &env->sse_status),
    858                              &env->sse_status);
    859    set_float_exception_flags(old_flags, &env->sse_status);
    860}
    861
    862void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    863{
    864    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    865    d->ZMM_S(0) = float32_div(float32_one,
    866                              float32_sqrt(s->ZMM_S(0), &env->sse_status),
    867                              &env->sse_status);
    868    set_float_exception_flags(old_flags, &env->sse_status);
    869}
    870
    871void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    872{
    873    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    874    d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
    875    d->ZMM_S(1) = float32_div(float32_one, s->ZMM_S(1), &env->sse_status);
    876    d->ZMM_S(2) = float32_div(float32_one, s->ZMM_S(2), &env->sse_status);
    877    d->ZMM_S(3) = float32_div(float32_one, s->ZMM_S(3), &env->sse_status);
    878    set_float_exception_flags(old_flags, &env->sse_status);
    879}
    880
    881void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    882{
    883    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
    884    d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
    885    set_float_exception_flags(old_flags, &env->sse_status);
    886}
    887
    888static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
    889{
    890    uint64_t mask;
    891
    892    if (len == 0) {
    893        mask = ~0LL;
    894    } else {
    895        mask = (1ULL << len) - 1;
    896    }
    897    return (src >> shift) & mask;
    898}
    899
    900void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    901{
    902    d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0));
    903}
    904
    905void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
    906{
    907    d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
    908}
    909
    910static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
    911{
    912    uint64_t mask;
    913
    914    if (len == 0) {
    915        mask = ~0ULL;
    916    } else {
    917        mask = (1ULL << len) - 1;
    918    }
    919    return (src & ~(mask << shift)) | ((src & mask) << shift);
    920}
    921
    922void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    923{
    924    d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
    925}
    926
    927void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
    928{
    929    d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
    930}
    931
    932void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    933{
    934    ZMMReg r;
    935
    936    r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
    937    r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
    938    r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
    939    r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
    940    *d = r;
    941}
    942
    943void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    944{
    945    ZMMReg r;
    946
    947    r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
    948    r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
    949    *d = r;
    950}
    951
    952void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    953{
    954    ZMMReg r;
    955
    956    r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
    957    r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
    958    r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
    959    r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
    960    *d = r;
    961}
    962
    963void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    964{
    965    ZMMReg r;
    966
    967    r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
    968    r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
    969    *d = r;
    970}
    971
    972void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    973{
    974    d->ZMM_S(0) = float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status);
    975    d->ZMM_S(1) = float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status);
    976    d->ZMM_S(2) = float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status);
    977    d->ZMM_S(3) = float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status);
    978}
    979
    980void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
    981{
    982    d->ZMM_D(0) = float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
    983    d->ZMM_D(1) = float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
    984}
    985
    986/* XXX: unordered */
    987#define SSE_HELPER_CMP(name, F)                                         \
    988    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
    989    {                                                                   \
    990        d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
    991        d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
    992        d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
    993        d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
    994    }                                                                   \
    995                                                                        \
    996    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
    997    {                                                                   \
    998        d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
    999    }                                                                   \
   1000                                                                        \
   1001    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
   1002    {                                                                   \
   1003        d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
   1004        d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
   1005    }                                                                   \
   1006                                                                        \
   1007    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
   1008    {                                                                   \
   1009        d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
   1010    }
   1011
   1012#define FPU_CMPEQ(size, a, b)                                           \
   1013    (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0)
   1014#define FPU_CMPLT(size, a, b)                                           \
   1015    (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0)
   1016#define FPU_CMPLE(size, a, b)                                           \
   1017    (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0)
   1018#define FPU_CMPUNORD(size, a, b)                                        \
   1019    (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0)
   1020#define FPU_CMPNEQ(size, a, b)                                          \
   1021    (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1)
   1022#define FPU_CMPNLT(size, a, b)                                          \
   1023    (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1)
   1024#define FPU_CMPNLE(size, a, b)                                          \
   1025    (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1)
   1026#define FPU_CMPORD(size, a, b)                                          \
   1027    (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1)
   1028
   1029SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
   1030SSE_HELPER_CMP(cmplt, FPU_CMPLT)
   1031SSE_HELPER_CMP(cmple, FPU_CMPLE)
   1032SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
   1033SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
   1034SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
   1035SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
   1036SSE_HELPER_CMP(cmpord, FPU_CMPORD)
   1037
   1038static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
   1039
   1040void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
   1041{
   1042    FloatRelation ret;
   1043    float32 s0, s1;
   1044
   1045    s0 = d->ZMM_S(0);
   1046    s1 = s->ZMM_S(0);
   1047    ret = float32_compare_quiet(s0, s1, &env->sse_status);
   1048    CC_SRC = comis_eflags[ret + 1];
   1049}
   1050
   1051void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
   1052{
   1053    FloatRelation ret;
   1054    float32 s0, s1;
   1055
   1056    s0 = d->ZMM_S(0);
   1057    s1 = s->ZMM_S(0);
   1058    ret = float32_compare(s0, s1, &env->sse_status);
   1059    CC_SRC = comis_eflags[ret + 1];
   1060}
   1061
   1062void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
   1063{
   1064    FloatRelation ret;
   1065    float64 d0, d1;
   1066
   1067    d0 = d->ZMM_D(0);
   1068    d1 = s->ZMM_D(0);
   1069    ret = float64_compare_quiet(d0, d1, &env->sse_status);
   1070    CC_SRC = comis_eflags[ret + 1];
   1071}
   1072
   1073void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
   1074{
   1075    FloatRelation ret;
   1076    float64 d0, d1;
   1077
   1078    d0 = d->ZMM_D(0);
   1079    d1 = s->ZMM_D(0);
   1080    ret = float64_compare(d0, d1, &env->sse_status);
   1081    CC_SRC = comis_eflags[ret + 1];
   1082}
   1083
   1084uint32_t helper_movmskps(CPUX86State *env, Reg *s)
   1085{
   1086    int b0, b1, b2, b3;
   1087
   1088    b0 = s->ZMM_L(0) >> 31;
   1089    b1 = s->ZMM_L(1) >> 31;
   1090    b2 = s->ZMM_L(2) >> 31;
   1091    b3 = s->ZMM_L(3) >> 31;
   1092    return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
   1093}
   1094
   1095uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
   1096{
   1097    int b0, b1;
   1098
   1099    b0 = s->ZMM_L(1) >> 31;
   1100    b1 = s->ZMM_L(3) >> 31;
   1101    return b0 | (b1 << 1);
   1102}
   1103
   1104#endif
   1105
   1106uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
   1107{
   1108    uint32_t val;
   1109
   1110    val = 0;
   1111    val |= (s->B(0) >> 7);
   1112    val |= (s->B(1) >> 6) & 0x02;
   1113    val |= (s->B(2) >> 5) & 0x04;
   1114    val |= (s->B(3) >> 4) & 0x08;
   1115    val |= (s->B(4) >> 3) & 0x10;
   1116    val |= (s->B(5) >> 2) & 0x20;
   1117    val |= (s->B(6) >> 1) & 0x40;
   1118    val |= (s->B(7)) & 0x80;
   1119#if SHIFT == 1
   1120    val |= (s->B(8) << 1) & 0x0100;
   1121    val |= (s->B(9) << 2) & 0x0200;
   1122    val |= (s->B(10) << 3) & 0x0400;
   1123    val |= (s->B(11) << 4) & 0x0800;
   1124    val |= (s->B(12) << 5) & 0x1000;
   1125    val |= (s->B(13) << 6) & 0x2000;
   1126    val |= (s->B(14) << 7) & 0x4000;
   1127    val |= (s->B(15) << 8) & 0x8000;
   1128#endif
   1129    return val;
   1130}
   1131
   1132void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1133{
   1134    Reg r;
   1135
   1136    r.B(0) = satsb((int16_t)d->W(0));
   1137    r.B(1) = satsb((int16_t)d->W(1));
   1138    r.B(2) = satsb((int16_t)d->W(2));
   1139    r.B(3) = satsb((int16_t)d->W(3));
   1140#if SHIFT == 1
   1141    r.B(4) = satsb((int16_t)d->W(4));
   1142    r.B(5) = satsb((int16_t)d->W(5));
   1143    r.B(6) = satsb((int16_t)d->W(6));
   1144    r.B(7) = satsb((int16_t)d->W(7));
   1145#endif
   1146    r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
   1147    r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
   1148    r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
   1149    r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
   1150#if SHIFT == 1
   1151    r.B(12) = satsb((int16_t)s->W(4));
   1152    r.B(13) = satsb((int16_t)s->W(5));
   1153    r.B(14) = satsb((int16_t)s->W(6));
   1154    r.B(15) = satsb((int16_t)s->W(7));
   1155#endif
   1156    *d = r;
   1157}
   1158
   1159void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1160{
   1161    Reg r;
   1162
   1163    r.B(0) = satub((int16_t)d->W(0));
   1164    r.B(1) = satub((int16_t)d->W(1));
   1165    r.B(2) = satub((int16_t)d->W(2));
   1166    r.B(3) = satub((int16_t)d->W(3));
   1167#if SHIFT == 1
   1168    r.B(4) = satub((int16_t)d->W(4));
   1169    r.B(5) = satub((int16_t)d->W(5));
   1170    r.B(6) = satub((int16_t)d->W(6));
   1171    r.B(7) = satub((int16_t)d->W(7));
   1172#endif
   1173    r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
   1174    r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
   1175    r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
   1176    r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
   1177#if SHIFT == 1
   1178    r.B(12) = satub((int16_t)s->W(4));
   1179    r.B(13) = satub((int16_t)s->W(5));
   1180    r.B(14) = satub((int16_t)s->W(6));
   1181    r.B(15) = satub((int16_t)s->W(7));
   1182#endif
   1183    *d = r;
   1184}
   1185
   1186void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1187{
   1188    Reg r;
   1189
   1190    r.W(0) = satsw(d->L(0));
   1191    r.W(1) = satsw(d->L(1));
   1192#if SHIFT == 1
   1193    r.W(2) = satsw(d->L(2));
   1194    r.W(3) = satsw(d->L(3));
   1195#endif
   1196    r.W((2 << SHIFT) + 0) = satsw(s->L(0));
   1197    r.W((2 << SHIFT) + 1) = satsw(s->L(1));
   1198#if SHIFT == 1
   1199    r.W(6) = satsw(s->L(2));
   1200    r.W(7) = satsw(s->L(3));
   1201#endif
   1202    *d = r;
   1203}
   1204
   1205#define UNPCK_OP(base_name, base)                                       \
   1206                                                                        \
   1207    void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
   1208                                                        Reg *d, Reg *s) \
   1209    {                                                                   \
   1210        Reg r;                                                          \
   1211                                                                        \
   1212        r.B(0) = d->B((base << (SHIFT + 2)) + 0);                       \
   1213        r.B(1) = s->B((base << (SHIFT + 2)) + 0);                       \
   1214        r.B(2) = d->B((base << (SHIFT + 2)) + 1);                       \
   1215        r.B(3) = s->B((base << (SHIFT + 2)) + 1);                       \
   1216        r.B(4) = d->B((base << (SHIFT + 2)) + 2);                       \
   1217        r.B(5) = s->B((base << (SHIFT + 2)) + 2);                       \
   1218        r.B(6) = d->B((base << (SHIFT + 2)) + 3);                       \
   1219        r.B(7) = s->B((base << (SHIFT + 2)) + 3);                       \
   1220        XMM_ONLY(                                                       \
   1221                 r.B(8) = d->B((base << (SHIFT + 2)) + 4);              \
   1222                 r.B(9) = s->B((base << (SHIFT + 2)) + 4);              \
   1223                 r.B(10) = d->B((base << (SHIFT + 2)) + 5);             \
   1224                 r.B(11) = s->B((base << (SHIFT + 2)) + 5);             \
   1225                 r.B(12) = d->B((base << (SHIFT + 2)) + 6);             \
   1226                 r.B(13) = s->B((base << (SHIFT + 2)) + 6);             \
   1227                 r.B(14) = d->B((base << (SHIFT + 2)) + 7);             \
   1228                 r.B(15) = s->B((base << (SHIFT + 2)) + 7);             \
   1229                                                                      ) \
   1230            *d = r;                                                     \
   1231    }                                                                   \
   1232                                                                        \
   1233    void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
   1234                                                        Reg *d, Reg *s) \
   1235    {                                                                   \
   1236        Reg r;                                                          \
   1237                                                                        \
   1238        r.W(0) = d->W((base << (SHIFT + 1)) + 0);                       \
   1239        r.W(1) = s->W((base << (SHIFT + 1)) + 0);                       \
   1240        r.W(2) = d->W((base << (SHIFT + 1)) + 1);                       \
   1241        r.W(3) = s->W((base << (SHIFT + 1)) + 1);                       \
   1242        XMM_ONLY(                                                       \
   1243                 r.W(4) = d->W((base << (SHIFT + 1)) + 2);              \
   1244                 r.W(5) = s->W((base << (SHIFT + 1)) + 2);              \
   1245                 r.W(6) = d->W((base << (SHIFT + 1)) + 3);              \
   1246                 r.W(7) = s->W((base << (SHIFT + 1)) + 3);              \
   1247                                                                      ) \
   1248            *d = r;                                                     \
   1249    }                                                                   \
   1250                                                                        \
   1251    void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
   1252                                                        Reg *d, Reg *s) \
   1253    {                                                                   \
   1254        Reg r;                                                          \
   1255                                                                        \
   1256        r.L(0) = d->L((base << SHIFT) + 0);                             \
   1257        r.L(1) = s->L((base << SHIFT) + 0);                             \
   1258        XMM_ONLY(                                                       \
   1259                 r.L(2) = d->L((base << SHIFT) + 1);                    \
   1260                 r.L(3) = s->L((base << SHIFT) + 1);                    \
   1261                                                                      ) \
   1262            *d = r;                                                     \
   1263    }                                                                   \
   1264                                                                        \
   1265    XMM_ONLY(                                                           \
   1266             void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \
   1267                                                                  *env, \
   1268                                                                  Reg *d, \
   1269                                                                  Reg *s) \
   1270             {                                                          \
   1271                 Reg r;                                                 \
   1272                                                                        \
   1273                 r.Q(0) = d->Q(base);                                   \
   1274                 r.Q(1) = s->Q(base);                                   \
   1275                 *d = r;                                                \
   1276             }                                                          \
   1277                                                                        )
   1278
   1279UNPCK_OP(l, 0)
   1280UNPCK_OP(h, 1)
   1281
   1282/* 3DNow! float ops */
   1283#if SHIFT == 0
   1284void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1285{
   1286    d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
   1287    d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
   1288}
   1289
   1290void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
   1291{
   1292    d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
   1293    d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
   1294}
   1295
   1296void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
   1297{
   1298    d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
   1299    d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
   1300}
   1301
   1302void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
   1303{
   1304    d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
   1305                                                       &env->mmx_status));
   1306    d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
   1307                                                       &env->mmx_status));
   1308}
   1309
   1310void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1311{
   1312    MMXReg r;
   1313
   1314    r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1315    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1316    *d = r;
   1317}
   1318
   1319void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1320{
   1321    d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1322    d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1323}
   1324
   1325void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
   1326{
   1327    d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
   1328                                   &env->mmx_status) ? -1 : 0;
   1329    d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
   1330                                   &env->mmx_status) ? -1 : 0;
   1331}
   1332
   1333void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
   1334{
   1335    d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
   1336                             &env->mmx_status) ? -1 : 0;
   1337    d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
   1338                             &env->mmx_status) ? -1 : 0;
   1339}
   1340
   1341void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
   1342{
   1343    d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
   1344                             &env->mmx_status) ? -1 : 0;
   1345    d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
   1346                             &env->mmx_status) ? -1 : 0;
   1347}
   1348
   1349void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
   1350{
   1351    if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
   1352        d->MMX_S(0) = s->MMX_S(0);
   1353    }
   1354    if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
   1355        d->MMX_S(1) = s->MMX_S(1);
   1356    }
   1357}
   1358
   1359void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
   1360{
   1361    if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
   1362        d->MMX_S(0) = s->MMX_S(0);
   1363    }
   1364    if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
   1365        d->MMX_S(1) = s->MMX_S(1);
   1366    }
   1367}
   1368
   1369void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
   1370{
   1371    d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1372    d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1373}
   1374
   1375void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1376{
   1377    MMXReg r;
   1378
   1379    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1380    r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1381    *d = r;
   1382}
   1383
   1384void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
   1385{
   1386    MMXReg r;
   1387
   1388    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1389    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1390    *d = r;
   1391}
   1392
   1393void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
   1394{
   1395    d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
   1396    d->MMX_S(1) = d->MMX_S(0);
   1397}
   1398
   1399void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
   1400{
   1401    d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
   1402    d->MMX_S(1) = float32_div(float32_one,
   1403                              float32_sqrt(d->MMX_S(1), &env->mmx_status),
   1404                              &env->mmx_status);
   1405    d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
   1406    d->MMX_L(0) = d->MMX_L(1);
   1407}
   1408
   1409void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
   1410{
   1411    d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1412    d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1413}
   1414
   1415void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
   1416{
   1417    d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
   1418    d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
   1419}
   1420
   1421void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
   1422{
   1423    MMXReg r;
   1424
   1425    r.MMX_L(0) = s->MMX_L(1);
   1426    r.MMX_L(1) = s->MMX_L(0);
   1427    *d = r;
   1428}
   1429#endif
   1430
   1431/* SSSE3 op helpers */
   1432void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1433{
   1434    int i;
   1435    Reg r;
   1436
   1437    for (i = 0; i < (8 << SHIFT); i++) {
   1438        r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
   1439    }
   1440
   1441    *d = r;
   1442}
   1443
   1444void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1445{
   1446
   1447    Reg r;
   1448
   1449    r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
   1450    r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
   1451    XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
   1452    XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
   1453    r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
   1454    r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
   1455    XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
   1456    XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
   1457
   1458    *d = r;
   1459}
   1460
   1461void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1462{
   1463    Reg r;
   1464
   1465    r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
   1466    XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
   1467    r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
   1468    XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
   1469
   1470    *d = r;
   1471}
   1472
   1473void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1474{
   1475    Reg r;
   1476
   1477    r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
   1478    r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
   1479    XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
   1480    XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
   1481    r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
   1482    r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
   1483    XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
   1484    XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
   1485
   1486    *d = r;
   1487}
   1488
   1489void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1490{
   1491    d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
   1492                    (int8_t)s->B(1) * (uint8_t)d->B(1));
   1493    d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
   1494                    (int8_t)s->B(3) * (uint8_t)d->B(3));
   1495    d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
   1496                    (int8_t)s->B(5) * (uint8_t)d->B(5));
   1497    d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
   1498                    (int8_t)s->B(7) * (uint8_t)d->B(7));
   1499#if SHIFT == 1
   1500    d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
   1501                    (int8_t)s->B(9) * (uint8_t)d->B(9));
   1502    d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
   1503                    (int8_t)s->B(11) * (uint8_t)d->B(11));
   1504    d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
   1505                    (int8_t)s->B(13) * (uint8_t)d->B(13));
   1506    d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
   1507                    (int8_t)s->B(15) * (uint8_t)d->B(15));
   1508#endif
   1509}
   1510
   1511void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1512{
   1513    d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
   1514    d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
   1515    XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
   1516    XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
   1517    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
   1518    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
   1519    XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
   1520    XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
   1521}
   1522
   1523void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1524{
   1525    d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
   1526    XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
   1527    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
   1528    XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
   1529}
   1530
   1531void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1532{
   1533    d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
   1534    d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
   1535    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
   1536    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
   1537    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
   1538    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
   1539    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
   1540    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
   1541}
   1542
   1543#define FABSB(_, x) (x > INT8_MAX  ? -(int8_t)x : x)
   1544#define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x)
   1545#define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x)
   1546SSE_HELPER_B(helper_pabsb, FABSB)
   1547SSE_HELPER_W(helper_pabsw, FABSW)
   1548SSE_HELPER_L(helper_pabsd, FABSL)
   1549
   1550#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
   1551SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
   1552
   1553#define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
   1554#define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
   1555#define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
   1556SSE_HELPER_B(helper_psignb, FSIGNB)
   1557SSE_HELPER_W(helper_psignw, FSIGNW)
   1558SSE_HELPER_L(helper_psignd, FSIGNL)
   1559
   1560void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1561                                  int32_t shift)
   1562{
   1563    Reg r;
   1564
   1565    /* XXX could be checked during translation */
   1566    if (shift >= (16 << SHIFT)) {
   1567        r.Q(0) = 0;
   1568        XMM_ONLY(r.Q(1) = 0);
   1569    } else {
   1570        shift <<= 3;
   1571#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
   1572#if SHIFT == 0
   1573        r.Q(0) = SHR(s->Q(0), shift - 0) |
   1574            SHR(d->Q(0), shift -  64);
   1575#else
   1576        r.Q(0) = SHR(s->Q(0), shift - 0) |
   1577            SHR(s->Q(1), shift -  64) |
   1578            SHR(d->Q(0), shift - 128) |
   1579            SHR(d->Q(1), shift - 192);
   1580        r.Q(1) = SHR(s->Q(0), shift + 64) |
   1581            SHR(s->Q(1), shift -   0) |
   1582            SHR(d->Q(0), shift -  64) |
   1583            SHR(d->Q(1), shift - 128);
   1584#endif
   1585#undef SHR
   1586    }
   1587
   1588    *d = r;
   1589}
   1590
   1591#define XMM0 (env->xmm_regs[0])
   1592
   1593#if SHIFT == 1
   1594#define SSE_HELPER_V(name, elem, num, F)                                \
   1595    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)           \
   1596    {                                                                   \
   1597        d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));           \
   1598        d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));           \
   1599        if (num > 2) {                                                  \
   1600            d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));       \
   1601            d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));       \
   1602            if (num > 4) {                                              \
   1603                d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));   \
   1604                d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));   \
   1605                d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));   \
   1606                d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));   \
   1607                if (num > 8) {                                          \
   1608                    d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \
   1609                    d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \
   1610                    d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \
   1611                    d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \
   1612                    d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \
   1613                    d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \
   1614                    d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \
   1615                    d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \
   1616                }                                                       \
   1617            }                                                           \
   1618        }                                                               \
   1619    }
   1620
   1621#define SSE_HELPER_I(name, elem, num, F)                                \
   1622    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \
   1623    {                                                                   \
   1624        d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));       \
   1625        d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));       \
   1626        if (num > 2) {                                                  \
   1627            d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));   \
   1628            d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));   \
   1629            if (num > 4) {                                              \
   1630                d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \
   1631                d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \
   1632                d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \
   1633                d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \
   1634                if (num > 8) {                                          \
   1635                    d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \
   1636                    d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \
   1637                    d->elem(10) = F(d->elem(10), s->elem(10),           \
   1638                                    ((imm >> 10) & 1));                 \
   1639                    d->elem(11) = F(d->elem(11), s->elem(11),           \
   1640                                    ((imm >> 11) & 1));                 \
   1641                    d->elem(12) = F(d->elem(12), s->elem(12),           \
   1642                                    ((imm >> 12) & 1));                 \
   1643                    d->elem(13) = F(d->elem(13), s->elem(13),           \
   1644                                    ((imm >> 13) & 1));                 \
   1645                    d->elem(14) = F(d->elem(14), s->elem(14),           \
   1646                                    ((imm >> 14) & 1));                 \
   1647                    d->elem(15) = F(d->elem(15), s->elem(15),           \
   1648                                    ((imm >> 15) & 1));                 \
   1649                }                                                       \
   1650            }                                                           \
   1651        }                                                               \
   1652    }
   1653
   1654/* SSE4.1 op helpers */
   1655#define FBLENDVB(d, s, m) ((m & 0x80) ? s : d)
   1656#define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d)
   1657#define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d)
   1658SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
   1659SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
   1660SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
   1661
   1662void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1663{
   1664    uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
   1665    uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
   1666
   1667    CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
   1668}
   1669
   1670#define SSE_HELPER_F(name, elem, num, F)        \
   1671    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)     \
   1672    {                                           \
   1673        if (num > 2) {                          \
   1674            if (num > 4) {                      \
   1675                d->elem(7) = F(7);              \
   1676                d->elem(6) = F(6);              \
   1677                d->elem(5) = F(5);              \
   1678                d->elem(4) = F(4);              \
   1679            }                                   \
   1680            d->elem(3) = F(3);                  \
   1681            d->elem(2) = F(2);                  \
   1682        }                                       \
   1683        d->elem(1) = F(1);                      \
   1684        d->elem(0) = F(0);                      \
   1685    }
   1686
   1687SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
   1688SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
   1689SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
   1690SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
   1691SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
   1692SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
   1693SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
   1694SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
   1695SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
   1696SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
   1697SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
   1698SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
   1699
   1700void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1701{
   1702    d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0);
   1703    d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2);
   1704}
   1705
   1706#define FCMPEQQ(d, s) (d == s ? -1 : 0)
   1707SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
   1708
   1709void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1710{
   1711    Reg r;
   1712
   1713    r.W(0) = satuw((int32_t) d->L(0));
   1714    r.W(1) = satuw((int32_t) d->L(1));
   1715    r.W(2) = satuw((int32_t) d->L(2));
   1716    r.W(3) = satuw((int32_t) d->L(3));
   1717    r.W(4) = satuw((int32_t) s->L(0));
   1718    r.W(5) = satuw((int32_t) s->L(1));
   1719    r.W(6) = satuw((int32_t) s->L(2));
   1720    r.W(7) = satuw((int32_t) s->L(3));
   1721    *d = r;
   1722}
   1723
   1724#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
   1725#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
   1726#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
   1727#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
   1728SSE_HELPER_B(helper_pminsb, FMINSB)
   1729SSE_HELPER_L(helper_pminsd, FMINSD)
   1730SSE_HELPER_W(helper_pminuw, MIN)
   1731SSE_HELPER_L(helper_pminud, MIN)
   1732SSE_HELPER_B(helper_pmaxsb, FMAXSB)
   1733SSE_HELPER_L(helper_pmaxsd, FMAXSD)
   1734SSE_HELPER_W(helper_pmaxuw, MAX)
   1735SSE_HELPER_L(helper_pmaxud, MAX)
   1736
   1737#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
   1738SSE_HELPER_L(helper_pmulld, FMULLD)
   1739
   1740void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   1741{
   1742    int idx = 0;
   1743
   1744    if (s->W(1) < s->W(idx)) {
   1745        idx = 1;
   1746    }
   1747    if (s->W(2) < s->W(idx)) {
   1748        idx = 2;
   1749    }
   1750    if (s->W(3) < s->W(idx)) {
   1751        idx = 3;
   1752    }
   1753    if (s->W(4) < s->W(idx)) {
   1754        idx = 4;
   1755    }
   1756    if (s->W(5) < s->W(idx)) {
   1757        idx = 5;
   1758    }
   1759    if (s->W(6) < s->W(idx)) {
   1760        idx = 6;
   1761    }
   1762    if (s->W(7) < s->W(idx)) {
   1763        idx = 7;
   1764    }
   1765
   1766    d->W(0) = s->W(idx);
   1767    d->W(1) = idx;
   1768    d->L(1) = 0;
   1769    d->Q(1) = 0;
   1770}
   1771
   1772void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1773                                  uint32_t mode)
   1774{
   1775    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1776    signed char prev_rounding_mode;
   1777
   1778    prev_rounding_mode = env->sse_status.float_rounding_mode;
   1779    if (!(mode & (1 << 2))) {
   1780        switch (mode & 3) {
   1781        case 0:
   1782            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1783            break;
   1784        case 1:
   1785            set_float_rounding_mode(float_round_down, &env->sse_status);
   1786            break;
   1787        case 2:
   1788            set_float_rounding_mode(float_round_up, &env->sse_status);
   1789            break;
   1790        case 3:
   1791            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1792            break;
   1793        }
   1794    }
   1795
   1796    d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
   1797    d->ZMM_S(1) = float32_round_to_int(s->ZMM_S(1), &env->sse_status);
   1798    d->ZMM_S(2) = float32_round_to_int(s->ZMM_S(2), &env->sse_status);
   1799    d->ZMM_S(3) = float32_round_to_int(s->ZMM_S(3), &env->sse_status);
   1800
   1801    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1802        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1803                                  ~float_flag_inexact,
   1804                                  &env->sse_status);
   1805    }
   1806    env->sse_status.float_rounding_mode = prev_rounding_mode;
   1807}
   1808
   1809void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1810                                  uint32_t mode)
   1811{
   1812    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1813    signed char prev_rounding_mode;
   1814
   1815    prev_rounding_mode = env->sse_status.float_rounding_mode;
   1816    if (!(mode & (1 << 2))) {
   1817        switch (mode & 3) {
   1818        case 0:
   1819            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1820            break;
   1821        case 1:
   1822            set_float_rounding_mode(float_round_down, &env->sse_status);
   1823            break;
   1824        case 2:
   1825            set_float_rounding_mode(float_round_up, &env->sse_status);
   1826            break;
   1827        case 3:
   1828            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1829            break;
   1830        }
   1831    }
   1832
   1833    d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
   1834    d->ZMM_D(1) = float64_round_to_int(s->ZMM_D(1), &env->sse_status);
   1835
   1836    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1837        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1838                                  ~float_flag_inexact,
   1839                                  &env->sse_status);
   1840    }
   1841    env->sse_status.float_rounding_mode = prev_rounding_mode;
   1842}
   1843
   1844void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1845                                  uint32_t mode)
   1846{
   1847    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1848    signed char prev_rounding_mode;
   1849
   1850    prev_rounding_mode = env->sse_status.float_rounding_mode;
   1851    if (!(mode & (1 << 2))) {
   1852        switch (mode & 3) {
   1853        case 0:
   1854            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1855            break;
   1856        case 1:
   1857            set_float_rounding_mode(float_round_down, &env->sse_status);
   1858            break;
   1859        case 2:
   1860            set_float_rounding_mode(float_round_up, &env->sse_status);
   1861            break;
   1862        case 3:
   1863            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1864            break;
   1865        }
   1866    }
   1867
   1868    d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
   1869
   1870    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1871        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1872                                  ~float_flag_inexact,
   1873                                  &env->sse_status);
   1874    }
   1875    env->sse_status.float_rounding_mode = prev_rounding_mode;
   1876}
   1877
   1878void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1879                                  uint32_t mode)
   1880{
   1881    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
   1882    signed char prev_rounding_mode;
   1883
   1884    prev_rounding_mode = env->sse_status.float_rounding_mode;
   1885    if (!(mode & (1 << 2))) {
   1886        switch (mode & 3) {
   1887        case 0:
   1888            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1889            break;
   1890        case 1:
   1891            set_float_rounding_mode(float_round_down, &env->sse_status);
   1892            break;
   1893        case 2:
   1894            set_float_rounding_mode(float_round_up, &env->sse_status);
   1895            break;
   1896        case 3:
   1897            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1898            break;
   1899        }
   1900    }
   1901
   1902    d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
   1903
   1904    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
   1905        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
   1906                                  ~float_flag_inexact,
   1907                                  &env->sse_status);
   1908    }
   1909    env->sse_status.float_rounding_mode = prev_rounding_mode;
   1910}
   1911
   1912#define FBLENDP(d, s, m) (m ? s : d)
   1913SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
   1914SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
   1915SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
   1916
   1917void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
   1918{
   1919    float32 iresult = float32_zero;
   1920
   1921    if (mask & (1 << 4)) {
   1922        iresult = float32_add(iresult,
   1923                              float32_mul(d->ZMM_S(0), s->ZMM_S(0),
   1924                                          &env->sse_status),
   1925                              &env->sse_status);
   1926    }
   1927    if (mask & (1 << 5)) {
   1928        iresult = float32_add(iresult,
   1929                              float32_mul(d->ZMM_S(1), s->ZMM_S(1),
   1930                                          &env->sse_status),
   1931                              &env->sse_status);
   1932    }
   1933    if (mask & (1 << 6)) {
   1934        iresult = float32_add(iresult,
   1935                              float32_mul(d->ZMM_S(2), s->ZMM_S(2),
   1936                                          &env->sse_status),
   1937                              &env->sse_status);
   1938    }
   1939    if (mask & (1 << 7)) {
   1940        iresult = float32_add(iresult,
   1941                              float32_mul(d->ZMM_S(3), s->ZMM_S(3),
   1942                                          &env->sse_status),
   1943                              &env->sse_status);
   1944    }
   1945    d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero;
   1946    d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
   1947    d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
   1948    d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
   1949}
   1950
   1951void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
   1952{
   1953    float64 iresult = float64_zero;
   1954
   1955    if (mask & (1 << 4)) {
   1956        iresult = float64_add(iresult,
   1957                              float64_mul(d->ZMM_D(0), s->ZMM_D(0),
   1958                                          &env->sse_status),
   1959                              &env->sse_status);
   1960    }
   1961    if (mask & (1 << 5)) {
   1962        iresult = float64_add(iresult,
   1963                              float64_mul(d->ZMM_D(1), s->ZMM_D(1),
   1964                                          &env->sse_status),
   1965                              &env->sse_status);
   1966    }
   1967    d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero;
   1968    d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
   1969}
   1970
   1971void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   1972                                  uint32_t offset)
   1973{
   1974    int s0 = (offset & 3) << 2;
   1975    int d0 = (offset & 4) << 0;
   1976    int i;
   1977    Reg r;
   1978
   1979    for (i = 0; i < 8; i++, d0++) {
   1980        r.W(i) = 0;
   1981        r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
   1982        r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
   1983        r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
   1984        r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
   1985    }
   1986
   1987    *d = r;
   1988}
   1989
   1990/* SSE4.2 op helpers */
   1991#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
   1992SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
   1993
   1994static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
   1995{
   1996    int val;
   1997
   1998    /* Presence of REX.W is indicated by a bit higher than 7 set */
   1999    if (ctrl >> 8) {
   2000        val = abs1((int64_t)env->regs[reg]);
   2001    } else {
   2002        val = abs1((int32_t)env->regs[reg]);
   2003    }
   2004
   2005    if (ctrl & 1) {
   2006        if (val > 8) {
   2007            return 8;
   2008        }
   2009    } else {
   2010        if (val > 16) {
   2011            return 16;
   2012        }
   2013    }
   2014    return val;
   2015}
   2016
   2017static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
   2018{
   2019    int val = 0;
   2020
   2021    if (ctrl & 1) {
   2022        while (val < 8 && r->W(val)) {
   2023            val++;
   2024        }
   2025    } else {
   2026        while (val < 16 && r->B(val)) {
   2027            val++;
   2028        }
   2029    }
   2030
   2031    return val;
   2032}
   2033
   2034static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
   2035{
   2036    switch ((ctrl >> 0) & 3) {
   2037    case 0:
   2038        return r->B(i);
   2039    case 1:
   2040        return r->W(i);
   2041    case 2:
   2042        return (int8_t)r->B(i);
   2043    case 3:
   2044    default:
   2045        return (int16_t)r->W(i);
   2046    }
   2047}
   2048
   2049static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
   2050                                 int8_t ctrl, int valids, int validd)
   2051{
   2052    unsigned int res = 0;
   2053    int v;
   2054    int j, i;
   2055    int upper = (ctrl & 1) ? 7 : 15;
   2056
   2057    valids--;
   2058    validd--;
   2059
   2060    CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
   2061
   2062    switch ((ctrl >> 2) & 3) {
   2063    case 0:
   2064        for (j = valids; j >= 0; j--) {
   2065            res <<= 1;
   2066            v = pcmp_val(s, ctrl, j);
   2067            for (i = validd; i >= 0; i--) {
   2068                res |= (v == pcmp_val(d, ctrl, i));
   2069            }
   2070        }
   2071        break;
   2072    case 1:
   2073        for (j = valids; j >= 0; j--) {
   2074            res <<= 1;
   2075            v = pcmp_val(s, ctrl, j);
   2076            for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
   2077                res |= (pcmp_val(d, ctrl, i - 0) >= v &&
   2078                        pcmp_val(d, ctrl, i - 1) <= v);
   2079            }
   2080        }
   2081        break;
   2082    case 2:
   2083        res = (1 << (upper - MAX(valids, validd))) - 1;
   2084        res <<= MAX(valids, validd) - MIN(valids, validd);
   2085        for (i = MIN(valids, validd); i >= 0; i--) {
   2086            res <<= 1;
   2087            v = pcmp_val(s, ctrl, i);
   2088            res |= (v == pcmp_val(d, ctrl, i));
   2089        }
   2090        break;
   2091    case 3:
   2092        if (validd == -1) {
   2093            res = (2 << upper) - 1;
   2094            break;
   2095        }
   2096        for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
   2097            res <<= 1;
   2098            v = 1;
   2099            for (i = MIN(valids - j, validd); i >= 0; i--) {
   2100                v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
   2101            }
   2102            res |= v;
   2103        }
   2104        break;
   2105    }
   2106
   2107    switch ((ctrl >> 4) & 3) {
   2108    case 1:
   2109        res ^= (2 << upper) - 1;
   2110        break;
   2111    case 3:
   2112        res ^= (1 << (valids + 1)) - 1;
   2113        break;
   2114    }
   2115
   2116    if (res) {
   2117        CC_SRC |= CC_C;
   2118    }
   2119    if (res & 1) {
   2120        CC_SRC |= CC_O;
   2121    }
   2122
   2123    return res;
   2124}
   2125
   2126void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2127                                    uint32_t ctrl)
   2128{
   2129    unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2130                                 pcmp_elen(env, R_EDX, ctrl),
   2131                                 pcmp_elen(env, R_EAX, ctrl));
   2132
   2133    if (res) {
   2134        env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
   2135    } else {
   2136        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   2137    }
   2138}
   2139
   2140void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2141                                    uint32_t ctrl)
   2142{
   2143    int i;
   2144    unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2145                                 pcmp_elen(env, R_EDX, ctrl),
   2146                                 pcmp_elen(env, R_EAX, ctrl));
   2147
   2148    if ((ctrl >> 6) & 1) {
   2149        if (ctrl & 1) {
   2150            for (i = 0; i < 8; i++, res >>= 1) {
   2151                env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
   2152            }
   2153        } else {
   2154            for (i = 0; i < 16; i++, res >>= 1) {
   2155                env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
   2156            }
   2157        }
   2158    } else {
   2159        env->xmm_regs[0].Q(1) = 0;
   2160        env->xmm_regs[0].Q(0) = res;
   2161    }
   2162}
   2163
   2164void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2165                                    uint32_t ctrl)
   2166{
   2167    unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2168                                 pcmp_ilen(s, ctrl),
   2169                                 pcmp_ilen(d, ctrl));
   2170
   2171    if (res) {
   2172        env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
   2173    } else {
   2174        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   2175    }
   2176}
   2177
   2178void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2179                                    uint32_t ctrl)
   2180{
   2181    int i;
   2182    unsigned int res = pcmpxstrx(env, d, s, ctrl,
   2183                                 pcmp_ilen(s, ctrl),
   2184                                 pcmp_ilen(d, ctrl));
   2185
   2186    if ((ctrl >> 6) & 1) {
   2187        if (ctrl & 1) {
   2188            for (i = 0; i < 8; i++, res >>= 1) {
   2189                env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
   2190            }
   2191        } else {
   2192            for (i = 0; i < 16; i++, res >>= 1) {
   2193                env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
   2194            }
   2195        }
   2196    } else {
   2197        env->xmm_regs[0].Q(1) = 0;
   2198        env->xmm_regs[0].Q(0) = res;
   2199    }
   2200}
   2201
   2202#define CRCPOLY        0x1edc6f41
   2203#define CRCPOLY_BITREV 0x82f63b78
   2204target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
   2205{
   2206    target_ulong crc = (msg & ((target_ulong) -1 >>
   2207                               (TARGET_LONG_BITS - len))) ^ crc1;
   2208
   2209    while (len--) {
   2210        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
   2211    }
   2212
   2213    return crc;
   2214}
   2215
   2216void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2217                                    uint32_t ctrl)
   2218{
   2219    uint64_t ah, al, b, resh, resl;
   2220
   2221    ah = 0;
   2222    al = d->Q((ctrl & 1) != 0);
   2223    b = s->Q((ctrl & 16) != 0);
   2224    resh = resl = 0;
   2225
   2226    while (b) {
   2227        if (b & 1) {
   2228            resl ^= al;
   2229            resh ^= ah;
   2230        }
   2231        ah = (ah << 1) | (al >> 63);
   2232        al <<= 1;
   2233        b >>= 1;
   2234    }
   2235
   2236    d->Q(0) = resl;
   2237    d->Q(1) = resh;
   2238}
   2239
   2240void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2241{
   2242    int i;
   2243    Reg st = *d;
   2244    Reg rk = *s;
   2245
   2246    for (i = 0 ; i < 4 ; i++) {
   2247        d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
   2248                                    AES_Td1[st.B(AES_ishifts[4*i+1])] ^
   2249                                    AES_Td2[st.B(AES_ishifts[4*i+2])] ^
   2250                                    AES_Td3[st.B(AES_ishifts[4*i+3])]);
   2251    }
   2252}
   2253
   2254void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2255{
   2256    int i;
   2257    Reg st = *d;
   2258    Reg rk = *s;
   2259
   2260    for (i = 0; i < 16; i++) {
   2261        d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
   2262    }
   2263}
   2264
   2265void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2266{
   2267    int i;
   2268    Reg st = *d;
   2269    Reg rk = *s;
   2270
   2271    for (i = 0 ; i < 4 ; i++) {
   2272        d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
   2273                                    AES_Te1[st.B(AES_shifts[4*i+1])] ^
   2274                                    AES_Te2[st.B(AES_shifts[4*i+2])] ^
   2275                                    AES_Te3[st.B(AES_shifts[4*i+3])]);
   2276    }
   2277}
   2278
   2279void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2280{
   2281    int i;
   2282    Reg st = *d;
   2283    Reg rk = *s;
   2284
   2285    for (i = 0; i < 16; i++) {
   2286        d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
   2287    }
   2288
   2289}
   2290
   2291void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
   2292{
   2293    int i;
   2294    Reg tmp = *s;
   2295
   2296    for (i = 0 ; i < 4 ; i++) {
   2297        d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
   2298                          AES_imc[tmp.B(4*i+1)][1] ^
   2299                          AES_imc[tmp.B(4*i+2)][2] ^
   2300                          AES_imc[tmp.B(4*i+3)][3]);
   2301    }
   2302}
   2303
   2304void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
   2305                                          uint32_t ctrl)
   2306{
   2307    int i;
   2308    Reg tmp = *s;
   2309
   2310    for (i = 0 ; i < 4 ; i++) {
   2311        d->B(i) = AES_sbox[tmp.B(i + 4)];
   2312        d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
   2313    }
   2314    d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
   2315    d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
   2316}
   2317#endif
   2318
   2319#undef SHIFT
   2320#undef XMM_ONLY
   2321#undef Reg
   2322#undef B
   2323#undef W
   2324#undef L
   2325#undef Q
   2326#undef SUFFIX