cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

neon_helper.c (45008B)


      1/*
      2 * ARM NEON vector operations.
      3 *
      4 * Copyright (c) 2007, 2008 CodeSourcery.
      5 * Written by Paul Brook
      6 *
      7 * This code is licensed under the GNU GPL v2.
      8 */
      9#include "qemu/osdep.h"
     10
     11#include "cpu.h"
     12#include "exec/helper-proto.h"
     13#include "fpu/softfloat.h"
     14#include "vec_internal.h"
     15
     16#define SIGNBIT (uint32_t)0x80000000
     17#define SIGNBIT64 ((uint64_t)1 << 63)
     18
     19#define SET_QC() env->vfp.qc[0] = 1
     20
     21#define NEON_TYPE1(name, type) \
     22typedef struct \
     23{ \
     24    type v1; \
     25} neon_##name;
     26#ifdef HOST_WORDS_BIGENDIAN
     27#define NEON_TYPE2(name, type) \
     28typedef struct \
     29{ \
     30    type v2; \
     31    type v1; \
     32} neon_##name;
     33#define NEON_TYPE4(name, type) \
     34typedef struct \
     35{ \
     36    type v4; \
     37    type v3; \
     38    type v2; \
     39    type v1; \
     40} neon_##name;
     41#else
     42#define NEON_TYPE2(name, type) \
     43typedef struct \
     44{ \
     45    type v1; \
     46    type v2; \
     47} neon_##name;
     48#define NEON_TYPE4(name, type) \
     49typedef struct \
     50{ \
     51    type v1; \
     52    type v2; \
     53    type v3; \
     54    type v4; \
     55} neon_##name;
     56#endif
     57
     58NEON_TYPE4(s8, int8_t)
     59NEON_TYPE4(u8, uint8_t)
     60NEON_TYPE2(s16, int16_t)
     61NEON_TYPE2(u16, uint16_t)
     62NEON_TYPE1(s32, int32_t)
     63NEON_TYPE1(u32, uint32_t)
     64#undef NEON_TYPE4
     65#undef NEON_TYPE2
     66#undef NEON_TYPE1
     67
     68/* Copy from a uint32_t to a vector structure type.  */
     69#define NEON_UNPACK(vtype, dest, val) do { \
     70    union { \
     71        vtype v; \
     72        uint32_t i; \
     73    } conv_u; \
     74    conv_u.i = (val); \
     75    dest = conv_u.v; \
     76    } while(0)
     77
     78/* Copy from a vector structure type to a uint32_t.  */
     79#define NEON_PACK(vtype, dest, val) do { \
     80    union { \
     81        vtype v; \
     82        uint32_t i; \
     83    } conv_u; \
     84    conv_u.v = (val); \
     85    dest = conv_u.i; \
     86    } while(0)
     87
     88#define NEON_DO1 \
     89    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
     90#define NEON_DO2 \
     91    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     92    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
     93#define NEON_DO4 \
     94    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     95    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
     96    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
     97    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
     98
     99#define NEON_VOP_BODY(vtype, n) \
    100{ \
    101    uint32_t res; \
    102    vtype vsrc1; \
    103    vtype vsrc2; \
    104    vtype vdest; \
    105    NEON_UNPACK(vtype, vsrc1, arg1); \
    106    NEON_UNPACK(vtype, vsrc2, arg2); \
    107    NEON_DO##n; \
    108    NEON_PACK(vtype, res, vdest); \
    109    return res; \
    110}
    111
    112#define NEON_VOP(name, vtype, n) \
    113uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    114NEON_VOP_BODY(vtype, n)
    115
    116#define NEON_VOP_ENV(name, vtype, n) \
    117uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
    118NEON_VOP_BODY(vtype, n)
    119
    120/* Pairwise operations.  */
    121/* For 32-bit elements each segment only contains a single element, so
    122   the elementwise and pairwise operations are the same.  */
    123#define NEON_PDO2 \
    124    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    125    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
    126#define NEON_PDO4 \
    127    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    128    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
    129    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
    130    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
    131
    132#define NEON_POP(name, vtype, n) \
    133uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    134{ \
    135    uint32_t res; \
    136    vtype vsrc1; \
    137    vtype vsrc2; \
    138    vtype vdest; \
    139    NEON_UNPACK(vtype, vsrc1, arg1); \
    140    NEON_UNPACK(vtype, vsrc2, arg2); \
    141    NEON_PDO##n; \
    142    NEON_PACK(vtype, res, vdest); \
    143    return res; \
    144}
    145
    146/* Unary operators.  */
    147#define NEON_VOP1(name, vtype, n) \
    148uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
    149{ \
    150    vtype vsrc1; \
    151    vtype vdest; \
    152    NEON_UNPACK(vtype, vsrc1, arg); \
    153    NEON_DO##n; \
    154    NEON_PACK(vtype, arg, vdest); \
    155    return arg; \
    156}
    157
    158
    159#define NEON_USAT(dest, src1, src2, type) do { \
    160    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    161    if (tmp != (type)tmp) { \
    162        SET_QC(); \
    163        dest = ~0; \
    164    } else { \
    165        dest = tmp; \
    166    }} while(0)
    167#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    168NEON_VOP_ENV(qadd_u8, neon_u8, 4)
    169#undef NEON_FN
    170#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    171NEON_VOP_ENV(qadd_u16, neon_u16, 2)
    172#undef NEON_FN
    173#undef NEON_USAT
    174
    175uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    176{
    177    uint32_t res = a + b;
    178    if (res < a) {
    179        SET_QC();
    180        res = ~0;
    181    }
    182    return res;
    183}
    184
    185uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    186{
    187    uint64_t res;
    188
    189    res = src1 + src2;
    190    if (res < src1) {
    191        SET_QC();
    192        res = ~(uint64_t)0;
    193    }
    194    return res;
    195}
    196
    197#define NEON_SSAT(dest, src1, src2, type) do { \
    198    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    199    if (tmp != (type)tmp) { \
    200        SET_QC(); \
    201        if (src2 > 0) { \
    202            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    203        } else { \
    204            tmp = 1 << (sizeof(type) * 8 - 1); \
    205        } \
    206    } \
    207    dest = tmp; \
    208    } while(0)
    209#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    210NEON_VOP_ENV(qadd_s8, neon_s8, 4)
    211#undef NEON_FN
    212#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    213NEON_VOP_ENV(qadd_s16, neon_s16, 2)
    214#undef NEON_FN
    215#undef NEON_SSAT
    216
    217uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    218{
    219    uint32_t res = a + b;
    220    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
    221        SET_QC();
    222        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    223    }
    224    return res;
    225}
    226
    227uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    228{
    229    uint64_t res;
    230
    231    res = src1 + src2;
    232    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
    233        SET_QC();
    234        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    235    }
    236    return res;
    237}
    238
    239/* Unsigned saturating accumulate of signed value
    240 *
    241 * Op1/Rn is treated as signed
    242 * Op2/Rd is treated as unsigned
    243 *
    244 * Explicit casting is used to ensure the correct sign extension of
    245 * inputs. The result is treated as a unsigned value and saturated as such.
    246 *
    247 * We use a macro for the 8/16 bit cases which expects signed integers of va,
    248 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
    249 */
    250
    251#define USATACC(bits, shift) \
    252    do { \
    253        va = sextract32(a, shift, bits);                                \
    254        vb = extract32(b, shift, bits);                                 \
    255        vr = va + vb;                                                   \
    256        if (vr > UINT##bits##_MAX) {                                    \
    257            SET_QC();                                                   \
    258            vr = UINT##bits##_MAX;                                      \
    259        } else if (vr < 0) {                                            \
    260            SET_QC();                                                   \
    261            vr = 0;                                                     \
    262        }                                                               \
    263        r = deposit32(r, shift, bits, vr);                              \
    264   } while (0)
    265
    266uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
    267{
    268    int16_t va, vb, vr;
    269    uint32_t r = 0;
    270
    271    USATACC(8, 0);
    272    USATACC(8, 8);
    273    USATACC(8, 16);
    274    USATACC(8, 24);
    275    return r;
    276}
    277
    278uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
    279{
    280    int32_t va, vb, vr;
    281    uint64_t r = 0;
    282
    283    USATACC(16, 0);
    284    USATACC(16, 16);
    285    return r;
    286}
    287
    288#undef USATACC
    289
    290uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    291{
    292    int64_t va = (int32_t)a;
    293    int64_t vb = (uint32_t)b;
    294    int64_t vr = va + vb;
    295    if (vr > UINT32_MAX) {
    296        SET_QC();
    297        vr = UINT32_MAX;
    298    } else if (vr < 0) {
    299        SET_QC();
    300        vr = 0;
    301    }
    302    return vr;
    303}
    304
    305uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
    306{
    307    uint64_t res;
    308    res = a + b;
    309    /* We only need to look at the pattern of SIGN bits to detect
    310     * +ve/-ve saturation
    311     */
    312    if (~a & b & ~res & SIGNBIT64) {
    313        SET_QC();
    314        res = UINT64_MAX;
    315    } else if (a & ~b & res & SIGNBIT64) {
    316        SET_QC();
    317        res = 0;
    318    }
    319    return res;
    320}
    321
    322/* Signed saturating accumulate of unsigned value
    323 *
    324 * Op1/Rn is treated as unsigned
    325 * Op2/Rd is treated as signed
    326 *
    327 * The result is treated as a signed value and saturated as such
    328 *
    329 * We use a macro for the 8/16 bit cases which expects signed integers of va,
    330 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
    331 */
    332
    333#define SSATACC(bits, shift) \
    334    do { \
    335        va = extract32(a, shift, bits);                                 \
    336        vb = sextract32(b, shift, bits);                                \
    337        vr = va + vb;                                                   \
    338        if (vr > INT##bits##_MAX) {                                     \
    339            SET_QC();                                                   \
    340            vr = INT##bits##_MAX;                                       \
    341        } else if (vr < INT##bits##_MIN) {                              \
    342            SET_QC();                                                   \
    343            vr = INT##bits##_MIN;                                       \
    344        }                                                               \
    345        r = deposit32(r, shift, bits, vr);                              \
    346    } while (0)
    347
    348uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
    349{
    350    int16_t va, vb, vr;
    351    uint32_t r = 0;
    352
    353    SSATACC(8, 0);
    354    SSATACC(8, 8);
    355    SSATACC(8, 16);
    356    SSATACC(8, 24);
    357    return r;
    358}
    359
    360uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
    361{
    362    int32_t va, vb, vr;
    363    uint32_t r = 0;
    364
    365    SSATACC(16, 0);
    366    SSATACC(16, 16);
    367
    368    return r;
    369}
    370
    371#undef SSATACC
    372
    373uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    374{
    375    int64_t res;
    376    int64_t op1 = (uint32_t)a;
    377    int64_t op2 = (int32_t)b;
    378    res = op1 + op2;
    379    if (res > INT32_MAX) {
    380        SET_QC();
    381        res = INT32_MAX;
    382    } else if (res < INT32_MIN) {
    383        SET_QC();
    384        res = INT32_MIN;
    385    }
    386    return res;
    387}
    388
    389uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
    390{
    391    uint64_t res;
    392    res = a + b;
    393    /* We only need to look at the pattern of SIGN bits to detect an overflow */
    394    if (((a & res)
    395         | (~b & res)
    396         | (a & ~b)) & SIGNBIT64) {
    397        SET_QC();
    398        res = INT64_MAX;
    399    }
    400    return res;
    401}
    402
    403
    404#define NEON_USAT(dest, src1, src2, type) do { \
    405    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    406    if (tmp != (type)tmp) { \
    407        SET_QC(); \
    408        dest = 0; \
    409    } else { \
    410        dest = tmp; \
    411    }} while(0)
    412#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    413NEON_VOP_ENV(qsub_u8, neon_u8, 4)
    414#undef NEON_FN
    415#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    416NEON_VOP_ENV(qsub_u16, neon_u16, 2)
    417#undef NEON_FN
    418#undef NEON_USAT
    419
    420uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    421{
    422    uint32_t res = a - b;
    423    if (res > a) {
    424        SET_QC();
    425        res = 0;
    426    }
    427    return res;
    428}
    429
    430uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    431{
    432    uint64_t res;
    433
    434    if (src1 < src2) {
    435        SET_QC();
    436        res = 0;
    437    } else {
    438        res = src1 - src2;
    439    }
    440    return res;
    441}
    442
    443#define NEON_SSAT(dest, src1, src2, type) do { \
    444    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    445    if (tmp != (type)tmp) { \
    446        SET_QC(); \
    447        if (src2 < 0) { \
    448            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    449        } else { \
    450            tmp = 1 << (sizeof(type) * 8 - 1); \
    451        } \
    452    } \
    453    dest = tmp; \
    454    } while(0)
    455#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    456NEON_VOP_ENV(qsub_s8, neon_s8, 4)
    457#undef NEON_FN
    458#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    459NEON_VOP_ENV(qsub_s16, neon_s16, 2)
    460#undef NEON_FN
    461#undef NEON_SSAT
    462
    463uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    464{
    465    uint32_t res = a - b;
    466    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
    467        SET_QC();
    468        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    469    }
    470    return res;
    471}
    472
    473uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    474{
    475    uint64_t res;
    476
    477    res = src1 - src2;
    478    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
    479        SET_QC();
    480        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    481    }
    482    return res;
    483}
    484
    485#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
    486NEON_VOP(hadd_s8, neon_s8, 4)
    487NEON_VOP(hadd_u8, neon_u8, 4)
    488NEON_VOP(hadd_s16, neon_s16, 2)
    489NEON_VOP(hadd_u16, neon_u16, 2)
    490#undef NEON_FN
    491
    492int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
    493{
    494    int32_t dest;
    495
    496    dest = (src1 >> 1) + (src2 >> 1);
    497    if (src1 & src2 & 1)
    498        dest++;
    499    return dest;
    500}
    501
    502uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
    503{
    504    uint32_t dest;
    505
    506    dest = (src1 >> 1) + (src2 >> 1);
    507    if (src1 & src2 & 1)
    508        dest++;
    509    return dest;
    510}
    511
    512#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
    513NEON_VOP(rhadd_s8, neon_s8, 4)
    514NEON_VOP(rhadd_u8, neon_u8, 4)
    515NEON_VOP(rhadd_s16, neon_s16, 2)
    516NEON_VOP(rhadd_u16, neon_u16, 2)
    517#undef NEON_FN
    518
    519int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
    520{
    521    int32_t dest;
    522
    523    dest = (src1 >> 1) + (src2 >> 1);
    524    if ((src1 | src2) & 1)
    525        dest++;
    526    return dest;
    527}
    528
    529uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
    530{
    531    uint32_t dest;
    532
    533    dest = (src1 >> 1) + (src2 >> 1);
    534    if ((src1 | src2) & 1)
    535        dest++;
    536    return dest;
    537}
    538
    539#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
    540NEON_VOP(hsub_s8, neon_s8, 4)
    541NEON_VOP(hsub_u8, neon_u8, 4)
    542NEON_VOP(hsub_s16, neon_s16, 2)
    543NEON_VOP(hsub_u16, neon_u16, 2)
    544#undef NEON_FN
    545
    546int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
    547{
    548    int32_t dest;
    549
    550    dest = (src1 >> 1) - (src2 >> 1);
    551    if ((~src1) & src2 & 1)
    552        dest--;
    553    return dest;
    554}
    555
    556uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
    557{
    558    uint32_t dest;
    559
    560    dest = (src1 >> 1) - (src2 >> 1);
    561    if ((~src1) & src2 & 1)
    562        dest--;
    563    return dest;
    564}
    565
    566#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
    567NEON_POP(pmin_s8, neon_s8, 4)
    568NEON_POP(pmin_u8, neon_u8, 4)
    569NEON_POP(pmin_s16, neon_s16, 2)
    570NEON_POP(pmin_u16, neon_u16, 2)
    571#undef NEON_FN
    572
    573#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
    574NEON_POP(pmax_s8, neon_s8, 4)
    575NEON_POP(pmax_u8, neon_u8, 4)
    576NEON_POP(pmax_s16, neon_s16, 2)
    577NEON_POP(pmax_u16, neon_u16, 2)
    578#undef NEON_FN
    579
    580#define NEON_FN(dest, src1, src2) \
    581    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
    582NEON_VOP(shl_u16, neon_u16, 2)
    583#undef NEON_FN
    584
    585#define NEON_FN(dest, src1, src2) \
    586    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
    587NEON_VOP(shl_s16, neon_s16, 2)
    588#undef NEON_FN
    589
    590#define NEON_FN(dest, src1, src2) \
    591    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
    592NEON_VOP(rshl_s8, neon_s8, 4)
    593#undef NEON_FN
    594
    595#define NEON_FN(dest, src1, src2) \
    596    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
    597NEON_VOP(rshl_s16, neon_s16, 2)
    598#undef NEON_FN
    599
    600uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
    601{
    602    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
    603}
    604
    605uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
    606{
    607    return do_sqrshl_d(val, (int8_t)shift, true, NULL);
    608}
    609
    610#define NEON_FN(dest, src1, src2) \
    611    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
    612NEON_VOP(rshl_u8, neon_u8, 4)
    613#undef NEON_FN
    614
    615#define NEON_FN(dest, src1, src2) \
    616    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
    617NEON_VOP(rshl_u16, neon_u16, 2)
    618#undef NEON_FN
    619
    620uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
    621{
    622    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
    623}
    624
    625uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
    626{
    627    return do_uqrshl_d(val, (int8_t)shift, true, NULL);
    628}
    629
    630#define NEON_FN(dest, src1, src2) \
    631    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    632NEON_VOP_ENV(qshl_u8, neon_u8, 4)
    633#undef NEON_FN
    634
    635#define NEON_FN(dest, src1, src2) \
    636    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    637NEON_VOP_ENV(qshl_u16, neon_u16, 2)
    638#undef NEON_FN
    639
    640uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
    641{
    642    return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    643}
    644
    645uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
    646{
    647    return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    648}
    649
    650#define NEON_FN(dest, src1, src2) \
    651    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    652NEON_VOP_ENV(qshl_s8, neon_s8, 4)
    653#undef NEON_FN
    654
    655#define NEON_FN(dest, src1, src2) \
    656    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    657NEON_VOP_ENV(qshl_s16, neon_s16, 2)
    658#undef NEON_FN
    659
    660uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    661{
    662    return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    663}
    664
    665uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    666{
    667    return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    668}
    669
    670#define NEON_FN(dest, src1, src2) \
    671    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
    672NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
    673#undef NEON_FN
    674
    675#define NEON_FN(dest, src1, src2) \
    676    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
    677NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
    678#undef NEON_FN
    679
    680uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    681{
    682    return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
    683}
    684
    685uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    686{
    687    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
    688}
    689
    690#define NEON_FN(dest, src1, src2) \
    691    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
    692NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
    693#undef NEON_FN
    694
    695#define NEON_FN(dest, src1, src2) \
    696    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
    697NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
    698#undef NEON_FN
    699
    700uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
    701{
    702    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
    703}
    704
    705uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
    706{
    707    return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
    708}
    709
    710#define NEON_FN(dest, src1, src2) \
    711    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
    712NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
    713#undef NEON_FN
    714
    715#define NEON_FN(dest, src1, src2) \
    716    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
    717NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
    718#undef NEON_FN
    719
    720uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
    721{
    722    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
    723}
    724
    725uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    726{
    727    return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
    728}
    729
    730uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
    731{
    732    uint32_t mask;
    733    mask = (a ^ b) & 0x80808080u;
    734    a &= ~0x80808080u;
    735    b &= ~0x80808080u;
    736    return (a + b) ^ mask;
    737}
    738
    739uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
    740{
    741    uint32_t mask;
    742    mask = (a ^ b) & 0x80008000u;
    743    a &= ~0x80008000u;
    744    b &= ~0x80008000u;
    745    return (a + b) ^ mask;
    746}
    747
    748#define NEON_FN(dest, src1, src2) dest = src1 + src2
    749NEON_POP(padd_u8, neon_u8, 4)
    750NEON_POP(padd_u16, neon_u16, 2)
    751#undef NEON_FN
    752
    753#define NEON_FN(dest, src1, src2) dest = src1 - src2
    754NEON_VOP(sub_u8, neon_u8, 4)
    755NEON_VOP(sub_u16, neon_u16, 2)
    756#undef NEON_FN
    757
    758#define NEON_FN(dest, src1, src2) dest = src1 * src2
    759NEON_VOP(mul_u8, neon_u8, 4)
    760NEON_VOP(mul_u16, neon_u16, 2)
    761#undef NEON_FN
    762
    763#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
    764NEON_VOP(tst_u8, neon_u8, 4)
    765NEON_VOP(tst_u16, neon_u16, 2)
    766NEON_VOP(tst_u32, neon_u32, 1)
    767#undef NEON_FN
    768
    769/* Count Leading Sign/Zero Bits.  */
    770static inline int do_clz8(uint8_t x)
    771{
    772    int n;
    773    for (n = 8; x; n--)
    774        x >>= 1;
    775    return n;
    776}
    777
    778static inline int do_clz16(uint16_t x)
    779{
    780    int n;
    781    for (n = 16; x; n--)
    782        x >>= 1;
    783    return n;
    784}
    785
    786#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
    787NEON_VOP1(clz_u8, neon_u8, 4)
    788#undef NEON_FN
    789
    790#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
    791NEON_VOP1(clz_u16, neon_u16, 2)
    792#undef NEON_FN
    793
    794#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
    795NEON_VOP1(cls_s8, neon_s8, 4)
    796#undef NEON_FN
    797
    798#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
    799NEON_VOP1(cls_s16, neon_s16, 2)
    800#undef NEON_FN
    801
    802uint32_t HELPER(neon_cls_s32)(uint32_t x)
    803{
    804    int count;
    805    if ((int32_t)x < 0)
    806        x = ~x;
    807    for (count = 32; x; count--)
    808        x = x >> 1;
    809    return count - 1;
    810}
    811
    812/* Bit count.  */
    813uint32_t HELPER(neon_cnt_u8)(uint32_t x)
    814{
    815    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
    816    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
    817    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
    818    return x;
    819}
    820
    821/* Reverse bits in each 8 bit word */
    822uint32_t HELPER(neon_rbit_u8)(uint32_t x)
    823{
    824    x =  ((x & 0xf0f0f0f0) >> 4)
    825       | ((x & 0x0f0f0f0f) << 4);
    826    x =  ((x & 0x88888888) >> 3)
    827       | ((x & 0x44444444) >> 1)
    828       | ((x & 0x22222222) << 1)
    829       | ((x & 0x11111111) << 3);
    830    return x;
    831}
    832
    833#define NEON_QDMULH16(dest, src1, src2, round) do { \
    834    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
    835    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
    836        SET_QC(); \
    837        tmp = (tmp >> 31) ^ ~SIGNBIT; \
    838    } else { \
    839        tmp <<= 1; \
    840    } \
    841    if (round) { \
    842        int32_t old = tmp; \
    843        tmp += 1 << 15; \
    844        if ((int32_t)tmp < old) { \
    845            SET_QC(); \
    846            tmp = SIGNBIT - 1; \
    847        } \
    848    } \
    849    dest = tmp >> 16; \
    850    } while(0)
    851#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
    852NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
    853#undef NEON_FN
    854#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
    855NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
    856#undef NEON_FN
    857#undef NEON_QDMULH16
    858
    859#define NEON_QDMULH32(dest, src1, src2, round) do { \
    860    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
    861    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
    862        SET_QC(); \
    863        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
    864    } else { \
    865        tmp <<= 1; \
    866    } \
    867    if (round) { \
    868        int64_t old = tmp; \
    869        tmp += (int64_t)1 << 31; \
    870        if ((int64_t)tmp < old) { \
    871            SET_QC(); \
    872            tmp = SIGNBIT64 - 1; \
    873        } \
    874    } \
    875    dest = tmp >> 32; \
    876    } while(0)
    877#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
    878NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
    879#undef NEON_FN
    880#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
    881NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
    882#undef NEON_FN
    883#undef NEON_QDMULH32
    884
    885uint32_t HELPER(neon_narrow_u8)(uint64_t x)
    886{
    887    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
    888           | ((x >> 24) & 0xff000000u);
    889}
    890
    891uint32_t HELPER(neon_narrow_u16)(uint64_t x)
    892{
    893    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
    894}
    895
    896uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
    897{
    898    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    899            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    900}
    901
    902uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
    903{
    904    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    905}
    906
    907uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
    908{
    909    x &= 0xff80ff80ff80ff80ull;
    910    x += 0x0080008000800080ull;
    911    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    912            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    913}
    914
    915uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
    916{
    917    x &= 0xffff8000ffff8000ull;
    918    x += 0x0000800000008000ull;
    919    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    920}
    921
    922uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
    923{
    924    uint16_t s;
    925    uint8_t d;
    926    uint32_t res = 0;
    927#define SAT8(n) \
    928    s = x >> n; \
    929    if (s & 0x8000) { \
    930        SET_QC(); \
    931    } else { \
    932        if (s > 0xff) { \
    933            d = 0xff; \
    934            SET_QC(); \
    935        } else  { \
    936            d = s; \
    937        } \
    938        res |= (uint32_t)d << (n / 2); \
    939    }
    940
    941    SAT8(0);
    942    SAT8(16);
    943    SAT8(32);
    944    SAT8(48);
    945#undef SAT8
    946    return res;
    947}
    948
    949uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
    950{
    951    uint16_t s;
    952    uint8_t d;
    953    uint32_t res = 0;
    954#define SAT8(n) \
    955    s = x >> n; \
    956    if (s > 0xff) { \
    957        d = 0xff; \
    958        SET_QC(); \
    959    } else  { \
    960        d = s; \
    961    } \
    962    res |= (uint32_t)d << (n / 2);
    963
    964    SAT8(0);
    965    SAT8(16);
    966    SAT8(32);
    967    SAT8(48);
    968#undef SAT8
    969    return res;
    970}
    971
    972uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
    973{
    974    int16_t s;
    975    uint8_t d;
    976    uint32_t res = 0;
    977#define SAT8(n) \
    978    s = x >> n; \
    979    if (s != (int8_t)s) { \
    980        d = (s >> 15) ^ 0x7f; \
    981        SET_QC(); \
    982    } else  { \
    983        d = s; \
    984    } \
    985    res |= (uint32_t)d << (n / 2);
    986
    987    SAT8(0);
    988    SAT8(16);
    989    SAT8(32);
    990    SAT8(48);
    991#undef SAT8
    992    return res;
    993}
    994
    995uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
    996{
    997    uint32_t high;
    998    uint32_t low;
    999    low = x;
   1000    if (low & 0x80000000) {
   1001        low = 0;
   1002        SET_QC();
   1003    } else if (low > 0xffff) {
   1004        low = 0xffff;
   1005        SET_QC();
   1006    }
   1007    high = x >> 32;
   1008    if (high & 0x80000000) {
   1009        high = 0;
   1010        SET_QC();
   1011    } else if (high > 0xffff) {
   1012        high = 0xffff;
   1013        SET_QC();
   1014    }
   1015    return low | (high << 16);
   1016}
   1017
   1018uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
   1019{
   1020    uint32_t high;
   1021    uint32_t low;
   1022    low = x;
   1023    if (low > 0xffff) {
   1024        low = 0xffff;
   1025        SET_QC();
   1026    }
   1027    high = x >> 32;
   1028    if (high > 0xffff) {
   1029        high = 0xffff;
   1030        SET_QC();
   1031    }
   1032    return low | (high << 16);
   1033}
   1034
   1035uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
   1036{
   1037    int32_t low;
   1038    int32_t high;
   1039    low = x;
   1040    if (low != (int16_t)low) {
   1041        low = (low >> 31) ^ 0x7fff;
   1042        SET_QC();
   1043    }
   1044    high = x >> 32;
   1045    if (high != (int16_t)high) {
   1046        high = (high >> 31) ^ 0x7fff;
   1047        SET_QC();
   1048    }
   1049    return (uint16_t)low | (high << 16);
   1050}
   1051
   1052uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
   1053{
   1054    if (x & 0x8000000000000000ull) {
   1055        SET_QC();
   1056        return 0;
   1057    }
   1058    if (x > 0xffffffffu) {
   1059        SET_QC();
   1060        return 0xffffffffu;
   1061    }
   1062    return x;
   1063}
   1064
   1065uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
   1066{
   1067    if (x > 0xffffffffu) {
   1068        SET_QC();
   1069        return 0xffffffffu;
   1070    }
   1071    return x;
   1072}
   1073
   1074uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
   1075{
   1076    if ((int64_t)x != (int32_t)x) {
   1077        SET_QC();
   1078        return ((int64_t)x >> 63) ^ 0x7fffffff;
   1079    }
   1080    return x;
   1081}
   1082
   1083uint64_t HELPER(neon_widen_u8)(uint32_t x)
   1084{
   1085    uint64_t tmp;
   1086    uint64_t ret;
   1087    ret = (uint8_t)x;
   1088    tmp = (uint8_t)(x >> 8);
   1089    ret |= tmp << 16;
   1090    tmp = (uint8_t)(x >> 16);
   1091    ret |= tmp << 32;
   1092    tmp = (uint8_t)(x >> 24);
   1093    ret |= tmp << 48;
   1094    return ret;
   1095}
   1096
   1097uint64_t HELPER(neon_widen_s8)(uint32_t x)
   1098{
   1099    uint64_t tmp;
   1100    uint64_t ret;
   1101    ret = (uint16_t)(int8_t)x;
   1102    tmp = (uint16_t)(int8_t)(x >> 8);
   1103    ret |= tmp << 16;
   1104    tmp = (uint16_t)(int8_t)(x >> 16);
   1105    ret |= tmp << 32;
   1106    tmp = (uint16_t)(int8_t)(x >> 24);
   1107    ret |= tmp << 48;
   1108    return ret;
   1109}
   1110
   1111uint64_t HELPER(neon_widen_u16)(uint32_t x)
   1112{
   1113    uint64_t high = (uint16_t)(x >> 16);
   1114    return ((uint16_t)x) | (high << 32);
   1115}
   1116
   1117uint64_t HELPER(neon_widen_s16)(uint32_t x)
   1118{
   1119    uint64_t high = (int16_t)(x >> 16);
   1120    return ((uint32_t)(int16_t)x) | (high << 32);
   1121}
   1122
   1123uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
   1124{
   1125    uint64_t mask;
   1126    mask = (a ^ b) & 0x8000800080008000ull;
   1127    a &= ~0x8000800080008000ull;
   1128    b &= ~0x8000800080008000ull;
   1129    return (a + b) ^ mask;
   1130}
   1131
   1132uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
   1133{
   1134    uint64_t mask;
   1135    mask = (a ^ b) & 0x8000000080000000ull;
   1136    a &= ~0x8000000080000000ull;
   1137    b &= ~0x8000000080000000ull;
   1138    return (a + b) ^ mask;
   1139}
   1140
   1141uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
   1142{
   1143    uint64_t tmp;
   1144    uint64_t tmp2;
   1145
   1146    tmp = a & 0x0000ffff0000ffffull;
   1147    tmp += (a >> 16) & 0x0000ffff0000ffffull;
   1148    tmp2 = b & 0xffff0000ffff0000ull;
   1149    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
   1150    return    ( tmp         & 0xffff)
   1151            | ((tmp  >> 16) & 0xffff0000ull)
   1152            | ((tmp2 << 16) & 0xffff00000000ull)
   1153            | ( tmp2        & 0xffff000000000000ull);
   1154}
   1155
   1156uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
   1157{
   1158    uint32_t low = a + (a >> 32);
   1159    uint32_t high = b + (b >> 32);
   1160    return low + ((uint64_t)high << 32);
   1161}
   1162
   1163uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
   1164{
   1165    uint64_t mask;
   1166    mask = (a ^ ~b) & 0x8000800080008000ull;
   1167    a |= 0x8000800080008000ull;
   1168    b &= ~0x8000800080008000ull;
   1169    return (a - b) ^ mask;
   1170}
   1171
   1172uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
   1173{
   1174    uint64_t mask;
   1175    mask = (a ^ ~b) & 0x8000000080000000ull;
   1176    a |= 0x8000000080000000ull;
   1177    b &= ~0x8000000080000000ull;
   1178    return (a - b) ^ mask;
   1179}
   1180
   1181uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
   1182{
   1183    uint32_t x, y;
   1184    uint32_t low, high;
   1185
   1186    x = a;
   1187    y = b;
   1188    low = x + y;
   1189    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1190        SET_QC();
   1191        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1192    }
   1193    x = a >> 32;
   1194    y = b >> 32;
   1195    high = x + y;
   1196    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1197        SET_QC();
   1198        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1199    }
   1200    return low | ((uint64_t)high << 32);
   1201}
   1202
   1203uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
   1204{
   1205    uint64_t result;
   1206
   1207    result = a + b;
   1208    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
   1209        SET_QC();
   1210        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
   1211    }
   1212    return result;
   1213}
   1214
   1215/* We have to do the arithmetic in a larger type than
   1216 * the input type, because for example with a signed 32 bit
   1217 * op the absolute difference can overflow a signed 32 bit value.
   1218 */
   1219#define DO_ABD(dest, x, y, intype, arithtype) do {            \
   1220    arithtype tmp_x = (intype)(x);                            \
   1221    arithtype tmp_y = (intype)(y);                            \
   1222    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
   1223    } while(0)
   1224
   1225uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
   1226{
   1227    uint64_t tmp;
   1228    uint64_t result;
   1229    DO_ABD(result, a, b, uint8_t, uint32_t);
   1230    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
   1231    result |= tmp << 16;
   1232    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
   1233    result |= tmp << 32;
   1234    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
   1235    result |= tmp << 48;
   1236    return result;
   1237}
   1238
   1239uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
   1240{
   1241    uint64_t tmp;
   1242    uint64_t result;
   1243    DO_ABD(result, a, b, int8_t, int32_t);
   1244    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
   1245    result |= tmp << 16;
   1246    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
   1247    result |= tmp << 32;
   1248    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
   1249    result |= tmp << 48;
   1250    return result;
   1251}
   1252
   1253uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
   1254{
   1255    uint64_t tmp;
   1256    uint64_t result;
   1257    DO_ABD(result, a, b, uint16_t, uint32_t);
   1258    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1259    return result | (tmp << 32);
   1260}
   1261
   1262uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
   1263{
   1264    uint64_t tmp;
   1265    uint64_t result;
   1266    DO_ABD(result, a, b, int16_t, int32_t);
   1267    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
   1268    return result | (tmp << 32);
   1269}
   1270
   1271uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
   1272{
   1273    uint64_t result;
   1274    DO_ABD(result, a, b, uint32_t, uint64_t);
   1275    return result;
   1276}
   1277
   1278uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
   1279{
   1280    uint64_t result;
   1281    DO_ABD(result, a, b, int32_t, int64_t);
   1282    return result;
   1283}
   1284#undef DO_ABD
   1285
   1286/* Widening multiply. Named type is the source type.  */
   1287#define DO_MULL(dest, x, y, type1, type2) do { \
   1288    type1 tmp_x = x; \
   1289    type1 tmp_y = y; \
   1290    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
   1291    } while(0)
   1292
   1293uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
   1294{
   1295    uint64_t tmp;
   1296    uint64_t result;
   1297
   1298    DO_MULL(result, a, b, uint8_t, uint16_t);
   1299    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
   1300    result |= tmp << 16;
   1301    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
   1302    result |= tmp << 32;
   1303    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
   1304    result |= tmp << 48;
   1305    return result;
   1306}
   1307
   1308uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
   1309{
   1310    uint64_t tmp;
   1311    uint64_t result;
   1312
   1313    DO_MULL(result, a, b, int8_t, uint16_t);
   1314    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
   1315    result |= tmp << 16;
   1316    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
   1317    result |= tmp << 32;
   1318    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
   1319    result |= tmp << 48;
   1320    return result;
   1321}
   1322
   1323uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
   1324{
   1325    uint64_t tmp;
   1326    uint64_t result;
   1327
   1328    DO_MULL(result, a, b, uint16_t, uint32_t);
   1329    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1330    return result | (tmp << 32);
   1331}
   1332
   1333uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
   1334{
   1335    uint64_t tmp;
   1336    uint64_t result;
   1337
   1338    DO_MULL(result, a, b, int16_t, uint32_t);
   1339    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
   1340    return result | (tmp << 32);
   1341}
   1342
   1343uint64_t HELPER(neon_negl_u16)(uint64_t x)
   1344{
   1345    uint16_t tmp;
   1346    uint64_t result;
   1347    result = (uint16_t)-x;
   1348    tmp = -(x >> 16);
   1349    result |= (uint64_t)tmp << 16;
   1350    tmp = -(x >> 32);
   1351    result |= (uint64_t)tmp << 32;
   1352    tmp = -(x >> 48);
   1353    result |= (uint64_t)tmp << 48;
   1354    return result;
   1355}
   1356
   1357uint64_t HELPER(neon_negl_u32)(uint64_t x)
   1358{
   1359    uint32_t low = -x;
   1360    uint32_t high = -(x >> 32);
   1361    return low | ((uint64_t)high << 32);
   1362}
   1363
   1364/* Saturating sign manipulation.  */
   1365/* ??? Make these use NEON_VOP1 */
   1366#define DO_QABS8(x) do { \
   1367    if (x == (int8_t)0x80) { \
   1368        x = 0x7f; \
   1369        SET_QC(); \
   1370    } else if (x < 0) { \
   1371        x = -x; \
   1372    }} while (0)
   1373uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
   1374{
   1375    neon_s8 vec;
   1376    NEON_UNPACK(neon_s8, vec, x);
   1377    DO_QABS8(vec.v1);
   1378    DO_QABS8(vec.v2);
   1379    DO_QABS8(vec.v3);
   1380    DO_QABS8(vec.v4);
   1381    NEON_PACK(neon_s8, x, vec);
   1382    return x;
   1383}
   1384#undef DO_QABS8
   1385
   1386#define DO_QNEG8(x) do { \
   1387    if (x == (int8_t)0x80) { \
   1388        x = 0x7f; \
   1389        SET_QC(); \
   1390    } else { \
   1391        x = -x; \
   1392    }} while (0)
   1393uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
   1394{
   1395    neon_s8 vec;
   1396    NEON_UNPACK(neon_s8, vec, x);
   1397    DO_QNEG8(vec.v1);
   1398    DO_QNEG8(vec.v2);
   1399    DO_QNEG8(vec.v3);
   1400    DO_QNEG8(vec.v4);
   1401    NEON_PACK(neon_s8, x, vec);
   1402    return x;
   1403}
   1404#undef DO_QNEG8
   1405
   1406#define DO_QABS16(x) do { \
   1407    if (x == (int16_t)0x8000) { \
   1408        x = 0x7fff; \
   1409        SET_QC(); \
   1410    } else if (x < 0) { \
   1411        x = -x; \
   1412    }} while (0)
   1413uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
   1414{
   1415    neon_s16 vec;
   1416    NEON_UNPACK(neon_s16, vec, x);
   1417    DO_QABS16(vec.v1);
   1418    DO_QABS16(vec.v2);
   1419    NEON_PACK(neon_s16, x, vec);
   1420    return x;
   1421}
   1422#undef DO_QABS16
   1423
   1424#define DO_QNEG16(x) do { \
   1425    if (x == (int16_t)0x8000) { \
   1426        x = 0x7fff; \
   1427        SET_QC(); \
   1428    } else { \
   1429        x = -x; \
   1430    }} while (0)
   1431uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
   1432{
   1433    neon_s16 vec;
   1434    NEON_UNPACK(neon_s16, vec, x);
   1435    DO_QNEG16(vec.v1);
   1436    DO_QNEG16(vec.v2);
   1437    NEON_PACK(neon_s16, x, vec);
   1438    return x;
   1439}
   1440#undef DO_QNEG16
   1441
   1442uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
   1443{
   1444    if (x == SIGNBIT) {
   1445        SET_QC();
   1446        x = ~SIGNBIT;
   1447    } else if ((int32_t)x < 0) {
   1448        x = -x;
   1449    }
   1450    return x;
   1451}
   1452
   1453uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
   1454{
   1455    if (x == SIGNBIT) {
   1456        SET_QC();
   1457        x = ~SIGNBIT;
   1458    } else {
   1459        x = -x;
   1460    }
   1461    return x;
   1462}
   1463
   1464uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
   1465{
   1466    if (x == SIGNBIT64) {
   1467        SET_QC();
   1468        x = ~SIGNBIT64;
   1469    } else if ((int64_t)x < 0) {
   1470        x = -x;
   1471    }
   1472    return x;
   1473}
   1474
   1475uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
   1476{
   1477    if (x == SIGNBIT64) {
   1478        SET_QC();
   1479        x = ~SIGNBIT64;
   1480    } else {
   1481        x = -x;
   1482    }
   1483    return x;
   1484}
   1485
   1486/* NEON Float helpers.  */
   1487
   1488/* Floating point comparisons produce an integer result.
   1489 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
   1490 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
   1491 */
   1492uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
   1493{
   1494    float_status *fpst = fpstp;
   1495    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
   1496}
   1497
   1498uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1499{
   1500    float_status *fpst = fpstp;
   1501    return -float32_le(make_float32(b), make_float32(a), fpst);
   1502}
   1503
   1504uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1505{
   1506    float_status *fpst = fpstp;
   1507    return -float32_lt(make_float32(b), make_float32(a), fpst);
   1508}
   1509
   1510uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1511{
   1512    float_status *fpst = fpstp;
   1513    float32 f0 = float32_abs(make_float32(a));
   1514    float32 f1 = float32_abs(make_float32(b));
   1515    return -float32_le(f1, f0, fpst);
   1516}
   1517
   1518uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1519{
   1520    float_status *fpst = fpstp;
   1521    float32 f0 = float32_abs(make_float32(a));
   1522    float32 f1 = float32_abs(make_float32(b));
   1523    return -float32_lt(f1, f0, fpst);
   1524}
   1525
   1526uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
   1527{
   1528    float_status *fpst = fpstp;
   1529    float64 f0 = float64_abs(make_float64(a));
   1530    float64 f1 = float64_abs(make_float64(b));
   1531    return -float64_le(f1, f0, fpst);
   1532}
   1533
   1534uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
   1535{
   1536    float_status *fpst = fpstp;
   1537    float64 f0 = float64_abs(make_float64(a));
   1538    float64 f1 = float64_abs(make_float64(b));
   1539    return -float64_lt(f1, f0, fpst);
   1540}
   1541
   1542#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
   1543
   1544void HELPER(neon_qunzip8)(void *vd, void *vm)
   1545{
   1546    uint64_t *rd = vd, *rm = vm;
   1547    uint64_t zd0 = rd[0], zd1 = rd[1];
   1548    uint64_t zm0 = rm[0], zm1 = rm[1];
   1549
   1550    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
   1551        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
   1552        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
   1553        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
   1554    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
   1555        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
   1556        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1557        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
   1558    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
   1559        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
   1560        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
   1561        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
   1562    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
   1563        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
   1564        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
   1565        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1566
   1567    rm[0] = m0;
   1568    rm[1] = m1;
   1569    rd[0] = d0;
   1570    rd[1] = d1;
   1571}
   1572
   1573void HELPER(neon_qunzip16)(void *vd, void *vm)
   1574{
   1575    uint64_t *rd = vd, *rm = vm;
   1576    uint64_t zd0 = rd[0], zd1 = rd[1];
   1577    uint64_t zm0 = rm[0], zm1 = rm[1];
   1578
   1579    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
   1580        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
   1581    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
   1582        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
   1583    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
   1584        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
   1585    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
   1586        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1587
   1588    rm[0] = m0;
   1589    rm[1] = m1;
   1590    rd[0] = d0;
   1591    rd[1] = d1;
   1592}
   1593
   1594void HELPER(neon_qunzip32)(void *vd, void *vm)
   1595{
   1596    uint64_t *rd = vd, *rm = vm;
   1597    uint64_t zd0 = rd[0], zd1 = rd[1];
   1598    uint64_t zm0 = rm[0], zm1 = rm[1];
   1599
   1600    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
   1601    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1602    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
   1603    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1604
   1605    rm[0] = m0;
   1606    rm[1] = m1;
   1607    rd[0] = d0;
   1608    rd[1] = d1;
   1609}
   1610
   1611void HELPER(neon_unzip8)(void *vd, void *vm)
   1612{
   1613    uint64_t *rd = vd, *rm = vm;
   1614    uint64_t zd = rd[0], zm = rm[0];
   1615
   1616    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
   1617        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
   1618        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1619        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
   1620    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
   1621        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
   1622        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
   1623        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1624
   1625    rm[0] = m0;
   1626    rd[0] = d0;
   1627}
   1628
   1629void HELPER(neon_unzip16)(void *vd, void *vm)
   1630{
   1631    uint64_t *rd = vd, *rm = vm;
   1632    uint64_t zd = rd[0], zm = rm[0];
   1633
   1634    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
   1635        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
   1636    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
   1637        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1638
   1639    rm[0] = m0;
   1640    rd[0] = d0;
   1641}
   1642
   1643void HELPER(neon_qzip8)(void *vd, void *vm)
   1644{
   1645    uint64_t *rd = vd, *rm = vm;
   1646    uint64_t zd0 = rd[0], zd1 = rd[1];
   1647    uint64_t zm0 = rm[0], zm1 = rm[1];
   1648
   1649    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
   1650        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
   1651        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
   1652        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
   1653    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
   1654        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
   1655        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
   1656        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
   1657    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
   1658        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
   1659        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1660        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
   1661    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
   1662        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
   1663        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
   1664        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1665
   1666    rm[0] = m0;
   1667    rm[1] = m1;
   1668    rd[0] = d0;
   1669    rd[1] = d1;
   1670}
   1671
   1672void HELPER(neon_qzip16)(void *vd, void *vm)
   1673{
   1674    uint64_t *rd = vd, *rm = vm;
   1675    uint64_t zd0 = rd[0], zd1 = rd[1];
   1676    uint64_t zm0 = rm[0], zm1 = rm[1];
   1677
   1678    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
   1679        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
   1680    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
   1681        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
   1682    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
   1683        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
   1684    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
   1685        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1686
   1687    rm[0] = m0;
   1688    rm[1] = m1;
   1689    rd[0] = d0;
   1690    rd[1] = d1;
   1691}
   1692
   1693void HELPER(neon_qzip32)(void *vd, void *vm)
   1694{
   1695    uint64_t *rd = vd, *rm = vm;
   1696    uint64_t zd0 = rd[0], zd1 = rd[1];
   1697    uint64_t zm0 = rm[0], zm1 = rm[1];
   1698
   1699    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
   1700    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
   1701    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1702    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1703
   1704    rm[0] = m0;
   1705    rm[1] = m1;
   1706    rd[0] = d0;
   1707    rd[1] = d1;
   1708}
   1709
   1710void HELPER(neon_zip8)(void *vd, void *vm)
   1711{
   1712    uint64_t *rd = vd, *rm = vm;
   1713    uint64_t zd = rd[0], zm = rm[0];
   1714
   1715    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
   1716        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
   1717        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1718        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
   1719    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
   1720        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
   1721        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
   1722        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1723
   1724    rm[0] = m0;
   1725    rd[0] = d0;
   1726}
   1727
   1728void HELPER(neon_zip16)(void *vd, void *vm)
   1729{
   1730    uint64_t *rd = vd, *rm = vm;
   1731    uint64_t zd = rd[0], zm = rm[0];
   1732
   1733    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
   1734        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
   1735    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
   1736        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1737
   1738    rm[0] = m0;
   1739    rd[0] = d0;
   1740}