cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

vec_helper.c (93480B)


      1/*
      2 * ARM AdvSIMD / SVE Vector Operations
      3 *
      4 * Copyright (c) 2018 Linaro
      5 *
      6 * This library is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * This library is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18 */
     19
     20#include "qemu/osdep.h"
     21#include "cpu.h"
     22#include "exec/helper-proto.h"
     23#include "tcg/tcg-gvec-desc.h"
     24#include "fpu/softfloat.h"
     25#include "qemu/int128.h"
     26#include "vec_internal.h"
     27
     28/*
     29 * Data for expanding active predicate bits to bytes, for byte elements.
     30 *
     31 *  for (i = 0; i < 256; ++i) {
     32 *      unsigned long m = 0;
     33 *      for (j = 0; j < 8; j++) {
     34 *          if ((i >> j) & 1) {
     35 *              m |= 0xfful << (j << 3);
     36 *          }
     37 *      }
     38 *      printf("0x%016lx,\n", m);
     39 *  }
     40 */
     41const uint64_t expand_pred_b_data[256] = {
     42    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
     43    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
     44    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
     45    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
     46    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
     47    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
     48    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
     49    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
     50    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
     51    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
     52    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
     53    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
     54    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
     55    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
     56    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
     57    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
     58    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
     59    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
     60    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
     61    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
     62    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
     63    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
     64    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
     65    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
     66    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
     67    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
     68    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
     69    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
     70    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
     71    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
     72    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
     73    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
     74    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
     75    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
     76    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
     77    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
     78    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
     79    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
     80    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
     81    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
     82    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
     83    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
     84    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
     85    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
     86    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
     87    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
     88    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
     89    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
     90    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
     91    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
     92    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
     93    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
     94    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
     95    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
     96    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
     97    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
     98    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
     99    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
    100    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
    101    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
    102    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
    103    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
    104    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
    105    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
    106    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
    107    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
    108    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
    109    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
    110    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
    111    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
    112    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
    113    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
    114    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
    115    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
    116    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
    117    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
    118    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
    119    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
    120    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
    121    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
    122    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
    123    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
    124    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
    125    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
    126    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
    127    0xffffffffffffffff,
    128};
    129
    130/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
    131int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
    132                     bool neg, bool round)
    133{
    134    /*
    135     * Simplify:
    136     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
    137     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
    138     */
    139    int32_t ret = (int32_t)src1 * src2;
    140    if (neg) {
    141        ret = -ret;
    142    }
    143    ret += ((int32_t)src3 << 7) + (round << 6);
    144    ret >>= 7;
    145
    146    if (ret != (int8_t)ret) {
    147        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
    148    }
    149    return ret;
    150}
    151
    152void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
    153                             void *va, uint32_t desc)
    154{
    155    intptr_t i, opr_sz = simd_oprsz(desc);
    156    int8_t *d = vd, *n = vn, *m = vm, *a = va;
    157
    158    for (i = 0; i < opr_sz; ++i) {
    159        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
    160    }
    161}
    162
    163void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
    164                             void *va, uint32_t desc)
    165{
    166    intptr_t i, opr_sz = simd_oprsz(desc);
    167    int8_t *d = vd, *n = vn, *m = vm, *a = va;
    168
    169    for (i = 0; i < opr_sz; ++i) {
    170        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
    171    }
    172}
    173
    174void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
    175{
    176    intptr_t i, opr_sz = simd_oprsz(desc);
    177    int8_t *d = vd, *n = vn, *m = vm;
    178
    179    for (i = 0; i < opr_sz; ++i) {
    180        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
    181    }
    182}
    183
    184void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
    185{
    186    intptr_t i, opr_sz = simd_oprsz(desc);
    187    int8_t *d = vd, *n = vn, *m = vm;
    188
    189    for (i = 0; i < opr_sz; ++i) {
    190        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
    191    }
    192}
    193
    194/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
    195int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
    196                      bool neg, bool round, uint32_t *sat)
    197{
    198    /* Simplify similarly to do_sqrdmlah_b above.  */
    199    int32_t ret = (int32_t)src1 * src2;
    200    if (neg) {
    201        ret = -ret;
    202    }
    203    ret += ((int32_t)src3 << 15) + (round << 14);
    204    ret >>= 15;
    205
    206    if (ret != (int16_t)ret) {
    207        *sat = 1;
    208        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
    209    }
    210    return ret;
    211}
    212
    213uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
    214                                  uint32_t src2, uint32_t src3)
    215{
    216    uint32_t *sat = &env->vfp.qc[0];
    217    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
    218    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
    219                                false, true, sat);
    220    return deposit32(e1, 16, 16, e2);
    221}
    222
    223void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
    224                              void *vq, uint32_t desc)
    225{
    226    uintptr_t opr_sz = simd_oprsz(desc);
    227    int16_t *d = vd;
    228    int16_t *n = vn;
    229    int16_t *m = vm;
    230    uintptr_t i;
    231
    232    for (i = 0; i < opr_sz / 2; ++i) {
    233        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
    234    }
    235    clear_tail(d, opr_sz, simd_maxsz(desc));
    236}
    237
    238uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
    239                                  uint32_t src2, uint32_t src3)
    240{
    241    uint32_t *sat = &env->vfp.qc[0];
    242    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
    243    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
    244                                true, true, sat);
    245    return deposit32(e1, 16, 16, e2);
    246}
    247
    248void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
    249                              void *vq, uint32_t desc)
    250{
    251    uintptr_t opr_sz = simd_oprsz(desc);
    252    int16_t *d = vd;
    253    int16_t *n = vn;
    254    int16_t *m = vm;
    255    uintptr_t i;
    256
    257    for (i = 0; i < opr_sz / 2; ++i) {
    258        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
    259    }
    260    clear_tail(d, opr_sz, simd_maxsz(desc));
    261}
    262
    263void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
    264                            void *vq, uint32_t desc)
    265{
    266    intptr_t i, opr_sz = simd_oprsz(desc);
    267    int16_t *d = vd, *n = vn, *m = vm;
    268
    269    for (i = 0; i < opr_sz / 2; ++i) {
    270        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
    271    }
    272    clear_tail(d, opr_sz, simd_maxsz(desc));
    273}
    274
    275void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
    276                             void *vq, uint32_t desc)
    277{
    278    intptr_t i, opr_sz = simd_oprsz(desc);
    279    int16_t *d = vd, *n = vn, *m = vm;
    280
    281    for (i = 0; i < opr_sz / 2; ++i) {
    282        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
    283    }
    284    clear_tail(d, opr_sz, simd_maxsz(desc));
    285}
    286
    287void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
    288                             void *va, uint32_t desc)
    289{
    290    intptr_t i, opr_sz = simd_oprsz(desc);
    291    int16_t *d = vd, *n = vn, *m = vm, *a = va;
    292    uint32_t discard;
    293
    294    for (i = 0; i < opr_sz / 2; ++i) {
    295        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
    296    }
    297}
    298
    299void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
    300                             void *va, uint32_t desc)
    301{
    302    intptr_t i, opr_sz = simd_oprsz(desc);
    303    int16_t *d = vd, *n = vn, *m = vm, *a = va;
    304    uint32_t discard;
    305
    306    for (i = 0; i < opr_sz / 2; ++i) {
    307        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
    308    }
    309}
    310
    311void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
    312{
    313    intptr_t i, opr_sz = simd_oprsz(desc);
    314    int16_t *d = vd, *n = vn, *m = vm;
    315    uint32_t discard;
    316
    317    for (i = 0; i < opr_sz / 2; ++i) {
    318        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
    319    }
    320}
    321
    322void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
    323{
    324    intptr_t i, opr_sz = simd_oprsz(desc);
    325    int16_t *d = vd, *n = vn, *m = vm;
    326    uint32_t discard;
    327
    328    for (i = 0; i < opr_sz / 2; ++i) {
    329        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
    330    }
    331}
    332
    333void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
    334{
    335    intptr_t i, j, opr_sz = simd_oprsz(desc);
    336    int idx = simd_data(desc);
    337    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
    338    uint32_t discard;
    339
    340    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
    341        int16_t mm = m[i];
    342        for (j = 0; j < 16 / 2; ++j) {
    343            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
    344        }
    345    }
    346}
    347
    348void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
    349{
    350    intptr_t i, j, opr_sz = simd_oprsz(desc);
    351    int idx = simd_data(desc);
    352    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
    353    uint32_t discard;
    354
    355    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
    356        int16_t mm = m[i];
    357        for (j = 0; j < 16 / 2; ++j) {
    358            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
    359        }
    360    }
    361}
    362
    363/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
    364int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
    365                      bool neg, bool round, uint32_t *sat)
    366{
    367    /* Simplify similarly to do_sqrdmlah_b above.  */
    368    int64_t ret = (int64_t)src1 * src2;
    369    if (neg) {
    370        ret = -ret;
    371    }
    372    ret += ((int64_t)src3 << 31) + (round << 30);
    373    ret >>= 31;
    374
    375    if (ret != (int32_t)ret) {
    376        *sat = 1;
    377        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
    378    }
    379    return ret;
    380}
    381
    382uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
    383                                  int32_t src2, int32_t src3)
    384{
    385    uint32_t *sat = &env->vfp.qc[0];
    386    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
    387}
    388
    389void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
    390                              void *vq, uint32_t desc)
    391{
    392    uintptr_t opr_sz = simd_oprsz(desc);
    393    int32_t *d = vd;
    394    int32_t *n = vn;
    395    int32_t *m = vm;
    396    uintptr_t i;
    397
    398    for (i = 0; i < opr_sz / 4; ++i) {
    399        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
    400    }
    401    clear_tail(d, opr_sz, simd_maxsz(desc));
    402}
    403
    404uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
    405                                  int32_t src2, int32_t src3)
    406{
    407    uint32_t *sat = &env->vfp.qc[0];
    408    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
    409}
    410
    411void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
    412                              void *vq, uint32_t desc)
    413{
    414    uintptr_t opr_sz = simd_oprsz(desc);
    415    int32_t *d = vd;
    416    int32_t *n = vn;
    417    int32_t *m = vm;
    418    uintptr_t i;
    419
    420    for (i = 0; i < opr_sz / 4; ++i) {
    421        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
    422    }
    423    clear_tail(d, opr_sz, simd_maxsz(desc));
    424}
    425
    426void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
    427                            void *vq, uint32_t desc)
    428{
    429    intptr_t i, opr_sz = simd_oprsz(desc);
    430    int32_t *d = vd, *n = vn, *m = vm;
    431
    432    for (i = 0; i < opr_sz / 4; ++i) {
    433        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
    434    }
    435    clear_tail(d, opr_sz, simd_maxsz(desc));
    436}
    437
    438void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
    439                             void *vq, uint32_t desc)
    440{
    441    intptr_t i, opr_sz = simd_oprsz(desc);
    442    int32_t *d = vd, *n = vn, *m = vm;
    443
    444    for (i = 0; i < opr_sz / 4; ++i) {
    445        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
    446    }
    447    clear_tail(d, opr_sz, simd_maxsz(desc));
    448}
    449
    450void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
    451                             void *va, uint32_t desc)
    452{
    453    intptr_t i, opr_sz = simd_oprsz(desc);
    454    int32_t *d = vd, *n = vn, *m = vm, *a = va;
    455    uint32_t discard;
    456
    457    for (i = 0; i < opr_sz / 4; ++i) {
    458        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
    459    }
    460}
    461
    462void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
    463                             void *va, uint32_t desc)
    464{
    465    intptr_t i, opr_sz = simd_oprsz(desc);
    466    int32_t *d = vd, *n = vn, *m = vm, *a = va;
    467    uint32_t discard;
    468
    469    for (i = 0; i < opr_sz / 4; ++i) {
    470        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
    471    }
    472}
    473
    474void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
    475{
    476    intptr_t i, opr_sz = simd_oprsz(desc);
    477    int32_t *d = vd, *n = vn, *m = vm;
    478    uint32_t discard;
    479
    480    for (i = 0; i < opr_sz / 4; ++i) {
    481        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
    482    }
    483}
    484
    485void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
    486{
    487    intptr_t i, opr_sz = simd_oprsz(desc);
    488    int32_t *d = vd, *n = vn, *m = vm;
    489    uint32_t discard;
    490
    491    for (i = 0; i < opr_sz / 4; ++i) {
    492        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
    493    }
    494}
    495
    496void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
    497{
    498    intptr_t i, j, opr_sz = simd_oprsz(desc);
    499    int idx = simd_data(desc);
    500    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
    501    uint32_t discard;
    502
    503    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
    504        int32_t mm = m[i];
    505        for (j = 0; j < 16 / 4; ++j) {
    506            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
    507        }
    508    }
    509}
    510
    511void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
    512{
    513    intptr_t i, j, opr_sz = simd_oprsz(desc);
    514    int idx = simd_data(desc);
    515    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
    516    uint32_t discard;
    517
    518    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
    519        int32_t mm = m[i];
    520        for (j = 0; j < 16 / 4; ++j) {
    521            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
    522        }
    523    }
    524}
    525
    526/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
    527static int64_t do_sat128_d(Int128 r)
    528{
    529    int64_t ls = int128_getlo(r);
    530    int64_t hs = int128_gethi(r);
    531
    532    if (unlikely(hs != (ls >> 63))) {
    533        return hs < 0 ? INT64_MIN : INT64_MAX;
    534    }
    535    return ls;
    536}
    537
    538int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
    539{
    540    uint64_t l, h;
    541    Int128 r, t;
    542
    543    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
    544    muls64(&l, &h, m, n);
    545    r = int128_make128(l, h);
    546    if (neg) {
    547        r = int128_neg(r);
    548    }
    549    if (a) {
    550        t = int128_exts64(a);
    551        t = int128_lshift(t, 63);
    552        r = int128_add(r, t);
    553    }
    554    if (round) {
    555        t = int128_exts64(1ll << 62);
    556        r = int128_add(r, t);
    557    }
    558    r = int128_rshift(r, 63);
    559
    560    return do_sat128_d(r);
    561}
    562
    563void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
    564                             void *va, uint32_t desc)
    565{
    566    intptr_t i, opr_sz = simd_oprsz(desc);
    567    int64_t *d = vd, *n = vn, *m = vm, *a = va;
    568
    569    for (i = 0; i < opr_sz / 8; ++i) {
    570        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
    571    }
    572}
    573
    574void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
    575                             void *va, uint32_t desc)
    576{
    577    intptr_t i, opr_sz = simd_oprsz(desc);
    578    int64_t *d = vd, *n = vn, *m = vm, *a = va;
    579
    580    for (i = 0; i < opr_sz / 8; ++i) {
    581        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
    582    }
    583}
    584
    585void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
    586{
    587    intptr_t i, opr_sz = simd_oprsz(desc);
    588    int64_t *d = vd, *n = vn, *m = vm;
    589
    590    for (i = 0; i < opr_sz / 8; ++i) {
    591        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
    592    }
    593}
    594
    595void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
    596{
    597    intptr_t i, opr_sz = simd_oprsz(desc);
    598    int64_t *d = vd, *n = vn, *m = vm;
    599
    600    for (i = 0; i < opr_sz / 8; ++i) {
    601        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
    602    }
    603}
    604
    605void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
    606{
    607    intptr_t i, j, opr_sz = simd_oprsz(desc);
    608    int idx = simd_data(desc);
    609    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
    610
    611    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
    612        int64_t mm = m[i];
    613        for (j = 0; j < 16 / 8; ++j) {
    614            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
    615        }
    616    }
    617}
    618
    619void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
    620{
    621    intptr_t i, j, opr_sz = simd_oprsz(desc);
    622    int idx = simd_data(desc);
    623    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
    624
    625    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
    626        int64_t mm = m[i];
    627        for (j = 0; j < 16 / 8; ++j) {
    628            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
    629        }
    630    }
    631}
    632
    633/* Integer 8 and 16-bit dot-product.
    634 *
    635 * Note that for the loops herein, host endianness does not matter
    636 * with respect to the ordering of data within the quad-width lanes.
    637 * All elements are treated equally, no matter where they are.
    638 */
    639
    640#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
    641void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
    642{                                                                         \
    643    intptr_t i, opr_sz = simd_oprsz(desc);                                \
    644    TYPED *d = vd, *a = va;                                               \
    645    TYPEN *n = vn;                                                        \
    646    TYPEM *m = vm;                                                        \
    647    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
    648        d[i] = (a[i] +                                                    \
    649                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
    650                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
    651                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
    652                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
    653    }                                                                     \
    654    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
    655}
    656
    657DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
    658DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
    659DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
    660DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
    661DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
    662
    663#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
    664void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
    665{                                                                         \
    666    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
    667    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
    668    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
    669    intptr_t index = simd_data(desc);                                     \
    670    TYPED *d = vd, *a = va;                                               \
    671    TYPEN *n = vn;                                                        \
    672    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
    673    do {                                                                  \
    674        TYPED m0 = m_indexed[i * 4 + 0];                                  \
    675        TYPED m1 = m_indexed[i * 4 + 1];                                  \
    676        TYPED m2 = m_indexed[i * 4 + 2];                                  \
    677        TYPED m3 = m_indexed[i * 4 + 3];                                  \
    678        do {                                                              \
    679            d[i] = (a[i] +                                                \
    680                    n[i * 4 + 0] * m0 +                                   \
    681                    n[i * 4 + 1] * m1 +                                   \
    682                    n[i * 4 + 2] * m2 +                                   \
    683                    n[i * 4 + 3] * m3);                                   \
    684        } while (++i < segend);                                           \
    685        segend = i + 4;                                                   \
    686    } while (i < opr_sz_n);                                               \
    687    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
    688}
    689
    690DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
    691DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
    692DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
    693DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
    694DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
    695DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
    696
    697void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
    698                         void *vfpst, uint32_t desc)
    699{
    700    uintptr_t opr_sz = simd_oprsz(desc);
    701    float16 *d = vd;
    702    float16 *n = vn;
    703    float16 *m = vm;
    704    float_status *fpst = vfpst;
    705    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
    706    uint32_t neg_imag = neg_real ^ 1;
    707    uintptr_t i;
    708
    709    /* Shift boolean to the sign bit so we can xor to negate.  */
    710    neg_real <<= 15;
    711    neg_imag <<= 15;
    712
    713    for (i = 0; i < opr_sz / 2; i += 2) {
    714        float16 e0 = n[H2(i)];
    715        float16 e1 = m[H2(i + 1)] ^ neg_imag;
    716        float16 e2 = n[H2(i + 1)];
    717        float16 e3 = m[H2(i)] ^ neg_real;
    718
    719        d[H2(i)] = float16_add(e0, e1, fpst);
    720        d[H2(i + 1)] = float16_add(e2, e3, fpst);
    721    }
    722    clear_tail(d, opr_sz, simd_maxsz(desc));
    723}
    724
    725void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
    726                         void *vfpst, uint32_t desc)
    727{
    728    uintptr_t opr_sz = simd_oprsz(desc);
    729    float32 *d = vd;
    730    float32 *n = vn;
    731    float32 *m = vm;
    732    float_status *fpst = vfpst;
    733    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
    734    uint32_t neg_imag = neg_real ^ 1;
    735    uintptr_t i;
    736
    737    /* Shift boolean to the sign bit so we can xor to negate.  */
    738    neg_real <<= 31;
    739    neg_imag <<= 31;
    740
    741    for (i = 0; i < opr_sz / 4; i += 2) {
    742        float32 e0 = n[H4(i)];
    743        float32 e1 = m[H4(i + 1)] ^ neg_imag;
    744        float32 e2 = n[H4(i + 1)];
    745        float32 e3 = m[H4(i)] ^ neg_real;
    746
    747        d[H4(i)] = float32_add(e0, e1, fpst);
    748        d[H4(i + 1)] = float32_add(e2, e3, fpst);
    749    }
    750    clear_tail(d, opr_sz, simd_maxsz(desc));
    751}
    752
    753void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
    754                         void *vfpst, uint32_t desc)
    755{
    756    uintptr_t opr_sz = simd_oprsz(desc);
    757    float64 *d = vd;
    758    float64 *n = vn;
    759    float64 *m = vm;
    760    float_status *fpst = vfpst;
    761    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
    762    uint64_t neg_imag = neg_real ^ 1;
    763    uintptr_t i;
    764
    765    /* Shift boolean to the sign bit so we can xor to negate.  */
    766    neg_real <<= 63;
    767    neg_imag <<= 63;
    768
    769    for (i = 0; i < opr_sz / 8; i += 2) {
    770        float64 e0 = n[i];
    771        float64 e1 = m[i + 1] ^ neg_imag;
    772        float64 e2 = n[i + 1];
    773        float64 e3 = m[i] ^ neg_real;
    774
    775        d[i] = float64_add(e0, e1, fpst);
    776        d[i + 1] = float64_add(e2, e3, fpst);
    777    }
    778    clear_tail(d, opr_sz, simd_maxsz(desc));
    779}
    780
    781void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
    782                         void *vfpst, uint32_t desc)
    783{
    784    uintptr_t opr_sz = simd_oprsz(desc);
    785    float16 *d = vd, *n = vn, *m = vm, *a = va;
    786    float_status *fpst = vfpst;
    787    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
    788    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
    789    uint32_t neg_real = flip ^ neg_imag;
    790    uintptr_t i;
    791
    792    /* Shift boolean to the sign bit so we can xor to negate.  */
    793    neg_real <<= 15;
    794    neg_imag <<= 15;
    795
    796    for (i = 0; i < opr_sz / 2; i += 2) {
    797        float16 e2 = n[H2(i + flip)];
    798        float16 e1 = m[H2(i + flip)] ^ neg_real;
    799        float16 e4 = e2;
    800        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
    801
    802        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
    803        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
    804    }
    805    clear_tail(d, opr_sz, simd_maxsz(desc));
    806}
    807
    808void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
    809                             void *vfpst, uint32_t desc)
    810{
    811    uintptr_t opr_sz = simd_oprsz(desc);
    812    float16 *d = vd, *n = vn, *m = vm, *a = va;
    813    float_status *fpst = vfpst;
    814    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
    815    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
    816    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
    817    uint32_t neg_real = flip ^ neg_imag;
    818    intptr_t elements = opr_sz / sizeof(float16);
    819    intptr_t eltspersegment = 16 / sizeof(float16);
    820    intptr_t i, j;
    821
    822    /* Shift boolean to the sign bit so we can xor to negate.  */
    823    neg_real <<= 15;
    824    neg_imag <<= 15;
    825
    826    for (i = 0; i < elements; i += eltspersegment) {
    827        float16 mr = m[H2(i + 2 * index + 0)];
    828        float16 mi = m[H2(i + 2 * index + 1)];
    829        float16 e1 = neg_real ^ (flip ? mi : mr);
    830        float16 e3 = neg_imag ^ (flip ? mr : mi);
    831
    832        for (j = i; j < i + eltspersegment; j += 2) {
    833            float16 e2 = n[H2(j + flip)];
    834            float16 e4 = e2;
    835
    836            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
    837            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
    838        }
    839    }
    840    clear_tail(d, opr_sz, simd_maxsz(desc));
    841}
    842
    843void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
    844                         void *vfpst, uint32_t desc)
    845{
    846    uintptr_t opr_sz = simd_oprsz(desc);
    847    float32 *d = vd, *n = vn, *m = vm, *a = va;
    848    float_status *fpst = vfpst;
    849    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
    850    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
    851    uint32_t neg_real = flip ^ neg_imag;
    852    uintptr_t i;
    853
    854    /* Shift boolean to the sign bit so we can xor to negate.  */
    855    neg_real <<= 31;
    856    neg_imag <<= 31;
    857
    858    for (i = 0; i < opr_sz / 4; i += 2) {
    859        float32 e2 = n[H4(i + flip)];
    860        float32 e1 = m[H4(i + flip)] ^ neg_real;
    861        float32 e4 = e2;
    862        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
    863
    864        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
    865        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
    866    }
    867    clear_tail(d, opr_sz, simd_maxsz(desc));
    868}
    869
    870void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
    871                             void *vfpst, uint32_t desc)
    872{
    873    uintptr_t opr_sz = simd_oprsz(desc);
    874    float32 *d = vd, *n = vn, *m = vm, *a = va;
    875    float_status *fpst = vfpst;
    876    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
    877    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
    878    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
    879    uint32_t neg_real = flip ^ neg_imag;
    880    intptr_t elements = opr_sz / sizeof(float32);
    881    intptr_t eltspersegment = 16 / sizeof(float32);
    882    intptr_t i, j;
    883
    884    /* Shift boolean to the sign bit so we can xor to negate.  */
    885    neg_real <<= 31;
    886    neg_imag <<= 31;
    887
    888    for (i = 0; i < elements; i += eltspersegment) {
    889        float32 mr = m[H4(i + 2 * index + 0)];
    890        float32 mi = m[H4(i + 2 * index + 1)];
    891        float32 e1 = neg_real ^ (flip ? mi : mr);
    892        float32 e3 = neg_imag ^ (flip ? mr : mi);
    893
    894        for (j = i; j < i + eltspersegment; j += 2) {
    895            float32 e2 = n[H4(j + flip)];
    896            float32 e4 = e2;
    897
    898            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
    899            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
    900        }
    901    }
    902    clear_tail(d, opr_sz, simd_maxsz(desc));
    903}
    904
    905void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
    906                         void *vfpst, uint32_t desc)
    907{
    908    uintptr_t opr_sz = simd_oprsz(desc);
    909    float64 *d = vd, *n = vn, *m = vm, *a = va;
    910    float_status *fpst = vfpst;
    911    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
    912    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
    913    uint64_t neg_real = flip ^ neg_imag;
    914    uintptr_t i;
    915
    916    /* Shift boolean to the sign bit so we can xor to negate.  */
    917    neg_real <<= 63;
    918    neg_imag <<= 63;
    919
    920    for (i = 0; i < opr_sz / 8; i += 2) {
    921        float64 e2 = n[i + flip];
    922        float64 e1 = m[i + flip] ^ neg_real;
    923        float64 e4 = e2;
    924        float64 e3 = m[i + 1 - flip] ^ neg_imag;
    925
    926        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
    927        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
    928    }
    929    clear_tail(d, opr_sz, simd_maxsz(desc));
    930}
    931
    932/*
    933 * Floating point comparisons producing an integer result (all 1s or all 0s).
    934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
    935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
    936 */
    937static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
    938{
    939    return -float16_eq_quiet(op1, op2, stat);
    940}
    941
    942static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
    943{
    944    return -float32_eq_quiet(op1, op2, stat);
    945}
    946
    947static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
    948{
    949    return -float16_le(op2, op1, stat);
    950}
    951
    952static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
    953{
    954    return -float32_le(op2, op1, stat);
    955}
    956
    957static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
    958{
    959    return -float16_lt(op2, op1, stat);
    960}
    961
    962static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
    963{
    964    return -float32_lt(op2, op1, stat);
    965}
    966
    967static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
    968{
    969    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
    970}
    971
    972static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
    973{
    974    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
    975}
    976
    977static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
    978{
    979    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
    980}
    981
    982static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
    983{
    984    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
    985}
    986
    987static int16_t vfp_tosszh(float16 x, void *fpstp)
    988{
    989    float_status *fpst = fpstp;
    990    if (float16_is_any_nan(x)) {
    991        float_raise(float_flag_invalid, fpst);
    992        return 0;
    993    }
    994    return float16_to_int16_round_to_zero(x, fpst);
    995}
    996
    997static uint16_t vfp_touszh(float16 x, void *fpstp)
    998{
    999    float_status *fpst = fpstp;
   1000    if (float16_is_any_nan(x)) {
   1001        float_raise(float_flag_invalid, fpst);
   1002        return 0;
   1003    }
   1004    return float16_to_uint16_round_to_zero(x, fpst);
   1005}
   1006
   1007#define DO_2OP(NAME, FUNC, TYPE) \
   1008void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
   1009{                                                                 \
   1010    intptr_t i, oprsz = simd_oprsz(desc);                         \
   1011    TYPE *d = vd, *n = vn;                                        \
   1012    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
   1013        d[i] = FUNC(n[i], stat);                                  \
   1014    }                                                             \
   1015    clear_tail(d, oprsz, simd_maxsz(desc));                       \
   1016}
   1017
   1018DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
   1019DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
   1020DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
   1021
   1022DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
   1023DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
   1024DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
   1025
   1026DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
   1027DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
   1028
   1029DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
   1030DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
   1031DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
   1032DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
   1033DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
   1034DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
   1035DO_2OP(gvec_tosszh, vfp_tosszh, float16)
   1036DO_2OP(gvec_touszh, vfp_touszh, float16)
   1037
   1038#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
   1039    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
   1040    {                                                           \
   1041        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
   1042    }
   1043
   1044#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
   1045    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
   1046    {                                                           \
   1047        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
   1048    }
   1049
   1050#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
   1051    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
   1052    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
   1053    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
   1054    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
   1055
   1056DO_2OP_CMP0(cgt, cgt, FWD)
   1057DO_2OP_CMP0(cge, cge, FWD)
   1058DO_2OP_CMP0(ceq, ceq, FWD)
   1059DO_2OP_CMP0(clt, cgt, REV)
   1060DO_2OP_CMP0(cle, cge, REV)
   1061
   1062#undef DO_2OP
   1063#undef DO_2OP_CMP0
   1064
   1065/* Floating-point trigonometric starting value.
   1066 * See the ARM ARM pseudocode function FPTrigSMul.
   1067 */
   1068static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
   1069{
   1070    float16 result = float16_mul(op1, op1, stat);
   1071    if (!float16_is_any_nan(result)) {
   1072        result = float16_set_sign(result, op2 & 1);
   1073    }
   1074    return result;
   1075}
   1076
   1077static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
   1078{
   1079    float32 result = float32_mul(op1, op1, stat);
   1080    if (!float32_is_any_nan(result)) {
   1081        result = float32_set_sign(result, op2 & 1);
   1082    }
   1083    return result;
   1084}
   1085
   1086static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
   1087{
   1088    float64 result = float64_mul(op1, op1, stat);
   1089    if (!float64_is_any_nan(result)) {
   1090        result = float64_set_sign(result, op2 & 1);
   1091    }
   1092    return result;
   1093}
   1094
   1095static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
   1096{
   1097    return float16_abs(float16_sub(op1, op2, stat));
   1098}
   1099
   1100static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
   1101{
   1102    return float32_abs(float32_sub(op1, op2, stat));
   1103}
   1104
   1105/*
   1106 * Reciprocal step. These are the AArch32 version which uses a
   1107 * non-fused multiply-and-subtract.
   1108 */
   1109static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
   1110{
   1111    op1 = float16_squash_input_denormal(op1, stat);
   1112    op2 = float16_squash_input_denormal(op2, stat);
   1113
   1114    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
   1115        (float16_is_infinity(op2) && float16_is_zero(op1))) {
   1116        return float16_two;
   1117    }
   1118    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
   1119}
   1120
   1121static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
   1122{
   1123    op1 = float32_squash_input_denormal(op1, stat);
   1124    op2 = float32_squash_input_denormal(op2, stat);
   1125
   1126    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
   1127        (float32_is_infinity(op2) && float32_is_zero(op1))) {
   1128        return float32_two;
   1129    }
   1130    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
   1131}
   1132
   1133/* Reciprocal square-root step. AArch32 non-fused semantics. */
   1134static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
   1135{
   1136    op1 = float16_squash_input_denormal(op1, stat);
   1137    op2 = float16_squash_input_denormal(op2, stat);
   1138
   1139    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
   1140        (float16_is_infinity(op2) && float16_is_zero(op1))) {
   1141        return float16_one_point_five;
   1142    }
   1143    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
   1144    return float16_div(op1, float16_two, stat);
   1145}
   1146
   1147static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
   1148{
   1149    op1 = float32_squash_input_denormal(op1, stat);
   1150    op2 = float32_squash_input_denormal(op2, stat);
   1151
   1152    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
   1153        (float32_is_infinity(op2) && float32_is_zero(op1))) {
   1154        return float32_one_point_five;
   1155    }
   1156    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
   1157    return float32_div(op1, float32_two, stat);
   1158}
   1159
   1160#define DO_3OP(NAME, FUNC, TYPE) \
   1161void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
   1162{                                                                          \
   1163    intptr_t i, oprsz = simd_oprsz(desc);                                  \
   1164    TYPE *d = vd, *n = vn, *m = vm;                                        \
   1165    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
   1166        d[i] = FUNC(n[i], m[i], stat);                                     \
   1167    }                                                                      \
   1168    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1169}
   1170
   1171DO_3OP(gvec_fadd_h, float16_add, float16)
   1172DO_3OP(gvec_fadd_s, float32_add, float32)
   1173DO_3OP(gvec_fadd_d, float64_add, float64)
   1174
   1175DO_3OP(gvec_fsub_h, float16_sub, float16)
   1176DO_3OP(gvec_fsub_s, float32_sub, float32)
   1177DO_3OP(gvec_fsub_d, float64_sub, float64)
   1178
   1179DO_3OP(gvec_fmul_h, float16_mul, float16)
   1180DO_3OP(gvec_fmul_s, float32_mul, float32)
   1181DO_3OP(gvec_fmul_d, float64_mul, float64)
   1182
   1183DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
   1184DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
   1185DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
   1186
   1187DO_3OP(gvec_fabd_h, float16_abd, float16)
   1188DO_3OP(gvec_fabd_s, float32_abd, float32)
   1189
   1190DO_3OP(gvec_fceq_h, float16_ceq, float16)
   1191DO_3OP(gvec_fceq_s, float32_ceq, float32)
   1192
   1193DO_3OP(gvec_fcge_h, float16_cge, float16)
   1194DO_3OP(gvec_fcge_s, float32_cge, float32)
   1195
   1196DO_3OP(gvec_fcgt_h, float16_cgt, float16)
   1197DO_3OP(gvec_fcgt_s, float32_cgt, float32)
   1198
   1199DO_3OP(gvec_facge_h, float16_acge, float16)
   1200DO_3OP(gvec_facge_s, float32_acge, float32)
   1201
   1202DO_3OP(gvec_facgt_h, float16_acgt, float16)
   1203DO_3OP(gvec_facgt_s, float32_acgt, float32)
   1204
   1205DO_3OP(gvec_fmax_h, float16_max, float16)
   1206DO_3OP(gvec_fmax_s, float32_max, float32)
   1207
   1208DO_3OP(gvec_fmin_h, float16_min, float16)
   1209DO_3OP(gvec_fmin_s, float32_min, float32)
   1210
   1211DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
   1212DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
   1213
   1214DO_3OP(gvec_fminnum_h, float16_minnum, float16)
   1215DO_3OP(gvec_fminnum_s, float32_minnum, float32)
   1216
   1217DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
   1218DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
   1219
   1220DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
   1221DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
   1222
   1223#ifdef TARGET_AARCH64
   1224
   1225DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
   1226DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
   1227DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
   1228
   1229DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
   1230DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
   1231DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
   1232
   1233#endif
   1234#undef DO_3OP
   1235
   1236/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
   1237static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
   1238                                 float_status *stat)
   1239{
   1240    return float16_add(dest, float16_mul(op1, op2, stat), stat);
   1241}
   1242
   1243static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
   1244                                 float_status *stat)
   1245{
   1246    return float32_add(dest, float32_mul(op1, op2, stat), stat);
   1247}
   1248
   1249static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
   1250                                 float_status *stat)
   1251{
   1252    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
   1253}
   1254
   1255static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
   1256                                 float_status *stat)
   1257{
   1258    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
   1259}
   1260
   1261/* Fused versions; these have the semantics Neon VFMA/VFMS want */
   1262static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
   1263                                float_status *stat)
   1264{
   1265    return float16_muladd(op1, op2, dest, 0, stat);
   1266}
   1267
   1268static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
   1269                                 float_status *stat)
   1270{
   1271    return float32_muladd(op1, op2, dest, 0, stat);
   1272}
   1273
   1274static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
   1275                                 float_status *stat)
   1276{
   1277    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
   1278}
   1279
   1280static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
   1281                                 float_status *stat)
   1282{
   1283    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
   1284}
   1285
   1286#define DO_MULADD(NAME, FUNC, TYPE)                                     \
   1287void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
   1288{                                                                          \
   1289    intptr_t i, oprsz = simd_oprsz(desc);                                  \
   1290    TYPE *d = vd, *n = vn, *m = vm;                                        \
   1291    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
   1292        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
   1293    }                                                                      \
   1294    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1295}
   1296
   1297DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
   1298DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
   1299
   1300DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
   1301DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
   1302
   1303DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
   1304DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
   1305
   1306DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
   1307DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
   1308
   1309/* For the indexed ops, SVE applies the index per 128-bit vector segment.
   1310 * For AdvSIMD, there is of course only one such vector segment.
   1311 */
   1312
   1313#define DO_MUL_IDX(NAME, TYPE, H) \
   1314void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
   1315{                                                                          \
   1316    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
   1317    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
   1318    intptr_t idx = simd_data(desc);                                        \
   1319    TYPE *d = vd, *n = vn, *m = vm;                                        \
   1320    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
   1321        TYPE mm = m[H(i + idx)];                                           \
   1322        for (j = 0; j < segment; j++) {                                    \
   1323            d[i + j] = n[i + j] * mm;                                      \
   1324        }                                                                  \
   1325    }                                                                      \
   1326    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1327}
   1328
   1329DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
   1330DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
   1331DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
   1332
   1333#undef DO_MUL_IDX
   1334
   1335#define DO_MLA_IDX(NAME, TYPE, OP, H) \
   1336void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
   1337{                                                                          \
   1338    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
   1339    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
   1340    intptr_t idx = simd_data(desc);                                        \
   1341    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
   1342    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
   1343        TYPE mm = m[H(i + idx)];                                           \
   1344        for (j = 0; j < segment; j++) {                                    \
   1345            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
   1346        }                                                                  \
   1347    }                                                                      \
   1348    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1349}
   1350
   1351DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
   1352DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
   1353DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
   1354
   1355DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
   1356DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
   1357DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
   1358
   1359#undef DO_MLA_IDX
   1360
   1361#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
   1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
   1363{                                                                          \
   1364    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
   1365    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
   1366    intptr_t idx = simd_data(desc);                                        \
   1367    TYPE *d = vd, *n = vn, *m = vm;                                        \
   1368    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
   1369        TYPE mm = m[H(i + idx)];                                           \
   1370        for (j = 0; j < segment; j++) {                                    \
   1371            d[i + j] = TYPE##_##ADD(d[i + j],                              \
   1372                                    TYPE##_mul(n[i + j], mm, stat), stat); \
   1373        }                                                                  \
   1374    }                                                                      \
   1375    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1376}
   1377
   1378#define float16_nop(N, M, S) (M)
   1379#define float32_nop(N, M, S) (M)
   1380#define float64_nop(N, M, S) (M)
   1381
   1382DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
   1383DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
   1384DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
   1385
   1386/*
   1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
   1388 * the fused ops below they assume accumulate both from and into Vd.
   1389 */
   1390DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
   1391DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
   1392DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
   1393DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
   1394
   1395#undef float16_nop
   1396#undef float32_nop
   1397#undef float64_nop
   1398#undef DO_FMUL_IDX
   1399
   1400#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
   1401void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
   1402                  void *stat, uint32_t desc)                               \
   1403{                                                                          \
   1404    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
   1405    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
   1406    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
   1407    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
   1408    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
   1409    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
   1410    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
   1411        TYPE mm = m[H(i + idx)];                                           \
   1412        for (j = 0; j < segment; j++) {                                    \
   1413            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
   1414                                     mm, a[i + j], 0, stat);               \
   1415        }                                                                  \
   1416    }                                                                      \
   1417    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1418}
   1419
   1420DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
   1421DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
   1422DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
   1423
   1424#undef DO_FMLA_IDX
   1425
   1426#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
   1427void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
   1428{                                                                          \
   1429    intptr_t i, oprsz = simd_oprsz(desc);                                  \
   1430    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
   1431    bool q = false;                                                        \
   1432    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
   1433        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
   1434        if (dd < MIN) {                                                    \
   1435            dd = MIN;                                                      \
   1436            q = true;                                                      \
   1437        } else if (dd > MAX) {                                             \
   1438            dd = MAX;                                                      \
   1439            q = true;                                                      \
   1440        }                                                                  \
   1441        d[i] = dd;                                                         \
   1442    }                                                                      \
   1443    if (q) {                                                               \
   1444        uint32_t *qc = vq;                                                 \
   1445        qc[0] = 1;                                                         \
   1446    }                                                                      \
   1447    clear_tail(d, oprsz, simd_maxsz(desc));                                \
   1448}
   1449
   1450DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
   1451DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
   1452DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
   1453
   1454DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
   1455DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
   1456DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
   1457
   1458DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
   1459DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
   1460DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
   1461
   1462DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
   1463DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
   1464DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
   1465
   1466#undef DO_SAT
   1467
   1468void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
   1469                          void *vm, uint32_t desc)
   1470{
   1471    intptr_t i, oprsz = simd_oprsz(desc);
   1472    uint64_t *d = vd, *n = vn, *m = vm;
   1473    bool q = false;
   1474
   1475    for (i = 0; i < oprsz / 8; i++) {
   1476        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
   1477        if (dd < nn) {
   1478            dd = UINT64_MAX;
   1479            q = true;
   1480        }
   1481        d[i] = dd;
   1482    }
   1483    if (q) {
   1484        uint32_t *qc = vq;
   1485        qc[0] = 1;
   1486    }
   1487    clear_tail(d, oprsz, simd_maxsz(desc));
   1488}
   1489
   1490void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
   1491                          void *vm, uint32_t desc)
   1492{
   1493    intptr_t i, oprsz = simd_oprsz(desc);
   1494    uint64_t *d = vd, *n = vn, *m = vm;
   1495    bool q = false;
   1496
   1497    for (i = 0; i < oprsz / 8; i++) {
   1498        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
   1499        if (nn < mm) {
   1500            dd = 0;
   1501            q = true;
   1502        }
   1503        d[i] = dd;
   1504    }
   1505    if (q) {
   1506        uint32_t *qc = vq;
   1507        qc[0] = 1;
   1508    }
   1509    clear_tail(d, oprsz, simd_maxsz(desc));
   1510}
   1511
   1512void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
   1513                          void *vm, uint32_t desc)
   1514{
   1515    intptr_t i, oprsz = simd_oprsz(desc);
   1516    int64_t *d = vd, *n = vn, *m = vm;
   1517    bool q = false;
   1518
   1519    for (i = 0; i < oprsz / 8; i++) {
   1520        int64_t nn = n[i], mm = m[i], dd = nn + mm;
   1521        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
   1522            dd = (nn >> 63) ^ ~INT64_MIN;
   1523            q = true;
   1524        }
   1525        d[i] = dd;
   1526    }
   1527    if (q) {
   1528        uint32_t *qc = vq;
   1529        qc[0] = 1;
   1530    }
   1531    clear_tail(d, oprsz, simd_maxsz(desc));
   1532}
   1533
   1534void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
   1535                          void *vm, uint32_t desc)
   1536{
   1537    intptr_t i, oprsz = simd_oprsz(desc);
   1538    int64_t *d = vd, *n = vn, *m = vm;
   1539    bool q = false;
   1540
   1541    for (i = 0; i < oprsz / 8; i++) {
   1542        int64_t nn = n[i], mm = m[i], dd = nn - mm;
   1543        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
   1544            dd = (nn >> 63) ^ ~INT64_MIN;
   1545            q = true;
   1546        }
   1547        d[i] = dd;
   1548    }
   1549    if (q) {
   1550        uint32_t *qc = vq;
   1551        qc[0] = 1;
   1552    }
   1553    clear_tail(d, oprsz, simd_maxsz(desc));
   1554}
   1555
   1556
   1557#define DO_SRA(NAME, TYPE)                              \
   1558void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   1559{                                                       \
   1560    intptr_t i, oprsz = simd_oprsz(desc);               \
   1561    int shift = simd_data(desc);                        \
   1562    TYPE *d = vd, *n = vn;                              \
   1563    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
   1564        d[i] += n[i] >> shift;                          \
   1565    }                                                   \
   1566    clear_tail(d, oprsz, simd_maxsz(desc));             \
   1567}
   1568
   1569DO_SRA(gvec_ssra_b, int8_t)
   1570DO_SRA(gvec_ssra_h, int16_t)
   1571DO_SRA(gvec_ssra_s, int32_t)
   1572DO_SRA(gvec_ssra_d, int64_t)
   1573
   1574DO_SRA(gvec_usra_b, uint8_t)
   1575DO_SRA(gvec_usra_h, uint16_t)
   1576DO_SRA(gvec_usra_s, uint32_t)
   1577DO_SRA(gvec_usra_d, uint64_t)
   1578
   1579#undef DO_SRA
   1580
   1581#define DO_RSHR(NAME, TYPE)                             \
   1582void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   1583{                                                       \
   1584    intptr_t i, oprsz = simd_oprsz(desc);               \
   1585    int shift = simd_data(desc);                        \
   1586    TYPE *d = vd, *n = vn;                              \
   1587    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
   1588        TYPE tmp = n[i] >> (shift - 1);                 \
   1589        d[i] = (tmp >> 1) + (tmp & 1);                  \
   1590    }                                                   \
   1591    clear_tail(d, oprsz, simd_maxsz(desc));             \
   1592}
   1593
   1594DO_RSHR(gvec_srshr_b, int8_t)
   1595DO_RSHR(gvec_srshr_h, int16_t)
   1596DO_RSHR(gvec_srshr_s, int32_t)
   1597DO_RSHR(gvec_srshr_d, int64_t)
   1598
   1599DO_RSHR(gvec_urshr_b, uint8_t)
   1600DO_RSHR(gvec_urshr_h, uint16_t)
   1601DO_RSHR(gvec_urshr_s, uint32_t)
   1602DO_RSHR(gvec_urshr_d, uint64_t)
   1603
   1604#undef DO_RSHR
   1605
   1606#define DO_RSRA(NAME, TYPE)                             \
   1607void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   1608{                                                       \
   1609    intptr_t i, oprsz = simd_oprsz(desc);               \
   1610    int shift = simd_data(desc);                        \
   1611    TYPE *d = vd, *n = vn;                              \
   1612    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
   1613        TYPE tmp = n[i] >> (shift - 1);                 \
   1614        d[i] += (tmp >> 1) + (tmp & 1);                 \
   1615    }                                                   \
   1616    clear_tail(d, oprsz, simd_maxsz(desc));             \
   1617}
   1618
   1619DO_RSRA(gvec_srsra_b, int8_t)
   1620DO_RSRA(gvec_srsra_h, int16_t)
   1621DO_RSRA(gvec_srsra_s, int32_t)
   1622DO_RSRA(gvec_srsra_d, int64_t)
   1623
   1624DO_RSRA(gvec_ursra_b, uint8_t)
   1625DO_RSRA(gvec_ursra_h, uint16_t)
   1626DO_RSRA(gvec_ursra_s, uint32_t)
   1627DO_RSRA(gvec_ursra_d, uint64_t)
   1628
   1629#undef DO_RSRA
   1630
   1631#define DO_SRI(NAME, TYPE)                              \
   1632void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   1633{                                                       \
   1634    intptr_t i, oprsz = simd_oprsz(desc);               \
   1635    int shift = simd_data(desc);                        \
   1636    TYPE *d = vd, *n = vn;                              \
   1637    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
   1638        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
   1639    }                                                   \
   1640    clear_tail(d, oprsz, simd_maxsz(desc));             \
   1641}
   1642
   1643DO_SRI(gvec_sri_b, uint8_t)
   1644DO_SRI(gvec_sri_h, uint16_t)
   1645DO_SRI(gvec_sri_s, uint32_t)
   1646DO_SRI(gvec_sri_d, uint64_t)
   1647
   1648#undef DO_SRI
   1649
   1650#define DO_SLI(NAME, TYPE)                              \
   1651void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   1652{                                                       \
   1653    intptr_t i, oprsz = simd_oprsz(desc);               \
   1654    int shift = simd_data(desc);                        \
   1655    TYPE *d = vd, *n = vn;                              \
   1656    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
   1657        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
   1658    }                                                   \
   1659    clear_tail(d, oprsz, simd_maxsz(desc));             \
   1660}
   1661
   1662DO_SLI(gvec_sli_b, uint8_t)
   1663DO_SLI(gvec_sli_h, uint16_t)
   1664DO_SLI(gvec_sli_s, uint32_t)
   1665DO_SLI(gvec_sli_d, uint64_t)
   1666
   1667#undef DO_SLI
   1668
   1669/*
   1670 * Convert float16 to float32, raising no exceptions and
   1671 * preserving exceptional values, including SNaN.
   1672 * This is effectively an unpack+repack operation.
   1673 */
   1674static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
   1675{
   1676    const int f16_bias = 15;
   1677    const int f32_bias = 127;
   1678    uint32_t sign = extract32(f16, 15, 1);
   1679    uint32_t exp = extract32(f16, 10, 5);
   1680    uint32_t frac = extract32(f16, 0, 10);
   1681
   1682    if (exp == 0x1f) {
   1683        /* Inf or NaN */
   1684        exp = 0xff;
   1685    } else if (exp == 0) {
   1686        /* Zero or denormal.  */
   1687        if (frac != 0) {
   1688            if (fz16) {
   1689                frac = 0;
   1690            } else {
   1691                /*
   1692                 * Denormal; these are all normal float32.
   1693                 * Shift the fraction so that the msb is at bit 11,
   1694                 * then remove bit 11 as the implicit bit of the
   1695                 * normalized float32.  Note that we still go through
   1696                 * the shift for normal numbers below, to put the
   1697                 * float32 fraction at the right place.
   1698                 */
   1699                int shift = clz32(frac) - 21;
   1700                frac = (frac << shift) & 0x3ff;
   1701                exp = f32_bias - f16_bias - shift + 1;
   1702            }
   1703        }
   1704    } else {
   1705        /* Normal number; adjust the bias.  */
   1706        exp += f32_bias - f16_bias;
   1707    }
   1708    sign <<= 31;
   1709    exp <<= 23;
   1710    frac <<= 23 - 10;
   1711
   1712    return sign | exp | frac;
   1713}
   1714
   1715static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
   1716{
   1717    /*
   1718     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
   1719     * Load the 2nd qword iff is_q & is_2.
   1720     * Shift to the 2nd dword iff !is_q & is_2.
   1721     * For !is_q & !is_2, the upper bits of the result are garbage.
   1722     */
   1723    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
   1724}
   1725
   1726/*
   1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
   1728 * as there is not yet SVE versions that might use blocking.
   1729 */
   1730
   1731static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
   1732                     uint32_t desc, bool fz16)
   1733{
   1734    intptr_t i, oprsz = simd_oprsz(desc);
   1735    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
   1736    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
   1737    int is_q = oprsz == 16;
   1738    uint64_t n_4, m_4;
   1739
   1740    /* Pre-load all of the f16 data, avoiding overlap issues.  */
   1741    n_4 = load4_f16(vn, is_q, is_2);
   1742    m_4 = load4_f16(vm, is_q, is_2);
   1743
   1744    /* Negate all inputs for FMLSL at once.  */
   1745    if (is_s) {
   1746        n_4 ^= 0x8000800080008000ull;
   1747    }
   1748
   1749    for (i = 0; i < oprsz / 4; i++) {
   1750        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
   1751        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
   1752        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
   1753    }
   1754    clear_tail(d, oprsz, simd_maxsz(desc));
   1755}
   1756
   1757void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
   1758                            void *venv, uint32_t desc)
   1759{
   1760    CPUARMState *env = venv;
   1761    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
   1762             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
   1763}
   1764
   1765void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
   1766                            void *venv, uint32_t desc)
   1767{
   1768    CPUARMState *env = venv;
   1769    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
   1770             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
   1771}
   1772
   1773void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
   1774                               void *venv, uint32_t desc)
   1775{
   1776    intptr_t i, oprsz = simd_oprsz(desc);
   1777    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
   1778    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
   1779    CPUARMState *env = venv;
   1780    float_status *status = &env->vfp.fp_status;
   1781    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
   1782
   1783    for (i = 0; i < oprsz; i += sizeof(float32)) {
   1784        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
   1785        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
   1786        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
   1787        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
   1788        float32 aa = *(float32 *)(va + H1_4(i));
   1789
   1790        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
   1791    }
   1792}
   1793
   1794static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
   1795                         uint32_t desc, bool fz16)
   1796{
   1797    intptr_t i, oprsz = simd_oprsz(desc);
   1798    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
   1799    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
   1800    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
   1801    int is_q = oprsz == 16;
   1802    uint64_t n_4;
   1803    float32 m_1;
   1804
   1805    /* Pre-load all of the f16 data, avoiding overlap issues.  */
   1806    n_4 = load4_f16(vn, is_q, is_2);
   1807
   1808    /* Negate all inputs for FMLSL at once.  */
   1809    if (is_s) {
   1810        n_4 ^= 0x8000800080008000ull;
   1811    }
   1812
   1813    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
   1814
   1815    for (i = 0; i < oprsz / 4; i++) {
   1816        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
   1817        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
   1818    }
   1819    clear_tail(d, oprsz, simd_maxsz(desc));
   1820}
   1821
   1822void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
   1823                                void *venv, uint32_t desc)
   1824{
   1825    CPUARMState *env = venv;
   1826    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
   1827                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
   1828}
   1829
   1830void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
   1831                                void *venv, uint32_t desc)
   1832{
   1833    CPUARMState *env = venv;
   1834    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
   1835                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
   1836}
   1837
   1838void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
   1839                               void *venv, uint32_t desc)
   1840{
   1841    intptr_t i, j, oprsz = simd_oprsz(desc);
   1842    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
   1843    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
   1844    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
   1845    CPUARMState *env = venv;
   1846    float_status *status = &env->vfp.fp_status;
   1847    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
   1848
   1849    for (i = 0; i < oprsz; i += 16) {
   1850        float16 mm_16 = *(float16 *)(vm + i + idx);
   1851        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
   1852
   1853        for (j = 0; j < 16; j += sizeof(float32)) {
   1854            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
   1855            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
   1856            float32 aa = *(float32 *)(va + H1_4(i + j));
   1857
   1858            *(float32 *)(vd + H1_4(i + j)) =
   1859                float32_muladd(nn, mm, aa, 0, status);
   1860        }
   1861    }
   1862}
   1863
   1864void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
   1865{
   1866    intptr_t i, opr_sz = simd_oprsz(desc);
   1867    int8_t *d = vd, *n = vn, *m = vm;
   1868
   1869    for (i = 0; i < opr_sz; ++i) {
   1870        int8_t mm = m[i];
   1871        int8_t nn = n[i];
   1872        int8_t res = 0;
   1873        if (mm >= 0) {
   1874            if (mm < 8) {
   1875                res = nn << mm;
   1876            }
   1877        } else {
   1878            res = nn >> (mm > -8 ? -mm : 7);
   1879        }
   1880        d[i] = res;
   1881    }
   1882    clear_tail(d, opr_sz, simd_maxsz(desc));
   1883}
   1884
   1885void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
   1886{
   1887    intptr_t i, opr_sz = simd_oprsz(desc);
   1888    int16_t *d = vd, *n = vn, *m = vm;
   1889
   1890    for (i = 0; i < opr_sz / 2; ++i) {
   1891        int8_t mm = m[i];   /* only 8 bits of shift are significant */
   1892        int16_t nn = n[i];
   1893        int16_t res = 0;
   1894        if (mm >= 0) {
   1895            if (mm < 16) {
   1896                res = nn << mm;
   1897            }
   1898        } else {
   1899            res = nn >> (mm > -16 ? -mm : 15);
   1900        }
   1901        d[i] = res;
   1902    }
   1903    clear_tail(d, opr_sz, simd_maxsz(desc));
   1904}
   1905
   1906void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
   1907{
   1908    intptr_t i, opr_sz = simd_oprsz(desc);
   1909    uint8_t *d = vd, *n = vn, *m = vm;
   1910
   1911    for (i = 0; i < opr_sz; ++i) {
   1912        int8_t mm = m[i];
   1913        uint8_t nn = n[i];
   1914        uint8_t res = 0;
   1915        if (mm >= 0) {
   1916            if (mm < 8) {
   1917                res = nn << mm;
   1918            }
   1919        } else {
   1920            if (mm > -8) {
   1921                res = nn >> -mm;
   1922            }
   1923        }
   1924        d[i] = res;
   1925    }
   1926    clear_tail(d, opr_sz, simd_maxsz(desc));
   1927}
   1928
   1929void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
   1930{
   1931    intptr_t i, opr_sz = simd_oprsz(desc);
   1932    uint16_t *d = vd, *n = vn, *m = vm;
   1933
   1934    for (i = 0; i < opr_sz / 2; ++i) {
   1935        int8_t mm = m[i];   /* only 8 bits of shift are significant */
   1936        uint16_t nn = n[i];
   1937        uint16_t res = 0;
   1938        if (mm >= 0) {
   1939            if (mm < 16) {
   1940                res = nn << mm;
   1941            }
   1942        } else {
   1943            if (mm > -16) {
   1944                res = nn >> -mm;
   1945            }
   1946        }
   1947        d[i] = res;
   1948    }
   1949    clear_tail(d, opr_sz, simd_maxsz(desc));
   1950}
   1951
   1952/*
   1953 * 8x8->8 polynomial multiply.
   1954 *
   1955 * Polynomial multiplication is like integer multiplication except the
   1956 * partial products are XORed, not added.
   1957 *
   1958 * TODO: expose this as a generic vector operation, as it is a common
   1959 * crypto building block.
   1960 */
   1961void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
   1962{
   1963    intptr_t i, j, opr_sz = simd_oprsz(desc);
   1964    uint64_t *d = vd, *n = vn, *m = vm;
   1965
   1966    for (i = 0; i < opr_sz / 8; ++i) {
   1967        uint64_t nn = n[i];
   1968        uint64_t mm = m[i];
   1969        uint64_t rr = 0;
   1970
   1971        for (j = 0; j < 8; ++j) {
   1972            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
   1973            rr ^= mm & mask;
   1974            mm = (mm << 1) & 0xfefefefefefefefeull;
   1975            nn >>= 1;
   1976        }
   1977        d[i] = rr;
   1978    }
   1979    clear_tail(d, opr_sz, simd_maxsz(desc));
   1980}
   1981
   1982/*
   1983 * 64x64->128 polynomial multiply.
   1984 * Because of the lanes are not accessed in strict columns,
   1985 * this probably cannot be turned into a generic helper.
   1986 */
   1987void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
   1988{
   1989    intptr_t i, j, opr_sz = simd_oprsz(desc);
   1990    intptr_t hi = simd_data(desc);
   1991    uint64_t *d = vd, *n = vn, *m = vm;
   1992
   1993    for (i = 0; i < opr_sz / 8; i += 2) {
   1994        uint64_t nn = n[i + hi];
   1995        uint64_t mm = m[i + hi];
   1996        uint64_t rhi = 0;
   1997        uint64_t rlo = 0;
   1998
   1999        /* Bit 0 can only influence the low 64-bit result.  */
   2000        if (nn & 1) {
   2001            rlo = mm;
   2002        }
   2003
   2004        for (j = 1; j < 64; ++j) {
   2005            uint64_t mask = -((nn >> j) & 1);
   2006            rlo ^= (mm << j) & mask;
   2007            rhi ^= (mm >> (64 - j)) & mask;
   2008        }
   2009        d[i] = rlo;
   2010        d[i + 1] = rhi;
   2011    }
   2012    clear_tail(d, opr_sz, simd_maxsz(desc));
   2013}
   2014
   2015/*
   2016 * 8x8->16 polynomial multiply.
   2017 *
   2018 * The byte inputs are expanded to (or extracted from) half-words.
   2019 * Note that neon and sve2 get the inputs from different positions.
   2020 * This allows 4 bytes to be processed in parallel with uint64_t.
   2021 */
   2022
   2023static uint64_t expand_byte_to_half(uint64_t x)
   2024{
   2025    return  (x & 0x000000ff)
   2026         | ((x & 0x0000ff00) << 8)
   2027         | ((x & 0x00ff0000) << 16)
   2028         | ((x & 0xff000000) << 24);
   2029}
   2030
   2031uint64_t pmull_w(uint64_t op1, uint64_t op2)
   2032{
   2033    uint64_t result = 0;
   2034    int i;
   2035    for (i = 0; i < 16; ++i) {
   2036        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
   2037        result ^= op2 & mask;
   2038        op1 >>= 1;
   2039        op2 <<= 1;
   2040    }
   2041    return result;
   2042}
   2043
   2044uint64_t pmull_h(uint64_t op1, uint64_t op2)
   2045{
   2046    uint64_t result = 0;
   2047    int i;
   2048    for (i = 0; i < 8; ++i) {
   2049        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
   2050        result ^= op2 & mask;
   2051        op1 >>= 1;
   2052        op2 <<= 1;
   2053    }
   2054    return result;
   2055}
   2056
   2057void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
   2058{
   2059    int hi = simd_data(desc);
   2060    uint64_t *d = vd, *n = vn, *m = vm;
   2061    uint64_t nn = n[hi], mm = m[hi];
   2062
   2063    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
   2064    nn >>= 32;
   2065    mm >>= 32;
   2066    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
   2067
   2068    clear_tail(d, 16, simd_maxsz(desc));
   2069}
   2070
   2071#ifdef TARGET_AARCH64
   2072void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
   2073{
   2074    int shift = simd_data(desc) * 8;
   2075    intptr_t i, opr_sz = simd_oprsz(desc);
   2076    uint64_t *d = vd, *n = vn, *m = vm;
   2077
   2078    for (i = 0; i < opr_sz / 8; ++i) {
   2079        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
   2080        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
   2081
   2082        d[i] = pmull_h(nn, mm);
   2083    }
   2084}
   2085
   2086static uint64_t pmull_d(uint64_t op1, uint64_t op2)
   2087{
   2088    uint64_t result = 0;
   2089    int i;
   2090
   2091    for (i = 0; i < 32; ++i) {
   2092        uint64_t mask = -((op1 >> i) & 1);
   2093        result ^= (op2 << i) & mask;
   2094    }
   2095    return result;
   2096}
   2097
   2098void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
   2099{
   2100    intptr_t sel = H4(simd_data(desc));
   2101    intptr_t i, opr_sz = simd_oprsz(desc);
   2102    uint32_t *n = vn, *m = vm;
   2103    uint64_t *d = vd;
   2104
   2105    for (i = 0; i < opr_sz / 8; ++i) {
   2106        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
   2107    }
   2108}
   2109#endif
   2110
   2111#define DO_CMP0(NAME, TYPE, OP)                         \
   2112void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
   2113{                                                       \
   2114    intptr_t i, opr_sz = simd_oprsz(desc);              \
   2115    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
   2116        TYPE nn = *(TYPE *)(vn + i);                    \
   2117        *(TYPE *)(vd + i) = -(nn OP 0);                 \
   2118    }                                                   \
   2119    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
   2120}
   2121
   2122DO_CMP0(gvec_ceq0_b, int8_t, ==)
   2123DO_CMP0(gvec_clt0_b, int8_t, <)
   2124DO_CMP0(gvec_cle0_b, int8_t, <=)
   2125DO_CMP0(gvec_cgt0_b, int8_t, >)
   2126DO_CMP0(gvec_cge0_b, int8_t, >=)
   2127
   2128DO_CMP0(gvec_ceq0_h, int16_t, ==)
   2129DO_CMP0(gvec_clt0_h, int16_t, <)
   2130DO_CMP0(gvec_cle0_h, int16_t, <=)
   2131DO_CMP0(gvec_cgt0_h, int16_t, >)
   2132DO_CMP0(gvec_cge0_h, int16_t, >=)
   2133
   2134#undef DO_CMP0
   2135
   2136#define DO_ABD(NAME, TYPE)                                      \
   2137void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
   2138{                                                               \
   2139    intptr_t i, opr_sz = simd_oprsz(desc);                      \
   2140    TYPE *d = vd, *n = vn, *m = vm;                             \
   2141                                                                \
   2142    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
   2143        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
   2144    }                                                           \
   2145    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
   2146}
   2147
   2148DO_ABD(gvec_sabd_b, int8_t)
   2149DO_ABD(gvec_sabd_h, int16_t)
   2150DO_ABD(gvec_sabd_s, int32_t)
   2151DO_ABD(gvec_sabd_d, int64_t)
   2152
   2153DO_ABD(gvec_uabd_b, uint8_t)
   2154DO_ABD(gvec_uabd_h, uint16_t)
   2155DO_ABD(gvec_uabd_s, uint32_t)
   2156DO_ABD(gvec_uabd_d, uint64_t)
   2157
   2158#undef DO_ABD
   2159
   2160#define DO_ABA(NAME, TYPE)                                      \
   2161void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
   2162{                                                               \
   2163    intptr_t i, opr_sz = simd_oprsz(desc);                      \
   2164    TYPE *d = vd, *n = vn, *m = vm;                             \
   2165                                                                \
   2166    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
   2167        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
   2168    }                                                           \
   2169    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
   2170}
   2171
   2172DO_ABA(gvec_saba_b, int8_t)
   2173DO_ABA(gvec_saba_h, int16_t)
   2174DO_ABA(gvec_saba_s, int32_t)
   2175DO_ABA(gvec_saba_d, int64_t)
   2176
   2177DO_ABA(gvec_uaba_b, uint8_t)
   2178DO_ABA(gvec_uaba_h, uint16_t)
   2179DO_ABA(gvec_uaba_s, uint32_t)
   2180DO_ABA(gvec_uaba_d, uint64_t)
   2181
   2182#undef DO_ABA
   2183
   2184#define DO_NEON_PAIRWISE(NAME, OP)                                      \
   2185    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
   2186                         void *stat, uint32_t oprsz)                    \
   2187    {                                                                   \
   2188        float_status *fpst = stat;                                      \
   2189        float32 *d = vd;                                                \
   2190        float32 *n = vn;                                                \
   2191        float32 *m = vm;                                                \
   2192        float32 r0, r1;                                                 \
   2193                                                                        \
   2194        /* Read all inputs before writing outputs in case vm == vd */   \
   2195        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
   2196        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
   2197                                                                        \
   2198        d[H4(0)] = r0;                                                  \
   2199        d[H4(1)] = r1;                                                  \
   2200    }                                                                   \
   2201                                                                        \
   2202    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
   2203                         void *stat, uint32_t oprsz)                    \
   2204    {                                                                   \
   2205        float_status *fpst = stat;                                      \
   2206        float16 *d = vd;                                                \
   2207        float16 *n = vn;                                                \
   2208        float16 *m = vm;                                                \
   2209        float16 r0, r1, r2, r3;                                         \
   2210                                                                        \
   2211        /* Read all inputs before writing outputs in case vm == vd */   \
   2212        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
   2213        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
   2214        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
   2215        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
   2216                                                                        \
   2217        d[H2(0)] = r0;                                                  \
   2218        d[H2(1)] = r1;                                                  \
   2219        d[H2(2)] = r2;                                                  \
   2220        d[H2(3)] = r3;                                                  \
   2221    }
   2222
   2223DO_NEON_PAIRWISE(neon_padd, add)
   2224DO_NEON_PAIRWISE(neon_pmax, max)
   2225DO_NEON_PAIRWISE(neon_pmin, min)
   2226
   2227#undef DO_NEON_PAIRWISE
   2228
   2229#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
   2230    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
   2231    {                                                                   \
   2232        intptr_t i, oprsz = simd_oprsz(desc);                           \
   2233        int shift = simd_data(desc);                                    \
   2234        TYPE *d = vd, *n = vn;                                          \
   2235        float_status *fpst = stat;                                      \
   2236        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
   2237            d[i] = FUNC(n[i], shift, fpst);                             \
   2238        }                                                               \
   2239        clear_tail(d, oprsz, simd_maxsz(desc));                         \
   2240    }
   2241
   2242DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
   2243DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
   2244DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
   2245DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
   2246DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
   2247DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
   2248DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
   2249DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
   2250
   2251#undef DO_VCVT_FIXED
   2252
   2253#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
   2254    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
   2255    {                                                                   \
   2256        float_status *fpst = stat;                                      \
   2257        intptr_t i, oprsz = simd_oprsz(desc);                           \
   2258        uint32_t rmode = simd_data(desc);                               \
   2259        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
   2260        TYPE *d = vd, *n = vn;                                          \
   2261        set_float_rounding_mode(rmode, fpst);                           \
   2262        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
   2263            d[i] = FUNC(n[i], 0, fpst);                                 \
   2264        }                                                               \
   2265        set_float_rounding_mode(prev_rmode, fpst);                      \
   2266        clear_tail(d, oprsz, simd_maxsz(desc));                         \
   2267    }
   2268
   2269DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
   2270DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
   2271DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
   2272DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
   2273
   2274#undef DO_VCVT_RMODE
   2275
   2276#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
   2277    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
   2278    {                                                                   \
   2279        float_status *fpst = stat;                                      \
   2280        intptr_t i, oprsz = simd_oprsz(desc);                           \
   2281        uint32_t rmode = simd_data(desc);                               \
   2282        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
   2283        TYPE *d = vd, *n = vn;                                          \
   2284        set_float_rounding_mode(rmode, fpst);                           \
   2285        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
   2286            d[i] = FUNC(n[i], fpst);                                    \
   2287        }                                                               \
   2288        set_float_rounding_mode(prev_rmode, fpst);                      \
   2289        clear_tail(d, oprsz, simd_maxsz(desc));                         \
   2290    }
   2291
   2292DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
   2293DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
   2294
   2295#undef DO_VRINT_RMODE
   2296
   2297#ifdef TARGET_AARCH64
   2298void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
   2299{
   2300    const uint8_t *indices = vm;
   2301    CPUARMState *env = venv;
   2302    size_t oprsz = simd_oprsz(desc);
   2303    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
   2304    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
   2305    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
   2306    union {
   2307        uint8_t b[16];
   2308        uint64_t d[2];
   2309    } result;
   2310
   2311    /*
   2312     * We must construct the final result in a temp, lest the output
   2313     * overlaps the input table.  For TBL, begin with zero; for TBX,
   2314     * begin with the original register contents.  Note that we always
   2315     * copy 16 bytes here to avoid an extra branch; clearing the high
   2316     * bits of the register for oprsz == 8 is handled below.
   2317     */
   2318    if (is_tbx) {
   2319        memcpy(&result, vd, 16);
   2320    } else {
   2321        memset(&result, 0, 16);
   2322    }
   2323
   2324    for (size_t i = 0; i < oprsz; ++i) {
   2325        uint32_t index = indices[H1(i)];
   2326
   2327        if (index < table_len) {
   2328            /*
   2329             * Convert index (a byte offset into the virtual table
   2330             * which is a series of 128-bit vectors concatenated)
   2331             * into the correct register element, bearing in mind
   2332             * that the table can wrap around from V31 to V0.
   2333             */
   2334            const uint8_t *table = (const uint8_t *)
   2335                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
   2336            result.b[H1(i)] = table[H1(index % 16)];
   2337        }
   2338    }
   2339
   2340    memcpy(vd, &result, 16);
   2341    clear_tail(vd, oprsz, simd_maxsz(desc));
   2342}
   2343#endif
   2344
   2345/*
   2346 * NxN -> N highpart multiply
   2347 *
   2348 * TODO: expose this as a generic vector operation.
   2349 */
   2350
   2351void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
   2352{
   2353    intptr_t i, opr_sz = simd_oprsz(desc);
   2354    int8_t *d = vd, *n = vn, *m = vm;
   2355
   2356    for (i = 0; i < opr_sz; ++i) {
   2357        d[i] = ((int32_t)n[i] * m[i]) >> 8;
   2358    }
   2359    clear_tail(d, opr_sz, simd_maxsz(desc));
   2360}
   2361
   2362void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
   2363{
   2364    intptr_t i, opr_sz = simd_oprsz(desc);
   2365    int16_t *d = vd, *n = vn, *m = vm;
   2366
   2367    for (i = 0; i < opr_sz / 2; ++i) {
   2368        d[i] = ((int32_t)n[i] * m[i]) >> 16;
   2369    }
   2370    clear_tail(d, opr_sz, simd_maxsz(desc));
   2371}
   2372
   2373void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
   2374{
   2375    intptr_t i, opr_sz = simd_oprsz(desc);
   2376    int32_t *d = vd, *n = vn, *m = vm;
   2377
   2378    for (i = 0; i < opr_sz / 4; ++i) {
   2379        d[i] = ((int64_t)n[i] * m[i]) >> 32;
   2380    }
   2381    clear_tail(d, opr_sz, simd_maxsz(desc));
   2382}
   2383
   2384void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
   2385{
   2386    intptr_t i, opr_sz = simd_oprsz(desc);
   2387    uint64_t *d = vd, *n = vn, *m = vm;
   2388    uint64_t discard;
   2389
   2390    for (i = 0; i < opr_sz / 8; ++i) {
   2391        muls64(&discard, &d[i], n[i], m[i]);
   2392    }
   2393    clear_tail(d, opr_sz, simd_maxsz(desc));
   2394}
   2395
   2396void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
   2397{
   2398    intptr_t i, opr_sz = simd_oprsz(desc);
   2399    uint8_t *d = vd, *n = vn, *m = vm;
   2400
   2401    for (i = 0; i < opr_sz; ++i) {
   2402        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
   2403    }
   2404    clear_tail(d, opr_sz, simd_maxsz(desc));
   2405}
   2406
   2407void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
   2408{
   2409    intptr_t i, opr_sz = simd_oprsz(desc);
   2410    uint16_t *d = vd, *n = vn, *m = vm;
   2411
   2412    for (i = 0; i < opr_sz / 2; ++i) {
   2413        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
   2414    }
   2415    clear_tail(d, opr_sz, simd_maxsz(desc));
   2416}
   2417
   2418void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
   2419{
   2420    intptr_t i, opr_sz = simd_oprsz(desc);
   2421    uint32_t *d = vd, *n = vn, *m = vm;
   2422
   2423    for (i = 0; i < opr_sz / 4; ++i) {
   2424        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
   2425    }
   2426    clear_tail(d, opr_sz, simd_maxsz(desc));
   2427}
   2428
   2429void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
   2430{
   2431    intptr_t i, opr_sz = simd_oprsz(desc);
   2432    uint64_t *d = vd, *n = vn, *m = vm;
   2433    uint64_t discard;
   2434
   2435    for (i = 0; i < opr_sz / 8; ++i) {
   2436        mulu64(&discard, &d[i], n[i], m[i]);
   2437    }
   2438    clear_tail(d, opr_sz, simd_maxsz(desc));
   2439}
   2440
   2441void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
   2442{
   2443    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
   2444    int shr = simd_data(desc);
   2445    uint64_t *d = vd, *n = vn, *m = vm;
   2446
   2447    for (i = 0; i < opr_sz; ++i) {
   2448        d[i] = ror64(n[i] ^ m[i], shr);
   2449    }
   2450    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
   2451}
   2452
   2453/*
   2454 * Integer matrix-multiply accumulate
   2455 */
   2456
   2457static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
   2458{
   2459    int8_t *n = vn, *m = vm;
   2460
   2461    for (intptr_t k = 0; k < 8; ++k) {
   2462        sum += n[H1(k)] * m[H1(k)];
   2463    }
   2464    return sum;
   2465}
   2466
   2467static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
   2468{
   2469    uint8_t *n = vn, *m = vm;
   2470
   2471    for (intptr_t k = 0; k < 8; ++k) {
   2472        sum += n[H1(k)] * m[H1(k)];
   2473    }
   2474    return sum;
   2475}
   2476
   2477static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
   2478{
   2479    uint8_t *n = vn;
   2480    int8_t *m = vm;
   2481
   2482    for (intptr_t k = 0; k < 8; ++k) {
   2483        sum += n[H1(k)] * m[H1(k)];
   2484    }
   2485    return sum;
   2486}
   2487
   2488static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
   2489                      uint32_t (*inner_loop)(uint32_t, void *, void *))
   2490{
   2491    intptr_t seg, opr_sz = simd_oprsz(desc);
   2492
   2493    for (seg = 0; seg < opr_sz; seg += 16) {
   2494        uint32_t *d = vd + seg;
   2495        uint32_t *a = va + seg;
   2496        uint32_t sum0, sum1, sum2, sum3;
   2497
   2498        /*
   2499         * Process the entire segment at once, writing back the
   2500         * results only after we've consumed all of the inputs.
   2501         *
   2502         * Key to indices by column:
   2503         *          i   j                  i             j
   2504         */
   2505        sum0 = a[H4(0 + 0)];
   2506        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
   2507        sum1 = a[H4(0 + 1)];
   2508        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
   2509        sum2 = a[H4(2 + 0)];
   2510        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
   2511        sum3 = a[H4(2 + 1)];
   2512        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
   2513
   2514        d[H4(0)] = sum0;
   2515        d[H4(1)] = sum1;
   2516        d[H4(2)] = sum2;
   2517        d[H4(3)] = sum3;
   2518    }
   2519    clear_tail(vd, opr_sz, simd_maxsz(desc));
   2520}
   2521
   2522#define DO_MMLA_B(NAME, INNER) \
   2523    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
   2524    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
   2525
   2526DO_MMLA_B(gvec_smmla_b, do_smmla_b)
   2527DO_MMLA_B(gvec_ummla_b, do_ummla_b)
   2528DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
   2529
   2530/*
   2531 * BFloat16 Dot Product
   2532 */
   2533
   2534static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
   2535{
   2536    /* FPCR is ignored for BFDOT and BFMMLA. */
   2537    float_status bf_status = {
   2538        .tininess_before_rounding = float_tininess_before_rounding,
   2539        .float_rounding_mode = float_round_to_odd_inf,
   2540        .flush_to_zero = true,
   2541        .flush_inputs_to_zero = true,
   2542        .default_nan_mode = true,
   2543    };
   2544    float32 t1, t2;
   2545
   2546    /*
   2547     * Extract each BFloat16 from the element pair, and shift
   2548     * them such that they become float32.
   2549     */
   2550    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
   2551    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
   2552    t1 = float32_add(t1, t2, &bf_status);
   2553    t1 = float32_add(sum, t1, &bf_status);
   2554
   2555    return t1;
   2556}
   2557
   2558void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
   2559{
   2560    intptr_t i, opr_sz = simd_oprsz(desc);
   2561    float32 *d = vd, *a = va;
   2562    uint32_t *n = vn, *m = vm;
   2563
   2564    for (i = 0; i < opr_sz / 4; ++i) {
   2565        d[i] = bfdotadd(a[i], n[i], m[i]);
   2566    }
   2567    clear_tail(d, opr_sz, simd_maxsz(desc));
   2568}
   2569
   2570void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
   2571                            void *va, uint32_t desc)
   2572{
   2573    intptr_t i, j, opr_sz = simd_oprsz(desc);
   2574    intptr_t index = simd_data(desc);
   2575    intptr_t elements = opr_sz / 4;
   2576    intptr_t eltspersegment = MIN(16 / 4, elements);
   2577    float32 *d = vd, *a = va;
   2578    uint32_t *n = vn, *m = vm;
   2579
   2580    for (i = 0; i < elements; i += eltspersegment) {
   2581        uint32_t m_idx = m[i + H4(index)];
   2582
   2583        for (j = i; j < i + eltspersegment; j++) {
   2584            d[j] = bfdotadd(a[j], n[j], m_idx);
   2585        }
   2586    }
   2587    clear_tail(d, opr_sz, simd_maxsz(desc));
   2588}
   2589
   2590void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
   2591{
   2592    intptr_t s, opr_sz = simd_oprsz(desc);
   2593    float32 *d = vd, *a = va;
   2594    uint32_t *n = vn, *m = vm;
   2595
   2596    for (s = 0; s < opr_sz / 4; s += 4) {
   2597        float32 sum00, sum01, sum10, sum11;
   2598
   2599        /*
   2600         * Process the entire segment at once, writing back the
   2601         * results only after we've consumed all of the inputs.
   2602         *
   2603         * Key to indicies by column:
   2604         *               i   j           i   k             j   k
   2605         */
   2606        sum00 = a[s + H4(0 + 0)];
   2607        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
   2608        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
   2609
   2610        sum01 = a[s + H4(0 + 1)];
   2611        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
   2612        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
   2613
   2614        sum10 = a[s + H4(2 + 0)];
   2615        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
   2616        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
   2617
   2618        sum11 = a[s + H4(2 + 1)];
   2619        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
   2620        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
   2621
   2622        d[s + H4(0 + 0)] = sum00;
   2623        d[s + H4(0 + 1)] = sum01;
   2624        d[s + H4(2 + 0)] = sum10;
   2625        d[s + H4(2 + 1)] = sum11;
   2626    }
   2627    clear_tail(d, opr_sz, simd_maxsz(desc));
   2628}
   2629
   2630void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
   2631                         void *stat, uint32_t desc)
   2632{
   2633    intptr_t i, opr_sz = simd_oprsz(desc);
   2634    intptr_t sel = simd_data(desc);
   2635    float32 *d = vd, *a = va;
   2636    bfloat16 *n = vn, *m = vm;
   2637
   2638    for (i = 0; i < opr_sz / 4; ++i) {
   2639        float32 nn = n[H2(i * 2 + sel)] << 16;
   2640        float32 mm = m[H2(i * 2 + sel)] << 16;
   2641        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
   2642    }
   2643    clear_tail(d, opr_sz, simd_maxsz(desc));
   2644}
   2645
   2646void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
   2647                             void *va, void *stat, uint32_t desc)
   2648{
   2649    intptr_t i, j, opr_sz = simd_oprsz(desc);
   2650    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
   2651    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
   2652    intptr_t elements = opr_sz / 4;
   2653    intptr_t eltspersegment = MIN(16 / 4, elements);
   2654    float32 *d = vd, *a = va;
   2655    bfloat16 *n = vn, *m = vm;
   2656
   2657    for (i = 0; i < elements; i += eltspersegment) {
   2658        float32 m_idx = m[H2(2 * i + index)] << 16;
   2659
   2660        for (j = i; j < i + eltspersegment; j++) {
   2661            float32 n_j = n[H2(2 * j + sel)] << 16;
   2662            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
   2663        }
   2664    }
   2665    clear_tail(d, opr_sz, simd_maxsz(desc));
   2666}