sve_helper.c (285190B)
1/* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "qemu/osdep.h" 21#include "cpu.h" 22#include "internals.h" 23#include "exec/exec-all.h" 24#include "exec/cpu_ldst.h" 25#include "exec/helper-proto.h" 26#include "tcg/tcg-gvec-desc.h" 27#include "fpu/softfloat.h" 28#include "tcg/tcg.h" 29#include "vec_internal.h" 30 31 32/* Return a value for NZCV as per the ARM PredTest pseudofunction. 33 * 34 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 35 * and bit 0 set if C is set. Compare the definitions of these variables 36 * within CPUARMState. 37 */ 38 39/* For no G bits set, NZCV = C. */ 40#define PREDTEST_INIT 1 41 42/* This is an iterative function, called for each Pd and Pg word 43 * moving forward. 44 */ 45static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 46{ 47 if (likely(g)) { 48 /* Compute N from first D & G. 49 Use bit 2 to signal first G bit seen. */ 50 if (!(flags & 4)) { 51 flags |= ((d & (g & -g)) != 0) << 31; 52 flags |= 4; 53 } 54 55 /* Accumulate Z from each D & G. */ 56 flags |= ((d & g) != 0) << 1; 57 58 /* Compute C from last !(D & G). Replace previous. */ 59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 60 } 61 return flags; 62} 63 64/* This is an iterative function, called for each Pd and Pg word 65 * moving backward. 66 */ 67static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 68{ 69 if (likely(g)) { 70 /* Compute C from first (i.e last) !(D & G). 71 Use bit 2 to signal first G bit seen. */ 72 if (!(flags & 4)) { 73 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 74 flags |= (d & pow2floor(g)) == 0; 75 } 76 77 /* Accumulate Z from each D & G. */ 78 flags |= ((d & g) != 0) << 1; 79 80 /* Compute N from last (i.e first) D & G. Replace previous. */ 81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 82 } 83 return flags; 84} 85 86/* The same for a single word predicate. */ 87uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 88{ 89 return iter_predtest_fwd(d, g, PREDTEST_INIT); 90} 91 92/* The same for a multi-word predicate. */ 93uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 94{ 95 uint32_t flags = PREDTEST_INIT; 96 uint64_t *d = vd, *g = vg; 97 uintptr_t i = 0; 98 99 do { 100 flags = iter_predtest_fwd(d[i], g[i], flags); 101 } while (++i < words); 102 103 return flags; 104} 105 106/* 107 * Expand active predicate bits to bytes, for byte elements. 108 * (The data table itself is in vec_helper.c as MVE also needs it.) 109 */ 110static inline uint64_t expand_pred_b(uint8_t byte) 111{ 112 return expand_pred_b_data[byte]; 113} 114 115/* Similarly for half-word elements. 116 * for (i = 0; i < 256; ++i) { 117 * unsigned long m = 0; 118 * if (i & 0xaa) { 119 * continue; 120 * } 121 * for (j = 0; j < 8; j += 2) { 122 * if ((i >> j) & 1) { 123 * m |= 0xfffful << (j << 3); 124 * } 125 * } 126 * printf("[0x%x] = 0x%016lx,\n", i, m); 127 * } 128 */ 129static inline uint64_t expand_pred_h(uint8_t byte) 130{ 131 static const uint64_t word[] = { 132 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 133 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 134 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 135 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 136 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 137 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 138 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 139 [0x55] = 0xffffffffffffffff, 140 }; 141 return word[byte & 0x55]; 142} 143 144/* Similarly for single word elements. */ 145static inline uint64_t expand_pred_s(uint8_t byte) 146{ 147 static const uint64_t word[] = { 148 [0x01] = 0x00000000ffffffffull, 149 [0x10] = 0xffffffff00000000ull, 150 [0x11] = 0xffffffffffffffffull, 151 }; 152 return word[byte & 0x11]; 153} 154 155#define LOGICAL_PPPP(NAME, FUNC) \ 156void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 157{ \ 158 uintptr_t opr_sz = simd_oprsz(desc); \ 159 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 160 uintptr_t i; \ 161 for (i = 0; i < opr_sz / 8; ++i) { \ 162 d[i] = FUNC(n[i], m[i], g[i]); \ 163 } \ 164} 165 166#define DO_AND(N, M, G) (((N) & (M)) & (G)) 167#define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 168#define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 169#define DO_ORR(N, M, G) (((N) | (M)) & (G)) 170#define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 171#define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 172#define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 173#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 174 175LOGICAL_PPPP(sve_and_pppp, DO_AND) 176LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 177LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 178LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 179LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 180LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 181LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 182LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 183 184#undef DO_AND 185#undef DO_BIC 186#undef DO_EOR 187#undef DO_ORR 188#undef DO_ORN 189#undef DO_NOR 190#undef DO_NAND 191#undef DO_SEL 192#undef LOGICAL_PPPP 193 194/* Fully general three-operand expander, controlled by a predicate. 195 * This is complicated by the host-endian storage of the register file. 196 */ 197/* ??? I don't expect the compiler could ever vectorize this itself. 198 * With some tables we can convert bit masks to byte masks, and with 199 * extra care wrt byte/word ordering we could use gcc generic vectors 200 * and do 16 bytes at a time. 201 */ 202#define DO_ZPZZ(NAME, TYPE, H, OP) \ 203void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 204{ \ 205 intptr_t i, opr_sz = simd_oprsz(desc); \ 206 for (i = 0; i < opr_sz; ) { \ 207 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 208 do { \ 209 if (pg & 1) { \ 210 TYPE nn = *(TYPE *)(vn + H(i)); \ 211 TYPE mm = *(TYPE *)(vm + H(i)); \ 212 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 213 } \ 214 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 215 } while (i & 15); \ 216 } \ 217} 218 219/* Similarly, specialized for 64-bit operands. */ 220#define DO_ZPZZ_D(NAME, TYPE, OP) \ 221void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 222{ \ 223 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 224 TYPE *d = vd, *n = vn, *m = vm; \ 225 uint8_t *pg = vg; \ 226 for (i = 0; i < opr_sz; i += 1) { \ 227 if (pg[H1(i)] & 1) { \ 228 TYPE nn = n[i], mm = m[i]; \ 229 d[i] = OP(nn, mm); \ 230 } \ 231 } \ 232} 233 234#define DO_AND(N, M) (N & M) 235#define DO_EOR(N, M) (N ^ M) 236#define DO_ORR(N, M) (N | M) 237#define DO_BIC(N, M) (N & ~M) 238#define DO_ADD(N, M) (N + M) 239#define DO_SUB(N, M) (N - M) 240#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 241#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 242#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 243#define DO_MUL(N, M) (N * M) 244 245 246/* 247 * We must avoid the C undefined behaviour cases: division by 248 * zero and signed division of INT_MIN by -1. Both of these 249 * have architecturally defined required results for Arm. 250 * We special case all signed divisions by -1 to avoid having 251 * to deduce the minimum integer for the type involved. 252 */ 253#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 254#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 255 256DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 257DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 258DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 259DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 260 261DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 262DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 263DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 264DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 265 266DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 267DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 268DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 269DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 270 271DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 272DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 273DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 274DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 275 276DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 277DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 278DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 279DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 280 281DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 282DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 283DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 284DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 285 286DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 287DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 288DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 289DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 290 291DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 292DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 293DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 294DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 295 296DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 297DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 298DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 299DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 300 301DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 302DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 303DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 304DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 305 306DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 307DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 308DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 309DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 310 311DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 312DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 313DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 314DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 315 316/* Because the computation type is at least twice as large as required, 317 these work for both signed and unsigned source types. */ 318static inline uint8_t do_mulh_b(int32_t n, int32_t m) 319{ 320 return (n * m) >> 8; 321} 322 323static inline uint16_t do_mulh_h(int32_t n, int32_t m) 324{ 325 return (n * m) >> 16; 326} 327 328static inline uint32_t do_mulh_s(int64_t n, int64_t m) 329{ 330 return (n * m) >> 32; 331} 332 333static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 334{ 335 uint64_t lo, hi; 336 muls64(&lo, &hi, n, m); 337 return hi; 338} 339 340static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 341{ 342 uint64_t lo, hi; 343 mulu64(&lo, &hi, n, m); 344 return hi; 345} 346 347DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 348DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 349DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 350DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 351 352DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 353DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 354DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 355DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 356 357DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 358DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 359DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 360DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 361 362DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 363DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 364 365DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 366DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 367 368/* Note that all bits of the shift are significant 369 and not modulo the element size. */ 370#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 371#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 372#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 373 374DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 375DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 376DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 377 378DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 379DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 380DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 381 382DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 383DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 384DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 385 386DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 387DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 388DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 389 390static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 391{ 392 int8_t n1 = n, n2 = n >> 8; 393 return m + n1 + n2; 394} 395 396static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 397{ 398 int16_t n1 = n, n2 = n >> 16; 399 return m + n1 + n2; 400} 401 402static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 403{ 404 int32_t n1 = n, n2 = n >> 32; 405 return m + n1 + n2; 406} 407 408DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 409DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 410DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 411 412static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 413{ 414 uint8_t n1 = n, n2 = n >> 8; 415 return m + n1 + n2; 416} 417 418static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 419{ 420 uint16_t n1 = n, n2 = n >> 16; 421 return m + n1 + n2; 422} 423 424static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 425{ 426 uint32_t n1 = n, n2 = n >> 32; 427 return m + n1 + n2; 428} 429 430DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 431DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 432DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 433 434#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 435#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 436#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 437#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 438 439DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 440DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 441DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 442DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 443 444#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 445#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 446#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 447#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 448 449DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 450DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 451DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 452DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 453 454/* 455 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 456 * We pass in a pointer to a dummy saturation field to trigger 457 * the saturating arithmetic but discard the information about 458 * whether it has occurred. 459 */ 460#define do_sqshl_b(n, m) \ 461 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 462#define do_sqshl_h(n, m) \ 463 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 464#define do_sqshl_s(n, m) \ 465 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 466#define do_sqshl_d(n, m) \ 467 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 468 469DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 470DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 471DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 472DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 473 474#define do_uqshl_b(n, m) \ 475 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 476#define do_uqshl_h(n, m) \ 477 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 478#define do_uqshl_s(n, m) \ 479 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 480#define do_uqshl_d(n, m) \ 481 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 482 483DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 484DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 485DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 486DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 487 488#define do_sqrshl_b(n, m) \ 489 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 490#define do_sqrshl_h(n, m) \ 491 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 492#define do_sqrshl_s(n, m) \ 493 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 494#define do_sqrshl_d(n, m) \ 495 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 496 497DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 498DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 499DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 500DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 501 502#undef do_sqrshl_d 503 504#define do_uqrshl_b(n, m) \ 505 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 506#define do_uqrshl_h(n, m) \ 507 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 508#define do_uqrshl_s(n, m) \ 509 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 510#define do_uqrshl_d(n, m) \ 511 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 512 513DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 514DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 515DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 516DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 517 518#undef do_uqrshl_d 519 520#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 521#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 522 523DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 524DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 525DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 526DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 527 528DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 529DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 530DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 531DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 532 533#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 534#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 535 536DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 537DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 538DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 539DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 540 541DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 542DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 543DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 544DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 545 546#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 547#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 548 549DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 550DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 551DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 552DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 553 554DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 555DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 556DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 557DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 558 559static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 560{ 561 return val >= max ? max : val <= min ? min : val; 562} 563 564#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 565#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 566#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 567 568static inline int64_t do_sqadd_d(int64_t n, int64_t m) 569{ 570 int64_t r = n + m; 571 if (((r ^ n) & ~(n ^ m)) < 0) { 572 /* Signed overflow. */ 573 return r < 0 ? INT64_MAX : INT64_MIN; 574 } 575 return r; 576} 577 578DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 579DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 580DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 581DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 582 583#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 584#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 585#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 586 587static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 588{ 589 uint64_t r = n + m; 590 return r < n ? UINT64_MAX : r; 591} 592 593DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 594DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 595DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 596DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 597 598#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 599#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 600#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 601 602static inline int64_t do_sqsub_d(int64_t n, int64_t m) 603{ 604 int64_t r = n - m; 605 if (((r ^ n) & (n ^ m)) < 0) { 606 /* Signed overflow. */ 607 return r < 0 ? INT64_MAX : INT64_MIN; 608 } 609 return r; 610} 611 612DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 613DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 614DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 615DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 616 617#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 618#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 619#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 620 621static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 622{ 623 return n > m ? n - m : 0; 624} 625 626DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 627DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 628DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 629DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 630 631#define DO_SUQADD_B(n, m) \ 632 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 633#define DO_SUQADD_H(n, m) \ 634 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 635#define DO_SUQADD_S(n, m) \ 636 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 637 638static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 639{ 640 uint64_t r = n + m; 641 642 if (n < 0) { 643 /* Note that m - abs(n) cannot underflow. */ 644 if (r > INT64_MAX) { 645 /* Result is either very large positive or negative. */ 646 if (m > -n) { 647 /* m > abs(n), so r is a very large positive. */ 648 return INT64_MAX; 649 } 650 /* Result is negative. */ 651 } 652 } else { 653 /* Both inputs are positive: check for overflow. */ 654 if (r < m || r > INT64_MAX) { 655 return INT64_MAX; 656 } 657 } 658 return r; 659} 660 661DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 662DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 663DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 664DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 665 666#define DO_USQADD_B(n, m) \ 667 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 668#define DO_USQADD_H(n, m) \ 669 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 670#define DO_USQADD_S(n, m) \ 671 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 672 673static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 674{ 675 uint64_t r = n + m; 676 677 if (m < 0) { 678 return n < -m ? 0 : r; 679 } 680 return r < n ? UINT64_MAX : r; 681} 682 683DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 684DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 685DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 686DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 687 688#undef DO_ZPZZ 689#undef DO_ZPZZ_D 690 691/* 692 * Three operand expander, operating on element pairs. 693 * If the slot I is even, the elements from from VN {I, I+1}. 694 * If the slot I is odd, the elements from from VM {I-1, I}. 695 * Load all of the input elements in each pair before overwriting output. 696 */ 697#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 698void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 699{ \ 700 intptr_t i, opr_sz = simd_oprsz(desc); \ 701 for (i = 0; i < opr_sz; ) { \ 702 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 703 do { \ 704 TYPE n0 = *(TYPE *)(vn + H(i)); \ 705 TYPE m0 = *(TYPE *)(vm + H(i)); \ 706 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 707 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 708 if (pg & 1) { \ 709 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 710 } \ 711 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 712 if (pg & 1) { \ 713 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 714 } \ 715 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 716 } while (i & 15); \ 717 } \ 718} 719 720/* Similarly, specialized for 64-bit operands. */ 721#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 722void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 723{ \ 724 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 725 TYPE *d = vd, *n = vn, *m = vm; \ 726 uint8_t *pg = vg; \ 727 for (i = 0; i < opr_sz; i += 2) { \ 728 TYPE n0 = n[i], n1 = n[i + 1]; \ 729 TYPE m0 = m[i], m1 = m[i + 1]; \ 730 if (pg[H1(i)] & 1) { \ 731 d[i] = OP(n0, n1); \ 732 } \ 733 if (pg[H1(i + 1)] & 1) { \ 734 d[i + 1] = OP(m0, m1); \ 735 } \ 736 } \ 737} 738 739DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 740DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 741DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 742DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 743 744DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 745DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 746DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 747DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 748 749DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 750DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 751DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 752DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 753 754DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 755DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 756DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 757DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 758 759DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 760DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 761DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 762DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 763 764#undef DO_ZPZZ_PAIR 765#undef DO_ZPZZ_PAIR_D 766 767#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 768void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 769 void *status, uint32_t desc) \ 770{ \ 771 intptr_t i, opr_sz = simd_oprsz(desc); \ 772 for (i = 0; i < opr_sz; ) { \ 773 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 774 do { \ 775 TYPE n0 = *(TYPE *)(vn + H(i)); \ 776 TYPE m0 = *(TYPE *)(vm + H(i)); \ 777 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 778 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 779 if (pg & 1) { \ 780 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 781 } \ 782 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 783 if (pg & 1) { \ 784 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 785 } \ 786 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 787 } while (i & 15); \ 788 } \ 789} 790 791DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 792DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 793DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 794 795DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 796DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 797DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 798 799DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 800DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 801DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 802 803DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 804DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 805DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 806 807DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 808DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 809DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 810 811#undef DO_ZPZZ_PAIR_FP 812 813/* Three-operand expander, controlled by a predicate, in which the 814 * third operand is "wide". That is, for D = N op M, the same 64-bit 815 * value of M is used with all of the narrower values of N. 816 */ 817#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 818void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 819{ \ 820 intptr_t i, opr_sz = simd_oprsz(desc); \ 821 for (i = 0; i < opr_sz; ) { \ 822 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 823 TYPEW mm = *(TYPEW *)(vm + i); \ 824 do { \ 825 if (pg & 1) { \ 826 TYPE nn = *(TYPE *)(vn + H(i)); \ 827 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 828 } \ 829 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 830 } while (i & 7); \ 831 } \ 832} 833 834DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 835DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 836DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 837 838DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 839DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 840DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 841 842DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 843DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 844DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 845 846#undef DO_ZPZW 847 848/* Fully general two-operand expander, controlled by a predicate. 849 */ 850#define DO_ZPZ(NAME, TYPE, H, OP) \ 851void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 852{ \ 853 intptr_t i, opr_sz = simd_oprsz(desc); \ 854 for (i = 0; i < opr_sz; ) { \ 855 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 856 do { \ 857 if (pg & 1) { \ 858 TYPE nn = *(TYPE *)(vn + H(i)); \ 859 *(TYPE *)(vd + H(i)) = OP(nn); \ 860 } \ 861 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 862 } while (i & 15); \ 863 } \ 864} 865 866/* Similarly, specialized for 64-bit operands. */ 867#define DO_ZPZ_D(NAME, TYPE, OP) \ 868void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 869{ \ 870 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 871 TYPE *d = vd, *n = vn; \ 872 uint8_t *pg = vg; \ 873 for (i = 0; i < opr_sz; i += 1) { \ 874 if (pg[H1(i)] & 1) { \ 875 TYPE nn = n[i]; \ 876 d[i] = OP(nn); \ 877 } \ 878 } \ 879} 880 881#define DO_CLS_B(N) (clrsb32(N) - 24) 882#define DO_CLS_H(N) (clrsb32(N) - 16) 883 884DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 885DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 886DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 887DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 888 889#define DO_CLZ_B(N) (clz32(N) - 24) 890#define DO_CLZ_H(N) (clz32(N) - 16) 891 892DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 893DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 894DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 895DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 896 897DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 898DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 899DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 900DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 901 902#define DO_CNOT(N) (N == 0) 903 904DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 905DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 906DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 907DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 908 909#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 910 911DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 912DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 913DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 914 915#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 916 917DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 918DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 919DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 920 921#define DO_NOT(N) (~N) 922 923DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 924DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 925DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 926DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 927 928#define DO_SXTB(N) ((int8_t)N) 929#define DO_SXTH(N) ((int16_t)N) 930#define DO_SXTS(N) ((int32_t)N) 931#define DO_UXTB(N) ((uint8_t)N) 932#define DO_UXTH(N) ((uint16_t)N) 933#define DO_UXTS(N) ((uint32_t)N) 934 935DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 936DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 937DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 938DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 939DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 940DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 941 942DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 943DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 944DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 945DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 946DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 947DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 948 949#define DO_ABS(N) (N < 0 ? -N : N) 950 951DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 952DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 953DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 954DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 955 956#define DO_NEG(N) (-N) 957 958DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 959DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 960DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 961DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 962 963DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 964DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 965DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 966 967DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 968DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 969 970DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 971 972DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 973DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 974DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 975DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 976 977#define DO_SQABS(X) \ 978 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 979 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 980 981DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 982DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 983DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 984DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 985 986#define DO_SQNEG(X) \ 987 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 988 x_ == min_ ? -min_ - 1 : -x_; }) 989 990DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 991DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 992DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 993DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 994 995DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 996DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 997 998/* Three-operand expander, unpredicated, in which the third operand is "wide". 999 */ 1000#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 1001void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1002{ \ 1003 intptr_t i, opr_sz = simd_oprsz(desc); \ 1004 for (i = 0; i < opr_sz; ) { \ 1005 TYPEW mm = *(TYPEW *)(vm + i); \ 1006 do { \ 1007 TYPE nn = *(TYPE *)(vn + H(i)); \ 1008 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1009 i += sizeof(TYPE); \ 1010 } while (i & 7); \ 1011 } \ 1012} 1013 1014DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1015DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1016DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1017 1018DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1019DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1020DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1021 1022DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1023DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1024DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1025 1026#undef DO_ZZW 1027 1028#undef DO_CLS_B 1029#undef DO_CLS_H 1030#undef DO_CLZ_B 1031#undef DO_CLZ_H 1032#undef DO_CNOT 1033#undef DO_FABS 1034#undef DO_FNEG 1035#undef DO_ABS 1036#undef DO_NEG 1037#undef DO_ZPZ 1038#undef DO_ZPZ_D 1039 1040/* 1041 * Three-operand expander, unpredicated, in which the two inputs are 1042 * selected from the top or bottom half of the wide column. 1043 */ 1044#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1045void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1046{ \ 1047 intptr_t i, opr_sz = simd_oprsz(desc); \ 1048 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1049 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1050 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1051 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1052 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1053 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1054 } \ 1055} 1056 1057DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1058DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1059DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1060 1061DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1062DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1063DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1064 1065DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1066DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1067DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1068 1069DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1070DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1071DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1072 1073DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1074DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1075DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1076 1077DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1078DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1079DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1080 1081DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1082DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1083DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1084 1085DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1086DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1087DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1088 1089/* Note that the multiply cannot overflow, but the doubling can. */ 1090static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1091{ 1092 int16_t val = n * m; 1093 return DO_SQADD_H(val, val); 1094} 1095 1096static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1097{ 1098 int32_t val = n * m; 1099 return DO_SQADD_S(val, val); 1100} 1101 1102static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1103{ 1104 int64_t val = n * m; 1105 return do_sqadd_d(val, val); 1106} 1107 1108DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1109DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1110DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1111 1112#undef DO_ZZZ_TB 1113 1114#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1115void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1116{ \ 1117 intptr_t i, opr_sz = simd_oprsz(desc); \ 1118 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1119 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1120 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1121 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1122 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1123 } \ 1124} 1125 1126DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1127DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1128DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1129 1130DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1131DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1132DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1133 1134DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1135DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1136DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1137 1138DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1139DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1140DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1141 1142#undef DO_ZZZ_WTB 1143 1144#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1145void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1146{ \ 1147 intptr_t i, opr_sz = simd_oprsz(desc); \ 1148 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1149 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1150 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1151 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1152 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1153 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1154 } \ 1155} 1156 1157DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1158DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1159DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1160DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1161 1162#undef DO_ZZZ_NTB 1163 1164#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1165void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1166{ \ 1167 intptr_t i, opr_sz = simd_oprsz(desc); \ 1168 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1169 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1170 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1171 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1172 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1173 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1174 } \ 1175} 1176 1177DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1178DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1179DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1180 1181DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1182DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1183DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1184 1185DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1186DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1187DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1188 1189DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1190DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1191DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1192 1193#define DO_NMUL(N, M) -(N * M) 1194 1195DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1196DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1197DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1198 1199DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1200DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1201DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1202 1203#undef DO_ZZZW_ACC 1204 1205#define DO_XTNB(NAME, TYPE, OP) \ 1206void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1207{ \ 1208 intptr_t i, opr_sz = simd_oprsz(desc); \ 1209 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1210 TYPE nn = *(TYPE *)(vn + i); \ 1211 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1212 *(TYPE *)(vd + i) = nn; \ 1213 } \ 1214} 1215 1216#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1217void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1218{ \ 1219 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1220 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1221 TYPE nn = *(TYPE *)(vn + i); \ 1222 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1223 } \ 1224} 1225 1226#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1227#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1228#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1229 1230DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1231DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1232DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1233 1234DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1235DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1236DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1237 1238#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1239#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1240#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1241 1242DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1243DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1244DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1245 1246DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1247DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1248DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1249 1250DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1251DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1252DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1253 1254DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1255DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1256DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1257 1258#undef DO_XTNB 1259#undef DO_XTNT 1260 1261void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1262{ 1263 intptr_t i, opr_sz = simd_oprsz(desc); 1264 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1265 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1266 uint32_t *a = va, *n = vn; 1267 uint64_t *d = vd, *m = vm; 1268 1269 for (i = 0; i < opr_sz / 8; ++i) { 1270 uint32_t e1 = a[2 * i + H4(0)]; 1271 uint32_t e2 = n[2 * i + sel] ^ inv; 1272 uint64_t c = extract64(m[i], 32, 1); 1273 /* Compute and store the entire 33-bit result at once. */ 1274 d[i] = c + e1 + e2; 1275 } 1276} 1277 1278void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1279{ 1280 intptr_t i, opr_sz = simd_oprsz(desc); 1281 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1282 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1283 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1284 1285 for (i = 0; i < opr_sz / 8; i += 2) { 1286 Int128 e1 = int128_make64(a[i]); 1287 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1288 Int128 c = int128_make64(m[i + 1] & 1); 1289 Int128 r = int128_add(int128_add(e1, e2), c); 1290 d[i + 0] = int128_getlo(r); 1291 d[i + 1] = int128_gethi(r); 1292 } 1293} 1294 1295#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1296void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1297{ \ 1298 intptr_t i, opr_sz = simd_oprsz(desc); \ 1299 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1300 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1301 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1302 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1303 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1304 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1305 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1306 } \ 1307} 1308 1309DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1310 do_sqdmull_h, DO_SQADD_H) 1311DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1312 do_sqdmull_s, DO_SQADD_S) 1313DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1314 do_sqdmull_d, do_sqadd_d) 1315 1316DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1317 do_sqdmull_h, DO_SQSUB_H) 1318DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1319 do_sqdmull_s, DO_SQSUB_S) 1320DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1321 do_sqdmull_d, do_sqsub_d) 1322 1323#undef DO_SQDMLAL 1324 1325#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1326void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1327{ \ 1328 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1329 int rot = simd_data(desc); \ 1330 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1331 bool sub_r = rot == 1 || rot == 2; \ 1332 bool sub_i = rot >= 2; \ 1333 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1334 for (i = 0; i < opr_sz; i += 2) { \ 1335 TYPE elt1_a = n[H(i + sel_a)]; \ 1336 TYPE elt2_a = m[H(i + sel_a)]; \ 1337 TYPE elt2_b = m[H(i + sel_b)]; \ 1338 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1339 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1340 } \ 1341} 1342 1343#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1344 1345DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1346DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1347DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1348DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1349 1350#define DO_SQRDMLAH_B(N, M, A, S) \ 1351 do_sqrdmlah_b(N, M, A, S, true) 1352#define DO_SQRDMLAH_H(N, M, A, S) \ 1353 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1354#define DO_SQRDMLAH_S(N, M, A, S) \ 1355 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1356#define DO_SQRDMLAH_D(N, M, A, S) \ 1357 do_sqrdmlah_d(N, M, A, S, true) 1358 1359DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1360DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1361DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1362DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1363 1364#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1365void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1366{ \ 1367 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1368 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1369 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1370 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1371 bool sub_r = rot == 1 || rot == 2; \ 1372 bool sub_i = rot >= 2; \ 1373 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1374 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1375 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1376 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1377 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1378 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1379 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1380 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1381 } \ 1382 } \ 1383} 1384 1385DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1386DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1387 1388DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1389DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1390 1391#undef DO_CMLA 1392#undef DO_CMLA_FUNC 1393#undef DO_CMLA_IDX_FUNC 1394#undef DO_SQRDMLAH_B 1395#undef DO_SQRDMLAH_H 1396#undef DO_SQRDMLAH_S 1397#undef DO_SQRDMLAH_D 1398 1399/* Note N and M are 4 elements bundled into one unit. */ 1400static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1401 int sel_a, int sel_b, int sub_i) 1402{ 1403 for (int i = 0; i <= 1; i++) { 1404 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1405 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1406 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1407 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1408 1409 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1410 } 1411 return a; 1412} 1413 1414static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1415 int sel_a, int sel_b, int sub_i) 1416{ 1417 for (int i = 0; i <= 1; i++) { 1418 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1419 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1420 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1421 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1422 1423 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1424 } 1425 return a; 1426} 1427 1428void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1429 void *va, uint32_t desc) 1430{ 1431 int opr_sz = simd_oprsz(desc); 1432 int rot = simd_data(desc); 1433 int sel_a = rot & 1; 1434 int sel_b = sel_a ^ 1; 1435 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1436 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1437 1438 for (int e = 0; e < opr_sz / 4; e++) { 1439 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1440 } 1441} 1442 1443void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1444 void *va, uint32_t desc) 1445{ 1446 int opr_sz = simd_oprsz(desc); 1447 int rot = simd_data(desc); 1448 int sel_a = rot & 1; 1449 int sel_b = sel_a ^ 1; 1450 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1451 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1452 1453 for (int e = 0; e < opr_sz / 8; e++) { 1454 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1455 } 1456} 1457 1458void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1459 void *va, uint32_t desc) 1460{ 1461 int opr_sz = simd_oprsz(desc); 1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1463 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1464 int sel_a = rot & 1; 1465 int sel_b = sel_a ^ 1; 1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1467 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1468 1469 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1470 uint32_t seg_m = m[seg + idx]; 1471 for (int e = 0; e < 4; e++) { 1472 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1473 sel_a, sel_b, sub_i); 1474 } 1475 } 1476} 1477 1478void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1479 void *va, uint32_t desc) 1480{ 1481 int seg, opr_sz = simd_oprsz(desc); 1482 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1483 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1484 int sel_a = rot & 1; 1485 int sel_b = sel_a ^ 1; 1486 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1487 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1488 1489 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1490 uint64_t seg_m = m[seg + idx]; 1491 for (int e = 0; e < 2; e++) { 1492 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1493 sel_a, sel_b, sub_i); 1494 } 1495 } 1496} 1497 1498#define DO_ZZXZ(NAME, TYPE, H, OP) \ 1499void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1500{ \ 1501 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1502 intptr_t i, j, idx = simd_data(desc); \ 1503 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1504 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1505 TYPE mm = m[i]; \ 1506 for (j = 0; j < segment; j++) { \ 1507 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1508 } \ 1509 } \ 1510} 1511 1512#define DO_SQRDMLAH_H(N, M, A) \ 1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1514#define DO_SQRDMLAH_S(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1516#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1517 1518DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1519DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1520DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1521 1522#define DO_SQRDMLSH_H(N, M, A) \ 1523 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1524#define DO_SQRDMLSH_S(N, M, A) \ 1525 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1526#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1527 1528DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1529DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1530DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1531 1532#undef DO_ZZXZ 1533 1534#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1535void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1536{ \ 1537 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1538 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1539 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1540 for (i = 0; i < oprsz; i += 16) { \ 1541 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1542 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1543 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1544 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1545 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1546 } \ 1547 } \ 1548} 1549 1550#define DO_MLA(N, M, A) (A + N * M) 1551 1552DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1553DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1554DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1555DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1556 1557#define DO_MLS(N, M, A) (A - N * M) 1558 1559DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1560DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1561DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1562DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1563 1564#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1565#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1566 1567DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1568DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1569 1570#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1571#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1572 1573DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1574DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1575 1576#undef DO_MLA 1577#undef DO_MLS 1578#undef DO_ZZXW 1579 1580#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1581void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1582{ \ 1583 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1584 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1585 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1586 for (i = 0; i < oprsz; i += 16) { \ 1587 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1588 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1589 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1590 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1591 } \ 1592 } \ 1593} 1594 1595DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1596DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1597 1598DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1599DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1600 1601DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1602DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1603 1604#undef DO_ZZX 1605 1606#define DO_BITPERM(NAME, TYPE, OP) \ 1607void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1608{ \ 1609 intptr_t i, opr_sz = simd_oprsz(desc); \ 1610 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1611 TYPE nn = *(TYPE *)(vn + i); \ 1612 TYPE mm = *(TYPE *)(vm + i); \ 1613 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1614 } \ 1615} 1616 1617static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1618{ 1619 uint64_t res = 0; 1620 int db, rb = 0; 1621 1622 for (db = 0; db < n; ++db) { 1623 if ((mask >> db) & 1) { 1624 res |= ((data >> db) & 1) << rb; 1625 ++rb; 1626 } 1627 } 1628 return res; 1629} 1630 1631DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1632DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1633DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1634DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1635 1636static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1637{ 1638 uint64_t res = 0; 1639 int rb, db = 0; 1640 1641 for (rb = 0; rb < n; ++rb) { 1642 if ((mask >> rb) & 1) { 1643 res |= ((data >> db) & 1) << rb; 1644 ++db; 1645 } 1646 } 1647 return res; 1648} 1649 1650DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1651DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1652DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1653DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1654 1655static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1656{ 1657 uint64_t resm = 0, resu = 0; 1658 int db, rbm = 0, rbu = 0; 1659 1660 for (db = 0; db < n; ++db) { 1661 uint64_t val = (data >> db) & 1; 1662 if ((mask >> db) & 1) { 1663 resm |= val << rbm++; 1664 } else { 1665 resu |= val << rbu++; 1666 } 1667 } 1668 1669 return resm | (resu << rbm); 1670} 1671 1672DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1673DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1674DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1675DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1676 1677#undef DO_BITPERM 1678 1679#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1680void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1681{ \ 1682 intptr_t i, opr_sz = simd_oprsz(desc); \ 1683 int sub_r = simd_data(desc); \ 1684 if (sub_r) { \ 1685 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1686 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1687 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1688 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1689 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1690 acc_r = ADD_OP(acc_r, el2_i); \ 1691 acc_i = SUB_OP(acc_i, el2_r); \ 1692 *(TYPE *)(vd + H(i)) = acc_r; \ 1693 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1694 } \ 1695 } else { \ 1696 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1697 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1698 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1699 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1700 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1701 acc_r = SUB_OP(acc_r, el2_i); \ 1702 acc_i = ADD_OP(acc_i, el2_r); \ 1703 *(TYPE *)(vd + H(i)) = acc_r; \ 1704 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1705 } \ 1706 } \ 1707} 1708 1709DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1710DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1711DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1712DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1713 1714DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1715DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1716DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1717DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1718 1719#undef DO_CADD 1720 1721#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1722void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1723{ \ 1724 intptr_t i, opr_sz = simd_oprsz(desc); \ 1725 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1726 int shift = simd_data(desc) >> 1; \ 1727 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1728 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1729 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1730 } \ 1731} 1732 1733DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1734DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1735DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1736 1737DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1738DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1739DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1740 1741#undef DO_ZZI_SHLL 1742 1743/* Two-operand reduction expander, controlled by a predicate. 1744 * The difference between TYPERED and TYPERET has to do with 1745 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1746 * but TYPERET must be unsigned so that e.g. a 32-bit value 1747 * is not sign-extended to the ABI uint64_t return type. 1748 */ 1749/* ??? If we were to vectorize this by hand the reduction ordering 1750 * would change. For integer operands, this is perfectly fine. 1751 */ 1752#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1753uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1754{ \ 1755 intptr_t i, opr_sz = simd_oprsz(desc); \ 1756 TYPERED ret = INIT; \ 1757 for (i = 0; i < opr_sz; ) { \ 1758 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1759 do { \ 1760 if (pg & 1) { \ 1761 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1762 ret = OP(ret, nn); \ 1763 } \ 1764 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1765 } while (i & 15); \ 1766 } \ 1767 return (TYPERET)ret; \ 1768} 1769 1770#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1771uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1772{ \ 1773 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1774 TYPEE *n = vn; \ 1775 uint8_t *pg = vg; \ 1776 TYPER ret = INIT; \ 1777 for (i = 0; i < opr_sz; i += 1) { \ 1778 if (pg[H1(i)] & 1) { \ 1779 TYPEE nn = n[i]; \ 1780 ret = OP(ret, nn); \ 1781 } \ 1782 } \ 1783 return ret; \ 1784} 1785 1786DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1787DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1788DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1789DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1790 1791DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1792DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1793DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1794DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1795 1796DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1797DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1798DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1799DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1800 1801DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1802DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1803DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1804 1805DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1806DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1807DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1808DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1809 1810DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1811DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1812DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1813DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1814 1815DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1816DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1817DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1818DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1819 1820DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1821DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1822DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1823DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1824 1825DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1826DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1827DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1828DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1829 1830#undef DO_VPZ 1831#undef DO_VPZ_D 1832 1833/* Two vector operand, one scalar operand, unpredicated. */ 1834#define DO_ZZI(NAME, TYPE, OP) \ 1835void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1836{ \ 1837 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1838 TYPE s = s64, *d = vd, *n = vn; \ 1839 for (i = 0; i < opr_sz; ++i) { \ 1840 d[i] = OP(n[i], s); \ 1841 } \ 1842} 1843 1844#define DO_SUBR(X, Y) (Y - X) 1845 1846DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1847DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1848DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1849DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1850 1851DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1852DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1853DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1854DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1855 1856DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1857DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1858DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1859DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1860 1861DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1862DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1863DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1864DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1865 1866DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1867DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1868DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1869DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1870 1871#undef DO_ZZI 1872 1873#undef DO_AND 1874#undef DO_ORR 1875#undef DO_EOR 1876#undef DO_BIC 1877#undef DO_ADD 1878#undef DO_SUB 1879#undef DO_MAX 1880#undef DO_MIN 1881#undef DO_ABD 1882#undef DO_MUL 1883#undef DO_DIV 1884#undef DO_ASR 1885#undef DO_LSR 1886#undef DO_LSL 1887#undef DO_SUBR 1888 1889/* Similar to the ARM LastActiveElement pseudocode function, except the 1890 result is multiplied by the element size. This includes the not found 1891 indication; e.g. not found for esz=3 is -8. */ 1892static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1893{ 1894 uint64_t mask = pred_esz_masks[esz]; 1895 intptr_t i = words; 1896 1897 do { 1898 uint64_t this_g = g[--i] & mask; 1899 if (this_g) { 1900 return i * 64 + (63 - clz64(this_g)); 1901 } 1902 } while (i > 0); 1903 return (intptr_t)-1 << esz; 1904} 1905 1906uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1907{ 1908 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1909 uint32_t flags = PREDTEST_INIT; 1910 uint64_t *d = vd, *g = vg; 1911 intptr_t i = 0; 1912 1913 do { 1914 uint64_t this_d = d[i]; 1915 uint64_t this_g = g[i]; 1916 1917 if (this_g) { 1918 if (!(flags & 4)) { 1919 /* Set in D the first bit of G. */ 1920 this_d |= this_g & -this_g; 1921 d[i] = this_d; 1922 } 1923 flags = iter_predtest_fwd(this_d, this_g, flags); 1924 } 1925 } while (++i < words); 1926 1927 return flags; 1928} 1929 1930uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1931{ 1932 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1933 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1934 uint32_t flags = PREDTEST_INIT; 1935 uint64_t *d = vd, *g = vg, esz_mask; 1936 intptr_t i, next; 1937 1938 next = last_active_element(vd, words, esz) + (1 << esz); 1939 esz_mask = pred_esz_masks[esz]; 1940 1941 /* Similar to the pseudocode for pnext, but scaled by ESZ 1942 so that we find the correct bit. */ 1943 if (next < words * 64) { 1944 uint64_t mask = -1; 1945 1946 if (next & 63) { 1947 mask = ~((1ull << (next & 63)) - 1); 1948 next &= -64; 1949 } 1950 do { 1951 uint64_t this_g = g[next / 64] & esz_mask & mask; 1952 if (this_g != 0) { 1953 next = (next & -64) + ctz64(this_g); 1954 break; 1955 } 1956 next += 64; 1957 mask = -1; 1958 } while (next < words * 64); 1959 } 1960 1961 i = 0; 1962 do { 1963 uint64_t this_d = 0; 1964 if (i == next / 64) { 1965 this_d = 1ull << (next & 63); 1966 } 1967 d[i] = this_d; 1968 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1969 } while (++i < words); 1970 1971 return flags; 1972} 1973 1974/* 1975 * Copy Zn into Zd, and store zero into inactive elements. 1976 * If inv, store zeros into the active elements. 1977 */ 1978void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1979{ 1980 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1981 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1982 uint64_t *d = vd, *n = vn; 1983 uint8_t *pg = vg; 1984 1985 for (i = 0; i < opr_sz; i += 1) { 1986 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1987 } 1988} 1989 1990void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1991{ 1992 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1993 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1994 uint64_t *d = vd, *n = vn; 1995 uint8_t *pg = vg; 1996 1997 for (i = 0; i < opr_sz; i += 1) { 1998 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1999 } 2000} 2001 2002void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2003{ 2004 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2005 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2006 uint64_t *d = vd, *n = vn; 2007 uint8_t *pg = vg; 2008 2009 for (i = 0; i < opr_sz; i += 1) { 2010 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2011 } 2012} 2013 2014void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2015{ 2016 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2017 uint64_t *d = vd, *n = vn; 2018 uint8_t *pg = vg; 2019 uint8_t inv = simd_data(desc); 2020 2021 for (i = 0; i < opr_sz; i += 1) { 2022 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2023 } 2024} 2025 2026/* Three-operand expander, immediate operand, controlled by a predicate. 2027 */ 2028#define DO_ZPZI(NAME, TYPE, H, OP) \ 2029void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2030{ \ 2031 intptr_t i, opr_sz = simd_oprsz(desc); \ 2032 TYPE imm = simd_data(desc); \ 2033 for (i = 0; i < opr_sz; ) { \ 2034 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2035 do { \ 2036 if (pg & 1) { \ 2037 TYPE nn = *(TYPE *)(vn + H(i)); \ 2038 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2039 } \ 2040 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2041 } while (i & 15); \ 2042 } \ 2043} 2044 2045/* Similarly, specialized for 64-bit operands. */ 2046#define DO_ZPZI_D(NAME, TYPE, OP) \ 2047void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2048{ \ 2049 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2050 TYPE *d = vd, *n = vn; \ 2051 TYPE imm = simd_data(desc); \ 2052 uint8_t *pg = vg; \ 2053 for (i = 0; i < opr_sz; i += 1) { \ 2054 if (pg[H1(i)] & 1) { \ 2055 TYPE nn = n[i]; \ 2056 d[i] = OP(nn, imm); \ 2057 } \ 2058 } \ 2059} 2060 2061#define DO_SHR(N, M) (N >> M) 2062#define DO_SHL(N, M) (N << M) 2063 2064/* Arithmetic shift right for division. This rounds negative numbers 2065 toward zero as per signed division. Therefore before shifting, 2066 when N is negative, add 2**M-1. */ 2067#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2068 2069static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2070{ 2071 if (likely(sh < 64)) { 2072 return (x >> sh) + ((x >> (sh - 1)) & 1); 2073 } else if (sh == 64) { 2074 return x >> 63; 2075 } else { 2076 return 0; 2077 } 2078} 2079 2080static inline int64_t do_srshr(int64_t x, unsigned sh) 2081{ 2082 if (likely(sh < 64)) { 2083 return (x >> sh) + ((x >> (sh - 1)) & 1); 2084 } else { 2085 /* Rounding the sign bit always produces 0. */ 2086 return 0; 2087 } 2088} 2089 2090DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2091DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2092DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2093DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2094 2095DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2096DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2097DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2098DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2099 2100DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2101DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2102DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2103DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2104 2105DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2106DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2107DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2108DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2109 2110/* SVE2 bitwise shift by immediate */ 2111DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2112DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2113DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2114DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2115 2116DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2117DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2118DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2119DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2120 2121DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2122DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2123DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2124DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2125 2126DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2127DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2128DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2129DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2130 2131#define do_suqrshl_b(n, m) \ 2132 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2133#define do_suqrshl_h(n, m) \ 2134 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2135#define do_suqrshl_s(n, m) \ 2136 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2137#define do_suqrshl_d(n, m) \ 2138 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2139 2140DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2141DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2142DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2143DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2144 2145#undef DO_ASRD 2146#undef DO_ZPZI 2147#undef DO_ZPZI_D 2148 2149#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2150void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2151{ \ 2152 intptr_t i, opr_sz = simd_oprsz(desc); \ 2153 int shift = simd_data(desc); \ 2154 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2155 TYPEW nn = *(TYPEW *)(vn + i); \ 2156 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2157 } \ 2158} 2159 2160#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2161void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2162{ \ 2163 intptr_t i, opr_sz = simd_oprsz(desc); \ 2164 int shift = simd_data(desc); \ 2165 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2166 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2167 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2168 } \ 2169} 2170 2171DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2172DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2173DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2174 2175DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2176DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2177DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2178 2179DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2180DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2181DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2182 2183DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2184DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2185DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2186 2187#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2188#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2189#define DO_SQSHRUN_D(x, sh) \ 2190 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2191 2192DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2193DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2194DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2195 2196DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2197DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2198DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2199 2200#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2201#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2202#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2203 2204DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2205DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2206DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2207 2208DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2209DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2210DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2211 2212#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2213#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2214#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2215 2216DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2217DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2218DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2219 2220DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2221DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2222DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2223 2224#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2225#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2226#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2227 2228DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2229DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2230DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2231 2232DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2233DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2234DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2235 2236#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2237#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2238#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2239 2240DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2241DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2242DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2243 2244DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2245DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2246DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2247 2248#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2249#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2250#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2251 2252DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2253DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2254DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2255 2256DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2257DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2258DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2259 2260#undef DO_SHRNB 2261#undef DO_SHRNT 2262 2263#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2264void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2265{ \ 2266 intptr_t i, opr_sz = simd_oprsz(desc); \ 2267 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2268 TYPEW nn = *(TYPEW *)(vn + i); \ 2269 TYPEW mm = *(TYPEW *)(vm + i); \ 2270 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2271 } \ 2272} 2273 2274#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2275void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2276{ \ 2277 intptr_t i, opr_sz = simd_oprsz(desc); \ 2278 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2279 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2280 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2281 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2282 } \ 2283} 2284 2285#define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2286#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2287#define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2288#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2289 2290DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2291DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2292DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2293 2294DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2295DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2296DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2297 2298DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2299DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2300DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2301 2302DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2303DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2304DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2305 2306DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2307DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2308DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2309 2310DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2311DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2312DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2313 2314DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2315DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2316DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2317 2318DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2319DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2320DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2321 2322#undef DO_RSUBHN 2323#undef DO_SUBHN 2324#undef DO_RADDHN 2325#undef DO_ADDHN 2326 2327#undef DO_BINOPNB 2328 2329/* Fully general four-operand expander, controlled by a predicate. 2330 */ 2331#define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2332void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2333 void *vg, uint32_t desc) \ 2334{ \ 2335 intptr_t i, opr_sz = simd_oprsz(desc); \ 2336 for (i = 0; i < opr_sz; ) { \ 2337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2338 do { \ 2339 if (pg & 1) { \ 2340 TYPE nn = *(TYPE *)(vn + H(i)); \ 2341 TYPE mm = *(TYPE *)(vm + H(i)); \ 2342 TYPE aa = *(TYPE *)(va + H(i)); \ 2343 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2344 } \ 2345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2346 } while (i & 15); \ 2347 } \ 2348} 2349 2350/* Similarly, specialized for 64-bit operands. */ 2351#define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2352void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2353 void *vg, uint32_t desc) \ 2354{ \ 2355 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2356 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2357 uint8_t *pg = vg; \ 2358 for (i = 0; i < opr_sz; i += 1) { \ 2359 if (pg[H1(i)] & 1) { \ 2360 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2361 d[i] = OP(aa, nn, mm); \ 2362 } \ 2363 } \ 2364} 2365 2366#define DO_MLA(A, N, M) (A + N * M) 2367#define DO_MLS(A, N, M) (A - N * M) 2368 2369DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2370DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2371 2372DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2373DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2374 2375DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2376DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2377 2378DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2379DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2380 2381#undef DO_MLA 2382#undef DO_MLS 2383#undef DO_ZPZZZ 2384#undef DO_ZPZZZ_D 2385 2386void HELPER(sve_index_b)(void *vd, uint32_t start, 2387 uint32_t incr, uint32_t desc) 2388{ 2389 intptr_t i, opr_sz = simd_oprsz(desc); 2390 uint8_t *d = vd; 2391 for (i = 0; i < opr_sz; i += 1) { 2392 d[H1(i)] = start + i * incr; 2393 } 2394} 2395 2396void HELPER(sve_index_h)(void *vd, uint32_t start, 2397 uint32_t incr, uint32_t desc) 2398{ 2399 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2400 uint16_t *d = vd; 2401 for (i = 0; i < opr_sz; i += 1) { 2402 d[H2(i)] = start + i * incr; 2403 } 2404} 2405 2406void HELPER(sve_index_s)(void *vd, uint32_t start, 2407 uint32_t incr, uint32_t desc) 2408{ 2409 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2410 uint32_t *d = vd; 2411 for (i = 0; i < opr_sz; i += 1) { 2412 d[H4(i)] = start + i * incr; 2413 } 2414} 2415 2416void HELPER(sve_index_d)(void *vd, uint64_t start, 2417 uint64_t incr, uint32_t desc) 2418{ 2419 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2420 uint64_t *d = vd; 2421 for (i = 0; i < opr_sz; i += 1) { 2422 d[i] = start + i * incr; 2423 } 2424} 2425 2426void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2427{ 2428 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2429 uint32_t sh = simd_data(desc); 2430 uint32_t *d = vd, *n = vn, *m = vm; 2431 for (i = 0; i < opr_sz; i += 1) { 2432 d[i] = n[i] + (m[i] << sh); 2433 } 2434} 2435 2436void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2437{ 2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2439 uint64_t sh = simd_data(desc); 2440 uint64_t *d = vd, *n = vn, *m = vm; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[i] = n[i] + (m[i] << sh); 2443 } 2444} 2445 2446void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2447{ 2448 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2449 uint64_t sh = simd_data(desc); 2450 uint64_t *d = vd, *n = vn, *m = vm; 2451 for (i = 0; i < opr_sz; i += 1) { 2452 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2453 } 2454} 2455 2456void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2457{ 2458 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2459 uint64_t sh = simd_data(desc); 2460 uint64_t *d = vd, *n = vn, *m = vm; 2461 for (i = 0; i < opr_sz; i += 1) { 2462 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2463 } 2464} 2465 2466void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2467{ 2468 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2469 static const uint16_t coeff[] = { 2470 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2471 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2472 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2473 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2474 }; 2475 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2476 uint16_t *d = vd, *n = vn; 2477 2478 for (i = 0; i < opr_sz; i++) { 2479 uint16_t nn = n[i]; 2480 intptr_t idx = extract32(nn, 0, 5); 2481 uint16_t exp = extract32(nn, 5, 5); 2482 d[i] = coeff[idx] | (exp << 10); 2483 } 2484} 2485 2486void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2487{ 2488 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2489 static const uint32_t coeff[] = { 2490 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2491 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2492 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2493 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2494 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2495 0x1ef532, 0x20b051, 0x227043, 0x243516, 2496 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2497 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2498 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2499 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2500 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2501 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2502 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2503 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2504 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2505 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2506 }; 2507 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2508 uint32_t *d = vd, *n = vn; 2509 2510 for (i = 0; i < opr_sz; i++) { 2511 uint32_t nn = n[i]; 2512 intptr_t idx = extract32(nn, 0, 6); 2513 uint32_t exp = extract32(nn, 6, 8); 2514 d[i] = coeff[idx] | (exp << 23); 2515 } 2516} 2517 2518void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2519{ 2520 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2521 static const uint64_t coeff[] = { 2522 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2523 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2524 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2525 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2526 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2527 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2528 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2529 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2530 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2531 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2532 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2533 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2534 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2535 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2536 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2537 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2538 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2539 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2540 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2541 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2542 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2543 0xFA7C1819E90D8ull, 2544 }; 2545 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2546 uint64_t *d = vd, *n = vn; 2547 2548 for (i = 0; i < opr_sz; i++) { 2549 uint64_t nn = n[i]; 2550 intptr_t idx = extract32(nn, 0, 6); 2551 uint64_t exp = extract32(nn, 6, 11); 2552 d[i] = coeff[idx] | (exp << 52); 2553 } 2554} 2555 2556void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2557{ 2558 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2559 uint16_t *d = vd, *n = vn, *m = vm; 2560 for (i = 0; i < opr_sz; i += 1) { 2561 uint16_t nn = n[i]; 2562 uint16_t mm = m[i]; 2563 if (mm & 1) { 2564 nn = float16_one; 2565 } 2566 d[i] = nn ^ (mm & 2) << 14; 2567 } 2568} 2569 2570void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2571{ 2572 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2573 uint32_t *d = vd, *n = vn, *m = vm; 2574 for (i = 0; i < opr_sz; i += 1) { 2575 uint32_t nn = n[i]; 2576 uint32_t mm = m[i]; 2577 if (mm & 1) { 2578 nn = float32_one; 2579 } 2580 d[i] = nn ^ (mm & 2) << 30; 2581 } 2582} 2583 2584void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2585{ 2586 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2587 uint64_t *d = vd, *n = vn, *m = vm; 2588 for (i = 0; i < opr_sz; i += 1) { 2589 uint64_t nn = n[i]; 2590 uint64_t mm = m[i]; 2591 if (mm & 1) { 2592 nn = float64_one; 2593 } 2594 d[i] = nn ^ (mm & 2) << 62; 2595 } 2596} 2597 2598/* 2599 * Signed saturating addition with scalar operand. 2600 */ 2601 2602void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2603{ 2604 intptr_t i, oprsz = simd_oprsz(desc); 2605 2606 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2607 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2608 } 2609} 2610 2611void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2612{ 2613 intptr_t i, oprsz = simd_oprsz(desc); 2614 2615 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2616 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2617 } 2618} 2619 2620void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2621{ 2622 intptr_t i, oprsz = simd_oprsz(desc); 2623 2624 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2625 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2626 } 2627} 2628 2629void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2630{ 2631 intptr_t i, oprsz = simd_oprsz(desc); 2632 2633 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2634 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2635 } 2636} 2637 2638/* 2639 * Unsigned saturating addition with scalar operand. 2640 */ 2641 2642void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2643{ 2644 intptr_t i, oprsz = simd_oprsz(desc); 2645 2646 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2647 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2648 } 2649} 2650 2651void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2652{ 2653 intptr_t i, oprsz = simd_oprsz(desc); 2654 2655 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2656 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2657 } 2658} 2659 2660void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2661{ 2662 intptr_t i, oprsz = simd_oprsz(desc); 2663 2664 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2665 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2666 } 2667} 2668 2669void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2670{ 2671 intptr_t i, oprsz = simd_oprsz(desc); 2672 2673 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2674 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2675 } 2676} 2677 2678void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2679{ 2680 intptr_t i, oprsz = simd_oprsz(desc); 2681 2682 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2683 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2684 } 2685} 2686 2687/* Two operand predicated copy immediate with merge. All valid immediates 2688 * can fit within 17 signed bits in the simd_data field. 2689 */ 2690void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2691 uint64_t mm, uint32_t desc) 2692{ 2693 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2694 uint64_t *d = vd, *n = vn; 2695 uint8_t *pg = vg; 2696 2697 mm = dup_const(MO_8, mm); 2698 for (i = 0; i < opr_sz; i += 1) { 2699 uint64_t nn = n[i]; 2700 uint64_t pp = expand_pred_b(pg[H1(i)]); 2701 d[i] = (mm & pp) | (nn & ~pp); 2702 } 2703} 2704 2705void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2706 uint64_t mm, uint32_t desc) 2707{ 2708 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2709 uint64_t *d = vd, *n = vn; 2710 uint8_t *pg = vg; 2711 2712 mm = dup_const(MO_16, mm); 2713 for (i = 0; i < opr_sz; i += 1) { 2714 uint64_t nn = n[i]; 2715 uint64_t pp = expand_pred_h(pg[H1(i)]); 2716 d[i] = (mm & pp) | (nn & ~pp); 2717 } 2718} 2719 2720void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2721 uint64_t mm, uint32_t desc) 2722{ 2723 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2724 uint64_t *d = vd, *n = vn; 2725 uint8_t *pg = vg; 2726 2727 mm = dup_const(MO_32, mm); 2728 for (i = 0; i < opr_sz; i += 1) { 2729 uint64_t nn = n[i]; 2730 uint64_t pp = expand_pred_s(pg[H1(i)]); 2731 d[i] = (mm & pp) | (nn & ~pp); 2732 } 2733} 2734 2735void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2736 uint64_t mm, uint32_t desc) 2737{ 2738 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2739 uint64_t *d = vd, *n = vn; 2740 uint8_t *pg = vg; 2741 2742 for (i = 0; i < opr_sz; i += 1) { 2743 uint64_t nn = n[i]; 2744 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2745 } 2746} 2747 2748void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2749{ 2750 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2751 uint64_t *d = vd; 2752 uint8_t *pg = vg; 2753 2754 val = dup_const(MO_8, val); 2755 for (i = 0; i < opr_sz; i += 1) { 2756 d[i] = val & expand_pred_b(pg[H1(i)]); 2757 } 2758} 2759 2760void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2761{ 2762 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2763 uint64_t *d = vd; 2764 uint8_t *pg = vg; 2765 2766 val = dup_const(MO_16, val); 2767 for (i = 0; i < opr_sz; i += 1) { 2768 d[i] = val & expand_pred_h(pg[H1(i)]); 2769 } 2770} 2771 2772void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2773{ 2774 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2775 uint64_t *d = vd; 2776 uint8_t *pg = vg; 2777 2778 val = dup_const(MO_32, val); 2779 for (i = 0; i < opr_sz; i += 1) { 2780 d[i] = val & expand_pred_s(pg[H1(i)]); 2781 } 2782} 2783 2784void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2785{ 2786 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2787 uint64_t *d = vd; 2788 uint8_t *pg = vg; 2789 2790 for (i = 0; i < opr_sz; i += 1) { 2791 d[i] = (pg[H1(i)] & 1 ? val : 0); 2792 } 2793} 2794 2795/* Big-endian hosts need to frob the byte indices. If the copy 2796 * happens to be 8-byte aligned, then no frobbing necessary. 2797 */ 2798static void swap_memmove(void *vd, void *vs, size_t n) 2799{ 2800 uintptr_t d = (uintptr_t)vd; 2801 uintptr_t s = (uintptr_t)vs; 2802 uintptr_t o = (d | s | n) & 7; 2803 size_t i; 2804 2805#ifndef HOST_WORDS_BIGENDIAN 2806 o = 0; 2807#endif 2808 switch (o) { 2809 case 0: 2810 memmove(vd, vs, n); 2811 break; 2812 2813 case 4: 2814 if (d < s || d >= s + n) { 2815 for (i = 0; i < n; i += 4) { 2816 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2817 } 2818 } else { 2819 for (i = n; i > 0; ) { 2820 i -= 4; 2821 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2822 } 2823 } 2824 break; 2825 2826 case 2: 2827 case 6: 2828 if (d < s || d >= s + n) { 2829 for (i = 0; i < n; i += 2) { 2830 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2831 } 2832 } else { 2833 for (i = n; i > 0; ) { 2834 i -= 2; 2835 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2836 } 2837 } 2838 break; 2839 2840 default: 2841 if (d < s || d >= s + n) { 2842 for (i = 0; i < n; i++) { 2843 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2844 } 2845 } else { 2846 for (i = n; i > 0; ) { 2847 i -= 1; 2848 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2849 } 2850 } 2851 break; 2852 } 2853} 2854 2855/* Similarly for memset of 0. */ 2856static void swap_memzero(void *vd, size_t n) 2857{ 2858 uintptr_t d = (uintptr_t)vd; 2859 uintptr_t o = (d | n) & 7; 2860 size_t i; 2861 2862 /* Usually, the first bit of a predicate is set, so N is 0. */ 2863 if (likely(n == 0)) { 2864 return; 2865 } 2866 2867#ifndef HOST_WORDS_BIGENDIAN 2868 o = 0; 2869#endif 2870 switch (o) { 2871 case 0: 2872 memset(vd, 0, n); 2873 break; 2874 2875 case 4: 2876 for (i = 0; i < n; i += 4) { 2877 *(uint32_t *)H1_4(d + i) = 0; 2878 } 2879 break; 2880 2881 case 2: 2882 case 6: 2883 for (i = 0; i < n; i += 2) { 2884 *(uint16_t *)H1_2(d + i) = 0; 2885 } 2886 break; 2887 2888 default: 2889 for (i = 0; i < n; i++) { 2890 *(uint8_t *)H1(d + i) = 0; 2891 } 2892 break; 2893 } 2894} 2895 2896void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2897{ 2898 intptr_t opr_sz = simd_oprsz(desc); 2899 size_t n_ofs = simd_data(desc); 2900 size_t n_siz = opr_sz - n_ofs; 2901 2902 if (vd != vm) { 2903 swap_memmove(vd, vn + n_ofs, n_siz); 2904 swap_memmove(vd + n_siz, vm, n_ofs); 2905 } else if (vd != vn) { 2906 swap_memmove(vd + n_siz, vd, n_ofs); 2907 swap_memmove(vd, vn + n_ofs, n_siz); 2908 } else { 2909 /* vd == vn == vm. Need temp space. */ 2910 ARMVectorReg tmp; 2911 swap_memmove(&tmp, vm, n_ofs); 2912 swap_memmove(vd, vd + n_ofs, n_siz); 2913 memcpy(vd + n_siz, &tmp, n_ofs); 2914 } 2915} 2916 2917#define DO_INSR(NAME, TYPE, H) \ 2918void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2919{ \ 2920 intptr_t opr_sz = simd_oprsz(desc); \ 2921 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2922 *(TYPE *)(vd + H(0)) = val; \ 2923} 2924 2925DO_INSR(sve_insr_b, uint8_t, H1) 2926DO_INSR(sve_insr_h, uint16_t, H1_2) 2927DO_INSR(sve_insr_s, uint32_t, H1_4) 2928DO_INSR(sve_insr_d, uint64_t, H1_8) 2929 2930#undef DO_INSR 2931 2932void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2933{ 2934 intptr_t i, j, opr_sz = simd_oprsz(desc); 2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2936 uint64_t f = *(uint64_t *)(vn + i); 2937 uint64_t b = *(uint64_t *)(vn + j); 2938 *(uint64_t *)(vd + i) = bswap64(b); 2939 *(uint64_t *)(vd + j) = bswap64(f); 2940 } 2941} 2942 2943void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2944{ 2945 intptr_t i, j, opr_sz = simd_oprsz(desc); 2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2947 uint64_t f = *(uint64_t *)(vn + i); 2948 uint64_t b = *(uint64_t *)(vn + j); 2949 *(uint64_t *)(vd + i) = hswap64(b); 2950 *(uint64_t *)(vd + j) = hswap64(f); 2951 } 2952} 2953 2954void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2955{ 2956 intptr_t i, j, opr_sz = simd_oprsz(desc); 2957 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2958 uint64_t f = *(uint64_t *)(vn + i); 2959 uint64_t b = *(uint64_t *)(vn + j); 2960 *(uint64_t *)(vd + i) = rol64(b, 32); 2961 *(uint64_t *)(vd + j) = rol64(f, 32); 2962 } 2963} 2964 2965void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2966{ 2967 intptr_t i, j, opr_sz = simd_oprsz(desc); 2968 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2969 uint64_t f = *(uint64_t *)(vn + i); 2970 uint64_t b = *(uint64_t *)(vn + j); 2971 *(uint64_t *)(vd + i) = b; 2972 *(uint64_t *)(vd + j) = f; 2973 } 2974} 2975 2976typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2977 2978static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2979 bool is_tbx, tb_impl_fn *fn) 2980{ 2981 ARMVectorReg scratch; 2982 uintptr_t oprsz = simd_oprsz(desc); 2983 2984 if (unlikely(vd == vn)) { 2985 vn = memcpy(&scratch, vn, oprsz); 2986 } 2987 2988 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2989} 2990 2991static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2992 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2993{ 2994 ARMVectorReg scratch; 2995 uintptr_t oprsz = simd_oprsz(desc); 2996 2997 if (unlikely(vd == vn0)) { 2998 vn0 = memcpy(&scratch, vn0, oprsz); 2999 if (vd == vn1) { 3000 vn1 = vn0; 3001 } 3002 } else if (unlikely(vd == vn1)) { 3003 vn1 = memcpy(&scratch, vn1, oprsz); 3004 } 3005 3006 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3007} 3008 3009#define DO_TB(SUFF, TYPE, H) \ 3010static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3011 void *vm, uintptr_t oprsz, bool is_tbx) \ 3012{ \ 3013 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3014 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3015 for (i = 0; i < nelem; ++i) { \ 3016 TYPE index = indexes[H1(i)], val = 0; \ 3017 if (index < nelem) { \ 3018 val = tbl0[H(index)]; \ 3019 } else { \ 3020 index -= nelem; \ 3021 if (tbl1 && index < nelem) { \ 3022 val = tbl1[H(index)]; \ 3023 } else if (is_tbx) { \ 3024 continue; \ 3025 } \ 3026 } \ 3027 d[H(i)] = val; \ 3028 } \ 3029} \ 3030void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3031{ \ 3032 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3033} \ 3034void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3035 void *vm, uint32_t desc) \ 3036{ \ 3037 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3038} \ 3039void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3040{ \ 3041 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3042} 3043 3044DO_TB(b, uint8_t, H1) 3045DO_TB(h, uint16_t, H2) 3046DO_TB(s, uint32_t, H4) 3047DO_TB(d, uint64_t, H8) 3048 3049#undef DO_TB 3050 3051#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3052void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3053{ \ 3054 intptr_t i, opr_sz = simd_oprsz(desc); \ 3055 TYPED *d = vd; \ 3056 TYPES *n = vn; \ 3057 ARMVectorReg tmp; \ 3058 if (unlikely(vn - vd < opr_sz)) { \ 3059 n = memcpy(&tmp, n, opr_sz / 2); \ 3060 } \ 3061 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3062 d[HD(i)] = n[HS(i)]; \ 3063 } \ 3064} 3065 3066DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3067DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3068DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3069 3070DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3071DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3072DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3073 3074#undef DO_UNPK 3075 3076/* Mask of bits included in the even numbered predicates of width esz. 3077 * We also use this for expand_bits/compress_bits, and so extend the 3078 * same pattern out to 16-bit units. 3079 */ 3080static const uint64_t even_bit_esz_masks[5] = { 3081 0x5555555555555555ull, 3082 0x3333333333333333ull, 3083 0x0f0f0f0f0f0f0f0full, 3084 0x00ff00ff00ff00ffull, 3085 0x0000ffff0000ffffull, 3086}; 3087 3088/* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3089 * For N==0, this corresponds to the operation that in qemu/bitops.h 3090 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3091 * section 7-2 Shuffling Bits. 3092 */ 3093static uint64_t expand_bits(uint64_t x, int n) 3094{ 3095 int i; 3096 3097 x &= 0xffffffffu; 3098 for (i = 4; i >= n; i--) { 3099 int sh = 1 << i; 3100 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3101 } 3102 return x; 3103} 3104 3105/* Compress units of 2**(N+1) bits to units of 2**N bits. 3106 * For N==0, this corresponds to the operation that in qemu/bitops.h 3107 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3108 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3109 */ 3110static uint64_t compress_bits(uint64_t x, int n) 3111{ 3112 int i; 3113 3114 for (i = n; i <= 4; i++) { 3115 int sh = 1 << i; 3116 x &= even_bit_esz_masks[i]; 3117 x = (x >> sh) | x; 3118 } 3119 return x & 0xffffffffu; 3120} 3121 3122void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3123{ 3124 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3125 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3126 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3127 int esize = 1 << esz; 3128 uint64_t *d = vd; 3129 intptr_t i; 3130 3131 if (oprsz <= 8) { 3132 uint64_t nn = *(uint64_t *)vn; 3133 uint64_t mm = *(uint64_t *)vm; 3134 int half = 4 * oprsz; 3135 3136 nn = extract64(nn, high * half, half); 3137 mm = extract64(mm, high * half, half); 3138 nn = expand_bits(nn, esz); 3139 mm = expand_bits(mm, esz); 3140 d[0] = nn | (mm << esize); 3141 } else { 3142 ARMPredicateReg tmp; 3143 3144 /* We produce output faster than we consume input. 3145 Therefore we must be mindful of possible overlap. */ 3146 if (vd == vn) { 3147 vn = memcpy(&tmp, vn, oprsz); 3148 if (vd == vm) { 3149 vm = vn; 3150 } 3151 } else if (vd == vm) { 3152 vm = memcpy(&tmp, vm, oprsz); 3153 } 3154 if (high) { 3155 high = oprsz >> 1; 3156 } 3157 3158 if ((oprsz & 7) == 0) { 3159 uint32_t *n = vn, *m = vm; 3160 high >>= 2; 3161 3162 for (i = 0; i < oprsz / 8; i++) { 3163 uint64_t nn = n[H4(high + i)]; 3164 uint64_t mm = m[H4(high + i)]; 3165 3166 nn = expand_bits(nn, esz); 3167 mm = expand_bits(mm, esz); 3168 d[i] = nn | (mm << esize); 3169 } 3170 } else { 3171 uint8_t *n = vn, *m = vm; 3172 uint16_t *d16 = vd; 3173 3174 for (i = 0; i < oprsz / 2; i++) { 3175 uint16_t nn = n[H1(high + i)]; 3176 uint16_t mm = m[H1(high + i)]; 3177 3178 nn = expand_bits(nn, esz); 3179 mm = expand_bits(mm, esz); 3180 d16[H2(i)] = nn | (mm << esize); 3181 } 3182 } 3183 } 3184} 3185 3186void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3187{ 3188 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3189 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3190 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3191 uint64_t *d = vd, *n = vn, *m = vm; 3192 uint64_t l, h; 3193 intptr_t i; 3194 3195 if (oprsz <= 8) { 3196 l = compress_bits(n[0] >> odd, esz); 3197 h = compress_bits(m[0] >> odd, esz); 3198 d[0] = l | (h << (4 * oprsz)); 3199 } else { 3200 ARMPredicateReg tmp_m; 3201 intptr_t oprsz_16 = oprsz / 16; 3202 3203 if ((vm - vd) < (uintptr_t)oprsz) { 3204 m = memcpy(&tmp_m, vm, oprsz); 3205 } 3206 3207 for (i = 0; i < oprsz_16; i++) { 3208 l = n[2 * i + 0]; 3209 h = n[2 * i + 1]; 3210 l = compress_bits(l >> odd, esz); 3211 h = compress_bits(h >> odd, esz); 3212 d[i] = l | (h << 32); 3213 } 3214 3215 /* 3216 * For VL which is not a multiple of 512, the results from M do not 3217 * align nicely with the uint64_t for D. Put the aligned results 3218 * from M into TMP_M and then copy it into place afterward. 3219 */ 3220 if (oprsz & 15) { 3221 int final_shift = (oprsz & 15) * 2; 3222 3223 l = n[2 * i + 0]; 3224 h = n[2 * i + 1]; 3225 l = compress_bits(l >> odd, esz); 3226 h = compress_bits(h >> odd, esz); 3227 d[i] = l | (h << final_shift); 3228 3229 for (i = 0; i < oprsz_16; i++) { 3230 l = m[2 * i + 0]; 3231 h = m[2 * i + 1]; 3232 l = compress_bits(l >> odd, esz); 3233 h = compress_bits(h >> odd, esz); 3234 tmp_m.p[i] = l | (h << 32); 3235 } 3236 l = m[2 * i + 0]; 3237 h = m[2 * i + 1]; 3238 l = compress_bits(l >> odd, esz); 3239 h = compress_bits(h >> odd, esz); 3240 tmp_m.p[i] = l | (h << final_shift); 3241 3242 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3243 } else { 3244 for (i = 0; i < oprsz_16; i++) { 3245 l = m[2 * i + 0]; 3246 h = m[2 * i + 1]; 3247 l = compress_bits(l >> odd, esz); 3248 h = compress_bits(h >> odd, esz); 3249 d[oprsz_16 + i] = l | (h << 32); 3250 } 3251 } 3252 } 3253} 3254 3255void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3256{ 3257 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3258 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3259 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3260 uint64_t *d = vd, *n = vn, *m = vm; 3261 uint64_t mask; 3262 int shr, shl; 3263 intptr_t i; 3264 3265 shl = 1 << esz; 3266 shr = 0; 3267 mask = even_bit_esz_masks[esz]; 3268 if (odd) { 3269 mask <<= shl; 3270 shr = shl; 3271 shl = 0; 3272 } 3273 3274 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3275 uint64_t nn = (n[i] & mask) >> shr; 3276 uint64_t mm = (m[i] & mask) << shl; 3277 d[i] = nn + mm; 3278 } 3279} 3280 3281/* Reverse units of 2**N bits. */ 3282static uint64_t reverse_bits_64(uint64_t x, int n) 3283{ 3284 int i, sh; 3285 3286 x = bswap64(x); 3287 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3288 uint64_t mask = even_bit_esz_masks[i]; 3289 x = ((x & mask) << sh) | ((x >> sh) & mask); 3290 } 3291 return x; 3292} 3293 3294static uint8_t reverse_bits_8(uint8_t x, int n) 3295{ 3296 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3297 int i, sh; 3298 3299 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3300 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3301 } 3302 return x; 3303} 3304 3305void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3306{ 3307 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3308 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3309 intptr_t i, oprsz_2 = oprsz / 2; 3310 3311 if (oprsz <= 8) { 3312 uint64_t l = *(uint64_t *)vn; 3313 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3314 *(uint64_t *)vd = l; 3315 } else if ((oprsz & 15) == 0) { 3316 for (i = 0; i < oprsz_2; i += 8) { 3317 intptr_t ih = oprsz - 8 - i; 3318 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3319 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3320 *(uint64_t *)(vd + i) = h; 3321 *(uint64_t *)(vd + ih) = l; 3322 } 3323 } else { 3324 for (i = 0; i < oprsz_2; i += 1) { 3325 intptr_t il = H1(i); 3326 intptr_t ih = H1(oprsz - 1 - i); 3327 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3328 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3329 *(uint8_t *)(vd + il) = h; 3330 *(uint8_t *)(vd + ih) = l; 3331 } 3332 } 3333} 3334 3335void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3336{ 3337 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3338 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3339 uint64_t *d = vd; 3340 intptr_t i; 3341 3342 if (oprsz <= 8) { 3343 uint64_t nn = *(uint64_t *)vn; 3344 int half = 4 * oprsz; 3345 3346 nn = extract64(nn, high * half, half); 3347 nn = expand_bits(nn, 0); 3348 d[0] = nn; 3349 } else { 3350 ARMPredicateReg tmp_n; 3351 3352 /* We produce output faster than we consume input. 3353 Therefore we must be mindful of possible overlap. */ 3354 if ((vn - vd) < (uintptr_t)oprsz) { 3355 vn = memcpy(&tmp_n, vn, oprsz); 3356 } 3357 if (high) { 3358 high = oprsz >> 1; 3359 } 3360 3361 if ((oprsz & 7) == 0) { 3362 uint32_t *n = vn; 3363 high >>= 2; 3364 3365 for (i = 0; i < oprsz / 8; i++) { 3366 uint64_t nn = n[H4(high + i)]; 3367 d[i] = expand_bits(nn, 0); 3368 } 3369 } else { 3370 uint16_t *d16 = vd; 3371 uint8_t *n = vn; 3372 3373 for (i = 0; i < oprsz / 2; i++) { 3374 uint16_t nn = n[H1(high + i)]; 3375 d16[H2(i)] = expand_bits(nn, 0); 3376 } 3377 } 3378 } 3379} 3380 3381#define DO_ZIP(NAME, TYPE, H) \ 3382void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3383{ \ 3384 intptr_t oprsz = simd_oprsz(desc); \ 3385 intptr_t i, oprsz_2 = oprsz / 2; \ 3386 ARMVectorReg tmp_n, tmp_m; \ 3387 /* We produce output faster than we consume input. \ 3388 Therefore we must be mindful of possible overlap. */ \ 3389 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3390 vn = memcpy(&tmp_n, vn, oprsz_2); \ 3391 } \ 3392 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3393 vm = memcpy(&tmp_m, vm, oprsz_2); \ 3394 } \ 3395 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3396 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \ 3397 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \ 3398 } \ 3399 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3400 memset(vd + oprsz - 16, 0, 16); \ 3401 } \ 3402} 3403 3404DO_ZIP(sve_zip_b, uint8_t, H1) 3405DO_ZIP(sve_zip_h, uint16_t, H1_2) 3406DO_ZIP(sve_zip_s, uint32_t, H1_4) 3407DO_ZIP(sve_zip_d, uint64_t, H1_8) 3408DO_ZIP(sve2_zip_q, Int128, ) 3409 3410#define DO_UZP(NAME, TYPE, H) \ 3411void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3412{ \ 3413 intptr_t oprsz = simd_oprsz(desc); \ 3414 intptr_t odd_ofs = simd_data(desc); \ 3415 intptr_t i, p; \ 3416 ARMVectorReg tmp_m; \ 3417 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3418 vm = memcpy(&tmp_m, vm, oprsz); \ 3419 } \ 3420 i = 0, p = odd_ofs; \ 3421 do { \ 3422 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3423 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3424 } while (p < oprsz); \ 3425 p -= oprsz; \ 3426 do { \ 3427 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3428 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3429 } while (p < oprsz); \ 3430 tcg_debug_assert(i == oprsz); \ 3431} 3432 3433DO_UZP(sve_uzp_b, uint8_t, H1) 3434DO_UZP(sve_uzp_h, uint16_t, H1_2) 3435DO_UZP(sve_uzp_s, uint32_t, H1_4) 3436DO_UZP(sve_uzp_d, uint64_t, H1_8) 3437DO_UZP(sve2_uzp_q, Int128, ) 3438 3439#define DO_TRN(NAME, TYPE, H) \ 3440void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3441{ \ 3442 intptr_t oprsz = simd_oprsz(desc); \ 3443 intptr_t odd_ofs = simd_data(desc); \ 3444 intptr_t i; \ 3445 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3446 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3447 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3448 *(TYPE *)(vd + H(i + 0)) = ae; \ 3449 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3450 } \ 3451 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3452 memset(vd + oprsz - 16, 0, 16); \ 3453 } \ 3454} 3455 3456DO_TRN(sve_trn_b, uint8_t, H1) 3457DO_TRN(sve_trn_h, uint16_t, H1_2) 3458DO_TRN(sve_trn_s, uint32_t, H1_4) 3459DO_TRN(sve_trn_d, uint64_t, H1_8) 3460DO_TRN(sve2_trn_q, Int128, ) 3461 3462#undef DO_ZIP 3463#undef DO_UZP 3464#undef DO_TRN 3465 3466void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3467{ 3468 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3469 uint32_t *d = vd, *n = vn; 3470 uint8_t *pg = vg; 3471 3472 for (i = j = 0; i < opr_sz; i++) { 3473 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3474 d[H4(j)] = n[H4(i)]; 3475 j++; 3476 } 3477 } 3478 for (; j < opr_sz; j++) { 3479 d[H4(j)] = 0; 3480 } 3481} 3482 3483void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3484{ 3485 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3486 uint64_t *d = vd, *n = vn; 3487 uint8_t *pg = vg; 3488 3489 for (i = j = 0; i < opr_sz; i++) { 3490 if (pg[H1(i)] & 1) { 3491 d[j] = n[i]; 3492 j++; 3493 } 3494 } 3495 for (; j < opr_sz; j++) { 3496 d[j] = 0; 3497 } 3498} 3499 3500/* Similar to the ARM LastActiveElement pseudocode function, except the 3501 * result is multiplied by the element size. This includes the not found 3502 * indication; e.g. not found for esz=3 is -8. 3503 */ 3504int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3505{ 3506 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3507 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3508 3509 return last_active_element(vg, words, esz); 3510} 3511 3512void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3513{ 3514 intptr_t opr_sz = simd_oprsz(desc) / 8; 3515 int esz = simd_data(desc); 3516 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3517 intptr_t i, first_i, last_i; 3518 ARMVectorReg tmp; 3519 3520 first_i = last_i = 0; 3521 first_g = last_g = 0; 3522 3523 /* Find the extent of the active elements within VG. */ 3524 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3525 pg = *(uint64_t *)(vg + i) & mask; 3526 if (pg) { 3527 if (last_g == 0) { 3528 last_g = pg; 3529 last_i = i; 3530 } 3531 first_g = pg; 3532 first_i = i; 3533 } 3534 } 3535 3536 len = 0; 3537 if (first_g != 0) { 3538 first_i = first_i * 8 + ctz64(first_g); 3539 last_i = last_i * 8 + 63 - clz64(last_g); 3540 len = last_i - first_i + (1 << esz); 3541 if (vd == vm) { 3542 vm = memcpy(&tmp, vm, opr_sz * 8); 3543 } 3544 swap_memmove(vd, vn + first_i, len); 3545 } 3546 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3547} 3548 3549void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3550 void *vg, uint32_t desc) 3551{ 3552 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3553 uint64_t *d = vd, *n = vn, *m = vm; 3554 uint8_t *pg = vg; 3555 3556 for (i = 0; i < opr_sz; i += 1) { 3557 uint64_t nn = n[i], mm = m[i]; 3558 uint64_t pp = expand_pred_b(pg[H1(i)]); 3559 d[i] = (nn & pp) | (mm & ~pp); 3560 } 3561} 3562 3563void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3564 void *vg, uint32_t desc) 3565{ 3566 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3567 uint64_t *d = vd, *n = vn, *m = vm; 3568 uint8_t *pg = vg; 3569 3570 for (i = 0; i < opr_sz; i += 1) { 3571 uint64_t nn = n[i], mm = m[i]; 3572 uint64_t pp = expand_pred_h(pg[H1(i)]); 3573 d[i] = (nn & pp) | (mm & ~pp); 3574 } 3575} 3576 3577void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3578 void *vg, uint32_t desc) 3579{ 3580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3581 uint64_t *d = vd, *n = vn, *m = vm; 3582 uint8_t *pg = vg; 3583 3584 for (i = 0; i < opr_sz; i += 1) { 3585 uint64_t nn = n[i], mm = m[i]; 3586 uint64_t pp = expand_pred_s(pg[H1(i)]); 3587 d[i] = (nn & pp) | (mm & ~pp); 3588 } 3589} 3590 3591void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3592 void *vg, uint32_t desc) 3593{ 3594 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3595 uint64_t *d = vd, *n = vn, *m = vm; 3596 uint8_t *pg = vg; 3597 3598 for (i = 0; i < opr_sz; i += 1) { 3599 uint64_t nn = n[i], mm = m[i]; 3600 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3601 } 3602} 3603 3604/* Two operand comparison controlled by a predicate. 3605 * ??? It is very tempting to want to be able to expand this inline 3606 * with x86 instructions, e.g. 3607 * 3608 * vcmpeqw zm, zn, %ymm0 3609 * vpmovmskb %ymm0, %eax 3610 * and $0x5555, %eax 3611 * and pg, %eax 3612 * 3613 * or even aarch64, e.g. 3614 * 3615 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3616 * cmeq v0.8h, zn, zm 3617 * and v0.8h, v0.8h, mask 3618 * addv h0, v0.8h 3619 * and v0.8b, pg 3620 * 3621 * However, coming up with an abstraction that allows vector inputs and 3622 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3623 * scalar outputs, is tricky. 3624 */ 3625#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3626uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3627{ \ 3628 intptr_t opr_sz = simd_oprsz(desc); \ 3629 uint32_t flags = PREDTEST_INIT; \ 3630 intptr_t i = opr_sz; \ 3631 do { \ 3632 uint64_t out = 0, pg; \ 3633 do { \ 3634 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3635 TYPE nn = *(TYPE *)(vn + H(i)); \ 3636 TYPE mm = *(TYPE *)(vm + H(i)); \ 3637 out |= nn OP mm; \ 3638 } while (i & 63); \ 3639 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3640 out &= pg; \ 3641 *(uint64_t *)(vd + (i >> 3)) = out; \ 3642 flags = iter_predtest_bwd(out, pg, flags); \ 3643 } while (i > 0); \ 3644 return flags; \ 3645} 3646 3647#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3648 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3649#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3650 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3651#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3652 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3653#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3654 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3655 3656DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3657DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3658DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3659DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3660 3661DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3662DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3663DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3664DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3665 3666DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3667DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3668DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3669DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3670 3671DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3672DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3673DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3674DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3675 3676DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3677DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3678DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3679DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3680 3681DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3682DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3683DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3684DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3685 3686#undef DO_CMP_PPZZ_B 3687#undef DO_CMP_PPZZ_H 3688#undef DO_CMP_PPZZ_S 3689#undef DO_CMP_PPZZ_D 3690#undef DO_CMP_PPZZ 3691 3692/* Similar, but the second source is "wide". */ 3693#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3694uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3695{ \ 3696 intptr_t opr_sz = simd_oprsz(desc); \ 3697 uint32_t flags = PREDTEST_INIT; \ 3698 intptr_t i = opr_sz; \ 3699 do { \ 3700 uint64_t out = 0, pg; \ 3701 do { \ 3702 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3703 do { \ 3704 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3705 TYPE nn = *(TYPE *)(vn + H(i)); \ 3706 out |= nn OP mm; \ 3707 } while (i & 7); \ 3708 } while (i & 63); \ 3709 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3710 out &= pg; \ 3711 *(uint64_t *)(vd + (i >> 3)) = out; \ 3712 flags = iter_predtest_bwd(out, pg, flags); \ 3713 } while (i > 0); \ 3714 return flags; \ 3715} 3716 3717#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3718 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3719#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3720 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3721#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3722 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3723 3724DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3725DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3726DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3727 3728DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3729DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3730DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3731 3732DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3733DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3734DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3735 3736DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3737DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3738DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3739 3740DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3741DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3742DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3743 3744DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3745DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3746DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3747 3748DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3749DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3750DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3751 3752DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3753DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3754DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3755 3756DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3757DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3758DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3759 3760DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3761DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3762DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3763 3764#undef DO_CMP_PPZW_B 3765#undef DO_CMP_PPZW_H 3766#undef DO_CMP_PPZW_S 3767#undef DO_CMP_PPZW 3768 3769/* Similar, but the second source is immediate. */ 3770#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3771uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3772{ \ 3773 intptr_t opr_sz = simd_oprsz(desc); \ 3774 uint32_t flags = PREDTEST_INIT; \ 3775 TYPE mm = simd_data(desc); \ 3776 intptr_t i = opr_sz; \ 3777 do { \ 3778 uint64_t out = 0, pg; \ 3779 do { \ 3780 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3781 TYPE nn = *(TYPE *)(vn + H(i)); \ 3782 out |= nn OP mm; \ 3783 } while (i & 63); \ 3784 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3785 out &= pg; \ 3786 *(uint64_t *)(vd + (i >> 3)) = out; \ 3787 flags = iter_predtest_bwd(out, pg, flags); \ 3788 } while (i > 0); \ 3789 return flags; \ 3790} 3791 3792#define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3793 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3794#define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3795 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3796#define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3797 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3798#define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3799 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3800 3801DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3802DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3803DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3804DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3805 3806DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3807DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3808DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3809DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3810 3811DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3812DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3813DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3814DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3815 3816DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3817DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3818DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3819DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3820 3821DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3822DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3823DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3824DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3825 3826DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3827DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3828DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3829DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3830 3831DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3832DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3833DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3834DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3835 3836DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3837DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3838DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3839DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3840 3841DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3842DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3843DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3844DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3845 3846DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3847DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3848DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3849DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3850 3851#undef DO_CMP_PPZI_B 3852#undef DO_CMP_PPZI_H 3853#undef DO_CMP_PPZI_S 3854#undef DO_CMP_PPZI_D 3855#undef DO_CMP_PPZI 3856 3857/* Similar to the ARM LastActive pseudocode function. */ 3858static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3859{ 3860 intptr_t i; 3861 3862 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3863 uint64_t pg = *(uint64_t *)(vg + i); 3864 if (pg) { 3865 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3866 } 3867 } 3868 return 0; 3869} 3870 3871/* Compute a mask into RETB that is true for all G, up to and including 3872 * (if after) or excluding (if !after) the first G & N. 3873 * Return true if BRK found. 3874 */ 3875static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3876 bool brk, bool after) 3877{ 3878 uint64_t b; 3879 3880 if (brk) { 3881 b = 0; 3882 } else if ((g & n) == 0) { 3883 /* For all G, no N are set; break not found. */ 3884 b = g; 3885 } else { 3886 /* Break somewhere in N. Locate it. */ 3887 b = g & n; /* guard true, pred true */ 3888 b = b & -b; /* first such */ 3889 if (after) { 3890 b = b | (b - 1); /* break after same */ 3891 } else { 3892 b = b - 1; /* break before same */ 3893 } 3894 brk = true; 3895 } 3896 3897 *retb = b; 3898 return brk; 3899} 3900 3901/* Compute a zeroing BRK. */ 3902static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3903 intptr_t oprsz, bool after) 3904{ 3905 bool brk = false; 3906 intptr_t i; 3907 3908 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3909 uint64_t this_b, this_g = g[i]; 3910 3911 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3912 d[i] = this_b & this_g; 3913 } 3914} 3915 3916/* Likewise, but also compute flags. */ 3917static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3918 intptr_t oprsz, bool after) 3919{ 3920 uint32_t flags = PREDTEST_INIT; 3921 bool brk = false; 3922 intptr_t i; 3923 3924 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3925 uint64_t this_b, this_d, this_g = g[i]; 3926 3927 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3928 d[i] = this_d = this_b & this_g; 3929 flags = iter_predtest_fwd(this_d, this_g, flags); 3930 } 3931 return flags; 3932} 3933 3934/* Compute a merging BRK. */ 3935static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3936 intptr_t oprsz, bool after) 3937{ 3938 bool brk = false; 3939 intptr_t i; 3940 3941 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3942 uint64_t this_b, this_g = g[i]; 3943 3944 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3945 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3946 } 3947} 3948 3949/* Likewise, but also compute flags. */ 3950static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3951 intptr_t oprsz, bool after) 3952{ 3953 uint32_t flags = PREDTEST_INIT; 3954 bool brk = false; 3955 intptr_t i; 3956 3957 for (i = 0; i < oprsz / 8; ++i) { 3958 uint64_t this_b, this_d = d[i], this_g = g[i]; 3959 3960 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3961 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3962 flags = iter_predtest_fwd(this_d, this_g, flags); 3963 } 3964 return flags; 3965} 3966 3967static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3968{ 3969 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3970 * The compiler should turn this into 4 64-bit integer stores. 3971 */ 3972 memset(d, 0, sizeof(ARMPredicateReg)); 3973 return PREDTEST_INIT; 3974} 3975 3976void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3977 uint32_t pred_desc) 3978{ 3979 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3980 if (last_active_pred(vn, vg, oprsz)) { 3981 compute_brk_z(vd, vm, vg, oprsz, true); 3982 } else { 3983 do_zero(vd, oprsz); 3984 } 3985} 3986 3987uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3988 uint32_t pred_desc) 3989{ 3990 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3991 if (last_active_pred(vn, vg, oprsz)) { 3992 return compute_brks_z(vd, vm, vg, oprsz, true); 3993 } else { 3994 return do_zero(vd, oprsz); 3995 } 3996} 3997 3998void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3999 uint32_t pred_desc) 4000{ 4001 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4002 if (last_active_pred(vn, vg, oprsz)) { 4003 compute_brk_z(vd, vm, vg, oprsz, false); 4004 } else { 4005 do_zero(vd, oprsz); 4006 } 4007} 4008 4009uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4010 uint32_t pred_desc) 4011{ 4012 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4013 if (last_active_pred(vn, vg, oprsz)) { 4014 return compute_brks_z(vd, vm, vg, oprsz, false); 4015 } else { 4016 return do_zero(vd, oprsz); 4017 } 4018} 4019 4020void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4021{ 4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4023 compute_brk_z(vd, vn, vg, oprsz, true); 4024} 4025 4026uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4027{ 4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4029 return compute_brks_z(vd, vn, vg, oprsz, true); 4030} 4031 4032void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4033{ 4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4035 compute_brk_z(vd, vn, vg, oprsz, false); 4036} 4037 4038uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4039{ 4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4041 return compute_brks_z(vd, vn, vg, oprsz, false); 4042} 4043 4044void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4045{ 4046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4047 compute_brk_m(vd, vn, vg, oprsz, true); 4048} 4049 4050uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4051{ 4052 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4053 return compute_brks_m(vd, vn, vg, oprsz, true); 4054} 4055 4056void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4057{ 4058 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4059 compute_brk_m(vd, vn, vg, oprsz, false); 4060} 4061 4062uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4063{ 4064 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4065 return compute_brks_m(vd, vn, vg, oprsz, false); 4066} 4067 4068void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4069{ 4070 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4071 if (!last_active_pred(vn, vg, oprsz)) { 4072 do_zero(vd, oprsz); 4073 } 4074} 4075 4076/* As if PredTest(Ones(PL), D, esz). */ 4077static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4078 uint64_t esz_mask) 4079{ 4080 uint32_t flags = PREDTEST_INIT; 4081 intptr_t i; 4082 4083 for (i = 0; i < oprsz / 8; i++) { 4084 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4085 } 4086 if (oprsz & 7) { 4087 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4088 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4089 } 4090 return flags; 4091} 4092 4093uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4094{ 4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4096 if (last_active_pred(vn, vg, oprsz)) { 4097 return predtest_ones(vd, oprsz, -1); 4098 } else { 4099 return do_zero(vd, oprsz); 4100 } 4101} 4102 4103uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4104{ 4105 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4106 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4107 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4108 intptr_t i; 4109 4110 for (i = 0; i < words; ++i) { 4111 uint64_t t = n[i] & g[i] & mask; 4112 sum += ctpop64(t); 4113 } 4114 return sum; 4115} 4116 4117uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4118{ 4119 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4120 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4121 uint64_t esz_mask = pred_esz_masks[esz]; 4122 ARMPredicateReg *d = vd; 4123 uint32_t flags; 4124 intptr_t i; 4125 4126 /* Begin with a zero predicate register. */ 4127 flags = do_zero(d, oprsz); 4128 if (count == 0) { 4129 return flags; 4130 } 4131 4132 /* Set all of the requested bits. */ 4133 for (i = 0; i < count / 64; ++i) { 4134 d->p[i] = esz_mask; 4135 } 4136 if (count & 63) { 4137 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4138 } 4139 4140 return predtest_ones(d, oprsz, esz_mask); 4141} 4142 4143uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4144{ 4145 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4146 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4147 uint64_t esz_mask = pred_esz_masks[esz]; 4148 ARMPredicateReg *d = vd; 4149 intptr_t i, invcount, oprbits; 4150 uint64_t bits; 4151 4152 if (count == 0) { 4153 return do_zero(d, oprsz); 4154 } 4155 4156 oprbits = oprsz * 8; 4157 tcg_debug_assert(count <= oprbits); 4158 4159 bits = esz_mask; 4160 if (oprbits & 63) { 4161 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4162 } 4163 4164 invcount = oprbits - count; 4165 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4166 d->p[i] = bits; 4167 bits = esz_mask; 4168 } 4169 4170 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4171 4172 while (--i >= 0) { 4173 d->p[i] = 0; 4174 } 4175 4176 return predtest_ones(d, oprsz, esz_mask); 4177} 4178 4179/* Recursive reduction on a function; 4180 * C.f. the ARM ARM function ReducePredicated. 4181 * 4182 * While it would be possible to write this without the DATA temporary, 4183 * it is much simpler to process the predicate register this way. 4184 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4185 * little to gain with a more complex non-recursive form. 4186 */ 4187#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4188static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4189{ \ 4190 if (n == 1) { \ 4191 return *data; \ 4192 } else { \ 4193 uintptr_t half = n / 2; \ 4194 TYPE lo = NAME##_reduce(data, status, half); \ 4195 TYPE hi = NAME##_reduce(data + half, status, half); \ 4196 return TYPE##_##FUNC(lo, hi, status); \ 4197 } \ 4198} \ 4199uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ 4200{ \ 4201 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4202 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4203 for (i = 0; i < oprsz; ) { \ 4204 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4205 do { \ 4206 TYPE nn = *(TYPE *)(vn + H(i)); \ 4207 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4208 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4209 } while (i & 15); \ 4210 } \ 4211 for (; i < maxsz; i += sizeof(TYPE)) { \ 4212 *(TYPE *)((void *)data + i) = IDENT; \ 4213 } \ 4214 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ 4215} 4216 4217DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4218DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4219DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4220 4221/* Identity is floatN_default_nan, without the function call. */ 4222DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4223DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4224DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4225 4226DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4227DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4228DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4229 4230DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4231DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4232DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4233 4234DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4235DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4236DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4237 4238#undef DO_REDUCE 4239 4240uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4241 void *status, uint32_t desc) 4242{ 4243 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4244 float16 result = nn; 4245 4246 do { 4247 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4248 do { 4249 if (pg & 1) { 4250 float16 mm = *(float16 *)(vm + H1_2(i)); 4251 result = float16_add(result, mm, status); 4252 } 4253 i += sizeof(float16), pg >>= sizeof(float16); 4254 } while (i & 15); 4255 } while (i < opr_sz); 4256 4257 return result; 4258} 4259 4260uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4261 void *status, uint32_t desc) 4262{ 4263 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4264 float32 result = nn; 4265 4266 do { 4267 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4268 do { 4269 if (pg & 1) { 4270 float32 mm = *(float32 *)(vm + H1_2(i)); 4271 result = float32_add(result, mm, status); 4272 } 4273 i += sizeof(float32), pg >>= sizeof(float32); 4274 } while (i & 15); 4275 } while (i < opr_sz); 4276 4277 return result; 4278} 4279 4280uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4281 void *status, uint32_t desc) 4282{ 4283 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4284 uint64_t *m = vm; 4285 uint8_t *pg = vg; 4286 4287 for (i = 0; i < opr_sz; i++) { 4288 if (pg[H1(i)] & 1) { 4289 nn = float64_add(nn, m[i], status); 4290 } 4291 } 4292 4293 return nn; 4294} 4295 4296/* Fully general three-operand expander, controlled by a predicate, 4297 * With the extra float_status parameter. 4298 */ 4299#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4300void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4301 void *status, uint32_t desc) \ 4302{ \ 4303 intptr_t i = simd_oprsz(desc); \ 4304 uint64_t *g = vg; \ 4305 do { \ 4306 uint64_t pg = g[(i - 1) >> 6]; \ 4307 do { \ 4308 i -= sizeof(TYPE); \ 4309 if (likely((pg >> (i & 63)) & 1)) { \ 4310 TYPE nn = *(TYPE *)(vn + H(i)); \ 4311 TYPE mm = *(TYPE *)(vm + H(i)); \ 4312 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4313 } \ 4314 } while (i & 63); \ 4315 } while (i != 0); \ 4316} 4317 4318DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4319DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4320DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4321 4322DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4323DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4324DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4325 4326DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4327DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4328DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4329 4330DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4331DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4332DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4333 4334DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4335DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4336DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4337 4338DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4339DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4340DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4341 4342DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4343DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4344DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4345 4346DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4347DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4348DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4349 4350static inline float16 abd_h(float16 a, float16 b, float_status *s) 4351{ 4352 return float16_abs(float16_sub(a, b, s)); 4353} 4354 4355static inline float32 abd_s(float32 a, float32 b, float_status *s) 4356{ 4357 return float32_abs(float32_sub(a, b, s)); 4358} 4359 4360static inline float64 abd_d(float64 a, float64 b, float_status *s) 4361{ 4362 return float64_abs(float64_sub(a, b, s)); 4363} 4364 4365DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4366DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4367DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4368 4369static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4370{ 4371 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4372 return float64_scalbn(a, b_int, s); 4373} 4374 4375DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4376DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4377DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4378 4379DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4380DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4381DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4382 4383#undef DO_ZPZZ_FP 4384 4385/* Three-operand expander, with one scalar operand, controlled by 4386 * a predicate, with the extra float_status parameter. 4387 */ 4388#define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4389void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4390 void *status, uint32_t desc) \ 4391{ \ 4392 intptr_t i = simd_oprsz(desc); \ 4393 uint64_t *g = vg; \ 4394 TYPE mm = scalar; \ 4395 do { \ 4396 uint64_t pg = g[(i - 1) >> 6]; \ 4397 do { \ 4398 i -= sizeof(TYPE); \ 4399 if (likely((pg >> (i & 63)) & 1)) { \ 4400 TYPE nn = *(TYPE *)(vn + H(i)); \ 4401 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4402 } \ 4403 } while (i & 63); \ 4404 } while (i != 0); \ 4405} 4406 4407DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4408DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4409DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4410 4411DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4412DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4413DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4414 4415DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4416DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4417DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4418 4419static inline float16 subr_h(float16 a, float16 b, float_status *s) 4420{ 4421 return float16_sub(b, a, s); 4422} 4423 4424static inline float32 subr_s(float32 a, float32 b, float_status *s) 4425{ 4426 return float32_sub(b, a, s); 4427} 4428 4429static inline float64 subr_d(float64 a, float64 b, float_status *s) 4430{ 4431 return float64_sub(b, a, s); 4432} 4433 4434DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4435DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4436DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4437 4438DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4439DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4440DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4441 4442DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4443DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4444DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4445 4446DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4447DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4448DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4449 4450DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4451DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4452DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4453 4454/* Fully general two-operand expander, controlled by a predicate, 4455 * With the extra float_status parameter. 4456 */ 4457#define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4458void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 4459{ \ 4460 intptr_t i = simd_oprsz(desc); \ 4461 uint64_t *g = vg; \ 4462 do { \ 4463 uint64_t pg = g[(i - 1) >> 6]; \ 4464 do { \ 4465 i -= sizeof(TYPE); \ 4466 if (likely((pg >> (i & 63)) & 1)) { \ 4467 TYPE nn = *(TYPE *)(vn + H(i)); \ 4468 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4469 } \ 4470 } while (i & 63); \ 4471 } while (i != 0); \ 4472} 4473 4474/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4475 * FZ16. When converting from fp16, this affects flushing input denormals; 4476 * when converting to fp16, this affects flushing output denormals. 4477 */ 4478static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4479{ 4480 bool save = get_flush_inputs_to_zero(fpst); 4481 float32 ret; 4482 4483 set_flush_inputs_to_zero(false, fpst); 4484 ret = float16_to_float32(f, true, fpst); 4485 set_flush_inputs_to_zero(save, fpst); 4486 return ret; 4487} 4488 4489static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4490{ 4491 bool save = get_flush_inputs_to_zero(fpst); 4492 float64 ret; 4493 4494 set_flush_inputs_to_zero(false, fpst); 4495 ret = float16_to_float64(f, true, fpst); 4496 set_flush_inputs_to_zero(save, fpst); 4497 return ret; 4498} 4499 4500static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4501{ 4502 bool save = get_flush_to_zero(fpst); 4503 float16 ret; 4504 4505 set_flush_to_zero(false, fpst); 4506 ret = float32_to_float16(f, true, fpst); 4507 set_flush_to_zero(save, fpst); 4508 return ret; 4509} 4510 4511static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4512{ 4513 bool save = get_flush_to_zero(fpst); 4514 float16 ret; 4515 4516 set_flush_to_zero(false, fpst); 4517 ret = float64_to_float16(f, true, fpst); 4518 set_flush_to_zero(save, fpst); 4519 return ret; 4520} 4521 4522static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4523{ 4524 if (float16_is_any_nan(f)) { 4525 float_raise(float_flag_invalid, s); 4526 return 0; 4527 } 4528 return float16_to_int16_round_to_zero(f, s); 4529} 4530 4531static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4532{ 4533 if (float16_is_any_nan(f)) { 4534 float_raise(float_flag_invalid, s); 4535 return 0; 4536 } 4537 return float16_to_int64_round_to_zero(f, s); 4538} 4539 4540static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4541{ 4542 if (float32_is_any_nan(f)) { 4543 float_raise(float_flag_invalid, s); 4544 return 0; 4545 } 4546 return float32_to_int64_round_to_zero(f, s); 4547} 4548 4549static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4550{ 4551 if (float64_is_any_nan(f)) { 4552 float_raise(float_flag_invalid, s); 4553 return 0; 4554 } 4555 return float64_to_int64_round_to_zero(f, s); 4556} 4557 4558static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4559{ 4560 if (float16_is_any_nan(f)) { 4561 float_raise(float_flag_invalid, s); 4562 return 0; 4563 } 4564 return float16_to_uint16_round_to_zero(f, s); 4565} 4566 4567static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4568{ 4569 if (float16_is_any_nan(f)) { 4570 float_raise(float_flag_invalid, s); 4571 return 0; 4572 } 4573 return float16_to_uint64_round_to_zero(f, s); 4574} 4575 4576static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4577{ 4578 if (float32_is_any_nan(f)) { 4579 float_raise(float_flag_invalid, s); 4580 return 0; 4581 } 4582 return float32_to_uint64_round_to_zero(f, s); 4583} 4584 4585static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4586{ 4587 if (float64_is_any_nan(f)) { 4588 float_raise(float_flag_invalid, s); 4589 return 0; 4590 } 4591 return float64_to_uint64_round_to_zero(f, s); 4592} 4593 4594DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4595DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4596DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4597DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4598DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4599DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4600DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4601 4602DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4603DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4604DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4605DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4606DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4607DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4608DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4609 4610DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4611DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4612DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4613DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4614DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4615DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4616DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4617 4618DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4619DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4620DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4621 4622DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4623DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4624DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4625 4626DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4627DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4628DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4629 4630DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4631DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4632DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4633 4634DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4635DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4636DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4637DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4638DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4639DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4640DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4641 4642DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4643DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4644DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4645DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4646DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4647DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4648DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4649 4650static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4651{ 4652 /* Extract frac to the top of the uint32_t. */ 4653 uint32_t frac = (uint32_t)a << (16 + 6); 4654 int16_t exp = extract32(a, 10, 5); 4655 4656 if (unlikely(exp == 0)) { 4657 if (frac != 0) { 4658 if (!get_flush_inputs_to_zero(s)) { 4659 /* denormal: bias - fractional_zeros */ 4660 return -15 - clz32(frac); 4661 } 4662 /* flush to zero */ 4663 float_raise(float_flag_input_denormal, s); 4664 } 4665 } else if (unlikely(exp == 0x1f)) { 4666 if (frac == 0) { 4667 return INT16_MAX; /* infinity */ 4668 } 4669 } else { 4670 /* normal: exp - bias */ 4671 return exp - 15; 4672 } 4673 /* nan or zero */ 4674 float_raise(float_flag_invalid, s); 4675 return INT16_MIN; 4676} 4677 4678static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4679{ 4680 /* Extract frac to the top of the uint32_t. */ 4681 uint32_t frac = a << 9; 4682 int32_t exp = extract32(a, 23, 8); 4683 4684 if (unlikely(exp == 0)) { 4685 if (frac != 0) { 4686 if (!get_flush_inputs_to_zero(s)) { 4687 /* denormal: bias - fractional_zeros */ 4688 return -127 - clz32(frac); 4689 } 4690 /* flush to zero */ 4691 float_raise(float_flag_input_denormal, s); 4692 } 4693 } else if (unlikely(exp == 0xff)) { 4694 if (frac == 0) { 4695 return INT32_MAX; /* infinity */ 4696 } 4697 } else { 4698 /* normal: exp - bias */ 4699 return exp - 127; 4700 } 4701 /* nan or zero */ 4702 float_raise(float_flag_invalid, s); 4703 return INT32_MIN; 4704} 4705 4706static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4707{ 4708 /* Extract frac to the top of the uint64_t. */ 4709 uint64_t frac = a << 12; 4710 int64_t exp = extract64(a, 52, 11); 4711 4712 if (unlikely(exp == 0)) { 4713 if (frac != 0) { 4714 if (!get_flush_inputs_to_zero(s)) { 4715 /* denormal: bias - fractional_zeros */ 4716 return -1023 - clz64(frac); 4717 } 4718 /* flush to zero */ 4719 float_raise(float_flag_input_denormal, s); 4720 } 4721 } else if (unlikely(exp == 0x7ff)) { 4722 if (frac == 0) { 4723 return INT64_MAX; /* infinity */ 4724 } 4725 } else { 4726 /* normal: exp - bias */ 4727 return exp - 1023; 4728 } 4729 /* nan or zero */ 4730 float_raise(float_flag_invalid, s); 4731 return INT64_MIN; 4732} 4733 4734DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4735DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4736DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4737 4738#undef DO_ZPZ_FP 4739 4740static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4741 float_status *status, uint32_t desc, 4742 uint16_t neg1, uint16_t neg3) 4743{ 4744 intptr_t i = simd_oprsz(desc); 4745 uint64_t *g = vg; 4746 4747 do { 4748 uint64_t pg = g[(i - 1) >> 6]; 4749 do { 4750 i -= 2; 4751 if (likely((pg >> (i & 63)) & 1)) { 4752 float16 e1, e2, e3, r; 4753 4754 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4755 e2 = *(uint16_t *)(vm + H1_2(i)); 4756 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4757 r = float16_muladd(e1, e2, e3, 0, status); 4758 *(uint16_t *)(vd + H1_2(i)) = r; 4759 } 4760 } while (i & 63); 4761 } while (i != 0); 4762} 4763 4764void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4765 void *vg, void *status, uint32_t desc) 4766{ 4767 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4768} 4769 4770void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4771 void *vg, void *status, uint32_t desc) 4772{ 4773 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4774} 4775 4776void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4777 void *vg, void *status, uint32_t desc) 4778{ 4779 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4780} 4781 4782void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4783 void *vg, void *status, uint32_t desc) 4784{ 4785 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4786} 4787 4788static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4789 float_status *status, uint32_t desc, 4790 uint32_t neg1, uint32_t neg3) 4791{ 4792 intptr_t i = simd_oprsz(desc); 4793 uint64_t *g = vg; 4794 4795 do { 4796 uint64_t pg = g[(i - 1) >> 6]; 4797 do { 4798 i -= 4; 4799 if (likely((pg >> (i & 63)) & 1)) { 4800 float32 e1, e2, e3, r; 4801 4802 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4803 e2 = *(uint32_t *)(vm + H1_4(i)); 4804 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4805 r = float32_muladd(e1, e2, e3, 0, status); 4806 *(uint32_t *)(vd + H1_4(i)) = r; 4807 } 4808 } while (i & 63); 4809 } while (i != 0); 4810} 4811 4812void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4813 void *vg, void *status, uint32_t desc) 4814{ 4815 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4816} 4817 4818void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4819 void *vg, void *status, uint32_t desc) 4820{ 4821 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4822} 4823 4824void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4825 void *vg, void *status, uint32_t desc) 4826{ 4827 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4828} 4829 4830void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4831 void *vg, void *status, uint32_t desc) 4832{ 4833 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4834} 4835 4836static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4837 float_status *status, uint32_t desc, 4838 uint64_t neg1, uint64_t neg3) 4839{ 4840 intptr_t i = simd_oprsz(desc); 4841 uint64_t *g = vg; 4842 4843 do { 4844 uint64_t pg = g[(i - 1) >> 6]; 4845 do { 4846 i -= 8; 4847 if (likely((pg >> (i & 63)) & 1)) { 4848 float64 e1, e2, e3, r; 4849 4850 e1 = *(uint64_t *)(vn + i) ^ neg1; 4851 e2 = *(uint64_t *)(vm + i); 4852 e3 = *(uint64_t *)(va + i) ^ neg3; 4853 r = float64_muladd(e1, e2, e3, 0, status); 4854 *(uint64_t *)(vd + i) = r; 4855 } 4856 } while (i & 63); 4857 } while (i != 0); 4858} 4859 4860void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4861 void *vg, void *status, uint32_t desc) 4862{ 4863 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4864} 4865 4866void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4867 void *vg, void *status, uint32_t desc) 4868{ 4869 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4870} 4871 4872void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4873 void *vg, void *status, uint32_t desc) 4874{ 4875 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4876} 4877 4878void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4879 void *vg, void *status, uint32_t desc) 4880{ 4881 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4882} 4883 4884/* Two operand floating-point comparison controlled by a predicate. 4885 * Unlike the integer version, we are not allowed to optimistically 4886 * compare operands, since the comparison may have side effects wrt 4887 * the FPSR. 4888 */ 4889#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4890void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4891 void *status, uint32_t desc) \ 4892{ \ 4893 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4894 uint64_t *d = vd, *g = vg; \ 4895 do { \ 4896 uint64_t out = 0, pg = g[j]; \ 4897 do { \ 4898 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4899 if (likely((pg >> (i & 63)) & 1)) { \ 4900 TYPE nn = *(TYPE *)(vn + H(i)); \ 4901 TYPE mm = *(TYPE *)(vm + H(i)); \ 4902 out |= OP(TYPE, nn, mm, status); \ 4903 } \ 4904 } while (i & 63); \ 4905 d[j--] = out; \ 4906 } while (i > 0); \ 4907} 4908 4909#define DO_FPCMP_PPZZ_H(NAME, OP) \ 4910 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4911#define DO_FPCMP_PPZZ_S(NAME, OP) \ 4912 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4913#define DO_FPCMP_PPZZ_D(NAME, OP) \ 4914 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4915 4916#define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4917 DO_FPCMP_PPZZ_H(NAME, OP) \ 4918 DO_FPCMP_PPZZ_S(NAME, OP) \ 4919 DO_FPCMP_PPZZ_D(NAME, OP) 4920 4921#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4922#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4923#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4924#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4925#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4926#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4927#define DO_FCMUO(TYPE, X, Y, ST) \ 4928 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4929#define DO_FACGE(TYPE, X, Y, ST) \ 4930 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4931#define DO_FACGT(TYPE, X, Y, ST) \ 4932 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4933 4934DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4935DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4936DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4937DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4938DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4939DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4940DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4941 4942#undef DO_FPCMP_PPZZ_ALL 4943#undef DO_FPCMP_PPZZ_D 4944#undef DO_FPCMP_PPZZ_S 4945#undef DO_FPCMP_PPZZ_H 4946#undef DO_FPCMP_PPZZ 4947 4948/* One operand floating-point comparison against zero, controlled 4949 * by a predicate. 4950 */ 4951#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4952void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4953 void *status, uint32_t desc) \ 4954{ \ 4955 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4956 uint64_t *d = vd, *g = vg; \ 4957 do { \ 4958 uint64_t out = 0, pg = g[j]; \ 4959 do { \ 4960 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4961 if ((pg >> (i & 63)) & 1) { \ 4962 TYPE nn = *(TYPE *)(vn + H(i)); \ 4963 out |= OP(TYPE, nn, 0, status); \ 4964 } \ 4965 } while (i & 63); \ 4966 d[j--] = out; \ 4967 } while (i > 0); \ 4968} 4969 4970#define DO_FPCMP_PPZ0_H(NAME, OP) \ 4971 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4972#define DO_FPCMP_PPZ0_S(NAME, OP) \ 4973 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4974#define DO_FPCMP_PPZ0_D(NAME, OP) \ 4975 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4976 4977#define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4978 DO_FPCMP_PPZ0_H(NAME, OP) \ 4979 DO_FPCMP_PPZ0_S(NAME, OP) \ 4980 DO_FPCMP_PPZ0_D(NAME, OP) 4981 4982DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4983DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4984DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4985DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4986DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4987DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4988 4989/* FP Trig Multiply-Add. */ 4990 4991void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 4992{ 4993 static const float16 coeff[16] = { 4994 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4995 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4996 }; 4997 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4998 intptr_t x = simd_data(desc); 4999 float16 *d = vd, *n = vn, *m = vm; 5000 for (i = 0; i < opr_sz; i++) { 5001 float16 mm = m[i]; 5002 intptr_t xx = x; 5003 if (float16_is_neg(mm)) { 5004 mm = float16_abs(mm); 5005 xx += 8; 5006 } 5007 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); 5008 } 5009} 5010 5011void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5012{ 5013 static const float32 coeff[16] = { 5014 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5015 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5016 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5017 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5018 }; 5019 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5020 intptr_t x = simd_data(desc); 5021 float32 *d = vd, *n = vn, *m = vm; 5022 for (i = 0; i < opr_sz; i++) { 5023 float32 mm = m[i]; 5024 intptr_t xx = x; 5025 if (float32_is_neg(mm)) { 5026 mm = float32_abs(mm); 5027 xx += 8; 5028 } 5029 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); 5030 } 5031} 5032 5033void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) 5034{ 5035 static const float64 coeff[16] = { 5036 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5037 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5038 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5039 0x3de5d8408868552full, 0x0000000000000000ull, 5040 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5041 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5042 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5043 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5044 }; 5045 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5046 intptr_t x = simd_data(desc); 5047 float64 *d = vd, *n = vn, *m = vm; 5048 for (i = 0; i < opr_sz; i++) { 5049 float64 mm = m[i]; 5050 intptr_t xx = x; 5051 if (float64_is_neg(mm)) { 5052 mm = float64_abs(mm); 5053 xx += 8; 5054 } 5055 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); 5056 } 5057} 5058 5059/* 5060 * FP Complex Add 5061 */ 5062 5063void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5064 void *vs, uint32_t desc) 5065{ 5066 intptr_t j, i = simd_oprsz(desc); 5067 uint64_t *g = vg; 5068 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5069 float16 neg_real = float16_chs(neg_imag); 5070 5071 do { 5072 uint64_t pg = g[(i - 1) >> 6]; 5073 do { 5074 float16 e0, e1, e2, e3; 5075 5076 /* I holds the real index; J holds the imag index. */ 5077 j = i - sizeof(float16); 5078 i -= 2 * sizeof(float16); 5079 5080 e0 = *(float16 *)(vn + H1_2(i)); 5081 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5082 e2 = *(float16 *)(vn + H1_2(j)); 5083 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5084 5085 if (likely((pg >> (i & 63)) & 1)) { 5086 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); 5087 } 5088 if (likely((pg >> (j & 63)) & 1)) { 5089 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); 5090 } 5091 } while (i & 63); 5092 } while (i != 0); 5093} 5094 5095void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5096 void *vs, uint32_t desc) 5097{ 5098 intptr_t j, i = simd_oprsz(desc); 5099 uint64_t *g = vg; 5100 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5101 float32 neg_real = float32_chs(neg_imag); 5102 5103 do { 5104 uint64_t pg = g[(i - 1) >> 6]; 5105 do { 5106 float32 e0, e1, e2, e3; 5107 5108 /* I holds the real index; J holds the imag index. */ 5109 j = i - sizeof(float32); 5110 i -= 2 * sizeof(float32); 5111 5112 e0 = *(float32 *)(vn + H1_2(i)); 5113 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5114 e2 = *(float32 *)(vn + H1_2(j)); 5115 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5116 5117 if (likely((pg >> (i & 63)) & 1)) { 5118 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); 5119 } 5120 if (likely((pg >> (j & 63)) & 1)) { 5121 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); 5122 } 5123 } while (i & 63); 5124 } while (i != 0); 5125} 5126 5127void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5128 void *vs, uint32_t desc) 5129{ 5130 intptr_t j, i = simd_oprsz(desc); 5131 uint64_t *g = vg; 5132 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5133 float64 neg_real = float64_chs(neg_imag); 5134 5135 do { 5136 uint64_t pg = g[(i - 1) >> 6]; 5137 do { 5138 float64 e0, e1, e2, e3; 5139 5140 /* I holds the real index; J holds the imag index. */ 5141 j = i - sizeof(float64); 5142 i -= 2 * sizeof(float64); 5143 5144 e0 = *(float64 *)(vn + H1_2(i)); 5145 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5146 e2 = *(float64 *)(vn + H1_2(j)); 5147 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5148 5149 if (likely((pg >> (i & 63)) & 1)) { 5150 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); 5151 } 5152 if (likely((pg >> (j & 63)) & 1)) { 5153 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); 5154 } 5155 } while (i & 63); 5156 } while (i != 0); 5157} 5158 5159/* 5160 * FP Complex Multiply 5161 */ 5162 5163void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5164 void *vg, void *status, uint32_t desc) 5165{ 5166 intptr_t j, i = simd_oprsz(desc); 5167 unsigned rot = simd_data(desc); 5168 bool flip = rot & 1; 5169 float16 neg_imag, neg_real; 5170 uint64_t *g = vg; 5171 5172 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5173 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5174 5175 do { 5176 uint64_t pg = g[(i - 1) >> 6]; 5177 do { 5178 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5179 5180 /* I holds the real index; J holds the imag index. */ 5181 j = i - sizeof(float16); 5182 i -= 2 * sizeof(float16); 5183 5184 nr = *(float16 *)(vn + H1_2(i)); 5185 ni = *(float16 *)(vn + H1_2(j)); 5186 mr = *(float16 *)(vm + H1_2(i)); 5187 mi = *(float16 *)(vm + H1_2(j)); 5188 5189 e2 = (flip ? ni : nr); 5190 e1 = (flip ? mi : mr) ^ neg_real; 5191 e4 = e2; 5192 e3 = (flip ? mr : mi) ^ neg_imag; 5193 5194 if (likely((pg >> (i & 63)) & 1)) { 5195 d = *(float16 *)(va + H1_2(i)); 5196 d = float16_muladd(e2, e1, d, 0, status); 5197 *(float16 *)(vd + H1_2(i)) = d; 5198 } 5199 if (likely((pg >> (j & 63)) & 1)) { 5200 d = *(float16 *)(va + H1_2(j)); 5201 d = float16_muladd(e4, e3, d, 0, status); 5202 *(float16 *)(vd + H1_2(j)) = d; 5203 } 5204 } while (i & 63); 5205 } while (i != 0); 5206} 5207 5208void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5209 void *vg, void *status, uint32_t desc) 5210{ 5211 intptr_t j, i = simd_oprsz(desc); 5212 unsigned rot = simd_data(desc); 5213 bool flip = rot & 1; 5214 float32 neg_imag, neg_real; 5215 uint64_t *g = vg; 5216 5217 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5218 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5219 5220 do { 5221 uint64_t pg = g[(i - 1) >> 6]; 5222 do { 5223 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5224 5225 /* I holds the real index; J holds the imag index. */ 5226 j = i - sizeof(float32); 5227 i -= 2 * sizeof(float32); 5228 5229 nr = *(float32 *)(vn + H1_2(i)); 5230 ni = *(float32 *)(vn + H1_2(j)); 5231 mr = *(float32 *)(vm + H1_2(i)); 5232 mi = *(float32 *)(vm + H1_2(j)); 5233 5234 e2 = (flip ? ni : nr); 5235 e1 = (flip ? mi : mr) ^ neg_real; 5236 e4 = e2; 5237 e3 = (flip ? mr : mi) ^ neg_imag; 5238 5239 if (likely((pg >> (i & 63)) & 1)) { 5240 d = *(float32 *)(va + H1_2(i)); 5241 d = float32_muladd(e2, e1, d, 0, status); 5242 *(float32 *)(vd + H1_2(i)) = d; 5243 } 5244 if (likely((pg >> (j & 63)) & 1)) { 5245 d = *(float32 *)(va + H1_2(j)); 5246 d = float32_muladd(e4, e3, d, 0, status); 5247 *(float32 *)(vd + H1_2(j)) = d; 5248 } 5249 } while (i & 63); 5250 } while (i != 0); 5251} 5252 5253void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5254 void *vg, void *status, uint32_t desc) 5255{ 5256 intptr_t j, i = simd_oprsz(desc); 5257 unsigned rot = simd_data(desc); 5258 bool flip = rot & 1; 5259 float64 neg_imag, neg_real; 5260 uint64_t *g = vg; 5261 5262 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5263 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5264 5265 do { 5266 uint64_t pg = g[(i - 1) >> 6]; 5267 do { 5268 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5269 5270 /* I holds the real index; J holds the imag index. */ 5271 j = i - sizeof(float64); 5272 i -= 2 * sizeof(float64); 5273 5274 nr = *(float64 *)(vn + H1_2(i)); 5275 ni = *(float64 *)(vn + H1_2(j)); 5276 mr = *(float64 *)(vm + H1_2(i)); 5277 mi = *(float64 *)(vm + H1_2(j)); 5278 5279 e2 = (flip ? ni : nr); 5280 e1 = (flip ? mi : mr) ^ neg_real; 5281 e4 = e2; 5282 e3 = (flip ? mr : mi) ^ neg_imag; 5283 5284 if (likely((pg >> (i & 63)) & 1)) { 5285 d = *(float64 *)(va + H1_2(i)); 5286 d = float64_muladd(e2, e1, d, 0, status); 5287 *(float64 *)(vd + H1_2(i)) = d; 5288 } 5289 if (likely((pg >> (j & 63)) & 1)) { 5290 d = *(float64 *)(va + H1_2(j)); 5291 d = float64_muladd(e4, e3, d, 0, status); 5292 *(float64 *)(vd + H1_2(j)) = d; 5293 } 5294 } while (i & 63); 5295 } while (i != 0); 5296} 5297 5298/* 5299 * Load contiguous data, protected by a governing predicate. 5300 */ 5301 5302/* 5303 * Load one element into @vd + @reg_off from @host. 5304 * The controlling predicate is known to be true. 5305 */ 5306typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host); 5307 5308/* 5309 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra). 5310 * The controlling predicate is known to be true. 5311 */ 5312typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off, 5313 target_ulong vaddr, uintptr_t retaddr); 5314 5315/* 5316 * Generate the above primitives. 5317 */ 5318 5319#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \ 5320static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \ 5321{ \ 5322 TYPEM val = HOST(host); \ 5323 *(TYPEE *)(vd + H(reg_off)) = val; \ 5324} 5325 5326#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \ 5327static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \ 5328{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); } 5329 5330#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \ 5331static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \ 5332 target_ulong addr, uintptr_t ra) \ 5333{ \ 5334 *(TYPEE *)(vd + H(reg_off)) = \ 5335 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \ 5336} 5337 5338#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \ 5339static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \ 5340 target_ulong addr, uintptr_t ra) \ 5341{ \ 5342 TLB(env, useronly_clean_ptr(addr), \ 5343 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \ 5344} 5345 5346#define DO_LD_PRIM_1(NAME, H, TE, TM) \ 5347 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \ 5348 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra) 5349 5350DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t) 5351DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t) 5352DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t) 5353DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t) 5354DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t) 5355DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t) 5356DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t) 5357 5358#define DO_ST_PRIM_1(NAME, H, TE, TM) \ 5359 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \ 5360 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra) 5361 5362DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t) 5363DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t) 5364DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t) 5365DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t) 5366 5367#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \ 5368 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \ 5369 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \ 5370 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \ 5371 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra) 5372 5373#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \ 5374 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \ 5375 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \ 5376 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \ 5377 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra) 5378 5379DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw) 5380DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw) 5381DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw) 5382DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw) 5383DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw) 5384 5385DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw) 5386DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw) 5387DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw) 5388 5389DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl) 5390DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl) 5391DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl) 5392 5393DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl) 5394DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl) 5395 5396DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq) 5397DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq) 5398 5399#undef DO_LD_TLB 5400#undef DO_ST_TLB 5401#undef DO_LD_HOST 5402#undef DO_LD_PRIM_1 5403#undef DO_ST_PRIM_1 5404#undef DO_LD_PRIM_2 5405#undef DO_ST_PRIM_2 5406 5407/* 5408 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5409 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5410 * element >= @reg_off, or @reg_max if there were no active elements at all. 5411 */ 5412static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5413 intptr_t reg_max, int esz) 5414{ 5415 uint64_t pg_mask = pred_esz_masks[esz]; 5416 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5417 5418 /* In normal usage, the first element is active. */ 5419 if (likely(pg & 1)) { 5420 return reg_off; 5421 } 5422 5423 if (pg == 0) { 5424 reg_off &= -64; 5425 do { 5426 reg_off += 64; 5427 if (unlikely(reg_off >= reg_max)) { 5428 /* The entire predicate was false. */ 5429 return reg_max; 5430 } 5431 pg = vg[reg_off >> 6] & pg_mask; 5432 } while (pg == 0); 5433 } 5434 reg_off += ctz64(pg); 5435 5436 /* We should never see an out of range predicate bit set. */ 5437 tcg_debug_assert(reg_off < reg_max); 5438 return reg_off; 5439} 5440 5441/* 5442 * Resolve the guest virtual address to info->host and info->flags. 5443 * If @nofault, return false if the page is invalid, otherwise 5444 * exit via page fault exception. 5445 */ 5446 5447typedef struct { 5448 void *host; 5449 int flags; 5450 MemTxAttrs attrs; 5451} SVEHostPage; 5452 5453static bool sve_probe_page(SVEHostPage *info, bool nofault, 5454 CPUARMState *env, target_ulong addr, 5455 int mem_off, MMUAccessType access_type, 5456 int mmu_idx, uintptr_t retaddr) 5457{ 5458 int flags; 5459 5460 addr += mem_off; 5461 5462 /* 5463 * User-only currently always issues with TBI. See the comment 5464 * above useronly_clean_ptr. Usually we clean this top byte away 5465 * during translation, but we can't do that for e.g. vector + imm 5466 * addressing modes. 5467 * 5468 * We currently always enable TBI for user-only, and do not provide 5469 * a way to turn it off. So clean the pointer unconditionally here, 5470 * rather than look it up here, or pass it down from above. 5471 */ 5472 addr = useronly_clean_ptr(addr); 5473 5474 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, 5475 &info->host, retaddr); 5476 info->flags = flags; 5477 5478 if (flags & TLB_INVALID_MASK) { 5479 g_assert(nofault); 5480 return false; 5481 } 5482 5483 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5484 info->host -= mem_off; 5485 5486#ifdef CONFIG_USER_ONLY 5487 memset(&info->attrs, 0, sizeof(info->attrs)); 5488#else 5489 /* 5490 * Find the iotlbentry for addr and return the transaction attributes. 5491 * This *must* be present in the TLB because we just found the mapping. 5492 */ 5493 { 5494 uintptr_t index = tlb_index(env, mmu_idx, addr); 5495 5496# ifdef CONFIG_DEBUG_TCG 5497 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr); 5498 target_ulong comparator = (access_type == MMU_DATA_LOAD 5499 ? entry->addr_read 5500 : tlb_addr_write(entry)); 5501 g_assert(tlb_hit(comparator, addr)); 5502# endif 5503 5504 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index]; 5505 info->attrs = iotlbentry->attrs; 5506 } 5507#endif 5508 5509 return true; 5510} 5511 5512 5513/* 5514 * Analyse contiguous data, protected by a governing predicate. 5515 */ 5516 5517typedef enum { 5518 FAULT_NO, 5519 FAULT_FIRST, 5520 FAULT_ALL, 5521} SVEContFault; 5522 5523typedef struct { 5524 /* 5525 * First and last element wholly contained within the two pages. 5526 * mem_off_first[0] and reg_off_first[0] are always set >= 0. 5527 * reg_off_last[0] may be < 0 if the first element crosses pages. 5528 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1] 5529 * are set >= 0 only if there are complete elements on a second page. 5530 * 5531 * The reg_off_* offsets are relative to the internal vector register. 5532 * The mem_off_first offset is relative to the memory address; the 5533 * two offsets are different when a load operation extends, a store 5534 * operation truncates, or for multi-register operations. 5535 */ 5536 int16_t mem_off_first[2]; 5537 int16_t reg_off_first[2]; 5538 int16_t reg_off_last[2]; 5539 5540 /* 5541 * One element that is misaligned and spans both pages, 5542 * or -1 if there is no such active element. 5543 */ 5544 int16_t mem_off_split; 5545 int16_t reg_off_split; 5546 5547 /* 5548 * The byte offset at which the entire operation crosses a page boundary. 5549 * Set >= 0 if and only if the entire operation spans two pages. 5550 */ 5551 int16_t page_split; 5552 5553 /* TLB data for the two pages. */ 5554 SVEHostPage page[2]; 5555} SVEContLdSt; 5556 5557/* 5558 * Find first active element on each page, and a loose bound for the 5559 * final element on each page. Identify any single element that spans 5560 * the page boundary. Return true if there are any active elements. 5561 */ 5562static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, 5563 uint64_t *vg, intptr_t reg_max, 5564 int esz, int msize) 5565{ 5566 const int esize = 1 << esz; 5567 const uint64_t pg_mask = pred_esz_masks[esz]; 5568 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5569 intptr_t mem_off_last, mem_off_split; 5570 intptr_t page_split, elt_split; 5571 intptr_t i; 5572 5573 /* Set all of the element indices to -1, and the TLB data to 0. */ 5574 memset(info, -1, offsetof(SVEContLdSt, page)); 5575 memset(info->page, 0, sizeof(info->page)); 5576 5577 /* Gross scan over the entire predicate to find bounds. */ 5578 i = 0; 5579 do { 5580 uint64_t pg = vg[i] & pg_mask; 5581 if (pg) { 5582 reg_off_last = i * 64 + 63 - clz64(pg); 5583 if (reg_off_first < 0) { 5584 reg_off_first = i * 64 + ctz64(pg); 5585 } 5586 } 5587 } while (++i * 64 < reg_max); 5588 5589 if (unlikely(reg_off_first < 0)) { 5590 /* No active elements, no pages touched. */ 5591 return false; 5592 } 5593 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5594 5595 info->reg_off_first[0] = reg_off_first; 5596 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5597 mem_off_last = (reg_off_last >> esz) * msize; 5598 5599 page_split = -(addr | TARGET_PAGE_MASK); 5600 if (likely(mem_off_last + msize <= page_split)) { 5601 /* The entire operation fits within a single page. */ 5602 info->reg_off_last[0] = reg_off_last; 5603 return true; 5604 } 5605 5606 info->page_split = page_split; 5607 elt_split = page_split / msize; 5608 reg_off_split = elt_split << esz; 5609 mem_off_split = elt_split * msize; 5610 5611 /* 5612 * This is the last full element on the first page, but it is not 5613 * necessarily active. If there is no full element, i.e. the first 5614 * active element is the one that's split, this value remains -1. 5615 * It is useful as iteration bounds. 5616 */ 5617 if (elt_split != 0) { 5618 info->reg_off_last[0] = reg_off_split - esize; 5619 } 5620 5621 /* Determine if an unaligned element spans the pages. */ 5622 if (page_split % msize != 0) { 5623 /* It is helpful to know if the split element is active. */ 5624 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5625 info->reg_off_split = reg_off_split; 5626 info->mem_off_split = mem_off_split; 5627 5628 if (reg_off_split == reg_off_last) { 5629 /* The page crossing element is last. */ 5630 return true; 5631 } 5632 } 5633 reg_off_split += esize; 5634 mem_off_split += msize; 5635 } 5636 5637 /* 5638 * We do want the first active element on the second page, because 5639 * this may affect the address reported in an exception. 5640 */ 5641 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5642 tcg_debug_assert(reg_off_split <= reg_off_last); 5643 info->reg_off_first[1] = reg_off_split; 5644 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5645 info->reg_off_last[1] = reg_off_last; 5646 return true; 5647} 5648 5649/* 5650 * Resolve the guest virtual addresses to info->page[]. 5651 * Control the generation of page faults with @fault. Return false if 5652 * there is no work to do, which can only happen with @fault == FAULT_NO. 5653 */ 5654static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5655 CPUARMState *env, target_ulong addr, 5656 MMUAccessType access_type, uintptr_t retaddr) 5657{ 5658 int mmu_idx = cpu_mmu_index(env, false); 5659 int mem_off = info->mem_off_first[0]; 5660 bool nofault = fault == FAULT_NO; 5661 bool have_work = true; 5662 5663 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5664 access_type, mmu_idx, retaddr)) { 5665 /* No work to be done. */ 5666 return false; 5667 } 5668 5669 if (likely(info->page_split < 0)) { 5670 /* The entire operation was on the one page. */ 5671 return true; 5672 } 5673 5674 /* 5675 * If the second page is invalid, then we want the fault address to be 5676 * the first byte on that page which is accessed. 5677 */ 5678 if (info->mem_off_split >= 0) { 5679 /* 5680 * There is an element split across the pages. The fault address 5681 * should be the first byte of the second page. 5682 */ 5683 mem_off = info->page_split; 5684 /* 5685 * If the split element is also the first active element 5686 * of the vector, then: For first-fault we should continue 5687 * to generate faults for the second page. For no-fault, 5688 * we have work only if the second page is valid. 5689 */ 5690 if (info->mem_off_first[0] < info->mem_off_split) { 5691 nofault = FAULT_FIRST; 5692 have_work = false; 5693 } 5694 } else { 5695 /* 5696 * There is no element split across the pages. The fault address 5697 * should be the first active element on the second page. 5698 */ 5699 mem_off = info->mem_off_first[1]; 5700 /* 5701 * There must have been one active element on the first page, 5702 * so we're out of first-fault territory. 5703 */ 5704 nofault = fault != FAULT_ALL; 5705 } 5706 5707 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5708 access_type, mmu_idx, retaddr); 5709 return have_work; 5710} 5711 5712static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5713 uint64_t *vg, target_ulong addr, 5714 int esize, int msize, int wp_access, 5715 uintptr_t retaddr) 5716{ 5717#ifndef CONFIG_USER_ONLY 5718 intptr_t mem_off, reg_off, reg_last; 5719 int flags0 = info->page[0].flags; 5720 int flags1 = info->page[1].flags; 5721 5722 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5723 return; 5724 } 5725 5726 /* Indicate that watchpoints are handled. */ 5727 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5728 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5729 5730 if (flags0 & TLB_WATCHPOINT) { 5731 mem_off = info->mem_off_first[0]; 5732 reg_off = info->reg_off_first[0]; 5733 reg_last = info->reg_off_last[0]; 5734 5735 while (reg_off <= reg_last) { 5736 uint64_t pg = vg[reg_off >> 6]; 5737 do { 5738 if ((pg >> (reg_off & 63)) & 1) { 5739 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5740 msize, info->page[0].attrs, 5741 wp_access, retaddr); 5742 } 5743 reg_off += esize; 5744 mem_off += msize; 5745 } while (reg_off <= reg_last && (reg_off & 63)); 5746 } 5747 } 5748 5749 mem_off = info->mem_off_split; 5750 if (mem_off >= 0) { 5751 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5752 info->page[0].attrs, wp_access, retaddr); 5753 } 5754 5755 mem_off = info->mem_off_first[1]; 5756 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5757 reg_off = info->reg_off_first[1]; 5758 reg_last = info->reg_off_last[1]; 5759 5760 do { 5761 uint64_t pg = vg[reg_off >> 6]; 5762 do { 5763 if ((pg >> (reg_off & 63)) & 1) { 5764 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5765 msize, info->page[1].attrs, 5766 wp_access, retaddr); 5767 } 5768 reg_off += esize; 5769 mem_off += msize; 5770 } while (reg_off & 63); 5771 } while (reg_off <= reg_last); 5772 } 5773#endif 5774} 5775 5776static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5777 uint64_t *vg, target_ulong addr, int esize, 5778 int msize, uint32_t mtedesc, uintptr_t ra) 5779{ 5780 intptr_t mem_off, reg_off, reg_last; 5781 5782 /* Process the page only if MemAttr == Tagged. */ 5783 if (arm_tlb_mte_tagged(&info->page[0].attrs)) { 5784 mem_off = info->mem_off_first[0]; 5785 reg_off = info->reg_off_first[0]; 5786 reg_last = info->reg_off_split; 5787 if (reg_last < 0) { 5788 reg_last = info->reg_off_last[0]; 5789 } 5790 5791 do { 5792 uint64_t pg = vg[reg_off >> 6]; 5793 do { 5794 if ((pg >> (reg_off & 63)) & 1) { 5795 mte_check(env, mtedesc, addr, ra); 5796 } 5797 reg_off += esize; 5798 mem_off += msize; 5799 } while (reg_off <= reg_last && (reg_off & 63)); 5800 } while (reg_off <= reg_last); 5801 } 5802 5803 mem_off = info->mem_off_first[1]; 5804 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) { 5805 reg_off = info->reg_off_first[1]; 5806 reg_last = info->reg_off_last[1]; 5807 5808 do { 5809 uint64_t pg = vg[reg_off >> 6]; 5810 do { 5811 if ((pg >> (reg_off & 63)) & 1) { 5812 mte_check(env, mtedesc, addr, ra); 5813 } 5814 reg_off += esize; 5815 mem_off += msize; 5816 } while (reg_off & 63); 5817 } while (reg_off <= reg_last); 5818 } 5819} 5820 5821/* 5822 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5823 */ 5824static inline QEMU_ALWAYS_INLINE 5825void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5826 uint32_t desc, const uintptr_t retaddr, 5827 const int esz, const int msz, const int N, uint32_t mtedesc, 5828 sve_ldst1_host_fn *host_fn, 5829 sve_ldst1_tlb_fn *tlb_fn) 5830{ 5831 const unsigned rd = simd_data(desc); 5832 const intptr_t reg_max = simd_oprsz(desc); 5833 intptr_t reg_off, reg_last, mem_off; 5834 SVEContLdSt info; 5835 void *host; 5836 int flags, i; 5837 5838 /* Find the active elements. */ 5839 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5840 /* The entire predicate was false; no load occurs. */ 5841 for (i = 0; i < N; ++i) { 5842 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5843 } 5844 return; 5845 } 5846 5847 /* Probe the page(s). Exit with exception for any invalid page. */ 5848 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5849 5850 /* Handle watchpoints for all active elements. */ 5851 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5852 BP_MEM_READ, retaddr); 5853 5854 /* 5855 * Handle mte checks for all active elements. 5856 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5857 */ 5858 if (mtedesc) { 5859 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5860 mtedesc, retaddr); 5861 } 5862 5863 flags = info.page[0].flags | info.page[1].flags; 5864 if (unlikely(flags != 0)) { 5865#ifdef CONFIG_USER_ONLY 5866 g_assert_not_reached(); 5867#else 5868 /* 5869 * At least one page includes MMIO. 5870 * Any bus operation can fail with cpu_transaction_failed, 5871 * which for ARM will raise SyncExternal. Perform the load 5872 * into scratch memory to preserve register state until the end. 5873 */ 5874 ARMVectorReg scratch[4] = { }; 5875 5876 mem_off = info.mem_off_first[0]; 5877 reg_off = info.reg_off_first[0]; 5878 reg_last = info.reg_off_last[1]; 5879 if (reg_last < 0) { 5880 reg_last = info.reg_off_split; 5881 if (reg_last < 0) { 5882 reg_last = info.reg_off_last[0]; 5883 } 5884 } 5885 5886 do { 5887 uint64_t pg = vg[reg_off >> 6]; 5888 do { 5889 if ((pg >> (reg_off & 63)) & 1) { 5890 for (i = 0; i < N; ++i) { 5891 tlb_fn(env, &scratch[i], reg_off, 5892 addr + mem_off + (i << msz), retaddr); 5893 } 5894 } 5895 reg_off += 1 << esz; 5896 mem_off += N << msz; 5897 } while (reg_off & 63); 5898 } while (reg_off <= reg_last); 5899 5900 for (i = 0; i < N; ++i) { 5901 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5902 } 5903 return; 5904#endif 5905 } 5906 5907 /* The entire operation is in RAM, on valid pages. */ 5908 5909 for (i = 0; i < N; ++i) { 5910 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5911 } 5912 5913 mem_off = info.mem_off_first[0]; 5914 reg_off = info.reg_off_first[0]; 5915 reg_last = info.reg_off_last[0]; 5916 host = info.page[0].host; 5917 5918 while (reg_off <= reg_last) { 5919 uint64_t pg = vg[reg_off >> 6]; 5920 do { 5921 if ((pg >> (reg_off & 63)) & 1) { 5922 for (i = 0; i < N; ++i) { 5923 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5924 host + mem_off + (i << msz)); 5925 } 5926 } 5927 reg_off += 1 << esz; 5928 mem_off += N << msz; 5929 } while (reg_off <= reg_last && (reg_off & 63)); 5930 } 5931 5932 /* 5933 * Use the slow path to manage the cross-page misalignment. 5934 * But we know this is RAM and cannot trap. 5935 */ 5936 mem_off = info.mem_off_split; 5937 if (unlikely(mem_off >= 0)) { 5938 reg_off = info.reg_off_split; 5939 for (i = 0; i < N; ++i) { 5940 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5941 addr + mem_off + (i << msz), retaddr); 5942 } 5943 } 5944 5945 mem_off = info.mem_off_first[1]; 5946 if (unlikely(mem_off >= 0)) { 5947 reg_off = info.reg_off_first[1]; 5948 reg_last = info.reg_off_last[1]; 5949 host = info.page[1].host; 5950 5951 do { 5952 uint64_t pg = vg[reg_off >> 6]; 5953 do { 5954 if ((pg >> (reg_off & 63)) & 1) { 5955 for (i = 0; i < N; ++i) { 5956 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5957 host + mem_off + (i << msz)); 5958 } 5959 } 5960 reg_off += 1 << esz; 5961 mem_off += N << msz; 5962 } while (reg_off & 63); 5963 } while (reg_off <= reg_last); 5964 } 5965} 5966 5967static inline QEMU_ALWAYS_INLINE 5968void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5969 uint32_t desc, const uintptr_t ra, 5970 const int esz, const int msz, const int N, 5971 sve_ldst1_host_fn *host_fn, 5972 sve_ldst1_tlb_fn *tlb_fn) 5973{ 5974 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5975 int bit55 = extract64(addr, 55, 1); 5976 5977 /* Remove mtedesc from the normal sve descriptor. */ 5978 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5979 5980 /* Perform gross MTE suppression early. */ 5981 if (!tbi_check(desc, bit55) || 5982 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 5983 mtedesc = 0; 5984 } 5985 5986 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5987} 5988 5989#define DO_LD1_1(NAME, ESZ) \ 5990void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5991 target_ulong addr, uint32_t desc) \ 5992{ \ 5993 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5994 sve_##NAME##_host, sve_##NAME##_tlb); \ 5995} \ 5996void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5997 target_ulong addr, uint32_t desc) \ 5998{ \ 5999 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6000 sve_##NAME##_host, sve_##NAME##_tlb); \ 6001} 6002 6003#define DO_LD1_2(NAME, ESZ, MSZ) \ 6004void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6005 target_ulong addr, uint32_t desc) \ 6006{ \ 6007 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6008 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6009} \ 6010void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6011 target_ulong addr, uint32_t desc) \ 6012{ \ 6013 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6014 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6015} \ 6016void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6017 target_ulong addr, uint32_t desc) \ 6018{ \ 6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6020 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6021} \ 6022void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6023 target_ulong addr, uint32_t desc) \ 6024{ \ 6025 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6026 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6027} 6028 6029DO_LD1_1(ld1bb, MO_8) 6030DO_LD1_1(ld1bhu, MO_16) 6031DO_LD1_1(ld1bhs, MO_16) 6032DO_LD1_1(ld1bsu, MO_32) 6033DO_LD1_1(ld1bss, MO_32) 6034DO_LD1_1(ld1bdu, MO_64) 6035DO_LD1_1(ld1bds, MO_64) 6036 6037DO_LD1_2(ld1hh, MO_16, MO_16) 6038DO_LD1_2(ld1hsu, MO_32, MO_16) 6039DO_LD1_2(ld1hss, MO_32, MO_16) 6040DO_LD1_2(ld1hdu, MO_64, MO_16) 6041DO_LD1_2(ld1hds, MO_64, MO_16) 6042 6043DO_LD1_2(ld1ss, MO_32, MO_32) 6044DO_LD1_2(ld1sdu, MO_64, MO_32) 6045DO_LD1_2(ld1sds, MO_64, MO_32) 6046 6047DO_LD1_2(ld1dd, MO_64, MO_64) 6048 6049#undef DO_LD1_1 6050#undef DO_LD1_2 6051 6052#define DO_LDN_1(N) \ 6053void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6054 target_ulong addr, uint32_t desc) \ 6055{ \ 6056 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6057 sve_ld1bb_host, sve_ld1bb_tlb); \ 6058} \ 6059void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6060 target_ulong addr, uint32_t desc) \ 6061{ \ 6062 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6063 sve_ld1bb_host, sve_ld1bb_tlb); \ 6064} 6065 6066#define DO_LDN_2(N, SUFF, ESZ) \ 6067void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6068 target_ulong addr, uint32_t desc) \ 6069{ \ 6070 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6071 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6072} \ 6073void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6074 target_ulong addr, uint32_t desc) \ 6075{ \ 6076 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6077 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6078} \ 6079void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6080 target_ulong addr, uint32_t desc) \ 6081{ \ 6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6083 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6084} \ 6085void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6086 target_ulong addr, uint32_t desc) \ 6087{ \ 6088 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6089 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6090} 6091 6092DO_LDN_1(2) 6093DO_LDN_1(3) 6094DO_LDN_1(4) 6095 6096DO_LDN_2(2, hh, MO_16) 6097DO_LDN_2(3, hh, MO_16) 6098DO_LDN_2(4, hh, MO_16) 6099 6100DO_LDN_2(2, ss, MO_32) 6101DO_LDN_2(3, ss, MO_32) 6102DO_LDN_2(4, ss, MO_32) 6103 6104DO_LDN_2(2, dd, MO_64) 6105DO_LDN_2(3, dd, MO_64) 6106DO_LDN_2(4, dd, MO_64) 6107 6108#undef DO_LDN_1 6109#undef DO_LDN_2 6110 6111/* 6112 * Load contiguous data, first-fault and no-fault. 6113 * 6114 * For user-only, one could argue that we should hold the mmap_lock during 6115 * the operation so that there is no race between page_check_range and the 6116 * load operation. However, unmapping pages out from under a running thread 6117 * is extraordinarily unlikely. This theoretical race condition also affects 6118 * linux-user/ in its get_user/put_user macros. 6119 * 6120 * TODO: Construct some helpers, written in assembly, that interact with 6121 * handle_cpu_signal to produce memory ops which can properly report errors 6122 * without racing. 6123 */ 6124 6125/* Fault on byte I. All bits in FFR from I are cleared. The vector 6126 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6127 * option, which leaves subsequent data unchanged. 6128 */ 6129static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6130{ 6131 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6132 6133 if (i & 63) { 6134 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6135 i = ROUND_UP(i, 64); 6136 } 6137 for (; i < oprsz; i += 64) { 6138 ffr[i / 64] = 0; 6139 } 6140} 6141 6142/* 6143 * Common helper for all contiguous no-fault and first-fault loads. 6144 */ 6145static inline QEMU_ALWAYS_INLINE 6146void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6147 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6148 const int esz, const int msz, const SVEContFault fault, 6149 sve_ldst1_host_fn *host_fn, 6150 sve_ldst1_tlb_fn *tlb_fn) 6151{ 6152 const unsigned rd = simd_data(desc); 6153 void *vd = &env->vfp.zregs[rd]; 6154 const intptr_t reg_max = simd_oprsz(desc); 6155 intptr_t reg_off, mem_off, reg_last; 6156 SVEContLdSt info; 6157 int flags; 6158 void *host; 6159 6160 /* Find the active elements. */ 6161 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6162 /* The entire predicate was false; no load occurs. */ 6163 memset(vd, 0, reg_max); 6164 return; 6165 } 6166 reg_off = info.reg_off_first[0]; 6167 6168 /* Probe the page(s). */ 6169 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6170 /* Fault on first element. */ 6171 tcg_debug_assert(fault == FAULT_NO); 6172 memset(vd, 0, reg_max); 6173 goto do_fault; 6174 } 6175 6176 mem_off = info.mem_off_first[0]; 6177 flags = info.page[0].flags; 6178 6179 /* 6180 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6181 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6182 */ 6183 if (arm_tlb_mte_tagged(&info.page[0].attrs)) { 6184 mtedesc = 0; 6185 } 6186 6187 if (fault == FAULT_FIRST) { 6188 /* Trapping mte check for the first-fault element. */ 6189 if (mtedesc) { 6190 mte_check(env, mtedesc, addr + mem_off, retaddr); 6191 } 6192 6193 /* 6194 * Special handling of the first active element, 6195 * if it crosses a page boundary or is MMIO. 6196 */ 6197 bool is_split = mem_off == info.mem_off_split; 6198 if (unlikely(flags != 0) || unlikely(is_split)) { 6199 /* 6200 * Use the slow path for cross-page handling. 6201 * Might trap for MMIO or watchpoints. 6202 */ 6203 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6204 6205 /* After any fault, zero the other elements. */ 6206 swap_memzero(vd, reg_off); 6207 reg_off += 1 << esz; 6208 mem_off += 1 << msz; 6209 swap_memzero(vd + reg_off, reg_max - reg_off); 6210 6211 if (is_split) { 6212 goto second_page; 6213 } 6214 } else { 6215 memset(vd, 0, reg_max); 6216 } 6217 } else { 6218 memset(vd, 0, reg_max); 6219 if (unlikely(mem_off == info.mem_off_split)) { 6220 /* The first active element crosses a page boundary. */ 6221 flags |= info.page[1].flags; 6222 if (unlikely(flags & TLB_MMIO)) { 6223 /* Some page is MMIO, see below. */ 6224 goto do_fault; 6225 } 6226 if (unlikely(flags & TLB_WATCHPOINT) && 6227 (cpu_watchpoint_address_matches 6228 (env_cpu(env), addr + mem_off, 1 << msz) 6229 & BP_MEM_READ)) { 6230 /* Watchpoint hit, see below. */ 6231 goto do_fault; 6232 } 6233 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6234 goto do_fault; 6235 } 6236 /* 6237 * Use the slow path for cross-page handling. 6238 * This is RAM, without a watchpoint, and will not trap. 6239 */ 6240 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6241 goto second_page; 6242 } 6243 } 6244 6245 /* 6246 * From this point on, all memory operations are MemSingleNF. 6247 * 6248 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6249 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6250 * 6251 * Unfortuately we do not have access to the memory attributes from the 6252 * PTE to tell Device memory from Normal memory. So we make a mostly 6253 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6254 * This gives the right answer for the common cases of "Normal memory, 6255 * backed by host RAM" and "Device memory, backed by MMIO". 6256 * The architecture allows us to suppress an NF load and return 6257 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6258 * case of "Normal memory, backed by MMIO" is permitted. The case we 6259 * get wrong is "Device memory, backed by host RAM", for which we 6260 * should return (UNKNOWN, FAULT) for but do not. 6261 * 6262 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6263 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6264 * architectural breakpoints the same. 6265 */ 6266 if (unlikely(flags & TLB_MMIO)) { 6267 goto do_fault; 6268 } 6269 6270 reg_last = info.reg_off_last[0]; 6271 host = info.page[0].host; 6272 6273 do { 6274 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6275 do { 6276 if ((pg >> (reg_off & 63)) & 1) { 6277 if (unlikely(flags & TLB_WATCHPOINT) && 6278 (cpu_watchpoint_address_matches 6279 (env_cpu(env), addr + mem_off, 1 << msz) 6280 & BP_MEM_READ)) { 6281 goto do_fault; 6282 } 6283 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6284 goto do_fault; 6285 } 6286 host_fn(vd, reg_off, host + mem_off); 6287 } 6288 reg_off += 1 << esz; 6289 mem_off += 1 << msz; 6290 } while (reg_off <= reg_last && (reg_off & 63)); 6291 } while (reg_off <= reg_last); 6292 6293 /* 6294 * MemSingleNF is allowed to fail for any reason. We have special 6295 * code above to handle the first element crossing a page boundary. 6296 * As an implementation choice, decline to handle a cross-page element 6297 * in any other position. 6298 */ 6299 reg_off = info.reg_off_split; 6300 if (reg_off >= 0) { 6301 goto do_fault; 6302 } 6303 6304 second_page: 6305 reg_off = info.reg_off_first[1]; 6306 if (likely(reg_off < 0)) { 6307 /* No active elements on the second page. All done. */ 6308 return; 6309 } 6310 6311 /* 6312 * MemSingleNF is allowed to fail for any reason. As an implementation 6313 * choice, decline to handle elements on the second page. This should 6314 * be low frequency as the guest walks through memory -- the next 6315 * iteration of the guest's loop should be aligned on the page boundary, 6316 * and then all following iterations will stay aligned. 6317 */ 6318 6319 do_fault: 6320 record_fault(env, reg_off, reg_max); 6321} 6322 6323static inline QEMU_ALWAYS_INLINE 6324void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6325 uint32_t desc, const uintptr_t retaddr, 6326 const int esz, const int msz, const SVEContFault fault, 6327 sve_ldst1_host_fn *host_fn, 6328 sve_ldst1_tlb_fn *tlb_fn) 6329{ 6330 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6331 int bit55 = extract64(addr, 55, 1); 6332 6333 /* Remove mtedesc from the normal sve descriptor. */ 6334 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6335 6336 /* Perform gross MTE suppression early. */ 6337 if (!tbi_check(desc, bit55) || 6338 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6339 mtedesc = 0; 6340 } 6341 6342 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6343 esz, msz, fault, host_fn, tlb_fn); 6344} 6345 6346#define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6347void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6348 target_ulong addr, uint32_t desc) \ 6349{ \ 6350 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6351 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6352} \ 6353void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6354 target_ulong addr, uint32_t desc) \ 6355{ \ 6356 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6357 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6358} \ 6359void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6360 target_ulong addr, uint32_t desc) \ 6361{ \ 6362 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6363 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6364} \ 6365void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6366 target_ulong addr, uint32_t desc) \ 6367{ \ 6368 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6369 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6370} 6371 6372#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6373void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6374 target_ulong addr, uint32_t desc) \ 6375{ \ 6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6377 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6378} \ 6379void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6380 target_ulong addr, uint32_t desc) \ 6381{ \ 6382 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6383 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6384} \ 6385void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6386 target_ulong addr, uint32_t desc) \ 6387{ \ 6388 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6389 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6390} \ 6391void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6392 target_ulong addr, uint32_t desc) \ 6393{ \ 6394 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6395 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6396} \ 6397void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6398 target_ulong addr, uint32_t desc) \ 6399{ \ 6400 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6401 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6402} \ 6403void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6404 target_ulong addr, uint32_t desc) \ 6405{ \ 6406 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6407 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6408} \ 6409void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6410 target_ulong addr, uint32_t desc) \ 6411{ \ 6412 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6413 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6414} \ 6415void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6416 target_ulong addr, uint32_t desc) \ 6417{ \ 6418 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6419 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6420} 6421 6422DO_LDFF1_LDNF1_1(bb, MO_8) 6423DO_LDFF1_LDNF1_1(bhu, MO_16) 6424DO_LDFF1_LDNF1_1(bhs, MO_16) 6425DO_LDFF1_LDNF1_1(bsu, MO_32) 6426DO_LDFF1_LDNF1_1(bss, MO_32) 6427DO_LDFF1_LDNF1_1(bdu, MO_64) 6428DO_LDFF1_LDNF1_1(bds, MO_64) 6429 6430DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6431DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6432DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6433DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6434DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6435 6436DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6437DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6438DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6439 6440DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6441 6442#undef DO_LDFF1_LDNF1_1 6443#undef DO_LDFF1_LDNF1_2 6444 6445/* 6446 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6447 */ 6448 6449static inline QEMU_ALWAYS_INLINE 6450void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6451 uint32_t desc, const uintptr_t retaddr, 6452 const int esz, const int msz, const int N, uint32_t mtedesc, 6453 sve_ldst1_host_fn *host_fn, 6454 sve_ldst1_tlb_fn *tlb_fn) 6455{ 6456 const unsigned rd = simd_data(desc); 6457 const intptr_t reg_max = simd_oprsz(desc); 6458 intptr_t reg_off, reg_last, mem_off; 6459 SVEContLdSt info; 6460 void *host; 6461 int i, flags; 6462 6463 /* Find the active elements. */ 6464 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6465 /* The entire predicate was false; no store occurs. */ 6466 return; 6467 } 6468 6469 /* Probe the page(s). Exit with exception for any invalid page. */ 6470 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6471 6472 /* Handle watchpoints for all active elements. */ 6473 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6474 BP_MEM_WRITE, retaddr); 6475 6476 /* 6477 * Handle mte checks for all active elements. 6478 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6479 */ 6480 if (mtedesc) { 6481 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6482 mtedesc, retaddr); 6483 } 6484 6485 flags = info.page[0].flags | info.page[1].flags; 6486 if (unlikely(flags != 0)) { 6487#ifdef CONFIG_USER_ONLY 6488 g_assert_not_reached(); 6489#else 6490 /* 6491 * At least one page includes MMIO. 6492 * Any bus operation can fail with cpu_transaction_failed, 6493 * which for ARM will raise SyncExternal. We cannot avoid 6494 * this fault and will leave with the store incomplete. 6495 */ 6496 mem_off = info.mem_off_first[0]; 6497 reg_off = info.reg_off_first[0]; 6498 reg_last = info.reg_off_last[1]; 6499 if (reg_last < 0) { 6500 reg_last = info.reg_off_split; 6501 if (reg_last < 0) { 6502 reg_last = info.reg_off_last[0]; 6503 } 6504 } 6505 6506 do { 6507 uint64_t pg = vg[reg_off >> 6]; 6508 do { 6509 if ((pg >> (reg_off & 63)) & 1) { 6510 for (i = 0; i < N; ++i) { 6511 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6512 addr + mem_off + (i << msz), retaddr); 6513 } 6514 } 6515 reg_off += 1 << esz; 6516 mem_off += N << msz; 6517 } while (reg_off & 63); 6518 } while (reg_off <= reg_last); 6519 return; 6520#endif 6521 } 6522 6523 mem_off = info.mem_off_first[0]; 6524 reg_off = info.reg_off_first[0]; 6525 reg_last = info.reg_off_last[0]; 6526 host = info.page[0].host; 6527 6528 while (reg_off <= reg_last) { 6529 uint64_t pg = vg[reg_off >> 6]; 6530 do { 6531 if ((pg >> (reg_off & 63)) & 1) { 6532 for (i = 0; i < N; ++i) { 6533 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6534 host + mem_off + (i << msz)); 6535 } 6536 } 6537 reg_off += 1 << esz; 6538 mem_off += N << msz; 6539 } while (reg_off <= reg_last && (reg_off & 63)); 6540 } 6541 6542 /* 6543 * Use the slow path to manage the cross-page misalignment. 6544 * But we know this is RAM and cannot trap. 6545 */ 6546 mem_off = info.mem_off_split; 6547 if (unlikely(mem_off >= 0)) { 6548 reg_off = info.reg_off_split; 6549 for (i = 0; i < N; ++i) { 6550 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6551 addr + mem_off + (i << msz), retaddr); 6552 } 6553 } 6554 6555 mem_off = info.mem_off_first[1]; 6556 if (unlikely(mem_off >= 0)) { 6557 reg_off = info.reg_off_first[1]; 6558 reg_last = info.reg_off_last[1]; 6559 host = info.page[1].host; 6560 6561 do { 6562 uint64_t pg = vg[reg_off >> 6]; 6563 do { 6564 if ((pg >> (reg_off & 63)) & 1) { 6565 for (i = 0; i < N; ++i) { 6566 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6567 host + mem_off + (i << msz)); 6568 } 6569 } 6570 reg_off += 1 << esz; 6571 mem_off += N << msz; 6572 } while (reg_off & 63); 6573 } while (reg_off <= reg_last); 6574 } 6575} 6576 6577static inline QEMU_ALWAYS_INLINE 6578void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6579 uint32_t desc, const uintptr_t ra, 6580 const int esz, const int msz, const int N, 6581 sve_ldst1_host_fn *host_fn, 6582 sve_ldst1_tlb_fn *tlb_fn) 6583{ 6584 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6585 int bit55 = extract64(addr, 55, 1); 6586 6587 /* Remove mtedesc from the normal sve descriptor. */ 6588 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6589 6590 /* Perform gross MTE suppression early. */ 6591 if (!tbi_check(desc, bit55) || 6592 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { 6593 mtedesc = 0; 6594 } 6595 6596 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6597} 6598 6599#define DO_STN_1(N, NAME, ESZ) \ 6600void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6601 target_ulong addr, uint32_t desc) \ 6602{ \ 6603 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6604 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6605} \ 6606void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6607 target_ulong addr, uint32_t desc) \ 6608{ \ 6609 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6610 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6611} 6612 6613#define DO_STN_2(N, NAME, ESZ, MSZ) \ 6614void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6615 target_ulong addr, uint32_t desc) \ 6616{ \ 6617 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6618 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6619} \ 6620void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6621 target_ulong addr, uint32_t desc) \ 6622{ \ 6623 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6624 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6625} \ 6626void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6627 target_ulong addr, uint32_t desc) \ 6628{ \ 6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6630 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6631} \ 6632void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6633 target_ulong addr, uint32_t desc) \ 6634{ \ 6635 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6636 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6637} 6638 6639DO_STN_1(1, bb, MO_8) 6640DO_STN_1(1, bh, MO_16) 6641DO_STN_1(1, bs, MO_32) 6642DO_STN_1(1, bd, MO_64) 6643DO_STN_1(2, bb, MO_8) 6644DO_STN_1(3, bb, MO_8) 6645DO_STN_1(4, bb, MO_8) 6646 6647DO_STN_2(1, hh, MO_16, MO_16) 6648DO_STN_2(1, hs, MO_32, MO_16) 6649DO_STN_2(1, hd, MO_64, MO_16) 6650DO_STN_2(2, hh, MO_16, MO_16) 6651DO_STN_2(3, hh, MO_16, MO_16) 6652DO_STN_2(4, hh, MO_16, MO_16) 6653 6654DO_STN_2(1, ss, MO_32, MO_32) 6655DO_STN_2(1, sd, MO_64, MO_32) 6656DO_STN_2(2, ss, MO_32, MO_32) 6657DO_STN_2(3, ss, MO_32, MO_32) 6658DO_STN_2(4, ss, MO_32, MO_32) 6659 6660DO_STN_2(1, dd, MO_64, MO_64) 6661DO_STN_2(2, dd, MO_64, MO_64) 6662DO_STN_2(3, dd, MO_64, MO_64) 6663DO_STN_2(4, dd, MO_64, MO_64) 6664 6665#undef DO_STN_1 6666#undef DO_STN_2 6667 6668/* 6669 * Loads with a vector index. 6670 */ 6671 6672/* 6673 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6674 */ 6675typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6676 6677static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6678{ 6679 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6680} 6681 6682static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6683{ 6684 return *(int32_t *)(reg + H1_4(reg_ofs)); 6685} 6686 6687static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6688{ 6689 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6690} 6691 6692static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6693{ 6694 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6695} 6696 6697static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6698{ 6699 return *(uint64_t *)(reg + reg_ofs); 6700} 6701 6702static inline QEMU_ALWAYS_INLINE 6703void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6704 target_ulong base, uint32_t desc, uintptr_t retaddr, 6705 uint32_t mtedesc, int esize, int msize, 6706 zreg_off_fn *off_fn, 6707 sve_ldst1_host_fn *host_fn, 6708 sve_ldst1_tlb_fn *tlb_fn) 6709{ 6710 const int mmu_idx = cpu_mmu_index(env, false); 6711 const intptr_t reg_max = simd_oprsz(desc); 6712 const int scale = simd_data(desc); 6713 ARMVectorReg scratch; 6714 intptr_t reg_off; 6715 SVEHostPage info, info2; 6716 6717 memset(&scratch, 0, reg_max); 6718 reg_off = 0; 6719 do { 6720 uint64_t pg = vg[reg_off >> 6]; 6721 do { 6722 if (likely(pg & 1)) { 6723 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6724 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6725 6726 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6727 mmu_idx, retaddr); 6728 6729 if (likely(in_page >= msize)) { 6730 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6731 cpu_check_watchpoint(env_cpu(env), addr, msize, 6732 info.attrs, BP_MEM_READ, retaddr); 6733 } 6734 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { 6735 mte_check(env, mtedesc, addr, retaddr); 6736 } 6737 host_fn(&scratch, reg_off, info.host); 6738 } else { 6739 /* Element crosses the page boundary. */ 6740 sve_probe_page(&info2, false, env, addr + in_page, 0, 6741 MMU_DATA_LOAD, mmu_idx, retaddr); 6742 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6743 cpu_check_watchpoint(env_cpu(env), addr, 6744 msize, info.attrs, 6745 BP_MEM_READ, retaddr); 6746 } 6747 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { 6748 mte_check(env, mtedesc, addr, retaddr); 6749 } 6750 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6751 } 6752 } 6753 reg_off += esize; 6754 pg >>= esize; 6755 } while (reg_off & 63); 6756 } while (reg_off < reg_max); 6757 6758 /* Wait until all exceptions have been raised to write back. */ 6759 memcpy(vd, &scratch, reg_max); 6760} 6761 6762static inline QEMU_ALWAYS_INLINE 6763void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6764 target_ulong base, uint32_t desc, uintptr_t retaddr, 6765 int esize, int msize, zreg_off_fn *off_fn, 6766 sve_ldst1_host_fn *host_fn, 6767 sve_ldst1_tlb_fn *tlb_fn) 6768{ 6769 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6770 /* Remove mtedesc from the normal sve descriptor. */ 6771 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6772 6773 /* 6774 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6775 * offset base entirely over the address space hole to change the 6776 * pointer tag, or change the bit55 selector. So we could here 6777 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6778 */ 6779 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6780 esize, msize, off_fn, host_fn, tlb_fn); 6781} 6782 6783#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6784void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6785 void *vm, target_ulong base, uint32_t desc) \ 6786{ \ 6787 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6788 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6789} \ 6790void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6791 void *vm, target_ulong base, uint32_t desc) \ 6792{ \ 6793 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6794 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6795} 6796 6797#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6798void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6799 void *vm, target_ulong base, uint32_t desc) \ 6800{ \ 6801 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6802 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6803} \ 6804void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6805 void *vm, target_ulong base, uint32_t desc) \ 6806{ \ 6807 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6808 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6809} 6810 6811DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6812DO_LD1_ZPZ_S(bsu, zss, MO_8) 6813DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6814DO_LD1_ZPZ_D(bdu, zss, MO_8) 6815DO_LD1_ZPZ_D(bdu, zd, MO_8) 6816 6817DO_LD1_ZPZ_S(bss, zsu, MO_8) 6818DO_LD1_ZPZ_S(bss, zss, MO_8) 6819DO_LD1_ZPZ_D(bds, zsu, MO_8) 6820DO_LD1_ZPZ_D(bds, zss, MO_8) 6821DO_LD1_ZPZ_D(bds, zd, MO_8) 6822 6823DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6824DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6825DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6826DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6827DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6828 6829DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6830DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6831DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6832DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6833DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6834 6835DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6836DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6837DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6838DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6839DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6840 6841DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6842DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6843DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6844DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6845DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6846 6847DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6848DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6849DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6850DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6851DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6852 6853DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6854DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6855DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6856DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6857DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6858 6859DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6860DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6861DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6862 6863DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6864DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6865DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6866 6867DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6868DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6869DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6870 6871DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6872DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6873DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6874 6875#undef DO_LD1_ZPZ_S 6876#undef DO_LD1_ZPZ_D 6877 6878/* First fault loads with a vector index. */ 6879 6880/* 6881 * Common helpers for all gather first-faulting loads. 6882 */ 6883 6884static inline QEMU_ALWAYS_INLINE 6885void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6886 target_ulong base, uint32_t desc, uintptr_t retaddr, 6887 uint32_t mtedesc, const int esz, const int msz, 6888 zreg_off_fn *off_fn, 6889 sve_ldst1_host_fn *host_fn, 6890 sve_ldst1_tlb_fn *tlb_fn) 6891{ 6892 const int mmu_idx = cpu_mmu_index(env, false); 6893 const intptr_t reg_max = simd_oprsz(desc); 6894 const int scale = simd_data(desc); 6895 const int esize = 1 << esz; 6896 const int msize = 1 << msz; 6897 intptr_t reg_off; 6898 SVEHostPage info; 6899 target_ulong addr, in_page; 6900 6901 /* Skip to the first true predicate. */ 6902 reg_off = find_next_active(vg, 0, reg_max, esz); 6903 if (unlikely(reg_off >= reg_max)) { 6904 /* The entire predicate was false; no load occurs. */ 6905 memset(vd, 0, reg_max); 6906 return; 6907 } 6908 6909 /* 6910 * Probe the first element, allowing faults. 6911 */ 6912 addr = base + (off_fn(vm, reg_off) << scale); 6913 if (mtedesc) { 6914 mte_check(env, mtedesc, addr, retaddr); 6915 } 6916 tlb_fn(env, vd, reg_off, addr, retaddr); 6917 6918 /* After any fault, zero the other elements. */ 6919 swap_memzero(vd, reg_off); 6920 reg_off += esize; 6921 swap_memzero(vd + reg_off, reg_max - reg_off); 6922 6923 /* 6924 * Probe the remaining elements, not allowing faults. 6925 */ 6926 while (reg_off < reg_max) { 6927 uint64_t pg = vg[reg_off >> 6]; 6928 do { 6929 if (likely((pg >> (reg_off & 63)) & 1)) { 6930 addr = base + (off_fn(vm, reg_off) << scale); 6931 in_page = -(addr | TARGET_PAGE_MASK); 6932 6933 if (unlikely(in_page < msize)) { 6934 /* Stop if the element crosses a page boundary. */ 6935 goto fault; 6936 } 6937 6938 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6939 mmu_idx, retaddr); 6940 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6941 goto fault; 6942 } 6943 if (unlikely(info.flags & TLB_WATCHPOINT) && 6944 (cpu_watchpoint_address_matches 6945 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6946 goto fault; 6947 } 6948 if (mtedesc && 6949 arm_tlb_mte_tagged(&info.attrs) && 6950 !mte_probe(env, mtedesc, addr)) { 6951 goto fault; 6952 } 6953 6954 host_fn(vd, reg_off, info.host); 6955 } 6956 reg_off += esize; 6957 } while (reg_off & 63); 6958 } 6959 return; 6960 6961 fault: 6962 record_fault(env, reg_off, reg_max); 6963} 6964 6965static inline QEMU_ALWAYS_INLINE 6966void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6967 target_ulong base, uint32_t desc, uintptr_t retaddr, 6968 const int esz, const int msz, 6969 zreg_off_fn *off_fn, 6970 sve_ldst1_host_fn *host_fn, 6971 sve_ldst1_tlb_fn *tlb_fn) 6972{ 6973 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6974 /* Remove mtedesc from the normal sve descriptor. */ 6975 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6976 6977 /* 6978 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6979 * offset base entirely over the address space hole to change the 6980 * pointer tag, or change the bit55 selector. So we could here 6981 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6982 */ 6983 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6984 esz, msz, off_fn, host_fn, tlb_fn); 6985} 6986 6987#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6988void HELPER(sve_ldff##MEM##_##OFS) \ 6989 (CPUARMState *env, void *vd, void *vg, \ 6990 void *vm, target_ulong base, uint32_t desc) \ 6991{ \ 6992 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6993 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6994} \ 6995void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6996 (CPUARMState *env, void *vd, void *vg, \ 6997 void *vm, target_ulong base, uint32_t desc) \ 6998{ \ 6999 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7000 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7001} 7002 7003#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7004void HELPER(sve_ldff##MEM##_##OFS) \ 7005 (CPUARMState *env, void *vd, void *vg, \ 7006 void *vm, target_ulong base, uint32_t desc) \ 7007{ \ 7008 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7009 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7010} \ 7011void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7012 (CPUARMState *env, void *vd, void *vg, \ 7013 void *vm, target_ulong base, uint32_t desc) \ 7014{ \ 7015 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7016 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7017} 7018 7019DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7020DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7021DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7022DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7023DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7024 7025DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7026DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7027DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7028DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7029DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7030 7031DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7032DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7033DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7034DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7035DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7036 7037DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7038DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7039DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7040DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7041DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7042 7043DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7044DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7045DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7046DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7047DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7048 7049DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7050DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7051DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7052DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7053DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7054 7055DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7056DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7057DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7058DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7059DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7060 7061DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7062DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7063DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7064DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7065DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7066 7067DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7068DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7069DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7070 7071DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7072DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7073DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7074 7075DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7076DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7077DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7078 7079DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7080DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7081DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7082 7083/* Stores with a vector index. */ 7084 7085static inline QEMU_ALWAYS_INLINE 7086void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7087 target_ulong base, uint32_t desc, uintptr_t retaddr, 7088 uint32_t mtedesc, int esize, int msize, 7089 zreg_off_fn *off_fn, 7090 sve_ldst1_host_fn *host_fn, 7091 sve_ldst1_tlb_fn *tlb_fn) 7092{ 7093 const int mmu_idx = cpu_mmu_index(env, false); 7094 const intptr_t reg_max = simd_oprsz(desc); 7095 const int scale = simd_data(desc); 7096 void *host[ARM_MAX_VQ * 4]; 7097 intptr_t reg_off, i; 7098 SVEHostPage info, info2; 7099 7100 /* 7101 * Probe all of the elements for host addresses and flags. 7102 */ 7103 i = reg_off = 0; 7104 do { 7105 uint64_t pg = vg[reg_off >> 6]; 7106 do { 7107 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7108 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7109 7110 host[i] = NULL; 7111 if (likely((pg >> (reg_off & 63)) & 1)) { 7112 if (likely(in_page >= msize)) { 7113 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7114 mmu_idx, retaddr); 7115 host[i] = info.host; 7116 } else { 7117 /* 7118 * Element crosses the page boundary. 7119 * Probe both pages, but do not record the host address, 7120 * so that we use the slow path. 7121 */ 7122 sve_probe_page(&info, false, env, addr, 0, 7123 MMU_DATA_STORE, mmu_idx, retaddr); 7124 sve_probe_page(&info2, false, env, addr + in_page, 0, 7125 MMU_DATA_STORE, mmu_idx, retaddr); 7126 info.flags |= info2.flags; 7127 } 7128 7129 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7130 cpu_check_watchpoint(env_cpu(env), addr, msize, 7131 info.attrs, BP_MEM_WRITE, retaddr); 7132 } 7133 7134 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { 7135 mte_check(env, mtedesc, addr, retaddr); 7136 } 7137 } 7138 i += 1; 7139 reg_off += esize; 7140 } while (reg_off & 63); 7141 } while (reg_off < reg_max); 7142 7143 /* 7144 * Now that we have recognized all exceptions except SyncExternal 7145 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7146 * 7147 * Note for the common case of an element in RAM, not crossing a page 7148 * boundary, we have stored the host address in host[]. This doubles 7149 * as a first-level check against the predicate, since only enabled 7150 * elements have non-null host addresses. 7151 */ 7152 i = reg_off = 0; 7153 do { 7154 void *h = host[i]; 7155 if (likely(h != NULL)) { 7156 host_fn(vd, reg_off, h); 7157 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7158 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7159 tlb_fn(env, vd, reg_off, addr, retaddr); 7160 } 7161 i += 1; 7162 reg_off += esize; 7163 } while (reg_off < reg_max); 7164} 7165 7166static inline QEMU_ALWAYS_INLINE 7167void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7168 target_ulong base, uint32_t desc, uintptr_t retaddr, 7169 int esize, int msize, zreg_off_fn *off_fn, 7170 sve_ldst1_host_fn *host_fn, 7171 sve_ldst1_tlb_fn *tlb_fn) 7172{ 7173 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7174 /* Remove mtedesc from the normal sve descriptor. */ 7175 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7176 7177 /* 7178 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7179 * offset base entirely over the address space hole to change the 7180 * pointer tag, or change the bit55 selector. So we could here 7181 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7182 */ 7183 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7184 esize, msize, off_fn, host_fn, tlb_fn); 7185} 7186 7187#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7188void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7189 void *vm, target_ulong base, uint32_t desc) \ 7190{ \ 7191 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7192 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7193} \ 7194void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7195 void *vm, target_ulong base, uint32_t desc) \ 7196{ \ 7197 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7198 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7199} 7200 7201#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7202void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7203 void *vm, target_ulong base, uint32_t desc) \ 7204{ \ 7205 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7206 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7207} \ 7208void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7209 void *vm, target_ulong base, uint32_t desc) \ 7210{ \ 7211 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7212 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7213} 7214 7215DO_ST1_ZPZ_S(bs, zsu, MO_8) 7216DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7217DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7218DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7219DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7220 7221DO_ST1_ZPZ_S(bs, zss, MO_8) 7222DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7223DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7224DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7225DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7226 7227DO_ST1_ZPZ_D(bd, zsu, MO_8) 7228DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7229DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7230DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7231DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7232DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7233DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7234 7235DO_ST1_ZPZ_D(bd, zss, MO_8) 7236DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7237DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7238DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7239DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7240DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7241DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7242 7243DO_ST1_ZPZ_D(bd, zd, MO_8) 7244DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7245DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7246DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7247DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7248DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7249DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7250 7251#undef DO_ST1_ZPZ_S 7252#undef DO_ST1_ZPZ_D 7253 7254void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7255{ 7256 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7257 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7258 7259 for (i = 0; i < opr_sz; ++i) { 7260 d[i] = n[i] ^ m[i] ^ k[i]; 7261 } 7262} 7263 7264void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7265{ 7266 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7267 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7268 7269 for (i = 0; i < opr_sz; ++i) { 7270 d[i] = n[i] ^ (m[i] & ~k[i]); 7271 } 7272} 7273 7274void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7275{ 7276 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7277 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7278 7279 for (i = 0; i < opr_sz; ++i) { 7280 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7281 } 7282} 7283 7284void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7285{ 7286 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7287 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7288 7289 for (i = 0; i < opr_sz; ++i) { 7290 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7291 } 7292} 7293 7294void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7295{ 7296 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7297 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7298 7299 for (i = 0; i < opr_sz; ++i) { 7300 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7301 } 7302} 7303 7304/* 7305 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7306 * See hasless(v,1) from 7307 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7308 */ 7309static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7310{ 7311 int bits = 8 << esz; 7312 uint64_t ones = dup_const(esz, 1); 7313 uint64_t signs = ones << (bits - 1); 7314 uint64_t cmp0, cmp1; 7315 7316 cmp1 = dup_const(esz, n); 7317 cmp0 = cmp1 ^ m0; 7318 cmp1 = cmp1 ^ m1; 7319 cmp0 = (cmp0 - ones) & ~cmp0; 7320 cmp1 = (cmp1 - ones) & ~cmp1; 7321 return (cmp0 | cmp1) & signs; 7322} 7323 7324static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7325 uint32_t desc, int esz, bool nmatch) 7326{ 7327 uint16_t esz_mask = pred_esz_masks[esz]; 7328 intptr_t opr_sz = simd_oprsz(desc); 7329 uint32_t flags = PREDTEST_INIT; 7330 intptr_t i, j, k; 7331 7332 for (i = 0; i < opr_sz; i += 16) { 7333 uint64_t m0 = *(uint64_t *)(vm + i); 7334 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7336 uint16_t out = 0; 7337 7338 for (j = 0; j < 16; j += 8) { 7339 uint64_t n = *(uint64_t *)(vn + i + j); 7340 7341 for (k = 0; k < 8; k += 1 << esz) { 7342 if (pg & (1 << (j + k))) { 7343 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7344 out |= (o ^ nmatch) << (j + k); 7345 } 7346 } 7347 } 7348 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7349 flags = iter_predtest_fwd(out, pg, flags); 7350 } 7351 return flags; 7352} 7353 7354#define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7355uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7356{ \ 7357 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7358} 7359 7360DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7361DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7362 7363DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7364DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7365 7366#undef DO_PPZZ_MATCH 7367 7368void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7369 uint32_t desc) 7370{ 7371 ARMVectorReg scratch; 7372 intptr_t i, j; 7373 intptr_t opr_sz = simd_oprsz(desc); 7374 uint32_t *d = vd, *n = vn, *m = vm; 7375 uint8_t *pg = vg; 7376 7377 if (d == n) { 7378 n = memcpy(&scratch, n, opr_sz); 7379 if (d == m) { 7380 m = n; 7381 } 7382 } else if (d == m) { 7383 m = memcpy(&scratch, m, opr_sz); 7384 } 7385 7386 for (i = 0; i < opr_sz; i += 4) { 7387 uint64_t count = 0; 7388 uint8_t pred; 7389 7390 pred = pg[H1(i >> 3)] >> (i & 7); 7391 if (pred & 1) { 7392 uint32_t nn = n[H4(i >> 2)]; 7393 7394 for (j = 0; j <= i; j += 4) { 7395 pred = pg[H1(j >> 3)] >> (j & 7); 7396 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7397 ++count; 7398 } 7399 } 7400 } 7401 d[H4(i >> 2)] = count; 7402 } 7403} 7404 7405void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7406 uint32_t desc) 7407{ 7408 ARMVectorReg scratch; 7409 intptr_t i, j; 7410 intptr_t opr_sz = simd_oprsz(desc); 7411 uint64_t *d = vd, *n = vn, *m = vm; 7412 uint8_t *pg = vg; 7413 7414 if (d == n) { 7415 n = memcpy(&scratch, n, opr_sz); 7416 if (d == m) { 7417 m = n; 7418 } 7419 } else if (d == m) { 7420 m = memcpy(&scratch, m, opr_sz); 7421 } 7422 7423 for (i = 0; i < opr_sz / 8; ++i) { 7424 uint64_t count = 0; 7425 if (pg[H1(i)] & 1) { 7426 uint64_t nn = n[i]; 7427 for (j = 0; j <= i; ++j) { 7428 if ((pg[H1(j)] & 1) && nn == m[j]) { 7429 ++count; 7430 } 7431 } 7432 } 7433 d[i] = count; 7434 } 7435} 7436 7437/* 7438 * Returns the number of bytes in m0 and m1 that match n. 7439 * Unlike do_match2 we don't just need true/false, we need an exact count. 7440 * This requires two extra logical operations. 7441 */ 7442static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7443{ 7444 const uint64_t mask = dup_const(MO_8, 0x7f); 7445 uint64_t cmp0, cmp1; 7446 7447 cmp1 = dup_const(MO_8, n); 7448 cmp0 = cmp1 ^ m0; 7449 cmp1 = cmp1 ^ m1; 7450 7451 /* 7452 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7453 * 2: carry in to msb if byte != 0 (+ mask) 7454 * 3: set msb if cmp has msb set (| cmp) 7455 * 4: set ~msb to ignore them (| mask) 7456 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7457 * 5: invert, resulting in 0x80 if and only if byte == 0. 7458 */ 7459 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7460 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7461 7462 /* 7463 * Combine the two compares in a way that the bits do 7464 * not overlap, and so preserves the count of set bits. 7465 * If the host has an efficient instruction for ctpop, 7466 * then ctpop(x) + ctpop(y) has the same number of 7467 * operations as ctpop(x | (y >> 1)). If the host does 7468 * not have an efficient ctpop, then we only want to 7469 * use it once. 7470 */ 7471 return ctpop64(cmp0 | (cmp1 >> 1)); 7472} 7473 7474void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7475{ 7476 intptr_t i, j; 7477 intptr_t opr_sz = simd_oprsz(desc); 7478 7479 for (i = 0; i < opr_sz; i += 16) { 7480 uint64_t n0 = *(uint64_t *)(vn + i); 7481 uint64_t m0 = *(uint64_t *)(vm + i); 7482 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7483 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7484 uint64_t out0 = 0; 7485 uint64_t out1 = 0; 7486 7487 for (j = 0; j < 64; j += 8) { 7488 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7489 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7490 out0 |= cnt0 << j; 7491 out1 |= cnt1 << j; 7492 } 7493 7494 *(uint64_t *)(vd + i) = out0; 7495 *(uint64_t *)(vd + i + 8) = out1; 7496 } 7497} 7498 7499void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7500{ 7501 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7502 int shr = simd_data(desc); 7503 int shl = 8 - shr; 7504 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7505 uint64_t *d = vd, *n = vn, *m = vm; 7506 7507 for (i = 0; i < opr_sz; ++i) { 7508 uint64_t t = n[i] ^ m[i]; 7509 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7510 } 7511} 7512 7513void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7514{ 7515 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7516 int shr = simd_data(desc); 7517 int shl = 16 - shr; 7518 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7519 uint64_t *d = vd, *n = vn, *m = vm; 7520 7521 for (i = 0; i < opr_sz; ++i) { 7522 uint64_t t = n[i] ^ m[i]; 7523 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7524 } 7525} 7526 7527void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7528{ 7529 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7530 int shr = simd_data(desc); 7531 uint32_t *d = vd, *n = vn, *m = vm; 7532 7533 for (i = 0; i < opr_sz; ++i) { 7534 d[i] = ror32(n[i] ^ m[i], shr); 7535 } 7536} 7537 7538void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7539 void *status, uint32_t desc) 7540{ 7541 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7542 7543 for (s = 0; s < opr_sz; ++s) { 7544 float32 *n = vn + s * sizeof(float32) * 4; 7545 float32 *m = vm + s * sizeof(float32) * 4; 7546 float32 *a = va + s * sizeof(float32) * 4; 7547 float32 *d = vd + s * sizeof(float32) * 4; 7548 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7549 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7550 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7551 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7552 float32 p0, p1; 7553 7554 /* i = 0, j = 0 */ 7555 p0 = float32_mul(n00, m00, status); 7556 p1 = float32_mul(n01, m01, status); 7557 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7558 7559 /* i = 0, j = 1 */ 7560 p0 = float32_mul(n00, m10, status); 7561 p1 = float32_mul(n01, m11, status); 7562 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7563 7564 /* i = 1, j = 0 */ 7565 p0 = float32_mul(n10, m00, status); 7566 p1 = float32_mul(n11, m01, status); 7567 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7568 7569 /* i = 1, j = 1 */ 7570 p0 = float32_mul(n10, m10, status); 7571 p1 = float32_mul(n11, m11, status); 7572 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7573 } 7574} 7575 7576void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7577 void *status, uint32_t desc) 7578{ 7579 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7580 7581 for (s = 0; s < opr_sz; ++s) { 7582 float64 *n = vn + s * sizeof(float64) * 4; 7583 float64 *m = vm + s * sizeof(float64) * 4; 7584 float64 *a = va + s * sizeof(float64) * 4; 7585 float64 *d = vd + s * sizeof(float64) * 4; 7586 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7587 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7588 float64 p0, p1; 7589 7590 /* i = 0, j = 0 */ 7591 p0 = float64_mul(n00, m00, status); 7592 p1 = float64_mul(n01, m01, status); 7593 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7594 7595 /* i = 0, j = 1 */ 7596 p0 = float64_mul(n00, m10, status); 7597 p1 = float64_mul(n01, m11, status); 7598 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7599 7600 /* i = 1, j = 0 */ 7601 p0 = float64_mul(n10, m00, status); 7602 p1 = float64_mul(n11, m01, status); 7603 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7604 7605 /* i = 1, j = 1 */ 7606 p0 = float64_mul(n10, m10, status); 7607 p1 = float64_mul(n11, m11, status); 7608 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7609 } 7610} 7611 7612#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7613void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7614{ \ 7615 intptr_t i = simd_oprsz(desc); \ 7616 uint64_t *g = vg; \ 7617 do { \ 7618 uint64_t pg = g[(i - 1) >> 6]; \ 7619 do { \ 7620 i -= sizeof(TYPEW); \ 7621 if (likely((pg >> (i & 63)) & 1)) { \ 7622 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7623 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7624 } \ 7625 } while (i & 63); \ 7626 } while (i != 0); \ 7627} 7628 7629DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7630DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7631DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7632 7633#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7634void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ 7635{ \ 7636 intptr_t i = simd_oprsz(desc); \ 7637 uint64_t *g = vg; \ 7638 do { \ 7639 uint64_t pg = g[(i - 1) >> 6]; \ 7640 do { \ 7641 i -= sizeof(TYPEW); \ 7642 if (likely((pg >> (i & 63)) & 1)) { \ 7643 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7644 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7645 } \ 7646 } while (i & 63); \ 7647 } while (i != 0); \ 7648} 7649 7650DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7651DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7652 7653#undef DO_FCVTLT 7654#undef DO_FCVTNT