vec_helper.c (93480B)
1/* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "qemu/osdep.h" 21#include "cpu.h" 22#include "exec/helper-proto.h" 23#include "tcg/tcg-gvec-desc.h" 24#include "fpu/softfloat.h" 25#include "qemu/int128.h" 26#include "vec_internal.h" 27 28/* 29 * Data for expanding active predicate bits to bytes, for byte elements. 30 * 31 * for (i = 0; i < 256; ++i) { 32 * unsigned long m = 0; 33 * for (j = 0; j < 8; j++) { 34 * if ((i >> j) & 1) { 35 * m |= 0xfful << (j << 3); 36 * } 37 * } 38 * printf("0x%016lx,\n", m); 39 * } 40 */ 41const uint64_t expand_pred_b_data[256] = { 42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 127 0xffffffffffffffff, 128}; 129 130/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 131int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 132 bool neg, bool round) 133{ 134 /* 135 * Simplify: 136 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 137 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 138 */ 139 int32_t ret = (int32_t)src1 * src2; 140 if (neg) { 141 ret = -ret; 142 } 143 ret += ((int32_t)src3 << 7) + (round << 6); 144 ret >>= 7; 145 146 if (ret != (int8_t)ret) { 147 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 148 } 149 return ret; 150} 151 152void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 153 void *va, uint32_t desc) 154{ 155 intptr_t i, opr_sz = simd_oprsz(desc); 156 int8_t *d = vd, *n = vn, *m = vm, *a = va; 157 158 for (i = 0; i < opr_sz; ++i) { 159 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 160 } 161} 162 163void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 164 void *va, uint32_t desc) 165{ 166 intptr_t i, opr_sz = simd_oprsz(desc); 167 int8_t *d = vd, *n = vn, *m = vm, *a = va; 168 169 for (i = 0; i < opr_sz; ++i) { 170 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 171 } 172} 173 174void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 175{ 176 intptr_t i, opr_sz = simd_oprsz(desc); 177 int8_t *d = vd, *n = vn, *m = vm; 178 179 for (i = 0; i < opr_sz; ++i) { 180 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 181 } 182} 183 184void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 185{ 186 intptr_t i, opr_sz = simd_oprsz(desc); 187 int8_t *d = vd, *n = vn, *m = vm; 188 189 for (i = 0; i < opr_sz; ++i) { 190 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 191 } 192} 193 194/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 195int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 196 bool neg, bool round, uint32_t *sat) 197{ 198 /* Simplify similarly to do_sqrdmlah_b above. */ 199 int32_t ret = (int32_t)src1 * src2; 200 if (neg) { 201 ret = -ret; 202 } 203 ret += ((int32_t)src3 << 15) + (round << 14); 204 ret >>= 15; 205 206 if (ret != (int16_t)ret) { 207 *sat = 1; 208 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 209 } 210 return ret; 211} 212 213uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 214 uint32_t src2, uint32_t src3) 215{ 216 uint32_t *sat = &env->vfp.qc[0]; 217 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 218 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 219 false, true, sat); 220 return deposit32(e1, 16, 16, e2); 221} 222 223void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 224 void *vq, uint32_t desc) 225{ 226 uintptr_t opr_sz = simd_oprsz(desc); 227 int16_t *d = vd; 228 int16_t *n = vn; 229 int16_t *m = vm; 230 uintptr_t i; 231 232 for (i = 0; i < opr_sz / 2; ++i) { 233 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 234 } 235 clear_tail(d, opr_sz, simd_maxsz(desc)); 236} 237 238uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 239 uint32_t src2, uint32_t src3) 240{ 241 uint32_t *sat = &env->vfp.qc[0]; 242 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 243 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 244 true, true, sat); 245 return deposit32(e1, 16, 16, e2); 246} 247 248void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 249 void *vq, uint32_t desc) 250{ 251 uintptr_t opr_sz = simd_oprsz(desc); 252 int16_t *d = vd; 253 int16_t *n = vn; 254 int16_t *m = vm; 255 uintptr_t i; 256 257 for (i = 0; i < opr_sz / 2; ++i) { 258 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 259 } 260 clear_tail(d, opr_sz, simd_maxsz(desc)); 261} 262 263void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 264 void *vq, uint32_t desc) 265{ 266 intptr_t i, opr_sz = simd_oprsz(desc); 267 int16_t *d = vd, *n = vn, *m = vm; 268 269 for (i = 0; i < opr_sz / 2; ++i) { 270 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 271 } 272 clear_tail(d, opr_sz, simd_maxsz(desc)); 273} 274 275void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277{ 278 intptr_t i, opr_sz = simd_oprsz(desc); 279 int16_t *d = vd, *n = vn, *m = vm; 280 281 for (i = 0; i < opr_sz / 2; ++i) { 282 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 283 } 284 clear_tail(d, opr_sz, simd_maxsz(desc)); 285} 286 287void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 288 void *va, uint32_t desc) 289{ 290 intptr_t i, opr_sz = simd_oprsz(desc); 291 int16_t *d = vd, *n = vn, *m = vm, *a = va; 292 uint32_t discard; 293 294 for (i = 0; i < opr_sz / 2; ++i) { 295 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 296 } 297} 298 299void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 300 void *va, uint32_t desc) 301{ 302 intptr_t i, opr_sz = simd_oprsz(desc); 303 int16_t *d = vd, *n = vn, *m = vm, *a = va; 304 uint32_t discard; 305 306 for (i = 0; i < opr_sz / 2; ++i) { 307 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 308 } 309} 310 311void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 312{ 313 intptr_t i, opr_sz = simd_oprsz(desc); 314 int16_t *d = vd, *n = vn, *m = vm; 315 uint32_t discard; 316 317 for (i = 0; i < opr_sz / 2; ++i) { 318 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 319 } 320} 321 322void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 323{ 324 intptr_t i, opr_sz = simd_oprsz(desc); 325 int16_t *d = vd, *n = vn, *m = vm; 326 uint32_t discard; 327 328 for (i = 0; i < opr_sz / 2; ++i) { 329 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 330 } 331} 332 333void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 334{ 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 uint32_t discard; 339 340 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 341 int16_t mm = m[i]; 342 for (j = 0; j < 16 / 2; ++j) { 343 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 344 } 345 } 346} 347 348void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 349{ 350 intptr_t i, j, opr_sz = simd_oprsz(desc); 351 int idx = simd_data(desc); 352 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 356 int16_t mm = m[i]; 357 for (j = 0; j < 16 / 2; ++j) { 358 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 359 } 360 } 361} 362 363/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 364int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 365 bool neg, bool round, uint32_t *sat) 366{ 367 /* Simplify similarly to do_sqrdmlah_b above. */ 368 int64_t ret = (int64_t)src1 * src2; 369 if (neg) { 370 ret = -ret; 371 } 372 ret += ((int64_t)src3 << 31) + (round << 30); 373 ret >>= 31; 374 375 if (ret != (int32_t)ret) { 376 *sat = 1; 377 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 378 } 379 return ret; 380} 381 382uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 383 int32_t src2, int32_t src3) 384{ 385 uint32_t *sat = &env->vfp.qc[0]; 386 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 387} 388 389void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 390 void *vq, uint32_t desc) 391{ 392 uintptr_t opr_sz = simd_oprsz(desc); 393 int32_t *d = vd; 394 int32_t *n = vn; 395 int32_t *m = vm; 396 uintptr_t i; 397 398 for (i = 0; i < opr_sz / 4; ++i) { 399 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 400 } 401 clear_tail(d, opr_sz, simd_maxsz(desc)); 402} 403 404uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 405 int32_t src2, int32_t src3) 406{ 407 uint32_t *sat = &env->vfp.qc[0]; 408 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 409} 410 411void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 412 void *vq, uint32_t desc) 413{ 414 uintptr_t opr_sz = simd_oprsz(desc); 415 int32_t *d = vd; 416 int32_t *n = vn; 417 int32_t *m = vm; 418 uintptr_t i; 419 420 for (i = 0; i < opr_sz / 4; ++i) { 421 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 422 } 423 clear_tail(d, opr_sz, simd_maxsz(desc)); 424} 425 426void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 427 void *vq, uint32_t desc) 428{ 429 intptr_t i, opr_sz = simd_oprsz(desc); 430 int32_t *d = vd, *n = vn, *m = vm; 431 432 for (i = 0; i < opr_sz / 4; ++i) { 433 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 434 } 435 clear_tail(d, opr_sz, simd_maxsz(desc)); 436} 437 438void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440{ 441 intptr_t i, opr_sz = simd_oprsz(desc); 442 int32_t *d = vd, *n = vn, *m = vm; 443 444 for (i = 0; i < opr_sz / 4; ++i) { 445 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 446 } 447 clear_tail(d, opr_sz, simd_maxsz(desc)); 448} 449 450void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 451 void *va, uint32_t desc) 452{ 453 intptr_t i, opr_sz = simd_oprsz(desc); 454 int32_t *d = vd, *n = vn, *m = vm, *a = va; 455 uint32_t discard; 456 457 for (i = 0; i < opr_sz / 4; ++i) { 458 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 459 } 460} 461 462void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 463 void *va, uint32_t desc) 464{ 465 intptr_t i, opr_sz = simd_oprsz(desc); 466 int32_t *d = vd, *n = vn, *m = vm, *a = va; 467 uint32_t discard; 468 469 for (i = 0; i < opr_sz / 4; ++i) { 470 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 471 } 472} 473 474void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 475{ 476 intptr_t i, opr_sz = simd_oprsz(desc); 477 int32_t *d = vd, *n = vn, *m = vm; 478 uint32_t discard; 479 480 for (i = 0; i < opr_sz / 4; ++i) { 481 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 482 } 483} 484 485void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 486{ 487 intptr_t i, opr_sz = simd_oprsz(desc); 488 int32_t *d = vd, *n = vn, *m = vm; 489 uint32_t discard; 490 491 for (i = 0; i < opr_sz / 4; ++i) { 492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 493 } 494} 495 496void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 497{ 498 intptr_t i, j, opr_sz = simd_oprsz(desc); 499 int idx = simd_data(desc); 500 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 501 uint32_t discard; 502 503 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 504 int32_t mm = m[i]; 505 for (j = 0; j < 16 / 4; ++j) { 506 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 507 } 508 } 509} 510 511void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 512{ 513 intptr_t i, j, opr_sz = simd_oprsz(desc); 514 int idx = simd_data(desc); 515 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 519 int32_t mm = m[i]; 520 for (j = 0; j < 16 / 4; ++j) { 521 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 522 } 523 } 524} 525 526/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 527static int64_t do_sat128_d(Int128 r) 528{ 529 int64_t ls = int128_getlo(r); 530 int64_t hs = int128_gethi(r); 531 532 if (unlikely(hs != (ls >> 63))) { 533 return hs < 0 ? INT64_MIN : INT64_MAX; 534 } 535 return ls; 536} 537 538int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 539{ 540 uint64_t l, h; 541 Int128 r, t; 542 543 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 544 muls64(&l, &h, m, n); 545 r = int128_make128(l, h); 546 if (neg) { 547 r = int128_neg(r); 548 } 549 if (a) { 550 t = int128_exts64(a); 551 t = int128_lshift(t, 63); 552 r = int128_add(r, t); 553 } 554 if (round) { 555 t = int128_exts64(1ll << 62); 556 r = int128_add(r, t); 557 } 558 r = int128_rshift(r, 63); 559 560 return do_sat128_d(r); 561} 562 563void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 564 void *va, uint32_t desc) 565{ 566 intptr_t i, opr_sz = simd_oprsz(desc); 567 int64_t *d = vd, *n = vn, *m = vm, *a = va; 568 569 for (i = 0; i < opr_sz / 8; ++i) { 570 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 571 } 572} 573 574void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 575 void *va, uint32_t desc) 576{ 577 intptr_t i, opr_sz = simd_oprsz(desc); 578 int64_t *d = vd, *n = vn, *m = vm, *a = va; 579 580 for (i = 0; i < opr_sz / 8; ++i) { 581 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 582 } 583} 584 585void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 586{ 587 intptr_t i, opr_sz = simd_oprsz(desc); 588 int64_t *d = vd, *n = vn, *m = vm; 589 590 for (i = 0; i < opr_sz / 8; ++i) { 591 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 592 } 593} 594 595void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 596{ 597 intptr_t i, opr_sz = simd_oprsz(desc); 598 int64_t *d = vd, *n = vn, *m = vm; 599 600 for (i = 0; i < opr_sz / 8; ++i) { 601 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 602 } 603} 604 605void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 606{ 607 intptr_t i, j, opr_sz = simd_oprsz(desc); 608 int idx = simd_data(desc); 609 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 610 611 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 612 int64_t mm = m[i]; 613 for (j = 0; j < 16 / 8; ++j) { 614 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 615 } 616 } 617} 618 619void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 620{ 621 intptr_t i, j, opr_sz = simd_oprsz(desc); 622 int idx = simd_data(desc); 623 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 624 625 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 626 int64_t mm = m[i]; 627 for (j = 0; j < 16 / 8; ++j) { 628 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 629 } 630 } 631} 632 633/* Integer 8 and 16-bit dot-product. 634 * 635 * Note that for the loops herein, host endianness does not matter 636 * with respect to the ordering of data within the quad-width lanes. 637 * All elements are treated equally, no matter where they are. 638 */ 639 640#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 641void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 642{ \ 643 intptr_t i, opr_sz = simd_oprsz(desc); \ 644 TYPED *d = vd, *a = va; \ 645 TYPEN *n = vn; \ 646 TYPEM *m = vm; \ 647 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 648 d[i] = (a[i] + \ 649 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 650 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 651 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 652 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 653 } \ 654 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 655} 656 657DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 658DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 659DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 660DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 661DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 662 663#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 664void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 665{ \ 666 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 667 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 668 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 669 intptr_t index = simd_data(desc); \ 670 TYPED *d = vd, *a = va; \ 671 TYPEN *n = vn; \ 672 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 673 do { \ 674 TYPED m0 = m_indexed[i * 4 + 0]; \ 675 TYPED m1 = m_indexed[i * 4 + 1]; \ 676 TYPED m2 = m_indexed[i * 4 + 2]; \ 677 TYPED m3 = m_indexed[i * 4 + 3]; \ 678 do { \ 679 d[i] = (a[i] + \ 680 n[i * 4 + 0] * m0 + \ 681 n[i * 4 + 1] * m1 + \ 682 n[i * 4 + 2] * m2 + \ 683 n[i * 4 + 3] * m3); \ 684 } while (++i < segend); \ 685 segend = i + 4; \ 686 } while (i < opr_sz_n); \ 687 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 688} 689 690DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 691DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 692DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 693DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 694DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 695DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 696 697void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 698 void *vfpst, uint32_t desc) 699{ 700 uintptr_t opr_sz = simd_oprsz(desc); 701 float16 *d = vd; 702 float16 *n = vn; 703 float16 *m = vm; 704 float_status *fpst = vfpst; 705 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 706 uint32_t neg_imag = neg_real ^ 1; 707 uintptr_t i; 708 709 /* Shift boolean to the sign bit so we can xor to negate. */ 710 neg_real <<= 15; 711 neg_imag <<= 15; 712 713 for (i = 0; i < opr_sz / 2; i += 2) { 714 float16 e0 = n[H2(i)]; 715 float16 e1 = m[H2(i + 1)] ^ neg_imag; 716 float16 e2 = n[H2(i + 1)]; 717 float16 e3 = m[H2(i)] ^ neg_real; 718 719 d[H2(i)] = float16_add(e0, e1, fpst); 720 d[H2(i + 1)] = float16_add(e2, e3, fpst); 721 } 722 clear_tail(d, opr_sz, simd_maxsz(desc)); 723} 724 725void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 726 void *vfpst, uint32_t desc) 727{ 728 uintptr_t opr_sz = simd_oprsz(desc); 729 float32 *d = vd; 730 float32 *n = vn; 731 float32 *m = vm; 732 float_status *fpst = vfpst; 733 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 734 uint32_t neg_imag = neg_real ^ 1; 735 uintptr_t i; 736 737 /* Shift boolean to the sign bit so we can xor to negate. */ 738 neg_real <<= 31; 739 neg_imag <<= 31; 740 741 for (i = 0; i < opr_sz / 4; i += 2) { 742 float32 e0 = n[H4(i)]; 743 float32 e1 = m[H4(i + 1)] ^ neg_imag; 744 float32 e2 = n[H4(i + 1)]; 745 float32 e3 = m[H4(i)] ^ neg_real; 746 747 d[H4(i)] = float32_add(e0, e1, fpst); 748 d[H4(i + 1)] = float32_add(e2, e3, fpst); 749 } 750 clear_tail(d, opr_sz, simd_maxsz(desc)); 751} 752 753void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 754 void *vfpst, uint32_t desc) 755{ 756 uintptr_t opr_sz = simd_oprsz(desc); 757 float64 *d = vd; 758 float64 *n = vn; 759 float64 *m = vm; 760 float_status *fpst = vfpst; 761 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 762 uint64_t neg_imag = neg_real ^ 1; 763 uintptr_t i; 764 765 /* Shift boolean to the sign bit so we can xor to negate. */ 766 neg_real <<= 63; 767 neg_imag <<= 63; 768 769 for (i = 0; i < opr_sz / 8; i += 2) { 770 float64 e0 = n[i]; 771 float64 e1 = m[i + 1] ^ neg_imag; 772 float64 e2 = n[i + 1]; 773 float64 e3 = m[i] ^ neg_real; 774 775 d[i] = float64_add(e0, e1, fpst); 776 d[i + 1] = float64_add(e2, e3, fpst); 777 } 778 clear_tail(d, opr_sz, simd_maxsz(desc)); 779} 780 781void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 782 void *vfpst, uint32_t desc) 783{ 784 uintptr_t opr_sz = simd_oprsz(desc); 785 float16 *d = vd, *n = vn, *m = vm, *a = va; 786 float_status *fpst = vfpst; 787 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 788 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 789 uint32_t neg_real = flip ^ neg_imag; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 15; 794 neg_imag <<= 15; 795 796 for (i = 0; i < opr_sz / 2; i += 2) { 797 float16 e2 = n[H2(i + flip)]; 798 float16 e1 = m[H2(i + flip)] ^ neg_real; 799 float16 e4 = e2; 800 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 801 802 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 803 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806} 807 808void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810{ 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 817 uint32_t neg_real = flip ^ neg_imag; 818 intptr_t elements = opr_sz / sizeof(float16); 819 intptr_t eltspersegment = 16 / sizeof(float16); 820 intptr_t i, j; 821 822 /* Shift boolean to the sign bit so we can xor to negate. */ 823 neg_real <<= 15; 824 neg_imag <<= 15; 825 826 for (i = 0; i < elements; i += eltspersegment) { 827 float16 mr = m[H2(i + 2 * index + 0)]; 828 float16 mi = m[H2(i + 2 * index + 1)]; 829 float16 e1 = neg_real ^ (flip ? mi : mr); 830 float16 e3 = neg_imag ^ (flip ? mr : mi); 831 832 for (j = i; j < i + eltspersegment; j += 2) { 833 float16 e2 = n[H2(j + flip)]; 834 float16 e4 = e2; 835 836 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 837 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 838 } 839 } 840 clear_tail(d, opr_sz, simd_maxsz(desc)); 841} 842 843void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 844 void *vfpst, uint32_t desc) 845{ 846 uintptr_t opr_sz = simd_oprsz(desc); 847 float32 *d = vd, *n = vn, *m = vm, *a = va; 848 float_status *fpst = vfpst; 849 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 850 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 851 uint32_t neg_real = flip ^ neg_imag; 852 uintptr_t i; 853 854 /* Shift boolean to the sign bit so we can xor to negate. */ 855 neg_real <<= 31; 856 neg_imag <<= 31; 857 858 for (i = 0; i < opr_sz / 4; i += 2) { 859 float32 e2 = n[H4(i + flip)]; 860 float32 e1 = m[H4(i + flip)] ^ neg_real; 861 float32 e4 = e2; 862 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 863 864 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 865 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868} 869 870void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872{ 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 879 uint32_t neg_real = flip ^ neg_imag; 880 intptr_t elements = opr_sz / sizeof(float32); 881 intptr_t eltspersegment = 16 / sizeof(float32); 882 intptr_t i, j; 883 884 /* Shift boolean to the sign bit so we can xor to negate. */ 885 neg_real <<= 31; 886 neg_imag <<= 31; 887 888 for (i = 0; i < elements; i += eltspersegment) { 889 float32 mr = m[H4(i + 2 * index + 0)]; 890 float32 mi = m[H4(i + 2 * index + 1)]; 891 float32 e1 = neg_real ^ (flip ? mi : mr); 892 float32 e3 = neg_imag ^ (flip ? mr : mi); 893 894 for (j = i; j < i + eltspersegment; j += 2) { 895 float32 e2 = n[H4(j + flip)]; 896 float32 e4 = e2; 897 898 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 899 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 900 } 901 } 902 clear_tail(d, opr_sz, simd_maxsz(desc)); 903} 904 905void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 906 void *vfpst, uint32_t desc) 907{ 908 uintptr_t opr_sz = simd_oprsz(desc); 909 float64 *d = vd, *n = vn, *m = vm, *a = va; 910 float_status *fpst = vfpst; 911 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 912 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 913 uint64_t neg_real = flip ^ neg_imag; 914 uintptr_t i; 915 916 /* Shift boolean to the sign bit so we can xor to negate. */ 917 neg_real <<= 63; 918 neg_imag <<= 63; 919 920 for (i = 0; i < opr_sz / 8; i += 2) { 921 float64 e2 = n[i + flip]; 922 float64 e1 = m[i + flip] ^ neg_real; 923 float64 e4 = e2; 924 float64 e3 = m[i + 1 - flip] ^ neg_imag; 925 926 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 927 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930} 931 932/* 933 * Floating point comparisons producing an integer result (all 1s or all 0s). 934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 936 */ 937static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 938{ 939 return -float16_eq_quiet(op1, op2, stat); 940} 941 942static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 943{ 944 return -float32_eq_quiet(op1, op2, stat); 945} 946 947static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 948{ 949 return -float16_le(op2, op1, stat); 950} 951 952static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 953{ 954 return -float32_le(op2, op1, stat); 955} 956 957static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 958{ 959 return -float16_lt(op2, op1, stat); 960} 961 962static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 963{ 964 return -float32_lt(op2, op1, stat); 965} 966 967static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 968{ 969 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 970} 971 972static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 973{ 974 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 975} 976 977static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 978{ 979 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 980} 981 982static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 983{ 984 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 985} 986 987static int16_t vfp_tosszh(float16 x, void *fpstp) 988{ 989 float_status *fpst = fpstp; 990 if (float16_is_any_nan(x)) { 991 float_raise(float_flag_invalid, fpst); 992 return 0; 993 } 994 return float16_to_int16_round_to_zero(x, fpst); 995} 996 997static uint16_t vfp_touszh(float16 x, void *fpstp) 998{ 999 float_status *fpst = fpstp; 1000 if (float16_is_any_nan(x)) { 1001 float_raise(float_flag_invalid, fpst); 1002 return 0; 1003 } 1004 return float16_to_uint16_round_to_zero(x, fpst); 1005} 1006 1007#define DO_2OP(NAME, FUNC, TYPE) \ 1008void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1009{ \ 1010 intptr_t i, oprsz = simd_oprsz(desc); \ 1011 TYPE *d = vd, *n = vn; \ 1012 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1013 d[i] = FUNC(n[i], stat); \ 1014 } \ 1015 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1016} 1017 1018DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1019DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1020DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1021 1022DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1023DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1024DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1025 1026DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1027DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1028 1029DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1030DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1031DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1032DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1033DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1034DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1035DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1036DO_2OP(gvec_touszh, vfp_touszh, float16) 1037 1038#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1039 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1040 { \ 1041 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1042 } 1043 1044#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1045 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1046 { \ 1047 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1048 } 1049 1050#define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1051 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1052 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1053 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1054 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1055 1056DO_2OP_CMP0(cgt, cgt, FWD) 1057DO_2OP_CMP0(cge, cge, FWD) 1058DO_2OP_CMP0(ceq, ceq, FWD) 1059DO_2OP_CMP0(clt, cgt, REV) 1060DO_2OP_CMP0(cle, cge, REV) 1061 1062#undef DO_2OP 1063#undef DO_2OP_CMP0 1064 1065/* Floating-point trigonometric starting value. 1066 * See the ARM ARM pseudocode function FPTrigSMul. 1067 */ 1068static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1069{ 1070 float16 result = float16_mul(op1, op1, stat); 1071 if (!float16_is_any_nan(result)) { 1072 result = float16_set_sign(result, op2 & 1); 1073 } 1074 return result; 1075} 1076 1077static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1078{ 1079 float32 result = float32_mul(op1, op1, stat); 1080 if (!float32_is_any_nan(result)) { 1081 result = float32_set_sign(result, op2 & 1); 1082 } 1083 return result; 1084} 1085 1086static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1087{ 1088 float64 result = float64_mul(op1, op1, stat); 1089 if (!float64_is_any_nan(result)) { 1090 result = float64_set_sign(result, op2 & 1); 1091 } 1092 return result; 1093} 1094 1095static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1096{ 1097 return float16_abs(float16_sub(op1, op2, stat)); 1098} 1099 1100static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1101{ 1102 return float32_abs(float32_sub(op1, op2, stat)); 1103} 1104 1105/* 1106 * Reciprocal step. These are the AArch32 version which uses a 1107 * non-fused multiply-and-subtract. 1108 */ 1109static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1110{ 1111 op1 = float16_squash_input_denormal(op1, stat); 1112 op2 = float16_squash_input_denormal(op2, stat); 1113 1114 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1115 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1116 return float16_two; 1117 } 1118 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1119} 1120 1121static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1122{ 1123 op1 = float32_squash_input_denormal(op1, stat); 1124 op2 = float32_squash_input_denormal(op2, stat); 1125 1126 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1127 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1128 return float32_two; 1129 } 1130 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1131} 1132 1133/* Reciprocal square-root step. AArch32 non-fused semantics. */ 1134static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1135{ 1136 op1 = float16_squash_input_denormal(op1, stat); 1137 op2 = float16_squash_input_denormal(op2, stat); 1138 1139 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1140 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1141 return float16_one_point_five; 1142 } 1143 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1144 return float16_div(op1, float16_two, stat); 1145} 1146 1147static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1148{ 1149 op1 = float32_squash_input_denormal(op1, stat); 1150 op2 = float32_squash_input_denormal(op2, stat); 1151 1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1153 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1154 return float32_one_point_five; 1155 } 1156 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1157 return float32_div(op1, float32_two, stat); 1158} 1159 1160#define DO_3OP(NAME, FUNC, TYPE) \ 1161void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1162{ \ 1163 intptr_t i, oprsz = simd_oprsz(desc); \ 1164 TYPE *d = vd, *n = vn, *m = vm; \ 1165 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1166 d[i] = FUNC(n[i], m[i], stat); \ 1167 } \ 1168 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1169} 1170 1171DO_3OP(gvec_fadd_h, float16_add, float16) 1172DO_3OP(gvec_fadd_s, float32_add, float32) 1173DO_3OP(gvec_fadd_d, float64_add, float64) 1174 1175DO_3OP(gvec_fsub_h, float16_sub, float16) 1176DO_3OP(gvec_fsub_s, float32_sub, float32) 1177DO_3OP(gvec_fsub_d, float64_sub, float64) 1178 1179DO_3OP(gvec_fmul_h, float16_mul, float16) 1180DO_3OP(gvec_fmul_s, float32_mul, float32) 1181DO_3OP(gvec_fmul_d, float64_mul, float64) 1182 1183DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1184DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1185DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1186 1187DO_3OP(gvec_fabd_h, float16_abd, float16) 1188DO_3OP(gvec_fabd_s, float32_abd, float32) 1189 1190DO_3OP(gvec_fceq_h, float16_ceq, float16) 1191DO_3OP(gvec_fceq_s, float32_ceq, float32) 1192 1193DO_3OP(gvec_fcge_h, float16_cge, float16) 1194DO_3OP(gvec_fcge_s, float32_cge, float32) 1195 1196DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1197DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1198 1199DO_3OP(gvec_facge_h, float16_acge, float16) 1200DO_3OP(gvec_facge_s, float32_acge, float32) 1201 1202DO_3OP(gvec_facgt_h, float16_acgt, float16) 1203DO_3OP(gvec_facgt_s, float32_acgt, float32) 1204 1205DO_3OP(gvec_fmax_h, float16_max, float16) 1206DO_3OP(gvec_fmax_s, float32_max, float32) 1207 1208DO_3OP(gvec_fmin_h, float16_min, float16) 1209DO_3OP(gvec_fmin_s, float32_min, float32) 1210 1211DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1212DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1213 1214DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1215DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1216 1217DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1218DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1219 1220DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1221DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1222 1223#ifdef TARGET_AARCH64 1224 1225DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1226DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1227DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1228 1229DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1230DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1231DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1232 1233#endif 1234#undef DO_3OP 1235 1236/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1237static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1238 float_status *stat) 1239{ 1240 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1241} 1242 1243static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1244 float_status *stat) 1245{ 1246 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1247} 1248 1249static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1250 float_status *stat) 1251{ 1252 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1253} 1254 1255static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1256 float_status *stat) 1257{ 1258 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1259} 1260 1261/* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1262static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1263 float_status *stat) 1264{ 1265 return float16_muladd(op1, op2, dest, 0, stat); 1266} 1267 1268static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1269 float_status *stat) 1270{ 1271 return float32_muladd(op1, op2, dest, 0, stat); 1272} 1273 1274static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1275 float_status *stat) 1276{ 1277 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1278} 1279 1280static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1281 float_status *stat) 1282{ 1283 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1284} 1285 1286#define DO_MULADD(NAME, FUNC, TYPE) \ 1287void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1288{ \ 1289 intptr_t i, oprsz = simd_oprsz(desc); \ 1290 TYPE *d = vd, *n = vn, *m = vm; \ 1291 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1292 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1293 } \ 1294 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1295} 1296 1297DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1298DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1299 1300DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1301DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1302 1303DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1304DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1305 1306DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1307DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1308 1309/* For the indexed ops, SVE applies the index per 128-bit vector segment. 1310 * For AdvSIMD, there is of course only one such vector segment. 1311 */ 1312 1313#define DO_MUL_IDX(NAME, TYPE, H) \ 1314void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1315{ \ 1316 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1317 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1318 intptr_t idx = simd_data(desc); \ 1319 TYPE *d = vd, *n = vn, *m = vm; \ 1320 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1321 TYPE mm = m[H(i + idx)]; \ 1322 for (j = 0; j < segment; j++) { \ 1323 d[i + j] = n[i + j] * mm; \ 1324 } \ 1325 } \ 1326 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1327} 1328 1329DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1330DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1331DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1332 1333#undef DO_MUL_IDX 1334 1335#define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1336void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1337{ \ 1338 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1339 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1340 intptr_t idx = simd_data(desc); \ 1341 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1342 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1343 TYPE mm = m[H(i + idx)]; \ 1344 for (j = 0; j < segment; j++) { \ 1345 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1346 } \ 1347 } \ 1348 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1349} 1350 1351DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1352DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1353DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1354 1355DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1356DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1357DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1358 1359#undef DO_MLA_IDX 1360 1361#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ 1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1363{ \ 1364 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1366 intptr_t idx = simd_data(desc); \ 1367 TYPE *d = vd, *n = vn, *m = vm; \ 1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1369 TYPE mm = m[H(i + idx)]; \ 1370 for (j = 0; j < segment; j++) { \ 1371 d[i + j] = TYPE##_##ADD(d[i + j], \ 1372 TYPE##_mul(n[i + j], mm, stat), stat); \ 1373 } \ 1374 } \ 1375 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1376} 1377 1378#define float16_nop(N, M, S) (M) 1379#define float32_nop(N, M, S) (M) 1380#define float64_nop(N, M, S) (M) 1381 1382DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) 1383DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) 1384DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) 1385 1386/* 1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1388 * the fused ops below they assume accumulate both from and into Vd. 1389 */ 1390DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) 1391DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) 1392DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) 1393DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) 1394 1395#undef float16_nop 1396#undef float32_nop 1397#undef float64_nop 1398#undef DO_FMUL_IDX 1399 1400#define DO_FMLA_IDX(NAME, TYPE, H) \ 1401void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1402 void *stat, uint32_t desc) \ 1403{ \ 1404 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1405 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1406 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1407 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1408 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1409 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1410 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1411 TYPE mm = m[H(i + idx)]; \ 1412 for (j = 0; j < segment; j++) { \ 1413 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1414 mm, a[i + j], 0, stat); \ 1415 } \ 1416 } \ 1417 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1418} 1419 1420DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1421DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1422DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1423 1424#undef DO_FMLA_IDX 1425 1426#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1427void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1428{ \ 1429 intptr_t i, oprsz = simd_oprsz(desc); \ 1430 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1431 bool q = false; \ 1432 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1433 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1434 if (dd < MIN) { \ 1435 dd = MIN; \ 1436 q = true; \ 1437 } else if (dd > MAX) { \ 1438 dd = MAX; \ 1439 q = true; \ 1440 } \ 1441 d[i] = dd; \ 1442 } \ 1443 if (q) { \ 1444 uint32_t *qc = vq; \ 1445 qc[0] = 1; \ 1446 } \ 1447 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1448} 1449 1450DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1451DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1452DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1453 1454DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1455DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1456DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1457 1458DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1459DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1460DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1461 1462DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1463DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1464DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1465 1466#undef DO_SAT 1467 1468void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1469 void *vm, uint32_t desc) 1470{ 1471 intptr_t i, oprsz = simd_oprsz(desc); 1472 uint64_t *d = vd, *n = vn, *m = vm; 1473 bool q = false; 1474 1475 for (i = 0; i < oprsz / 8; i++) { 1476 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1477 if (dd < nn) { 1478 dd = UINT64_MAX; 1479 q = true; 1480 } 1481 d[i] = dd; 1482 } 1483 if (q) { 1484 uint32_t *qc = vq; 1485 qc[0] = 1; 1486 } 1487 clear_tail(d, oprsz, simd_maxsz(desc)); 1488} 1489 1490void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1491 void *vm, uint32_t desc) 1492{ 1493 intptr_t i, oprsz = simd_oprsz(desc); 1494 uint64_t *d = vd, *n = vn, *m = vm; 1495 bool q = false; 1496 1497 for (i = 0; i < oprsz / 8; i++) { 1498 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1499 if (nn < mm) { 1500 dd = 0; 1501 q = true; 1502 } 1503 d[i] = dd; 1504 } 1505 if (q) { 1506 uint32_t *qc = vq; 1507 qc[0] = 1; 1508 } 1509 clear_tail(d, oprsz, simd_maxsz(desc)); 1510} 1511 1512void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1513 void *vm, uint32_t desc) 1514{ 1515 intptr_t i, oprsz = simd_oprsz(desc); 1516 int64_t *d = vd, *n = vn, *m = vm; 1517 bool q = false; 1518 1519 for (i = 0; i < oprsz / 8; i++) { 1520 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1521 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1522 dd = (nn >> 63) ^ ~INT64_MIN; 1523 q = true; 1524 } 1525 d[i] = dd; 1526 } 1527 if (q) { 1528 uint32_t *qc = vq; 1529 qc[0] = 1; 1530 } 1531 clear_tail(d, oprsz, simd_maxsz(desc)); 1532} 1533 1534void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1535 void *vm, uint32_t desc) 1536{ 1537 intptr_t i, oprsz = simd_oprsz(desc); 1538 int64_t *d = vd, *n = vn, *m = vm; 1539 bool q = false; 1540 1541 for (i = 0; i < oprsz / 8; i++) { 1542 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1543 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1544 dd = (nn >> 63) ^ ~INT64_MIN; 1545 q = true; 1546 } 1547 d[i] = dd; 1548 } 1549 if (q) { 1550 uint32_t *qc = vq; 1551 qc[0] = 1; 1552 } 1553 clear_tail(d, oprsz, simd_maxsz(desc)); 1554} 1555 1556 1557#define DO_SRA(NAME, TYPE) \ 1558void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1559{ \ 1560 intptr_t i, oprsz = simd_oprsz(desc); \ 1561 int shift = simd_data(desc); \ 1562 TYPE *d = vd, *n = vn; \ 1563 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1564 d[i] += n[i] >> shift; \ 1565 } \ 1566 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1567} 1568 1569DO_SRA(gvec_ssra_b, int8_t) 1570DO_SRA(gvec_ssra_h, int16_t) 1571DO_SRA(gvec_ssra_s, int32_t) 1572DO_SRA(gvec_ssra_d, int64_t) 1573 1574DO_SRA(gvec_usra_b, uint8_t) 1575DO_SRA(gvec_usra_h, uint16_t) 1576DO_SRA(gvec_usra_s, uint32_t) 1577DO_SRA(gvec_usra_d, uint64_t) 1578 1579#undef DO_SRA 1580 1581#define DO_RSHR(NAME, TYPE) \ 1582void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1583{ \ 1584 intptr_t i, oprsz = simd_oprsz(desc); \ 1585 int shift = simd_data(desc); \ 1586 TYPE *d = vd, *n = vn; \ 1587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1588 TYPE tmp = n[i] >> (shift - 1); \ 1589 d[i] = (tmp >> 1) + (tmp & 1); \ 1590 } \ 1591 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1592} 1593 1594DO_RSHR(gvec_srshr_b, int8_t) 1595DO_RSHR(gvec_srshr_h, int16_t) 1596DO_RSHR(gvec_srshr_s, int32_t) 1597DO_RSHR(gvec_srshr_d, int64_t) 1598 1599DO_RSHR(gvec_urshr_b, uint8_t) 1600DO_RSHR(gvec_urshr_h, uint16_t) 1601DO_RSHR(gvec_urshr_s, uint32_t) 1602DO_RSHR(gvec_urshr_d, uint64_t) 1603 1604#undef DO_RSHR 1605 1606#define DO_RSRA(NAME, TYPE) \ 1607void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1608{ \ 1609 intptr_t i, oprsz = simd_oprsz(desc); \ 1610 int shift = simd_data(desc); \ 1611 TYPE *d = vd, *n = vn; \ 1612 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1613 TYPE tmp = n[i] >> (shift - 1); \ 1614 d[i] += (tmp >> 1) + (tmp & 1); \ 1615 } \ 1616 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1617} 1618 1619DO_RSRA(gvec_srsra_b, int8_t) 1620DO_RSRA(gvec_srsra_h, int16_t) 1621DO_RSRA(gvec_srsra_s, int32_t) 1622DO_RSRA(gvec_srsra_d, int64_t) 1623 1624DO_RSRA(gvec_ursra_b, uint8_t) 1625DO_RSRA(gvec_ursra_h, uint16_t) 1626DO_RSRA(gvec_ursra_s, uint32_t) 1627DO_RSRA(gvec_ursra_d, uint64_t) 1628 1629#undef DO_RSRA 1630 1631#define DO_SRI(NAME, TYPE) \ 1632void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1633{ \ 1634 intptr_t i, oprsz = simd_oprsz(desc); \ 1635 int shift = simd_data(desc); \ 1636 TYPE *d = vd, *n = vn; \ 1637 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1638 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1639 } \ 1640 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1641} 1642 1643DO_SRI(gvec_sri_b, uint8_t) 1644DO_SRI(gvec_sri_h, uint16_t) 1645DO_SRI(gvec_sri_s, uint32_t) 1646DO_SRI(gvec_sri_d, uint64_t) 1647 1648#undef DO_SRI 1649 1650#define DO_SLI(NAME, TYPE) \ 1651void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1652{ \ 1653 intptr_t i, oprsz = simd_oprsz(desc); \ 1654 int shift = simd_data(desc); \ 1655 TYPE *d = vd, *n = vn; \ 1656 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1657 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1658 } \ 1659 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1660} 1661 1662DO_SLI(gvec_sli_b, uint8_t) 1663DO_SLI(gvec_sli_h, uint16_t) 1664DO_SLI(gvec_sli_s, uint32_t) 1665DO_SLI(gvec_sli_d, uint64_t) 1666 1667#undef DO_SLI 1668 1669/* 1670 * Convert float16 to float32, raising no exceptions and 1671 * preserving exceptional values, including SNaN. 1672 * This is effectively an unpack+repack operation. 1673 */ 1674static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1675{ 1676 const int f16_bias = 15; 1677 const int f32_bias = 127; 1678 uint32_t sign = extract32(f16, 15, 1); 1679 uint32_t exp = extract32(f16, 10, 5); 1680 uint32_t frac = extract32(f16, 0, 10); 1681 1682 if (exp == 0x1f) { 1683 /* Inf or NaN */ 1684 exp = 0xff; 1685 } else if (exp == 0) { 1686 /* Zero or denormal. */ 1687 if (frac != 0) { 1688 if (fz16) { 1689 frac = 0; 1690 } else { 1691 /* 1692 * Denormal; these are all normal float32. 1693 * Shift the fraction so that the msb is at bit 11, 1694 * then remove bit 11 as the implicit bit of the 1695 * normalized float32. Note that we still go through 1696 * the shift for normal numbers below, to put the 1697 * float32 fraction at the right place. 1698 */ 1699 int shift = clz32(frac) - 21; 1700 frac = (frac << shift) & 0x3ff; 1701 exp = f32_bias - f16_bias - shift + 1; 1702 } 1703 } 1704 } else { 1705 /* Normal number; adjust the bias. */ 1706 exp += f32_bias - f16_bias; 1707 } 1708 sign <<= 31; 1709 exp <<= 23; 1710 frac <<= 23 - 10; 1711 1712 return sign | exp | frac; 1713} 1714 1715static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1716{ 1717 /* 1718 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1719 * Load the 2nd qword iff is_q & is_2. 1720 * Shift to the 2nd dword iff !is_q & is_2. 1721 * For !is_q & !is_2, the upper bits of the result are garbage. 1722 */ 1723 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1724} 1725 1726/* 1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1728 * as there is not yet SVE versions that might use blocking. 1729 */ 1730 1731static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1732 uint32_t desc, bool fz16) 1733{ 1734 intptr_t i, oprsz = simd_oprsz(desc); 1735 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1736 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1737 int is_q = oprsz == 16; 1738 uint64_t n_4, m_4; 1739 1740 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1741 n_4 = load4_f16(vn, is_q, is_2); 1742 m_4 = load4_f16(vm, is_q, is_2); 1743 1744 /* Negate all inputs for FMLSL at once. */ 1745 if (is_s) { 1746 n_4 ^= 0x8000800080008000ull; 1747 } 1748 1749 for (i = 0; i < oprsz / 4; i++) { 1750 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1751 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1752 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1753 } 1754 clear_tail(d, oprsz, simd_maxsz(desc)); 1755} 1756 1757void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1758 void *venv, uint32_t desc) 1759{ 1760 CPUARMState *env = venv; 1761 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1762 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1763} 1764 1765void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1766 void *venv, uint32_t desc) 1767{ 1768 CPUARMState *env = venv; 1769 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1770 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1771} 1772 1773void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1774 void *venv, uint32_t desc) 1775{ 1776 intptr_t i, oprsz = simd_oprsz(desc); 1777 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1778 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1779 CPUARMState *env = venv; 1780 float_status *status = &env->vfp.fp_status; 1781 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1782 1783 for (i = 0; i < oprsz; i += sizeof(float32)) { 1784 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1785 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1786 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1787 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1788 float32 aa = *(float32 *)(va + H1_4(i)); 1789 1790 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1791 } 1792} 1793 1794static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1795 uint32_t desc, bool fz16) 1796{ 1797 intptr_t i, oprsz = simd_oprsz(desc); 1798 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1799 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1800 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1801 int is_q = oprsz == 16; 1802 uint64_t n_4; 1803 float32 m_1; 1804 1805 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1806 n_4 = load4_f16(vn, is_q, is_2); 1807 1808 /* Negate all inputs for FMLSL at once. */ 1809 if (is_s) { 1810 n_4 ^= 0x8000800080008000ull; 1811 } 1812 1813 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1814 1815 for (i = 0; i < oprsz / 4; i++) { 1816 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1817 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1818 } 1819 clear_tail(d, oprsz, simd_maxsz(desc)); 1820} 1821 1822void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1823 void *venv, uint32_t desc) 1824{ 1825 CPUARMState *env = venv; 1826 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1827 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1828} 1829 1830void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1831 void *venv, uint32_t desc) 1832{ 1833 CPUARMState *env = venv; 1834 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1835 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1836} 1837 1838void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1839 void *venv, uint32_t desc) 1840{ 1841 intptr_t i, j, oprsz = simd_oprsz(desc); 1842 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1843 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1844 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1845 CPUARMState *env = venv; 1846 float_status *status = &env->vfp.fp_status; 1847 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1848 1849 for (i = 0; i < oprsz; i += 16) { 1850 float16 mm_16 = *(float16 *)(vm + i + idx); 1851 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1852 1853 for (j = 0; j < 16; j += sizeof(float32)) { 1854 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1855 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1856 float32 aa = *(float32 *)(va + H1_4(i + j)); 1857 1858 *(float32 *)(vd + H1_4(i + j)) = 1859 float32_muladd(nn, mm, aa, 0, status); 1860 } 1861 } 1862} 1863 1864void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1865{ 1866 intptr_t i, opr_sz = simd_oprsz(desc); 1867 int8_t *d = vd, *n = vn, *m = vm; 1868 1869 for (i = 0; i < opr_sz; ++i) { 1870 int8_t mm = m[i]; 1871 int8_t nn = n[i]; 1872 int8_t res = 0; 1873 if (mm >= 0) { 1874 if (mm < 8) { 1875 res = nn << mm; 1876 } 1877 } else { 1878 res = nn >> (mm > -8 ? -mm : 7); 1879 } 1880 d[i] = res; 1881 } 1882 clear_tail(d, opr_sz, simd_maxsz(desc)); 1883} 1884 1885void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1886{ 1887 intptr_t i, opr_sz = simd_oprsz(desc); 1888 int16_t *d = vd, *n = vn, *m = vm; 1889 1890 for (i = 0; i < opr_sz / 2; ++i) { 1891 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1892 int16_t nn = n[i]; 1893 int16_t res = 0; 1894 if (mm >= 0) { 1895 if (mm < 16) { 1896 res = nn << mm; 1897 } 1898 } else { 1899 res = nn >> (mm > -16 ? -mm : 15); 1900 } 1901 d[i] = res; 1902 } 1903 clear_tail(d, opr_sz, simd_maxsz(desc)); 1904} 1905 1906void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1907{ 1908 intptr_t i, opr_sz = simd_oprsz(desc); 1909 uint8_t *d = vd, *n = vn, *m = vm; 1910 1911 for (i = 0; i < opr_sz; ++i) { 1912 int8_t mm = m[i]; 1913 uint8_t nn = n[i]; 1914 uint8_t res = 0; 1915 if (mm >= 0) { 1916 if (mm < 8) { 1917 res = nn << mm; 1918 } 1919 } else { 1920 if (mm > -8) { 1921 res = nn >> -mm; 1922 } 1923 } 1924 d[i] = res; 1925 } 1926 clear_tail(d, opr_sz, simd_maxsz(desc)); 1927} 1928 1929void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1930{ 1931 intptr_t i, opr_sz = simd_oprsz(desc); 1932 uint16_t *d = vd, *n = vn, *m = vm; 1933 1934 for (i = 0; i < opr_sz / 2; ++i) { 1935 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1936 uint16_t nn = n[i]; 1937 uint16_t res = 0; 1938 if (mm >= 0) { 1939 if (mm < 16) { 1940 res = nn << mm; 1941 } 1942 } else { 1943 if (mm > -16) { 1944 res = nn >> -mm; 1945 } 1946 } 1947 d[i] = res; 1948 } 1949 clear_tail(d, opr_sz, simd_maxsz(desc)); 1950} 1951 1952/* 1953 * 8x8->8 polynomial multiply. 1954 * 1955 * Polynomial multiplication is like integer multiplication except the 1956 * partial products are XORed, not added. 1957 * 1958 * TODO: expose this as a generic vector operation, as it is a common 1959 * crypto building block. 1960 */ 1961void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1962{ 1963 intptr_t i, j, opr_sz = simd_oprsz(desc); 1964 uint64_t *d = vd, *n = vn, *m = vm; 1965 1966 for (i = 0; i < opr_sz / 8; ++i) { 1967 uint64_t nn = n[i]; 1968 uint64_t mm = m[i]; 1969 uint64_t rr = 0; 1970 1971 for (j = 0; j < 8; ++j) { 1972 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; 1973 rr ^= mm & mask; 1974 mm = (mm << 1) & 0xfefefefefefefefeull; 1975 nn >>= 1; 1976 } 1977 d[i] = rr; 1978 } 1979 clear_tail(d, opr_sz, simd_maxsz(desc)); 1980} 1981 1982/* 1983 * 64x64->128 polynomial multiply. 1984 * Because of the lanes are not accessed in strict columns, 1985 * this probably cannot be turned into a generic helper. 1986 */ 1987void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 1988{ 1989 intptr_t i, j, opr_sz = simd_oprsz(desc); 1990 intptr_t hi = simd_data(desc); 1991 uint64_t *d = vd, *n = vn, *m = vm; 1992 1993 for (i = 0; i < opr_sz / 8; i += 2) { 1994 uint64_t nn = n[i + hi]; 1995 uint64_t mm = m[i + hi]; 1996 uint64_t rhi = 0; 1997 uint64_t rlo = 0; 1998 1999 /* Bit 0 can only influence the low 64-bit result. */ 2000 if (nn & 1) { 2001 rlo = mm; 2002 } 2003 2004 for (j = 1; j < 64; ++j) { 2005 uint64_t mask = -((nn >> j) & 1); 2006 rlo ^= (mm << j) & mask; 2007 rhi ^= (mm >> (64 - j)) & mask; 2008 } 2009 d[i] = rlo; 2010 d[i + 1] = rhi; 2011 } 2012 clear_tail(d, opr_sz, simd_maxsz(desc)); 2013} 2014 2015/* 2016 * 8x8->16 polynomial multiply. 2017 * 2018 * The byte inputs are expanded to (or extracted from) half-words. 2019 * Note that neon and sve2 get the inputs from different positions. 2020 * This allows 4 bytes to be processed in parallel with uint64_t. 2021 */ 2022 2023static uint64_t expand_byte_to_half(uint64_t x) 2024{ 2025 return (x & 0x000000ff) 2026 | ((x & 0x0000ff00) << 8) 2027 | ((x & 0x00ff0000) << 16) 2028 | ((x & 0xff000000) << 24); 2029} 2030 2031uint64_t pmull_w(uint64_t op1, uint64_t op2) 2032{ 2033 uint64_t result = 0; 2034 int i; 2035 for (i = 0; i < 16; ++i) { 2036 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; 2037 result ^= op2 & mask; 2038 op1 >>= 1; 2039 op2 <<= 1; 2040 } 2041 return result; 2042} 2043 2044uint64_t pmull_h(uint64_t op1, uint64_t op2) 2045{ 2046 uint64_t result = 0; 2047 int i; 2048 for (i = 0; i < 8; ++i) { 2049 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; 2050 result ^= op2 & mask; 2051 op1 >>= 1; 2052 op2 <<= 1; 2053 } 2054 return result; 2055} 2056 2057void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2058{ 2059 int hi = simd_data(desc); 2060 uint64_t *d = vd, *n = vn, *m = vm; 2061 uint64_t nn = n[hi], mm = m[hi]; 2062 2063 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 2064 nn >>= 32; 2065 mm >>= 32; 2066 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 2067 2068 clear_tail(d, 16, simd_maxsz(desc)); 2069} 2070 2071#ifdef TARGET_AARCH64 2072void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2073{ 2074 int shift = simd_data(desc) * 8; 2075 intptr_t i, opr_sz = simd_oprsz(desc); 2076 uint64_t *d = vd, *n = vn, *m = vm; 2077 2078 for (i = 0; i < opr_sz / 8; ++i) { 2079 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; 2080 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; 2081 2082 d[i] = pmull_h(nn, mm); 2083 } 2084} 2085 2086static uint64_t pmull_d(uint64_t op1, uint64_t op2) 2087{ 2088 uint64_t result = 0; 2089 int i; 2090 2091 for (i = 0; i < 32; ++i) { 2092 uint64_t mask = -((op1 >> i) & 1); 2093 result ^= (op2 << i) & mask; 2094 } 2095 return result; 2096} 2097 2098void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2099{ 2100 intptr_t sel = H4(simd_data(desc)); 2101 intptr_t i, opr_sz = simd_oprsz(desc); 2102 uint32_t *n = vn, *m = vm; 2103 uint64_t *d = vd; 2104 2105 for (i = 0; i < opr_sz / 8; ++i) { 2106 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); 2107 } 2108} 2109#endif 2110 2111#define DO_CMP0(NAME, TYPE, OP) \ 2112void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2113{ \ 2114 intptr_t i, opr_sz = simd_oprsz(desc); \ 2115 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2116 TYPE nn = *(TYPE *)(vn + i); \ 2117 *(TYPE *)(vd + i) = -(nn OP 0); \ 2118 } \ 2119 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2120} 2121 2122DO_CMP0(gvec_ceq0_b, int8_t, ==) 2123DO_CMP0(gvec_clt0_b, int8_t, <) 2124DO_CMP0(gvec_cle0_b, int8_t, <=) 2125DO_CMP0(gvec_cgt0_b, int8_t, >) 2126DO_CMP0(gvec_cge0_b, int8_t, >=) 2127 2128DO_CMP0(gvec_ceq0_h, int16_t, ==) 2129DO_CMP0(gvec_clt0_h, int16_t, <) 2130DO_CMP0(gvec_cle0_h, int16_t, <=) 2131DO_CMP0(gvec_cgt0_h, int16_t, >) 2132DO_CMP0(gvec_cge0_h, int16_t, >=) 2133 2134#undef DO_CMP0 2135 2136#define DO_ABD(NAME, TYPE) \ 2137void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2138{ \ 2139 intptr_t i, opr_sz = simd_oprsz(desc); \ 2140 TYPE *d = vd, *n = vn, *m = vm; \ 2141 \ 2142 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2143 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2144 } \ 2145 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2146} 2147 2148DO_ABD(gvec_sabd_b, int8_t) 2149DO_ABD(gvec_sabd_h, int16_t) 2150DO_ABD(gvec_sabd_s, int32_t) 2151DO_ABD(gvec_sabd_d, int64_t) 2152 2153DO_ABD(gvec_uabd_b, uint8_t) 2154DO_ABD(gvec_uabd_h, uint16_t) 2155DO_ABD(gvec_uabd_s, uint32_t) 2156DO_ABD(gvec_uabd_d, uint64_t) 2157 2158#undef DO_ABD 2159 2160#define DO_ABA(NAME, TYPE) \ 2161void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2162{ \ 2163 intptr_t i, opr_sz = simd_oprsz(desc); \ 2164 TYPE *d = vd, *n = vn, *m = vm; \ 2165 \ 2166 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2167 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2168 } \ 2169 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2170} 2171 2172DO_ABA(gvec_saba_b, int8_t) 2173DO_ABA(gvec_saba_h, int16_t) 2174DO_ABA(gvec_saba_s, int32_t) 2175DO_ABA(gvec_saba_d, int64_t) 2176 2177DO_ABA(gvec_uaba_b, uint8_t) 2178DO_ABA(gvec_uaba_h, uint16_t) 2179DO_ABA(gvec_uaba_s, uint32_t) 2180DO_ABA(gvec_uaba_d, uint64_t) 2181 2182#undef DO_ABA 2183 2184#define DO_NEON_PAIRWISE(NAME, OP) \ 2185 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2186 void *stat, uint32_t oprsz) \ 2187 { \ 2188 float_status *fpst = stat; \ 2189 float32 *d = vd; \ 2190 float32 *n = vn; \ 2191 float32 *m = vm; \ 2192 float32 r0, r1; \ 2193 \ 2194 /* Read all inputs before writing outputs in case vm == vd */ \ 2195 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2196 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2197 \ 2198 d[H4(0)] = r0; \ 2199 d[H4(1)] = r1; \ 2200 } \ 2201 \ 2202 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2203 void *stat, uint32_t oprsz) \ 2204 { \ 2205 float_status *fpst = stat; \ 2206 float16 *d = vd; \ 2207 float16 *n = vn; \ 2208 float16 *m = vm; \ 2209 float16 r0, r1, r2, r3; \ 2210 \ 2211 /* Read all inputs before writing outputs in case vm == vd */ \ 2212 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2213 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2214 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2215 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2216 \ 2217 d[H2(0)] = r0; \ 2218 d[H2(1)] = r1; \ 2219 d[H2(2)] = r2; \ 2220 d[H2(3)] = r3; \ 2221 } 2222 2223DO_NEON_PAIRWISE(neon_padd, add) 2224DO_NEON_PAIRWISE(neon_pmax, max) 2225DO_NEON_PAIRWISE(neon_pmin, min) 2226 2227#undef DO_NEON_PAIRWISE 2228 2229#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2230 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2231 { \ 2232 intptr_t i, oprsz = simd_oprsz(desc); \ 2233 int shift = simd_data(desc); \ 2234 TYPE *d = vd, *n = vn; \ 2235 float_status *fpst = stat; \ 2236 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2237 d[i] = FUNC(n[i], shift, fpst); \ 2238 } \ 2239 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2240 } 2241 2242DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2243DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2244DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2245DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2246DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2247DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2248DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2249DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2250 2251#undef DO_VCVT_FIXED 2252 2253#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2254 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2255 { \ 2256 float_status *fpst = stat; \ 2257 intptr_t i, oprsz = simd_oprsz(desc); \ 2258 uint32_t rmode = simd_data(desc); \ 2259 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2260 TYPE *d = vd, *n = vn; \ 2261 set_float_rounding_mode(rmode, fpst); \ 2262 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2263 d[i] = FUNC(n[i], 0, fpst); \ 2264 } \ 2265 set_float_rounding_mode(prev_rmode, fpst); \ 2266 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2267 } 2268 2269DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2270DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2271DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2272DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2273 2274#undef DO_VCVT_RMODE 2275 2276#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2277 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2278 { \ 2279 float_status *fpst = stat; \ 2280 intptr_t i, oprsz = simd_oprsz(desc); \ 2281 uint32_t rmode = simd_data(desc); \ 2282 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2283 TYPE *d = vd, *n = vn; \ 2284 set_float_rounding_mode(rmode, fpst); \ 2285 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2286 d[i] = FUNC(n[i], fpst); \ 2287 } \ 2288 set_float_rounding_mode(prev_rmode, fpst); \ 2289 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2290 } 2291 2292DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2293DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2294 2295#undef DO_VRINT_RMODE 2296 2297#ifdef TARGET_AARCH64 2298void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2299{ 2300 const uint8_t *indices = vm; 2301 CPUARMState *env = venv; 2302 size_t oprsz = simd_oprsz(desc); 2303 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2304 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2305 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2306 union { 2307 uint8_t b[16]; 2308 uint64_t d[2]; 2309 } result; 2310 2311 /* 2312 * We must construct the final result in a temp, lest the output 2313 * overlaps the input table. For TBL, begin with zero; for TBX, 2314 * begin with the original register contents. Note that we always 2315 * copy 16 bytes here to avoid an extra branch; clearing the high 2316 * bits of the register for oprsz == 8 is handled below. 2317 */ 2318 if (is_tbx) { 2319 memcpy(&result, vd, 16); 2320 } else { 2321 memset(&result, 0, 16); 2322 } 2323 2324 for (size_t i = 0; i < oprsz; ++i) { 2325 uint32_t index = indices[H1(i)]; 2326 2327 if (index < table_len) { 2328 /* 2329 * Convert index (a byte offset into the virtual table 2330 * which is a series of 128-bit vectors concatenated) 2331 * into the correct register element, bearing in mind 2332 * that the table can wrap around from V31 to V0. 2333 */ 2334 const uint8_t *table = (const uint8_t *) 2335 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2336 result.b[H1(i)] = table[H1(index % 16)]; 2337 } 2338 } 2339 2340 memcpy(vd, &result, 16); 2341 clear_tail(vd, oprsz, simd_maxsz(desc)); 2342} 2343#endif 2344 2345/* 2346 * NxN -> N highpart multiply 2347 * 2348 * TODO: expose this as a generic vector operation. 2349 */ 2350 2351void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2352{ 2353 intptr_t i, opr_sz = simd_oprsz(desc); 2354 int8_t *d = vd, *n = vn, *m = vm; 2355 2356 for (i = 0; i < opr_sz; ++i) { 2357 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2358 } 2359 clear_tail(d, opr_sz, simd_maxsz(desc)); 2360} 2361 2362void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2363{ 2364 intptr_t i, opr_sz = simd_oprsz(desc); 2365 int16_t *d = vd, *n = vn, *m = vm; 2366 2367 for (i = 0; i < opr_sz / 2; ++i) { 2368 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2369 } 2370 clear_tail(d, opr_sz, simd_maxsz(desc)); 2371} 2372 2373void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2374{ 2375 intptr_t i, opr_sz = simd_oprsz(desc); 2376 int32_t *d = vd, *n = vn, *m = vm; 2377 2378 for (i = 0; i < opr_sz / 4; ++i) { 2379 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2380 } 2381 clear_tail(d, opr_sz, simd_maxsz(desc)); 2382} 2383 2384void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2385{ 2386 intptr_t i, opr_sz = simd_oprsz(desc); 2387 uint64_t *d = vd, *n = vn, *m = vm; 2388 uint64_t discard; 2389 2390 for (i = 0; i < opr_sz / 8; ++i) { 2391 muls64(&discard, &d[i], n[i], m[i]); 2392 } 2393 clear_tail(d, opr_sz, simd_maxsz(desc)); 2394} 2395 2396void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2397{ 2398 intptr_t i, opr_sz = simd_oprsz(desc); 2399 uint8_t *d = vd, *n = vn, *m = vm; 2400 2401 for (i = 0; i < opr_sz; ++i) { 2402 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2403 } 2404 clear_tail(d, opr_sz, simd_maxsz(desc)); 2405} 2406 2407void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2408{ 2409 intptr_t i, opr_sz = simd_oprsz(desc); 2410 uint16_t *d = vd, *n = vn, *m = vm; 2411 2412 for (i = 0; i < opr_sz / 2; ++i) { 2413 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2414 } 2415 clear_tail(d, opr_sz, simd_maxsz(desc)); 2416} 2417 2418void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2419{ 2420 intptr_t i, opr_sz = simd_oprsz(desc); 2421 uint32_t *d = vd, *n = vn, *m = vm; 2422 2423 for (i = 0; i < opr_sz / 4; ++i) { 2424 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2425 } 2426 clear_tail(d, opr_sz, simd_maxsz(desc)); 2427} 2428 2429void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2430{ 2431 intptr_t i, opr_sz = simd_oprsz(desc); 2432 uint64_t *d = vd, *n = vn, *m = vm; 2433 uint64_t discard; 2434 2435 for (i = 0; i < opr_sz / 8; ++i) { 2436 mulu64(&discard, &d[i], n[i], m[i]); 2437 } 2438 clear_tail(d, opr_sz, simd_maxsz(desc)); 2439} 2440 2441void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2442{ 2443 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2444 int shr = simd_data(desc); 2445 uint64_t *d = vd, *n = vn, *m = vm; 2446 2447 for (i = 0; i < opr_sz; ++i) { 2448 d[i] = ror64(n[i] ^ m[i], shr); 2449 } 2450 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2451} 2452 2453/* 2454 * Integer matrix-multiply accumulate 2455 */ 2456 2457static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2458{ 2459 int8_t *n = vn, *m = vm; 2460 2461 for (intptr_t k = 0; k < 8; ++k) { 2462 sum += n[H1(k)] * m[H1(k)]; 2463 } 2464 return sum; 2465} 2466 2467static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2468{ 2469 uint8_t *n = vn, *m = vm; 2470 2471 for (intptr_t k = 0; k < 8; ++k) { 2472 sum += n[H1(k)] * m[H1(k)]; 2473 } 2474 return sum; 2475} 2476 2477static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2478{ 2479 uint8_t *n = vn; 2480 int8_t *m = vm; 2481 2482 for (intptr_t k = 0; k < 8; ++k) { 2483 sum += n[H1(k)] * m[H1(k)]; 2484 } 2485 return sum; 2486} 2487 2488static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2489 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2490{ 2491 intptr_t seg, opr_sz = simd_oprsz(desc); 2492 2493 for (seg = 0; seg < opr_sz; seg += 16) { 2494 uint32_t *d = vd + seg; 2495 uint32_t *a = va + seg; 2496 uint32_t sum0, sum1, sum2, sum3; 2497 2498 /* 2499 * Process the entire segment at once, writing back the 2500 * results only after we've consumed all of the inputs. 2501 * 2502 * Key to indices by column: 2503 * i j i j 2504 */ 2505 sum0 = a[H4(0 + 0)]; 2506 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2507 sum1 = a[H4(0 + 1)]; 2508 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2509 sum2 = a[H4(2 + 0)]; 2510 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2511 sum3 = a[H4(2 + 1)]; 2512 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2513 2514 d[H4(0)] = sum0; 2515 d[H4(1)] = sum1; 2516 d[H4(2)] = sum2; 2517 d[H4(3)] = sum3; 2518 } 2519 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2520} 2521 2522#define DO_MMLA_B(NAME, INNER) \ 2523 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2524 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2525 2526DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2527DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2528DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2529 2530/* 2531 * BFloat16 Dot Product 2532 */ 2533 2534static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2535{ 2536 /* FPCR is ignored for BFDOT and BFMMLA. */ 2537 float_status bf_status = { 2538 .tininess_before_rounding = float_tininess_before_rounding, 2539 .float_rounding_mode = float_round_to_odd_inf, 2540 .flush_to_zero = true, 2541 .flush_inputs_to_zero = true, 2542 .default_nan_mode = true, 2543 }; 2544 float32 t1, t2; 2545 2546 /* 2547 * Extract each BFloat16 from the element pair, and shift 2548 * them such that they become float32. 2549 */ 2550 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2551 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2552 t1 = float32_add(t1, t2, &bf_status); 2553 t1 = float32_add(sum, t1, &bf_status); 2554 2555 return t1; 2556} 2557 2558void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2559{ 2560 intptr_t i, opr_sz = simd_oprsz(desc); 2561 float32 *d = vd, *a = va; 2562 uint32_t *n = vn, *m = vm; 2563 2564 for (i = 0; i < opr_sz / 4; ++i) { 2565 d[i] = bfdotadd(a[i], n[i], m[i]); 2566 } 2567 clear_tail(d, opr_sz, simd_maxsz(desc)); 2568} 2569 2570void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2571 void *va, uint32_t desc) 2572{ 2573 intptr_t i, j, opr_sz = simd_oprsz(desc); 2574 intptr_t index = simd_data(desc); 2575 intptr_t elements = opr_sz / 4; 2576 intptr_t eltspersegment = MIN(16 / 4, elements); 2577 float32 *d = vd, *a = va; 2578 uint32_t *n = vn, *m = vm; 2579 2580 for (i = 0; i < elements; i += eltspersegment) { 2581 uint32_t m_idx = m[i + H4(index)]; 2582 2583 for (j = i; j < i + eltspersegment; j++) { 2584 d[j] = bfdotadd(a[j], n[j], m_idx); 2585 } 2586 } 2587 clear_tail(d, opr_sz, simd_maxsz(desc)); 2588} 2589 2590void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2591{ 2592 intptr_t s, opr_sz = simd_oprsz(desc); 2593 float32 *d = vd, *a = va; 2594 uint32_t *n = vn, *m = vm; 2595 2596 for (s = 0; s < opr_sz / 4; s += 4) { 2597 float32 sum00, sum01, sum10, sum11; 2598 2599 /* 2600 * Process the entire segment at once, writing back the 2601 * results only after we've consumed all of the inputs. 2602 * 2603 * Key to indicies by column: 2604 * i j i k j k 2605 */ 2606 sum00 = a[s + H4(0 + 0)]; 2607 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2608 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2609 2610 sum01 = a[s + H4(0 + 1)]; 2611 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2612 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2613 2614 sum10 = a[s + H4(2 + 0)]; 2615 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2616 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2617 2618 sum11 = a[s + H4(2 + 1)]; 2619 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2620 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2621 2622 d[s + H4(0 + 0)] = sum00; 2623 d[s + H4(0 + 1)] = sum01; 2624 d[s + H4(2 + 0)] = sum10; 2625 d[s + H4(2 + 1)] = sum11; 2626 } 2627 clear_tail(d, opr_sz, simd_maxsz(desc)); 2628} 2629 2630void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2631 void *stat, uint32_t desc) 2632{ 2633 intptr_t i, opr_sz = simd_oprsz(desc); 2634 intptr_t sel = simd_data(desc); 2635 float32 *d = vd, *a = va; 2636 bfloat16 *n = vn, *m = vm; 2637 2638 for (i = 0; i < opr_sz / 4; ++i) { 2639 float32 nn = n[H2(i * 2 + sel)] << 16; 2640 float32 mm = m[H2(i * 2 + sel)] << 16; 2641 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2642 } 2643 clear_tail(d, opr_sz, simd_maxsz(desc)); 2644} 2645 2646void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2647 void *va, void *stat, uint32_t desc) 2648{ 2649 intptr_t i, j, opr_sz = simd_oprsz(desc); 2650 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2651 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2652 intptr_t elements = opr_sz / 4; 2653 intptr_t eltspersegment = MIN(16 / 4, elements); 2654 float32 *d = vd, *a = va; 2655 bfloat16 *n = vn, *m = vm; 2656 2657 for (i = 0; i < elements; i += eltspersegment) { 2658 float32 m_idx = m[H2(2 * i + index)] << 16; 2659 2660 for (j = i; j < i + eltspersegment; j++) { 2661 float32 n_j = n[H2(2 * j + sel)] << 16; 2662 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2663 } 2664 } 2665 clear_tail(d, opr_sz, simd_maxsz(desc)); 2666}