neon_helper.c (45008B)
1/* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9#include "qemu/osdep.h" 10 11#include "cpu.h" 12#include "exec/helper-proto.h" 13#include "fpu/softfloat.h" 14#include "vec_internal.h" 15 16#define SIGNBIT (uint32_t)0x80000000 17#define SIGNBIT64 ((uint64_t)1 << 63) 18 19#define SET_QC() env->vfp.qc[0] = 1 20 21#define NEON_TYPE1(name, type) \ 22typedef struct \ 23{ \ 24 type v1; \ 25} neon_##name; 26#ifdef HOST_WORDS_BIGENDIAN 27#define NEON_TYPE2(name, type) \ 28typedef struct \ 29{ \ 30 type v2; \ 31 type v1; \ 32} neon_##name; 33#define NEON_TYPE4(name, type) \ 34typedef struct \ 35{ \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40} neon_##name; 41#else 42#define NEON_TYPE2(name, type) \ 43typedef struct \ 44{ \ 45 type v1; \ 46 type v2; \ 47} neon_##name; 48#define NEON_TYPE4(name, type) \ 49typedef struct \ 50{ \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55} neon_##name; 56#endif 57 58NEON_TYPE4(s8, int8_t) 59NEON_TYPE4(u8, uint8_t) 60NEON_TYPE2(s16, int16_t) 61NEON_TYPE2(u16, uint16_t) 62NEON_TYPE1(s32, int32_t) 63NEON_TYPE1(u32, uint32_t) 64#undef NEON_TYPE4 65#undef NEON_TYPE2 66#undef NEON_TYPE1 67 68/* Copy from a uint32_t to a vector structure type. */ 69#define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78/* Copy from a vector structure type to a uint32_t. */ 79#define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88#define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90#define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93#define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99#define NEON_VOP_BODY(vtype, n) \ 100{ \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110} 111 112#define NEON_VOP(name, vtype, n) \ 113uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114NEON_VOP_BODY(vtype, n) 115 116#define NEON_VOP_ENV(name, vtype, n) \ 117uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118NEON_VOP_BODY(vtype, n) 119 120/* Pairwise operations. */ 121/* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123#define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126#define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132#define NEON_POP(name, vtype, n) \ 133uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134{ \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144} 145 146/* Unary operators. */ 147#define NEON_VOP1(name, vtype, n) \ 148uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149{ \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156} 157 158 159#define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169#undef NEON_FN 170#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172#undef NEON_FN 173#undef NEON_USAT 174 175uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176{ 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183} 184 185uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186{ 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195} 196 197#define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211#undef NEON_FN 212#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214#undef NEON_FN 215#undef NEON_SSAT 216 217uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218{ 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225} 226 227uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228{ 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237} 238 239/* Unsigned saturating accumulate of signed value 240 * 241 * Op1/Rn is treated as signed 242 * Op2/Rd is treated as unsigned 243 * 244 * Explicit casting is used to ensure the correct sign extension of 245 * inputs. The result is treated as a unsigned value and saturated as such. 246 * 247 * We use a macro for the 8/16 bit cases which expects signed integers of va, 248 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 249 */ 250 251#define USATACC(bits, shift) \ 252 do { \ 253 va = sextract32(a, shift, bits); \ 254 vb = extract32(b, shift, bits); \ 255 vr = va + vb; \ 256 if (vr > UINT##bits##_MAX) { \ 257 SET_QC(); \ 258 vr = UINT##bits##_MAX; \ 259 } else if (vr < 0) { \ 260 SET_QC(); \ 261 vr = 0; \ 262 } \ 263 r = deposit32(r, shift, bits, vr); \ 264 } while (0) 265 266uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b) 267{ 268 int16_t va, vb, vr; 269 uint32_t r = 0; 270 271 USATACC(8, 0); 272 USATACC(8, 8); 273 USATACC(8, 16); 274 USATACC(8, 24); 275 return r; 276} 277 278uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b) 279{ 280 int32_t va, vb, vr; 281 uint64_t r = 0; 282 283 USATACC(16, 0); 284 USATACC(16, 16); 285 return r; 286} 287 288#undef USATACC 289 290uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 291{ 292 int64_t va = (int32_t)a; 293 int64_t vb = (uint32_t)b; 294 int64_t vr = va + vb; 295 if (vr > UINT32_MAX) { 296 SET_QC(); 297 vr = UINT32_MAX; 298 } else if (vr < 0) { 299 SET_QC(); 300 vr = 0; 301 } 302 return vr; 303} 304 305uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b) 306{ 307 uint64_t res; 308 res = a + b; 309 /* We only need to look at the pattern of SIGN bits to detect 310 * +ve/-ve saturation 311 */ 312 if (~a & b & ~res & SIGNBIT64) { 313 SET_QC(); 314 res = UINT64_MAX; 315 } else if (a & ~b & res & SIGNBIT64) { 316 SET_QC(); 317 res = 0; 318 } 319 return res; 320} 321 322/* Signed saturating accumulate of unsigned value 323 * 324 * Op1/Rn is treated as unsigned 325 * Op2/Rd is treated as signed 326 * 327 * The result is treated as a signed value and saturated as such 328 * 329 * We use a macro for the 8/16 bit cases which expects signed integers of va, 330 * vb, and vr for interim calculation and an unsigned 32 bit result value r. 331 */ 332 333#define SSATACC(bits, shift) \ 334 do { \ 335 va = extract32(a, shift, bits); \ 336 vb = sextract32(b, shift, bits); \ 337 vr = va + vb; \ 338 if (vr > INT##bits##_MAX) { \ 339 SET_QC(); \ 340 vr = INT##bits##_MAX; \ 341 } else if (vr < INT##bits##_MIN) { \ 342 SET_QC(); \ 343 vr = INT##bits##_MIN; \ 344 } \ 345 r = deposit32(r, shift, bits, vr); \ 346 } while (0) 347 348uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b) 349{ 350 int16_t va, vb, vr; 351 uint32_t r = 0; 352 353 SSATACC(8, 0); 354 SSATACC(8, 8); 355 SSATACC(8, 16); 356 SSATACC(8, 24); 357 return r; 358} 359 360uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b) 361{ 362 int32_t va, vb, vr; 363 uint32_t r = 0; 364 365 SSATACC(16, 0); 366 SSATACC(16, 16); 367 368 return r; 369} 370 371#undef SSATACC 372 373uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 374{ 375 int64_t res; 376 int64_t op1 = (uint32_t)a; 377 int64_t op2 = (int32_t)b; 378 res = op1 + op2; 379 if (res > INT32_MAX) { 380 SET_QC(); 381 res = INT32_MAX; 382 } else if (res < INT32_MIN) { 383 SET_QC(); 384 res = INT32_MIN; 385 } 386 return res; 387} 388 389uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b) 390{ 391 uint64_t res; 392 res = a + b; 393 /* We only need to look at the pattern of SIGN bits to detect an overflow */ 394 if (((a & res) 395 | (~b & res) 396 | (a & ~b)) & SIGNBIT64) { 397 SET_QC(); 398 res = INT64_MAX; 399 } 400 return res; 401} 402 403 404#define NEON_USAT(dest, src1, src2, type) do { \ 405 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 406 if (tmp != (type)tmp) { \ 407 SET_QC(); \ 408 dest = 0; \ 409 } else { \ 410 dest = tmp; \ 411 }} while(0) 412#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 413NEON_VOP_ENV(qsub_u8, neon_u8, 4) 414#undef NEON_FN 415#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 416NEON_VOP_ENV(qsub_u16, neon_u16, 2) 417#undef NEON_FN 418#undef NEON_USAT 419 420uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 421{ 422 uint32_t res = a - b; 423 if (res > a) { 424 SET_QC(); 425 res = 0; 426 } 427 return res; 428} 429 430uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 431{ 432 uint64_t res; 433 434 if (src1 < src2) { 435 SET_QC(); 436 res = 0; 437 } else { 438 res = src1 - src2; 439 } 440 return res; 441} 442 443#define NEON_SSAT(dest, src1, src2, type) do { \ 444 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 445 if (tmp != (type)tmp) { \ 446 SET_QC(); \ 447 if (src2 < 0) { \ 448 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 449 } else { \ 450 tmp = 1 << (sizeof(type) * 8 - 1); \ 451 } \ 452 } \ 453 dest = tmp; \ 454 } while(0) 455#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 456NEON_VOP_ENV(qsub_s8, neon_s8, 4) 457#undef NEON_FN 458#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 459NEON_VOP_ENV(qsub_s16, neon_s16, 2) 460#undef NEON_FN 461#undef NEON_SSAT 462 463uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 464{ 465 uint32_t res = a - b; 466 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 467 SET_QC(); 468 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 469 } 470 return res; 471} 472 473uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 474{ 475 uint64_t res; 476 477 res = src1 - src2; 478 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 479 SET_QC(); 480 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 481 } 482 return res; 483} 484 485#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 486NEON_VOP(hadd_s8, neon_s8, 4) 487NEON_VOP(hadd_u8, neon_u8, 4) 488NEON_VOP(hadd_s16, neon_s16, 2) 489NEON_VOP(hadd_u16, neon_u16, 2) 490#undef NEON_FN 491 492int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 493{ 494 int32_t dest; 495 496 dest = (src1 >> 1) + (src2 >> 1); 497 if (src1 & src2 & 1) 498 dest++; 499 return dest; 500} 501 502uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 503{ 504 uint32_t dest; 505 506 dest = (src1 >> 1) + (src2 >> 1); 507 if (src1 & src2 & 1) 508 dest++; 509 return dest; 510} 511 512#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 513NEON_VOP(rhadd_s8, neon_s8, 4) 514NEON_VOP(rhadd_u8, neon_u8, 4) 515NEON_VOP(rhadd_s16, neon_s16, 2) 516NEON_VOP(rhadd_u16, neon_u16, 2) 517#undef NEON_FN 518 519int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 520{ 521 int32_t dest; 522 523 dest = (src1 >> 1) + (src2 >> 1); 524 if ((src1 | src2) & 1) 525 dest++; 526 return dest; 527} 528 529uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 530{ 531 uint32_t dest; 532 533 dest = (src1 >> 1) + (src2 >> 1); 534 if ((src1 | src2) & 1) 535 dest++; 536 return dest; 537} 538 539#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 540NEON_VOP(hsub_s8, neon_s8, 4) 541NEON_VOP(hsub_u8, neon_u8, 4) 542NEON_VOP(hsub_s16, neon_s16, 2) 543NEON_VOP(hsub_u16, neon_u16, 2) 544#undef NEON_FN 545 546int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 547{ 548 int32_t dest; 549 550 dest = (src1 >> 1) - (src2 >> 1); 551 if ((~src1) & src2 & 1) 552 dest--; 553 return dest; 554} 555 556uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 557{ 558 uint32_t dest; 559 560 dest = (src1 >> 1) - (src2 >> 1); 561 if ((~src1) & src2 & 1) 562 dest--; 563 return dest; 564} 565 566#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 567NEON_POP(pmin_s8, neon_s8, 4) 568NEON_POP(pmin_u8, neon_u8, 4) 569NEON_POP(pmin_s16, neon_s16, 2) 570NEON_POP(pmin_u16, neon_u16, 2) 571#undef NEON_FN 572 573#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 574NEON_POP(pmax_s8, neon_s8, 4) 575NEON_POP(pmax_u8, neon_u8, 4) 576NEON_POP(pmax_s16, neon_s16, 2) 577NEON_POP(pmax_u16, neon_u16, 2) 578#undef NEON_FN 579 580#define NEON_FN(dest, src1, src2) \ 581 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 582NEON_VOP(shl_u16, neon_u16, 2) 583#undef NEON_FN 584 585#define NEON_FN(dest, src1, src2) \ 586 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 587NEON_VOP(shl_s16, neon_s16, 2) 588#undef NEON_FN 589 590#define NEON_FN(dest, src1, src2) \ 591 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 592NEON_VOP(rshl_s8, neon_s8, 4) 593#undef NEON_FN 594 595#define NEON_FN(dest, src1, src2) \ 596 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 597NEON_VOP(rshl_s16, neon_s16, 2) 598#undef NEON_FN 599 600uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 601{ 602 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 603} 604 605uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 606{ 607 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 608} 609 610#define NEON_FN(dest, src1, src2) \ 611 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 612NEON_VOP(rshl_u8, neon_u8, 4) 613#undef NEON_FN 614 615#define NEON_FN(dest, src1, src2) \ 616 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 617NEON_VOP(rshl_u16, neon_u16, 2) 618#undef NEON_FN 619 620uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 621{ 622 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 623} 624 625uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 626{ 627 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 628} 629 630#define NEON_FN(dest, src1, src2) \ 631 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 632NEON_VOP_ENV(qshl_u8, neon_u8, 4) 633#undef NEON_FN 634 635#define NEON_FN(dest, src1, src2) \ 636 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 637NEON_VOP_ENV(qshl_u16, neon_u16, 2) 638#undef NEON_FN 639 640uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 641{ 642 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 643} 644 645uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 646{ 647 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 648} 649 650#define NEON_FN(dest, src1, src2) \ 651 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 652NEON_VOP_ENV(qshl_s8, neon_s8, 4) 653#undef NEON_FN 654 655#define NEON_FN(dest, src1, src2) \ 656 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 657NEON_VOP_ENV(qshl_s16, neon_s16, 2) 658#undef NEON_FN 659 660uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 661{ 662 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 663} 664 665uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 666{ 667 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 668} 669 670#define NEON_FN(dest, src1, src2) \ 671 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 672NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 673#undef NEON_FN 674 675#define NEON_FN(dest, src1, src2) \ 676 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 677NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 678#undef NEON_FN 679 680uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 681{ 682 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 683} 684 685uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 686{ 687 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 688} 689 690#define NEON_FN(dest, src1, src2) \ 691 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 692NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 693#undef NEON_FN 694 695#define NEON_FN(dest, src1, src2) \ 696 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 697NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 698#undef NEON_FN 699 700uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 701{ 702 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 703} 704 705uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 706{ 707 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 708} 709 710#define NEON_FN(dest, src1, src2) \ 711 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 712NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 713#undef NEON_FN 714 715#define NEON_FN(dest, src1, src2) \ 716 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 717NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 718#undef NEON_FN 719 720uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 721{ 722 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 723} 724 725uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 726{ 727 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 728} 729 730uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 731{ 732 uint32_t mask; 733 mask = (a ^ b) & 0x80808080u; 734 a &= ~0x80808080u; 735 b &= ~0x80808080u; 736 return (a + b) ^ mask; 737} 738 739uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 740{ 741 uint32_t mask; 742 mask = (a ^ b) & 0x80008000u; 743 a &= ~0x80008000u; 744 b &= ~0x80008000u; 745 return (a + b) ^ mask; 746} 747 748#define NEON_FN(dest, src1, src2) dest = src1 + src2 749NEON_POP(padd_u8, neon_u8, 4) 750NEON_POP(padd_u16, neon_u16, 2) 751#undef NEON_FN 752 753#define NEON_FN(dest, src1, src2) dest = src1 - src2 754NEON_VOP(sub_u8, neon_u8, 4) 755NEON_VOP(sub_u16, neon_u16, 2) 756#undef NEON_FN 757 758#define NEON_FN(dest, src1, src2) dest = src1 * src2 759NEON_VOP(mul_u8, neon_u8, 4) 760NEON_VOP(mul_u16, neon_u16, 2) 761#undef NEON_FN 762 763#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 764NEON_VOP(tst_u8, neon_u8, 4) 765NEON_VOP(tst_u16, neon_u16, 2) 766NEON_VOP(tst_u32, neon_u32, 1) 767#undef NEON_FN 768 769/* Count Leading Sign/Zero Bits. */ 770static inline int do_clz8(uint8_t x) 771{ 772 int n; 773 for (n = 8; x; n--) 774 x >>= 1; 775 return n; 776} 777 778static inline int do_clz16(uint16_t x) 779{ 780 int n; 781 for (n = 16; x; n--) 782 x >>= 1; 783 return n; 784} 785 786#define NEON_FN(dest, src, dummy) dest = do_clz8(src) 787NEON_VOP1(clz_u8, neon_u8, 4) 788#undef NEON_FN 789 790#define NEON_FN(dest, src, dummy) dest = do_clz16(src) 791NEON_VOP1(clz_u16, neon_u16, 2) 792#undef NEON_FN 793 794#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 795NEON_VOP1(cls_s8, neon_s8, 4) 796#undef NEON_FN 797 798#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 799NEON_VOP1(cls_s16, neon_s16, 2) 800#undef NEON_FN 801 802uint32_t HELPER(neon_cls_s32)(uint32_t x) 803{ 804 int count; 805 if ((int32_t)x < 0) 806 x = ~x; 807 for (count = 32; x; count--) 808 x = x >> 1; 809 return count - 1; 810} 811 812/* Bit count. */ 813uint32_t HELPER(neon_cnt_u8)(uint32_t x) 814{ 815 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 816 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 817 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 818 return x; 819} 820 821/* Reverse bits in each 8 bit word */ 822uint32_t HELPER(neon_rbit_u8)(uint32_t x) 823{ 824 x = ((x & 0xf0f0f0f0) >> 4) 825 | ((x & 0x0f0f0f0f) << 4); 826 x = ((x & 0x88888888) >> 3) 827 | ((x & 0x44444444) >> 1) 828 | ((x & 0x22222222) << 1) 829 | ((x & 0x11111111) << 3); 830 return x; 831} 832 833#define NEON_QDMULH16(dest, src1, src2, round) do { \ 834 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 835 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 836 SET_QC(); \ 837 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 838 } else { \ 839 tmp <<= 1; \ 840 } \ 841 if (round) { \ 842 int32_t old = tmp; \ 843 tmp += 1 << 15; \ 844 if ((int32_t)tmp < old) { \ 845 SET_QC(); \ 846 tmp = SIGNBIT - 1; \ 847 } \ 848 } \ 849 dest = tmp >> 16; \ 850 } while(0) 851#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 852NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 853#undef NEON_FN 854#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 855NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 856#undef NEON_FN 857#undef NEON_QDMULH16 858 859#define NEON_QDMULH32(dest, src1, src2, round) do { \ 860 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 861 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 862 SET_QC(); \ 863 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 864 } else { \ 865 tmp <<= 1; \ 866 } \ 867 if (round) { \ 868 int64_t old = tmp; \ 869 tmp += (int64_t)1 << 31; \ 870 if ((int64_t)tmp < old) { \ 871 SET_QC(); \ 872 tmp = SIGNBIT64 - 1; \ 873 } \ 874 } \ 875 dest = tmp >> 32; \ 876 } while(0) 877#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 878NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 879#undef NEON_FN 880#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 881NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 882#undef NEON_FN 883#undef NEON_QDMULH32 884 885uint32_t HELPER(neon_narrow_u8)(uint64_t x) 886{ 887 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 888 | ((x >> 24) & 0xff000000u); 889} 890 891uint32_t HELPER(neon_narrow_u16)(uint64_t x) 892{ 893 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 894} 895 896uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 897{ 898 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 899 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 900} 901 902uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 903{ 904 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 905} 906 907uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 908{ 909 x &= 0xff80ff80ff80ff80ull; 910 x += 0x0080008000800080ull; 911 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 912 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 913} 914 915uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 916{ 917 x &= 0xffff8000ffff8000ull; 918 x += 0x0000800000008000ull; 919 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 920} 921 922uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 923{ 924 uint16_t s; 925 uint8_t d; 926 uint32_t res = 0; 927#define SAT8(n) \ 928 s = x >> n; \ 929 if (s & 0x8000) { \ 930 SET_QC(); \ 931 } else { \ 932 if (s > 0xff) { \ 933 d = 0xff; \ 934 SET_QC(); \ 935 } else { \ 936 d = s; \ 937 } \ 938 res |= (uint32_t)d << (n / 2); \ 939 } 940 941 SAT8(0); 942 SAT8(16); 943 SAT8(32); 944 SAT8(48); 945#undef SAT8 946 return res; 947} 948 949uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 950{ 951 uint16_t s; 952 uint8_t d; 953 uint32_t res = 0; 954#define SAT8(n) \ 955 s = x >> n; \ 956 if (s > 0xff) { \ 957 d = 0xff; \ 958 SET_QC(); \ 959 } else { \ 960 d = s; \ 961 } \ 962 res |= (uint32_t)d << (n / 2); 963 964 SAT8(0); 965 SAT8(16); 966 SAT8(32); 967 SAT8(48); 968#undef SAT8 969 return res; 970} 971 972uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 973{ 974 int16_t s; 975 uint8_t d; 976 uint32_t res = 0; 977#define SAT8(n) \ 978 s = x >> n; \ 979 if (s != (int8_t)s) { \ 980 d = (s >> 15) ^ 0x7f; \ 981 SET_QC(); \ 982 } else { \ 983 d = s; \ 984 } \ 985 res |= (uint32_t)d << (n / 2); 986 987 SAT8(0); 988 SAT8(16); 989 SAT8(32); 990 SAT8(48); 991#undef SAT8 992 return res; 993} 994 995uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 996{ 997 uint32_t high; 998 uint32_t low; 999 low = x; 1000 if (low & 0x80000000) { 1001 low = 0; 1002 SET_QC(); 1003 } else if (low > 0xffff) { 1004 low = 0xffff; 1005 SET_QC(); 1006 } 1007 high = x >> 32; 1008 if (high & 0x80000000) { 1009 high = 0; 1010 SET_QC(); 1011 } else if (high > 0xffff) { 1012 high = 0xffff; 1013 SET_QC(); 1014 } 1015 return low | (high << 16); 1016} 1017 1018uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 1019{ 1020 uint32_t high; 1021 uint32_t low; 1022 low = x; 1023 if (low > 0xffff) { 1024 low = 0xffff; 1025 SET_QC(); 1026 } 1027 high = x >> 32; 1028 if (high > 0xffff) { 1029 high = 0xffff; 1030 SET_QC(); 1031 } 1032 return low | (high << 16); 1033} 1034 1035uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 1036{ 1037 int32_t low; 1038 int32_t high; 1039 low = x; 1040 if (low != (int16_t)low) { 1041 low = (low >> 31) ^ 0x7fff; 1042 SET_QC(); 1043 } 1044 high = x >> 32; 1045 if (high != (int16_t)high) { 1046 high = (high >> 31) ^ 0x7fff; 1047 SET_QC(); 1048 } 1049 return (uint16_t)low | (high << 16); 1050} 1051 1052uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 1053{ 1054 if (x & 0x8000000000000000ull) { 1055 SET_QC(); 1056 return 0; 1057 } 1058 if (x > 0xffffffffu) { 1059 SET_QC(); 1060 return 0xffffffffu; 1061 } 1062 return x; 1063} 1064 1065uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 1066{ 1067 if (x > 0xffffffffu) { 1068 SET_QC(); 1069 return 0xffffffffu; 1070 } 1071 return x; 1072} 1073 1074uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 1075{ 1076 if ((int64_t)x != (int32_t)x) { 1077 SET_QC(); 1078 return ((int64_t)x >> 63) ^ 0x7fffffff; 1079 } 1080 return x; 1081} 1082 1083uint64_t HELPER(neon_widen_u8)(uint32_t x) 1084{ 1085 uint64_t tmp; 1086 uint64_t ret; 1087 ret = (uint8_t)x; 1088 tmp = (uint8_t)(x >> 8); 1089 ret |= tmp << 16; 1090 tmp = (uint8_t)(x >> 16); 1091 ret |= tmp << 32; 1092 tmp = (uint8_t)(x >> 24); 1093 ret |= tmp << 48; 1094 return ret; 1095} 1096 1097uint64_t HELPER(neon_widen_s8)(uint32_t x) 1098{ 1099 uint64_t tmp; 1100 uint64_t ret; 1101 ret = (uint16_t)(int8_t)x; 1102 tmp = (uint16_t)(int8_t)(x >> 8); 1103 ret |= tmp << 16; 1104 tmp = (uint16_t)(int8_t)(x >> 16); 1105 ret |= tmp << 32; 1106 tmp = (uint16_t)(int8_t)(x >> 24); 1107 ret |= tmp << 48; 1108 return ret; 1109} 1110 1111uint64_t HELPER(neon_widen_u16)(uint32_t x) 1112{ 1113 uint64_t high = (uint16_t)(x >> 16); 1114 return ((uint16_t)x) | (high << 32); 1115} 1116 1117uint64_t HELPER(neon_widen_s16)(uint32_t x) 1118{ 1119 uint64_t high = (int16_t)(x >> 16); 1120 return ((uint32_t)(int16_t)x) | (high << 32); 1121} 1122 1123uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1124{ 1125 uint64_t mask; 1126 mask = (a ^ b) & 0x8000800080008000ull; 1127 a &= ~0x8000800080008000ull; 1128 b &= ~0x8000800080008000ull; 1129 return (a + b) ^ mask; 1130} 1131 1132uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1133{ 1134 uint64_t mask; 1135 mask = (a ^ b) & 0x8000000080000000ull; 1136 a &= ~0x8000000080000000ull; 1137 b &= ~0x8000000080000000ull; 1138 return (a + b) ^ mask; 1139} 1140 1141uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1142{ 1143 uint64_t tmp; 1144 uint64_t tmp2; 1145 1146 tmp = a & 0x0000ffff0000ffffull; 1147 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1148 tmp2 = b & 0xffff0000ffff0000ull; 1149 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1150 return ( tmp & 0xffff) 1151 | ((tmp >> 16) & 0xffff0000ull) 1152 | ((tmp2 << 16) & 0xffff00000000ull) 1153 | ( tmp2 & 0xffff000000000000ull); 1154} 1155 1156uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1157{ 1158 uint32_t low = a + (a >> 32); 1159 uint32_t high = b + (b >> 32); 1160 return low + ((uint64_t)high << 32); 1161} 1162 1163uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1164{ 1165 uint64_t mask; 1166 mask = (a ^ ~b) & 0x8000800080008000ull; 1167 a |= 0x8000800080008000ull; 1168 b &= ~0x8000800080008000ull; 1169 return (a - b) ^ mask; 1170} 1171 1172uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1173{ 1174 uint64_t mask; 1175 mask = (a ^ ~b) & 0x8000000080000000ull; 1176 a |= 0x8000000080000000ull; 1177 b &= ~0x8000000080000000ull; 1178 return (a - b) ^ mask; 1179} 1180 1181uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1182{ 1183 uint32_t x, y; 1184 uint32_t low, high; 1185 1186 x = a; 1187 y = b; 1188 low = x + y; 1189 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1190 SET_QC(); 1191 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1192 } 1193 x = a >> 32; 1194 y = b >> 32; 1195 high = x + y; 1196 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1197 SET_QC(); 1198 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1199 } 1200 return low | ((uint64_t)high << 32); 1201} 1202 1203uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1204{ 1205 uint64_t result; 1206 1207 result = a + b; 1208 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1209 SET_QC(); 1210 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1211 } 1212 return result; 1213} 1214 1215/* We have to do the arithmetic in a larger type than 1216 * the input type, because for example with a signed 32 bit 1217 * op the absolute difference can overflow a signed 32 bit value. 1218 */ 1219#define DO_ABD(dest, x, y, intype, arithtype) do { \ 1220 arithtype tmp_x = (intype)(x); \ 1221 arithtype tmp_y = (intype)(y); \ 1222 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1223 } while(0) 1224 1225uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1226{ 1227 uint64_t tmp; 1228 uint64_t result; 1229 DO_ABD(result, a, b, uint8_t, uint32_t); 1230 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1231 result |= tmp << 16; 1232 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1233 result |= tmp << 32; 1234 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1235 result |= tmp << 48; 1236 return result; 1237} 1238 1239uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1240{ 1241 uint64_t tmp; 1242 uint64_t result; 1243 DO_ABD(result, a, b, int8_t, int32_t); 1244 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1245 result |= tmp << 16; 1246 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1247 result |= tmp << 32; 1248 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1249 result |= tmp << 48; 1250 return result; 1251} 1252 1253uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1254{ 1255 uint64_t tmp; 1256 uint64_t result; 1257 DO_ABD(result, a, b, uint16_t, uint32_t); 1258 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1259 return result | (tmp << 32); 1260} 1261 1262uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1263{ 1264 uint64_t tmp; 1265 uint64_t result; 1266 DO_ABD(result, a, b, int16_t, int32_t); 1267 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1268 return result | (tmp << 32); 1269} 1270 1271uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1272{ 1273 uint64_t result; 1274 DO_ABD(result, a, b, uint32_t, uint64_t); 1275 return result; 1276} 1277 1278uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1279{ 1280 uint64_t result; 1281 DO_ABD(result, a, b, int32_t, int64_t); 1282 return result; 1283} 1284#undef DO_ABD 1285 1286/* Widening multiply. Named type is the source type. */ 1287#define DO_MULL(dest, x, y, type1, type2) do { \ 1288 type1 tmp_x = x; \ 1289 type1 tmp_y = y; \ 1290 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1291 } while(0) 1292 1293uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1294{ 1295 uint64_t tmp; 1296 uint64_t result; 1297 1298 DO_MULL(result, a, b, uint8_t, uint16_t); 1299 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1300 result |= tmp << 16; 1301 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1302 result |= tmp << 32; 1303 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1304 result |= tmp << 48; 1305 return result; 1306} 1307 1308uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1309{ 1310 uint64_t tmp; 1311 uint64_t result; 1312 1313 DO_MULL(result, a, b, int8_t, uint16_t); 1314 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1315 result |= tmp << 16; 1316 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1317 result |= tmp << 32; 1318 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1319 result |= tmp << 48; 1320 return result; 1321} 1322 1323uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1324{ 1325 uint64_t tmp; 1326 uint64_t result; 1327 1328 DO_MULL(result, a, b, uint16_t, uint32_t); 1329 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1330 return result | (tmp << 32); 1331} 1332 1333uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1334{ 1335 uint64_t tmp; 1336 uint64_t result; 1337 1338 DO_MULL(result, a, b, int16_t, uint32_t); 1339 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1340 return result | (tmp << 32); 1341} 1342 1343uint64_t HELPER(neon_negl_u16)(uint64_t x) 1344{ 1345 uint16_t tmp; 1346 uint64_t result; 1347 result = (uint16_t)-x; 1348 tmp = -(x >> 16); 1349 result |= (uint64_t)tmp << 16; 1350 tmp = -(x >> 32); 1351 result |= (uint64_t)tmp << 32; 1352 tmp = -(x >> 48); 1353 result |= (uint64_t)tmp << 48; 1354 return result; 1355} 1356 1357uint64_t HELPER(neon_negl_u32)(uint64_t x) 1358{ 1359 uint32_t low = -x; 1360 uint32_t high = -(x >> 32); 1361 return low | ((uint64_t)high << 32); 1362} 1363 1364/* Saturating sign manipulation. */ 1365/* ??? Make these use NEON_VOP1 */ 1366#define DO_QABS8(x) do { \ 1367 if (x == (int8_t)0x80) { \ 1368 x = 0x7f; \ 1369 SET_QC(); \ 1370 } else if (x < 0) { \ 1371 x = -x; \ 1372 }} while (0) 1373uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1374{ 1375 neon_s8 vec; 1376 NEON_UNPACK(neon_s8, vec, x); 1377 DO_QABS8(vec.v1); 1378 DO_QABS8(vec.v2); 1379 DO_QABS8(vec.v3); 1380 DO_QABS8(vec.v4); 1381 NEON_PACK(neon_s8, x, vec); 1382 return x; 1383} 1384#undef DO_QABS8 1385 1386#define DO_QNEG8(x) do { \ 1387 if (x == (int8_t)0x80) { \ 1388 x = 0x7f; \ 1389 SET_QC(); \ 1390 } else { \ 1391 x = -x; \ 1392 }} while (0) 1393uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1394{ 1395 neon_s8 vec; 1396 NEON_UNPACK(neon_s8, vec, x); 1397 DO_QNEG8(vec.v1); 1398 DO_QNEG8(vec.v2); 1399 DO_QNEG8(vec.v3); 1400 DO_QNEG8(vec.v4); 1401 NEON_PACK(neon_s8, x, vec); 1402 return x; 1403} 1404#undef DO_QNEG8 1405 1406#define DO_QABS16(x) do { \ 1407 if (x == (int16_t)0x8000) { \ 1408 x = 0x7fff; \ 1409 SET_QC(); \ 1410 } else if (x < 0) { \ 1411 x = -x; \ 1412 }} while (0) 1413uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1414{ 1415 neon_s16 vec; 1416 NEON_UNPACK(neon_s16, vec, x); 1417 DO_QABS16(vec.v1); 1418 DO_QABS16(vec.v2); 1419 NEON_PACK(neon_s16, x, vec); 1420 return x; 1421} 1422#undef DO_QABS16 1423 1424#define DO_QNEG16(x) do { \ 1425 if (x == (int16_t)0x8000) { \ 1426 x = 0x7fff; \ 1427 SET_QC(); \ 1428 } else { \ 1429 x = -x; \ 1430 }} while (0) 1431uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1432{ 1433 neon_s16 vec; 1434 NEON_UNPACK(neon_s16, vec, x); 1435 DO_QNEG16(vec.v1); 1436 DO_QNEG16(vec.v2); 1437 NEON_PACK(neon_s16, x, vec); 1438 return x; 1439} 1440#undef DO_QNEG16 1441 1442uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1443{ 1444 if (x == SIGNBIT) { 1445 SET_QC(); 1446 x = ~SIGNBIT; 1447 } else if ((int32_t)x < 0) { 1448 x = -x; 1449 } 1450 return x; 1451} 1452 1453uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1454{ 1455 if (x == SIGNBIT) { 1456 SET_QC(); 1457 x = ~SIGNBIT; 1458 } else { 1459 x = -x; 1460 } 1461 return x; 1462} 1463 1464uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1465{ 1466 if (x == SIGNBIT64) { 1467 SET_QC(); 1468 x = ~SIGNBIT64; 1469 } else if ((int64_t)x < 0) { 1470 x = -x; 1471 } 1472 return x; 1473} 1474 1475uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1476{ 1477 if (x == SIGNBIT64) { 1478 SET_QC(); 1479 x = ~SIGNBIT64; 1480 } else { 1481 x = -x; 1482 } 1483 return x; 1484} 1485 1486/* NEON Float helpers. */ 1487 1488/* Floating point comparisons produce an integer result. 1489 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1490 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1491 */ 1492uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1493{ 1494 float_status *fpst = fpstp; 1495 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1496} 1497 1498uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1499{ 1500 float_status *fpst = fpstp; 1501 return -float32_le(make_float32(b), make_float32(a), fpst); 1502} 1503 1504uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1505{ 1506 float_status *fpst = fpstp; 1507 return -float32_lt(make_float32(b), make_float32(a), fpst); 1508} 1509 1510uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1511{ 1512 float_status *fpst = fpstp; 1513 float32 f0 = float32_abs(make_float32(a)); 1514 float32 f1 = float32_abs(make_float32(b)); 1515 return -float32_le(f1, f0, fpst); 1516} 1517 1518uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1519{ 1520 float_status *fpst = fpstp; 1521 float32 f0 = float32_abs(make_float32(a)); 1522 float32 f1 = float32_abs(make_float32(b)); 1523 return -float32_lt(f1, f0, fpst); 1524} 1525 1526uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) 1527{ 1528 float_status *fpst = fpstp; 1529 float64 f0 = float64_abs(make_float64(a)); 1530 float64 f1 = float64_abs(make_float64(b)); 1531 return -float64_le(f1, f0, fpst); 1532} 1533 1534uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) 1535{ 1536 float_status *fpst = fpstp; 1537 float64 f0 = float64_abs(make_float64(a)); 1538 float64 f1 = float64_abs(make_float64(b)); 1539 return -float64_lt(f1, f0, fpst); 1540} 1541 1542#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1543 1544void HELPER(neon_qunzip8)(void *vd, void *vm) 1545{ 1546 uint64_t *rd = vd, *rm = vm; 1547 uint64_t zd0 = rd[0], zd1 = rd[1]; 1548 uint64_t zm0 = rm[0], zm1 = rm[1]; 1549 1550 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1551 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1552 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1553 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1554 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1555 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1556 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1557 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1558 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1559 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1560 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1561 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1562 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1563 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1564 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1565 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1566 1567 rm[0] = m0; 1568 rm[1] = m1; 1569 rd[0] = d0; 1570 rd[1] = d1; 1571} 1572 1573void HELPER(neon_qunzip16)(void *vd, void *vm) 1574{ 1575 uint64_t *rd = vd, *rm = vm; 1576 uint64_t zd0 = rd[0], zd1 = rd[1]; 1577 uint64_t zm0 = rm[0], zm1 = rm[1]; 1578 1579 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1580 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1581 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1582 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1583 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1584 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1585 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1586 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1587 1588 rm[0] = m0; 1589 rm[1] = m1; 1590 rd[0] = d0; 1591 rd[1] = d1; 1592} 1593 1594void HELPER(neon_qunzip32)(void *vd, void *vm) 1595{ 1596 uint64_t *rd = vd, *rm = vm; 1597 uint64_t zd0 = rd[0], zd1 = rd[1]; 1598 uint64_t zm0 = rm[0], zm1 = rm[1]; 1599 1600 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1601 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1602 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1603 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1604 1605 rm[0] = m0; 1606 rm[1] = m1; 1607 rd[0] = d0; 1608 rd[1] = d1; 1609} 1610 1611void HELPER(neon_unzip8)(void *vd, void *vm) 1612{ 1613 uint64_t *rd = vd, *rm = vm; 1614 uint64_t zd = rd[0], zm = rm[0]; 1615 1616 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1617 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1618 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1619 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1620 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1621 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1622 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1623 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1624 1625 rm[0] = m0; 1626 rd[0] = d0; 1627} 1628 1629void HELPER(neon_unzip16)(void *vd, void *vm) 1630{ 1631 uint64_t *rd = vd, *rm = vm; 1632 uint64_t zd = rd[0], zm = rm[0]; 1633 1634 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1635 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1636 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1637 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1638 1639 rm[0] = m0; 1640 rd[0] = d0; 1641} 1642 1643void HELPER(neon_qzip8)(void *vd, void *vm) 1644{ 1645 uint64_t *rd = vd, *rm = vm; 1646 uint64_t zd0 = rd[0], zd1 = rd[1]; 1647 uint64_t zm0 = rm[0], zm1 = rm[1]; 1648 1649 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1650 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1651 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1652 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1653 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1654 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1655 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1656 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1657 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1658 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1659 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1660 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1661 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1662 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1663 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1664 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1665 1666 rm[0] = m0; 1667 rm[1] = m1; 1668 rd[0] = d0; 1669 rd[1] = d1; 1670} 1671 1672void HELPER(neon_qzip16)(void *vd, void *vm) 1673{ 1674 uint64_t *rd = vd, *rm = vm; 1675 uint64_t zd0 = rd[0], zd1 = rd[1]; 1676 uint64_t zm0 = rm[0], zm1 = rm[1]; 1677 1678 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1679 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1680 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1681 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1682 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1683 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1684 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1685 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1686 1687 rm[0] = m0; 1688 rm[1] = m1; 1689 rd[0] = d0; 1690 rd[1] = d1; 1691} 1692 1693void HELPER(neon_qzip32)(void *vd, void *vm) 1694{ 1695 uint64_t *rd = vd, *rm = vm; 1696 uint64_t zd0 = rd[0], zd1 = rd[1]; 1697 uint64_t zm0 = rm[0], zm1 = rm[1]; 1698 1699 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1700 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1701 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1702 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1703 1704 rm[0] = m0; 1705 rm[1] = m1; 1706 rd[0] = d0; 1707 rd[1] = d1; 1708} 1709 1710void HELPER(neon_zip8)(void *vd, void *vm) 1711{ 1712 uint64_t *rd = vd, *rm = vm; 1713 uint64_t zd = rd[0], zm = rm[0]; 1714 1715 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1716 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1717 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1718 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1719 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1720 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1721 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1722 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1723 1724 rm[0] = m0; 1725 rd[0] = d0; 1726} 1727 1728void HELPER(neon_zip16)(void *vd, void *vm) 1729{ 1730 uint64_t *rd = vd, *rm = vm; 1731 uint64_t zd = rd[0], zm = rm[0]; 1732 1733 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1734 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1735 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1736 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1737 1738 rm[0] = m0; 1739 rd[0] = d0; 1740}