cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

helper-a64.c (33771B)


      1/*
      2 *  AArch64 specific helpers
      3 *
      4 *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
      5 *
      6 * This library is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * This library is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18 */
     19
     20#include "qemu/osdep.h"
     21#include "qemu/units.h"
     22#include "cpu.h"
     23#include "exec/gdbstub.h"
     24#include "exec/helper-proto.h"
     25#include "qemu/host-utils.h"
     26#include "qemu/log.h"
     27#include "qemu/main-loop.h"
     28#include "qemu/bitops.h"
     29#include "internals.h"
     30#include "qemu/crc32c.h"
     31#include "exec/exec-all.h"
     32#include "exec/cpu_ldst.h"
     33#include "qemu/int128.h"
     34#include "qemu/atomic128.h"
     35#include "tcg/tcg.h"
     36#include "fpu/softfloat.h"
     37#include <zlib.h> /* For crc32 */
     38
     39/* C2.4.7 Multiply and divide */
     40/* special cases for 0 and LLONG_MIN are mandated by the standard */
     41uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
     42{
     43    if (den == 0) {
     44        return 0;
     45    }
     46    return num / den;
     47}
     48
     49int64_t HELPER(sdiv64)(int64_t num, int64_t den)
     50{
     51    if (den == 0) {
     52        return 0;
     53    }
     54    if (num == LLONG_MIN && den == -1) {
     55        return LLONG_MIN;
     56    }
     57    return num / den;
     58}
     59
     60uint64_t HELPER(rbit64)(uint64_t x)
     61{
     62    return revbit64(x);
     63}
     64
     65void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
     66{
     67    update_spsel(env, imm);
     68}
     69
     70static void daif_check(CPUARMState *env, uint32_t op,
     71                       uint32_t imm, uintptr_t ra)
     72{
     73    /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
     74    if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
     75        raise_exception_ra(env, EXCP_UDEF,
     76                           syn_aa64_sysregtrap(0, extract32(op, 0, 3),
     77                                               extract32(op, 3, 3), 4,
     78                                               imm, 0x1f, 0),
     79                           exception_target_el(env), ra);
     80    }
     81}
     82
     83void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
     84{
     85    daif_check(env, 0x1e, imm, GETPC());
     86    env->daif |= (imm << 6) & PSTATE_DAIF;
     87}
     88
     89void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
     90{
     91    daif_check(env, 0x1f, imm, GETPC());
     92    env->daif &= ~((imm << 6) & PSTATE_DAIF);
     93}
     94
     95/* Convert a softfloat float_relation_ (as returned by
     96 * the float*_compare functions) to the correct ARM
     97 * NZCV flag state.
     98 */
     99static inline uint32_t float_rel_to_flags(int res)
    100{
    101    uint64_t flags;
    102    switch (res) {
    103    case float_relation_equal:
    104        flags = PSTATE_Z | PSTATE_C;
    105        break;
    106    case float_relation_less:
    107        flags = PSTATE_N;
    108        break;
    109    case float_relation_greater:
    110        flags = PSTATE_C;
    111        break;
    112    case float_relation_unordered:
    113    default:
    114        flags = PSTATE_C | PSTATE_V;
    115        break;
    116    }
    117    return flags;
    118}
    119
    120uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
    121{
    122    return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
    123}
    124
    125uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
    126{
    127    return float_rel_to_flags(float16_compare(x, y, fp_status));
    128}
    129
    130uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
    131{
    132    return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
    133}
    134
    135uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
    136{
    137    return float_rel_to_flags(float32_compare(x, y, fp_status));
    138}
    139
    140uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
    141{
    142    return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
    143}
    144
    145uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
    146{
    147    return float_rel_to_flags(float64_compare(x, y, fp_status));
    148}
    149
    150float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
    151{
    152    float_status *fpst = fpstp;
    153
    154    a = float32_squash_input_denormal(a, fpst);
    155    b = float32_squash_input_denormal(b, fpst);
    156
    157    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
    158        (float32_is_infinity(a) && float32_is_zero(b))) {
    159        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
    160        return make_float32((1U << 30) |
    161                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
    162    }
    163    return float32_mul(a, b, fpst);
    164}
    165
    166float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
    167{
    168    float_status *fpst = fpstp;
    169
    170    a = float64_squash_input_denormal(a, fpst);
    171    b = float64_squash_input_denormal(b, fpst);
    172
    173    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
    174        (float64_is_infinity(a) && float64_is_zero(b))) {
    175        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
    176        return make_float64((1ULL << 62) |
    177                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
    178    }
    179    return float64_mul(a, b, fpst);
    180}
    181
    182/* 64bit/double versions of the neon float compare functions */
    183uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
    184{
    185    float_status *fpst = fpstp;
    186    return -float64_eq_quiet(a, b, fpst);
    187}
    188
    189uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
    190{
    191    float_status *fpst = fpstp;
    192    return -float64_le(b, a, fpst);
    193}
    194
    195uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
    196{
    197    float_status *fpst = fpstp;
    198    return -float64_lt(b, a, fpst);
    199}
    200
    201/* Reciprocal step and sqrt step. Note that unlike the A32/T32
    202 * versions, these do a fully fused multiply-add or
    203 * multiply-add-and-halve.
    204 */
    205
    206uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
    207{
    208    float_status *fpst = fpstp;
    209
    210    a = float16_squash_input_denormal(a, fpst);
    211    b = float16_squash_input_denormal(b, fpst);
    212
    213    a = float16_chs(a);
    214    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
    215        (float16_is_infinity(b) && float16_is_zero(a))) {
    216        return float16_two;
    217    }
    218    return float16_muladd(a, b, float16_two, 0, fpst);
    219}
    220
    221float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
    222{
    223    float_status *fpst = fpstp;
    224
    225    a = float32_squash_input_denormal(a, fpst);
    226    b = float32_squash_input_denormal(b, fpst);
    227
    228    a = float32_chs(a);
    229    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
    230        (float32_is_infinity(b) && float32_is_zero(a))) {
    231        return float32_two;
    232    }
    233    return float32_muladd(a, b, float32_two, 0, fpst);
    234}
    235
    236float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
    237{
    238    float_status *fpst = fpstp;
    239
    240    a = float64_squash_input_denormal(a, fpst);
    241    b = float64_squash_input_denormal(b, fpst);
    242
    243    a = float64_chs(a);
    244    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
    245        (float64_is_infinity(b) && float64_is_zero(a))) {
    246        return float64_two;
    247    }
    248    return float64_muladd(a, b, float64_two, 0, fpst);
    249}
    250
    251uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
    252{
    253    float_status *fpst = fpstp;
    254
    255    a = float16_squash_input_denormal(a, fpst);
    256    b = float16_squash_input_denormal(b, fpst);
    257
    258    a = float16_chs(a);
    259    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
    260        (float16_is_infinity(b) && float16_is_zero(a))) {
    261        return float16_one_point_five;
    262    }
    263    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
    264}
    265
    266float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
    267{
    268    float_status *fpst = fpstp;
    269
    270    a = float32_squash_input_denormal(a, fpst);
    271    b = float32_squash_input_denormal(b, fpst);
    272
    273    a = float32_chs(a);
    274    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
    275        (float32_is_infinity(b) && float32_is_zero(a))) {
    276        return float32_one_point_five;
    277    }
    278    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
    279}
    280
    281float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
    282{
    283    float_status *fpst = fpstp;
    284
    285    a = float64_squash_input_denormal(a, fpst);
    286    b = float64_squash_input_denormal(b, fpst);
    287
    288    a = float64_chs(a);
    289    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
    290        (float64_is_infinity(b) && float64_is_zero(a))) {
    291        return float64_one_point_five;
    292    }
    293    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
    294}
    295
    296/* Pairwise long add: add pairs of adjacent elements into
    297 * double-width elements in the result (eg _s8 is an 8x8->16 op)
    298 */
    299uint64_t HELPER(neon_addlp_s8)(uint64_t a)
    300{
    301    uint64_t nsignmask = 0x0080008000800080ULL;
    302    uint64_t wsignmask = 0x8000800080008000ULL;
    303    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
    304    uint64_t tmp1, tmp2;
    305    uint64_t res, signres;
    306
    307    /* Extract odd elements, sign extend each to a 16 bit field */
    308    tmp1 = a & elementmask;
    309    tmp1 ^= nsignmask;
    310    tmp1 |= wsignmask;
    311    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
    312    /* Ditto for the even elements */
    313    tmp2 = (a >> 8) & elementmask;
    314    tmp2 ^= nsignmask;
    315    tmp2 |= wsignmask;
    316    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
    317
    318    /* calculate the result by summing bits 0..14, 16..22, etc,
    319     * and then adjusting the sign bits 15, 23, etc manually.
    320     * This ensures the addition can't overflow the 16 bit field.
    321     */
    322    signres = (tmp1 ^ tmp2) & wsignmask;
    323    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
    324    res ^= signres;
    325
    326    return res;
    327}
    328
    329uint64_t HELPER(neon_addlp_u8)(uint64_t a)
    330{
    331    uint64_t tmp;
    332
    333    tmp = a & 0x00ff00ff00ff00ffULL;
    334    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
    335    return tmp;
    336}
    337
    338uint64_t HELPER(neon_addlp_s16)(uint64_t a)
    339{
    340    int32_t reslo, reshi;
    341
    342    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
    343    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
    344
    345    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
    346}
    347
    348uint64_t HELPER(neon_addlp_u16)(uint64_t a)
    349{
    350    uint64_t tmp;
    351
    352    tmp = a & 0x0000ffff0000ffffULL;
    353    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
    354    return tmp;
    355}
    356
    357/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
    358uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
    359{
    360    float_status *fpst = fpstp;
    361    uint16_t val16, sbit;
    362    int16_t exp;
    363
    364    if (float16_is_any_nan(a)) {
    365        float16 nan = a;
    366        if (float16_is_signaling_nan(a, fpst)) {
    367            float_raise(float_flag_invalid, fpst);
    368            if (!fpst->default_nan_mode) {
    369                nan = float16_silence_nan(a, fpst);
    370            }
    371        }
    372        if (fpst->default_nan_mode) {
    373            nan = float16_default_nan(fpst);
    374        }
    375        return nan;
    376    }
    377
    378    a = float16_squash_input_denormal(a, fpst);
    379
    380    val16 = float16_val(a);
    381    sbit = 0x8000 & val16;
    382    exp = extract32(val16, 10, 5);
    383
    384    if (exp == 0) {
    385        return make_float16(deposit32(sbit, 10, 5, 0x1e));
    386    } else {
    387        return make_float16(deposit32(sbit, 10, 5, ~exp));
    388    }
    389}
    390
    391float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
    392{
    393    float_status *fpst = fpstp;
    394    uint32_t val32, sbit;
    395    int32_t exp;
    396
    397    if (float32_is_any_nan(a)) {
    398        float32 nan = a;
    399        if (float32_is_signaling_nan(a, fpst)) {
    400            float_raise(float_flag_invalid, fpst);
    401            if (!fpst->default_nan_mode) {
    402                nan = float32_silence_nan(a, fpst);
    403            }
    404        }
    405        if (fpst->default_nan_mode) {
    406            nan = float32_default_nan(fpst);
    407        }
    408        return nan;
    409    }
    410
    411    a = float32_squash_input_denormal(a, fpst);
    412
    413    val32 = float32_val(a);
    414    sbit = 0x80000000ULL & val32;
    415    exp = extract32(val32, 23, 8);
    416
    417    if (exp == 0) {
    418        return make_float32(sbit | (0xfe << 23));
    419    } else {
    420        return make_float32(sbit | (~exp & 0xff) << 23);
    421    }
    422}
    423
    424float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
    425{
    426    float_status *fpst = fpstp;
    427    uint64_t val64, sbit;
    428    int64_t exp;
    429
    430    if (float64_is_any_nan(a)) {
    431        float64 nan = a;
    432        if (float64_is_signaling_nan(a, fpst)) {
    433            float_raise(float_flag_invalid, fpst);
    434            if (!fpst->default_nan_mode) {
    435                nan = float64_silence_nan(a, fpst);
    436            }
    437        }
    438        if (fpst->default_nan_mode) {
    439            nan = float64_default_nan(fpst);
    440        }
    441        return nan;
    442    }
    443
    444    a = float64_squash_input_denormal(a, fpst);
    445
    446    val64 = float64_val(a);
    447    sbit = 0x8000000000000000ULL & val64;
    448    exp = extract64(float64_val(a), 52, 11);
    449
    450    if (exp == 0) {
    451        return make_float64(sbit | (0x7feULL << 52));
    452    } else {
    453        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
    454    }
    455}
    456
    457float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
    458{
    459    /* Von Neumann rounding is implemented by using round-to-zero
    460     * and then setting the LSB of the result if Inexact was raised.
    461     */
    462    float32 r;
    463    float_status *fpst = &env->vfp.fp_status;
    464    float_status tstat = *fpst;
    465    int exflags;
    466
    467    set_float_rounding_mode(float_round_to_zero, &tstat);
    468    set_float_exception_flags(0, &tstat);
    469    r = float64_to_float32(a, &tstat);
    470    exflags = get_float_exception_flags(&tstat);
    471    if (exflags & float_flag_inexact) {
    472        r = make_float32(float32_val(r) | 1);
    473    }
    474    exflags |= get_float_exception_flags(fpst);
    475    set_float_exception_flags(exflags, fpst);
    476    return r;
    477}
    478
    479/* 64-bit versions of the CRC helpers. Note that although the operation
    480 * (and the prototypes of crc32c() and crc32() mean that only the bottom
    481 * 32 bits of the accumulator and result are used, we pass and return
    482 * uint64_t for convenience of the generated code. Unlike the 32-bit
    483 * instruction set versions, val may genuinely have 64 bits of data in it.
    484 * The upper bytes of val (above the number specified by 'bytes') must have
    485 * been zeroed out by the caller.
    486 */
    487uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
    488{
    489    uint8_t buf[8];
    490
    491    stq_le_p(buf, val);
    492
    493    /* zlib crc32 converts the accumulator and output to one's complement.  */
    494    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
    495}
    496
    497uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
    498{
    499    uint8_t buf[8];
    500
    501    stq_le_p(buf, val);
    502
    503    /* Linux crc32c converts the output to one's complement.  */
    504    return crc32c(acc, buf, bytes) ^ 0xffffffff;
    505}
    506
    507uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
    508                                     uint64_t new_lo, uint64_t new_hi)
    509{
    510    Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
    511    Int128 newv = int128_make128(new_lo, new_hi);
    512    Int128 oldv;
    513    uintptr_t ra = GETPC();
    514    uint64_t o0, o1;
    515    bool success;
    516
    517#ifdef CONFIG_USER_ONLY
    518    /* ??? Enforce alignment.  */
    519    uint64_t *haddr = g2h(env_cpu(env), addr);
    520
    521    set_helper_retaddr(ra);
    522    o0 = ldq_le_p(haddr + 0);
    523    o1 = ldq_le_p(haddr + 1);
    524    oldv = int128_make128(o0, o1);
    525
    526    success = int128_eq(oldv, cmpv);
    527    if (success) {
    528        stq_le_p(haddr + 0, int128_getlo(newv));
    529        stq_le_p(haddr + 1, int128_gethi(newv));
    530    }
    531    clear_helper_retaddr();
    532#else
    533    int mem_idx = cpu_mmu_index(env, false);
    534    MemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
    535    MemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
    536
    537    o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
    538    o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
    539    oldv = int128_make128(o0, o1);
    540
    541    success = int128_eq(oldv, cmpv);
    542    if (success) {
    543        helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
    544        helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
    545    }
    546#endif
    547
    548    return !success;
    549}
    550
    551uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
    552                                              uint64_t new_lo, uint64_t new_hi)
    553{
    554    Int128 oldv, cmpv, newv;
    555    uintptr_t ra = GETPC();
    556    bool success;
    557    int mem_idx;
    558    MemOpIdx oi;
    559
    560    assert(HAVE_CMPXCHG128);
    561
    562    mem_idx = cpu_mmu_index(env, false);
    563    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
    564
    565    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
    566    newv = int128_make128(new_lo, new_hi);
    567    oldv = cpu_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
    568
    569    success = int128_eq(oldv, cmpv);
    570    return !success;
    571}
    572
    573uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
    574                                     uint64_t new_lo, uint64_t new_hi)
    575{
    576    /*
    577     * High and low need to be switched here because this is not actually a
    578     * 128bit store but two doublewords stored consecutively
    579     */
    580    Int128 cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
    581    Int128 newv = int128_make128(new_hi, new_lo);
    582    Int128 oldv;
    583    uintptr_t ra = GETPC();
    584    uint64_t o0, o1;
    585    bool success;
    586
    587#ifdef CONFIG_USER_ONLY
    588    /* ??? Enforce alignment.  */
    589    uint64_t *haddr = g2h(env_cpu(env), addr);
    590
    591    set_helper_retaddr(ra);
    592    o1 = ldq_be_p(haddr + 0);
    593    o0 = ldq_be_p(haddr + 1);
    594    oldv = int128_make128(o0, o1);
    595
    596    success = int128_eq(oldv, cmpv);
    597    if (success) {
    598        stq_be_p(haddr + 0, int128_gethi(newv));
    599        stq_be_p(haddr + 1, int128_getlo(newv));
    600    }
    601    clear_helper_retaddr();
    602#else
    603    int mem_idx = cpu_mmu_index(env, false);
    604    MemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
    605    MemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
    606
    607    o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
    608    o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
    609    oldv = int128_make128(o0, o1);
    610
    611    success = int128_eq(oldv, cmpv);
    612    if (success) {
    613        helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
    614        helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
    615    }
    616#endif
    617
    618    return !success;
    619}
    620
    621uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
    622                                              uint64_t new_lo, uint64_t new_hi)
    623{
    624    Int128 oldv, cmpv, newv;
    625    uintptr_t ra = GETPC();
    626    bool success;
    627    int mem_idx;
    628    MemOpIdx oi;
    629
    630    assert(HAVE_CMPXCHG128);
    631
    632    mem_idx = cpu_mmu_index(env, false);
    633    oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
    634
    635    /*
    636     * High and low need to be switched here because this is not actually a
    637     * 128bit store but two doublewords stored consecutively
    638     */
    639    cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
    640    newv = int128_make128(new_hi, new_lo);
    641    oldv = cpu_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
    642
    643    success = int128_eq(oldv, cmpv);
    644    return !success;
    645}
    646
    647/* Writes back the old data into Rs.  */
    648void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
    649                              uint64_t new_lo, uint64_t new_hi)
    650{
    651    Int128 oldv, cmpv, newv;
    652    uintptr_t ra = GETPC();
    653    int mem_idx;
    654    MemOpIdx oi;
    655
    656    assert(HAVE_CMPXCHG128);
    657
    658    mem_idx = cpu_mmu_index(env, false);
    659    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
    660
    661    cmpv = int128_make128(env->xregs[rs], env->xregs[rs + 1]);
    662    newv = int128_make128(new_lo, new_hi);
    663    oldv = cpu_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
    664
    665    env->xregs[rs] = int128_getlo(oldv);
    666    env->xregs[rs + 1] = int128_gethi(oldv);
    667}
    668
    669void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
    670                              uint64_t new_hi, uint64_t new_lo)
    671{
    672    Int128 oldv, cmpv, newv;
    673    uintptr_t ra = GETPC();
    674    int mem_idx;
    675    MemOpIdx oi;
    676
    677    assert(HAVE_CMPXCHG128);
    678
    679    mem_idx = cpu_mmu_index(env, false);
    680    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
    681
    682    cmpv = int128_make128(env->xregs[rs + 1], env->xregs[rs]);
    683    newv = int128_make128(new_lo, new_hi);
    684    oldv = cpu_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
    685
    686    env->xregs[rs + 1] = int128_getlo(oldv);
    687    env->xregs[rs] = int128_gethi(oldv);
    688}
    689
    690/*
    691 * AdvSIMD half-precision
    692 */
    693
    694#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
    695
    696#define ADVSIMD_HALFOP(name) \
    697uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
    698{ \
    699    float_status *fpst = fpstp; \
    700    return float16_ ## name(a, b, fpst);    \
    701}
    702
    703ADVSIMD_HALFOP(add)
    704ADVSIMD_HALFOP(sub)
    705ADVSIMD_HALFOP(mul)
    706ADVSIMD_HALFOP(div)
    707ADVSIMD_HALFOP(min)
    708ADVSIMD_HALFOP(max)
    709ADVSIMD_HALFOP(minnum)
    710ADVSIMD_HALFOP(maxnum)
    711
    712#define ADVSIMD_TWOHALFOP(name)                                         \
    713uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
    714{ \
    715    float16  a1, a2, b1, b2;                        \
    716    uint32_t r1, r2;                                \
    717    float_status *fpst = fpstp;                     \
    718    a1 = extract32(two_a, 0, 16);                   \
    719    a2 = extract32(two_a, 16, 16);                  \
    720    b1 = extract32(two_b, 0, 16);                   \
    721    b2 = extract32(two_b, 16, 16);                  \
    722    r1 = float16_ ## name(a1, b1, fpst);            \
    723    r2 = float16_ ## name(a2, b2, fpst);            \
    724    return deposit32(r1, 16, 16, r2);               \
    725}
    726
    727ADVSIMD_TWOHALFOP(add)
    728ADVSIMD_TWOHALFOP(sub)
    729ADVSIMD_TWOHALFOP(mul)
    730ADVSIMD_TWOHALFOP(div)
    731ADVSIMD_TWOHALFOP(min)
    732ADVSIMD_TWOHALFOP(max)
    733ADVSIMD_TWOHALFOP(minnum)
    734ADVSIMD_TWOHALFOP(maxnum)
    735
    736/* Data processing - scalar floating-point and advanced SIMD */
    737static float16 float16_mulx(float16 a, float16 b, void *fpstp)
    738{
    739    float_status *fpst = fpstp;
    740
    741    a = float16_squash_input_denormal(a, fpst);
    742    b = float16_squash_input_denormal(b, fpst);
    743
    744    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
    745        (float16_is_infinity(a) && float16_is_zero(b))) {
    746        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
    747        return make_float16((1U << 14) |
    748                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
    749    }
    750    return float16_mul(a, b, fpst);
    751}
    752
    753ADVSIMD_HALFOP(mulx)
    754ADVSIMD_TWOHALFOP(mulx)
    755
    756/* fused multiply-accumulate */
    757uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
    758                                 void *fpstp)
    759{
    760    float_status *fpst = fpstp;
    761    return float16_muladd(a, b, c, 0, fpst);
    762}
    763
    764uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
    765                                  uint32_t two_c, void *fpstp)
    766{
    767    float_status *fpst = fpstp;
    768    float16  a1, a2, b1, b2, c1, c2;
    769    uint32_t r1, r2;
    770    a1 = extract32(two_a, 0, 16);
    771    a2 = extract32(two_a, 16, 16);
    772    b1 = extract32(two_b, 0, 16);
    773    b2 = extract32(two_b, 16, 16);
    774    c1 = extract32(two_c, 0, 16);
    775    c2 = extract32(two_c, 16, 16);
    776    r1 = float16_muladd(a1, b1, c1, 0, fpst);
    777    r2 = float16_muladd(a2, b2, c2, 0, fpst);
    778    return deposit32(r1, 16, 16, r2);
    779}
    780
    781/*
    782 * Floating point comparisons produce an integer result. Softfloat
    783 * routines return float_relation types which we convert to the 0/-1
    784 * Neon requires.
    785 */
    786
    787#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
    788
    789uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
    790{
    791    float_status *fpst = fpstp;
    792    int compare = float16_compare_quiet(a, b, fpst);
    793    return ADVSIMD_CMPRES(compare == float_relation_equal);
    794}
    795
    796uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
    797{
    798    float_status *fpst = fpstp;
    799    int compare = float16_compare(a, b, fpst);
    800    return ADVSIMD_CMPRES(compare == float_relation_greater ||
    801                          compare == float_relation_equal);
    802}
    803
    804uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
    805{
    806    float_status *fpst = fpstp;
    807    int compare = float16_compare(a, b, fpst);
    808    return ADVSIMD_CMPRES(compare == float_relation_greater);
    809}
    810
    811uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
    812{
    813    float_status *fpst = fpstp;
    814    float16 f0 = float16_abs(a);
    815    float16 f1 = float16_abs(b);
    816    int compare = float16_compare(f0, f1, fpst);
    817    return ADVSIMD_CMPRES(compare == float_relation_greater ||
    818                          compare == float_relation_equal);
    819}
    820
    821uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
    822{
    823    float_status *fpst = fpstp;
    824    float16 f0 = float16_abs(a);
    825    float16 f1 = float16_abs(b);
    826    int compare = float16_compare(f0, f1, fpst);
    827    return ADVSIMD_CMPRES(compare == float_relation_greater);
    828}
    829
    830/* round to integral */
    831uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
    832{
    833    return float16_round_to_int(x, fp_status);
    834}
    835
    836uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
    837{
    838    int old_flags = get_float_exception_flags(fp_status), new_flags;
    839    float16 ret;
    840
    841    ret = float16_round_to_int(x, fp_status);
    842
    843    /* Suppress any inexact exceptions the conversion produced */
    844    if (!(old_flags & float_flag_inexact)) {
    845        new_flags = get_float_exception_flags(fp_status);
    846        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
    847    }
    848
    849    return ret;
    850}
    851
    852/*
    853 * Half-precision floating point conversion functions
    854 *
    855 * There are a multitude of conversion functions with various
    856 * different rounding modes. This is dealt with by the calling code
    857 * setting the mode appropriately before calling the helper.
    858 */
    859
    860uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
    861{
    862    float_status *fpst = fpstp;
    863
    864    /* Invalid if we are passed a NaN */
    865    if (float16_is_any_nan(a)) {
    866        float_raise(float_flag_invalid, fpst);
    867        return 0;
    868    }
    869    return float16_to_int16(a, fpst);
    870}
    871
    872uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
    873{
    874    float_status *fpst = fpstp;
    875
    876    /* Invalid if we are passed a NaN */
    877    if (float16_is_any_nan(a)) {
    878        float_raise(float_flag_invalid, fpst);
    879        return 0;
    880    }
    881    return float16_to_uint16(a, fpst);
    882}
    883
    884static int el_from_spsr(uint32_t spsr)
    885{
    886    /* Return the exception level that this SPSR is requesting a return to,
    887     * or -1 if it is invalid (an illegal return)
    888     */
    889    if (spsr & PSTATE_nRW) {
    890        switch (spsr & CPSR_M) {
    891        case ARM_CPU_MODE_USR:
    892            return 0;
    893        case ARM_CPU_MODE_HYP:
    894            return 2;
    895        case ARM_CPU_MODE_FIQ:
    896        case ARM_CPU_MODE_IRQ:
    897        case ARM_CPU_MODE_SVC:
    898        case ARM_CPU_MODE_ABT:
    899        case ARM_CPU_MODE_UND:
    900        case ARM_CPU_MODE_SYS:
    901            return 1;
    902        case ARM_CPU_MODE_MON:
    903            /* Returning to Mon from AArch64 is never possible,
    904             * so this is an illegal return.
    905             */
    906        default:
    907            return -1;
    908        }
    909    } else {
    910        if (extract32(spsr, 1, 1)) {
    911            /* Return with reserved M[1] bit set */
    912            return -1;
    913        }
    914        if (extract32(spsr, 0, 4) == 1) {
    915            /* return to EL0 with M[0] bit set */
    916            return -1;
    917        }
    918        return extract32(spsr, 2, 2);
    919    }
    920}
    921
    922static void cpsr_write_from_spsr_elx(CPUARMState *env,
    923                                     uint32_t val)
    924{
    925    uint32_t mask;
    926
    927    /* Save SPSR_ELx.SS into PSTATE. */
    928    env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
    929    val &= ~PSTATE_SS;
    930
    931    /* Move DIT to the correct location for CPSR */
    932    if (val & PSTATE_DIT) {
    933        val &= ~PSTATE_DIT;
    934        val |= CPSR_DIT;
    935    }
    936
    937    mask = aarch32_cpsr_valid_mask(env->features, \
    938        &env_archcpu(env)->isar);
    939    cpsr_write(env, val, mask, CPSRWriteRaw);
    940}
    941
    942void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
    943{
    944    int cur_el = arm_current_el(env);
    945    unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
    946    uint32_t spsr = env->banked_spsr[spsr_idx];
    947    int new_el;
    948    bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
    949
    950    aarch64_save_sp(env, cur_el);
    951
    952    arm_clear_exclusive(env);
    953
    954    /* We must squash the PSTATE.SS bit to zero unless both of the
    955     * following hold:
    956     *  1. debug exceptions are currently disabled
    957     *  2. singlestep will be active in the EL we return to
    958     * We check 1 here and 2 after we've done the pstate/cpsr write() to
    959     * transition to the EL we're going to.
    960     */
    961    if (arm_generate_debug_exceptions(env)) {
    962        spsr &= ~PSTATE_SS;
    963    }
    964
    965    new_el = el_from_spsr(spsr);
    966    if (new_el == -1) {
    967        goto illegal_return;
    968    }
    969    if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
    970        /* Disallow return to an EL which is unimplemented or higher
    971         * than the current one.
    972         */
    973        goto illegal_return;
    974    }
    975
    976    if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
    977        /* Return to an EL which is configured for a different register width */
    978        goto illegal_return;
    979    }
    980
    981    if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
    982        goto illegal_return;
    983    }
    984
    985    qemu_mutex_lock_iothread();
    986    arm_call_pre_el_change_hook(env_archcpu(env));
    987    qemu_mutex_unlock_iothread();
    988
    989    if (!return_to_aa64) {
    990        env->aarch64 = 0;
    991        /* We do a raw CPSR write because aarch64_sync_64_to_32()
    992         * will sort the register banks out for us, and we've already
    993         * caught all the bad-mode cases in el_from_spsr().
    994         */
    995        cpsr_write_from_spsr_elx(env, spsr);
    996        if (!arm_singlestep_active(env)) {
    997            env->pstate &= ~PSTATE_SS;
    998        }
    999        aarch64_sync_64_to_32(env);
   1000
   1001        if (spsr & CPSR_T) {
   1002            env->regs[15] = new_pc & ~0x1;
   1003        } else {
   1004            env->regs[15] = new_pc & ~0x3;
   1005        }
   1006        helper_rebuild_hflags_a32(env, new_el);
   1007        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
   1008                      "AArch32 EL%d PC 0x%" PRIx32 "\n",
   1009                      cur_el, new_el, env->regs[15]);
   1010    } else {
   1011        int tbii;
   1012
   1013        env->aarch64 = 1;
   1014        spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
   1015        pstate_write(env, spsr);
   1016        if (!arm_singlestep_active(env)) {
   1017            env->pstate &= ~PSTATE_SS;
   1018        }
   1019        aarch64_restore_sp(env, new_el);
   1020        helper_rebuild_hflags_a64(env, new_el);
   1021
   1022        /*
   1023         * Apply TBI to the exception return address.  We had to delay this
   1024         * until after we selected the new EL, so that we could select the
   1025         * correct TBI+TBID bits.  This is made easier by waiting until after
   1026         * the hflags rebuild, since we can pull the composite TBII field
   1027         * from there.
   1028         */
   1029        tbii = EX_TBFLAG_A64(env->hflags, TBII);
   1030        if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
   1031            /* TBI is enabled. */
   1032            int core_mmu_idx = cpu_mmu_index(env, false);
   1033            if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
   1034                new_pc = sextract64(new_pc, 0, 56);
   1035            } else {
   1036                new_pc = extract64(new_pc, 0, 56);
   1037            }
   1038        }
   1039        env->pc = new_pc;
   1040
   1041        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
   1042                      "AArch64 EL%d PC 0x%" PRIx64 "\n",
   1043                      cur_el, new_el, env->pc);
   1044    }
   1045
   1046    /*
   1047     * Note that cur_el can never be 0.  If new_el is 0, then
   1048     * el0_a64 is return_to_aa64, else el0_a64 is ignored.
   1049     */
   1050    aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
   1051
   1052    qemu_mutex_lock_iothread();
   1053    arm_call_el_change_hook(env_archcpu(env));
   1054    qemu_mutex_unlock_iothread();
   1055
   1056    return;
   1057
   1058illegal_return:
   1059    /* Illegal return events of various kinds have architecturally
   1060     * mandated behaviour:
   1061     * restore NZCV and DAIF from SPSR_ELx
   1062     * set PSTATE.IL
   1063     * restore PC from ELR_ELx
   1064     * no change to exception level, execution state or stack pointer
   1065     */
   1066    env->pstate |= PSTATE_IL;
   1067    env->pc = new_pc;
   1068    spsr &= PSTATE_NZCV | PSTATE_DAIF;
   1069    spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
   1070    pstate_write(env, spsr);
   1071    if (!arm_singlestep_active(env)) {
   1072        env->pstate &= ~PSTATE_SS;
   1073    }
   1074    helper_rebuild_hflags_a64(env, cur_el);
   1075    qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
   1076                  "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
   1077}
   1078
   1079/*
   1080 * Square Root and Reciprocal square root
   1081 */
   1082
   1083uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
   1084{
   1085    float_status *s = fpstp;
   1086
   1087    return float16_sqrt(a, s);
   1088}
   1089
   1090void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
   1091{
   1092    /*
   1093     * Implement DC ZVA, which zeroes a fixed-length block of memory.
   1094     * Note that we do not implement the (architecturally mandated)
   1095     * alignment fault for attempts to use this on Device memory
   1096     * (which matches the usual QEMU behaviour of not implementing either
   1097     * alignment faults or any memory attribute handling).
   1098     */
   1099    int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
   1100    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
   1101    int mmu_idx = cpu_mmu_index(env, false);
   1102    void *mem;
   1103
   1104    /*
   1105     * Trapless lookup.  In addition to actual invalid page, may
   1106     * return NULL for I/O, watchpoints, clean pages, etc.
   1107     */
   1108    mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
   1109
   1110#ifndef CONFIG_USER_ONLY
   1111    if (unlikely(!mem)) {
   1112        uintptr_t ra = GETPC();
   1113
   1114        /*
   1115         * Trap if accessing an invalid page.  DC_ZVA requires that we supply
   1116         * the original pointer for an invalid page.  But watchpoints require
   1117         * that we probe the actual space.  So do both.
   1118         */
   1119        (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
   1120        mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
   1121
   1122        if (unlikely(!mem)) {
   1123            /*
   1124             * The only remaining reason for mem == NULL is I/O.
   1125             * Just do a series of byte writes as the architecture demands.
   1126             */
   1127            for (int i = 0; i < blocklen; i++) {
   1128                cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
   1129            }
   1130            return;
   1131        }
   1132    }
   1133#endif
   1134
   1135    memset(mem, 0, blocklen);
   1136}