cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

kvm.c (159139B)


      1/*
      2 * QEMU KVM support
      3 *
      4 * Copyright (C) 2006-2008 Qumranet Technologies
      5 * Copyright IBM, Corp. 2008
      6 *
      7 * Authors:
      8 *  Anthony Liguori   <aliguori@us.ibm.com>
      9 *
     10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
     11 * See the COPYING file in the top-level directory.
     12 *
     13 */
     14
     15#include "qemu/osdep.h"
     16#include "qapi/qapi-events-run-state.h"
     17#include "qapi/error.h"
     18#include <sys/ioctl.h>
     19#include <sys/utsname.h>
     20
     21#include <linux/kvm.h>
     22#include "standard-headers/asm-x86/kvm_para.h"
     23
     24#include "cpu.h"
     25#include "host-cpu.h"
     26#include "sysemu/sysemu.h"
     27#include "sysemu/hw_accel.h"
     28#include "sysemu/kvm_int.h"
     29#include "sysemu/runstate.h"
     30#include "kvm_i386.h"
     31#include "sev_i386.h"
     32#include "hyperv.h"
     33#include "hyperv-proto.h"
     34
     35#include "exec/gdbstub.h"
     36#include "qemu/host-utils.h"
     37#include "qemu/main-loop.h"
     38#include "qemu/config-file.h"
     39#include "qemu/error-report.h"
     40#include "hw/i386/x86.h"
     41#include "hw/i386/apic.h"
     42#include "hw/i386/apic_internal.h"
     43#include "hw/i386/apic-msidef.h"
     44#include "hw/i386/intel_iommu.h"
     45#include "hw/i386/x86-iommu.h"
     46#include "hw/i386/e820_memory_layout.h"
     47#include "sysemu/sev.h"
     48
     49#include "hw/pci/pci.h"
     50#include "hw/pci/msi.h"
     51#include "hw/pci/msix.h"
     52#include "migration/blocker.h"
     53#include "exec/memattrs.h"
     54#include "trace.h"
     55
     56//#define DEBUG_KVM
     57
     58#ifdef DEBUG_KVM
     59#define DPRINTF(fmt, ...) \
     60    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
     61#else
     62#define DPRINTF(fmt, ...) \
     63    do { } while (0)
     64#endif
     65
     66/* From arch/x86/kvm/lapic.h */
     67#define KVM_APIC_BUS_CYCLE_NS       1
     68#define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
     69
     70#define MSR_KVM_WALL_CLOCK  0x11
     71#define MSR_KVM_SYSTEM_TIME 0x12
     72
     73/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
     74 * 255 kvm_msr_entry structs */
     75#define MSR_BUF_SIZE 4096
     76
     77static void kvm_init_msrs(X86CPU *cpu);
     78
     79const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     80    KVM_CAP_INFO(SET_TSS_ADDR),
     81    KVM_CAP_INFO(EXT_CPUID),
     82    KVM_CAP_INFO(MP_STATE),
     83    KVM_CAP_LAST_INFO
     84};
     85
     86static bool has_msr_star;
     87static bool has_msr_hsave_pa;
     88static bool has_msr_tsc_aux;
     89static bool has_msr_tsc_adjust;
     90static bool has_msr_tsc_deadline;
     91static bool has_msr_feature_control;
     92static bool has_msr_misc_enable;
     93static bool has_msr_smbase;
     94static bool has_msr_bndcfgs;
     95static int lm_capable_kernel;
     96static bool has_msr_hv_hypercall;
     97static bool has_msr_hv_crash;
     98static bool has_msr_hv_reset;
     99static bool has_msr_hv_vpindex;
    100static bool hv_vpindex_settable;
    101static bool has_msr_hv_runtime;
    102static bool has_msr_hv_synic;
    103static bool has_msr_hv_stimer;
    104static bool has_msr_hv_frequencies;
    105static bool has_msr_hv_reenlightenment;
    106static bool has_msr_xss;
    107static bool has_msr_umwait;
    108static bool has_msr_spec_ctrl;
    109static bool has_msr_tsx_ctrl;
    110static bool has_msr_virt_ssbd;
    111static bool has_msr_smi_count;
    112static bool has_msr_arch_capabs;
    113static bool has_msr_core_capabs;
    114static bool has_msr_vmx_vmfunc;
    115static bool has_msr_ucode_rev;
    116static bool has_msr_vmx_procbased_ctls2;
    117static bool has_msr_perf_capabs;
    118static bool has_msr_pkrs;
    119
    120static uint32_t has_architectural_pmu_version;
    121static uint32_t num_architectural_pmu_gp_counters;
    122static uint32_t num_architectural_pmu_fixed_counters;
    123
    124static int has_xsave;
    125static int has_xcrs;
    126static int has_pit_state2;
    127static int has_exception_payload;
    128
    129static bool has_msr_mcg_ext_ctl;
    130
    131static struct kvm_cpuid2 *cpuid_cache;
    132static struct kvm_cpuid2 *hv_cpuid_cache;
    133static struct kvm_msr_list *kvm_feature_msrs;
    134
    135#define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
    136static RateLimit bus_lock_ratelimit_ctrl;
    137
    138int kvm_has_pit_state2(void)
    139{
    140    return has_pit_state2;
    141}
    142
    143bool kvm_has_smm(void)
    144{
    145    return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
    146}
    147
    148bool kvm_has_adjust_clock_stable(void)
    149{
    150    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
    151
    152    return (ret == KVM_CLOCK_TSC_STABLE);
    153}
    154
    155bool kvm_has_adjust_clock(void)
    156{
    157    return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
    158}
    159
    160bool kvm_has_exception_payload(void)
    161{
    162    return has_exception_payload;
    163}
    164
    165static bool kvm_x2apic_api_set_flags(uint64_t flags)
    166{
    167    KVMState *s = KVM_STATE(current_accel());
    168
    169    return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
    170}
    171
    172#define MEMORIZE(fn, _result) \
    173    ({ \
    174        static bool _memorized; \
    175        \
    176        if (_memorized) { \
    177            return _result; \
    178        } \
    179        _memorized = true; \
    180        _result = fn; \
    181    })
    182
    183static bool has_x2apic_api;
    184
    185bool kvm_has_x2apic_api(void)
    186{
    187    return has_x2apic_api;
    188}
    189
    190bool kvm_enable_x2apic(void)
    191{
    192    return MEMORIZE(
    193             kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
    194                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
    195             has_x2apic_api);
    196}
    197
    198bool kvm_hv_vpindex_settable(void)
    199{
    200    return hv_vpindex_settable;
    201}
    202
    203static int kvm_get_tsc(CPUState *cs)
    204{
    205    X86CPU *cpu = X86_CPU(cs);
    206    CPUX86State *env = &cpu->env;
    207    struct {
    208        struct kvm_msrs info;
    209        struct kvm_msr_entry entries[1];
    210    } msr_data = {};
    211    int ret;
    212
    213    if (env->tsc_valid) {
    214        return 0;
    215    }
    216
    217    memset(&msr_data, 0, sizeof(msr_data));
    218    msr_data.info.nmsrs = 1;
    219    msr_data.entries[0].index = MSR_IA32_TSC;
    220    env->tsc_valid = !runstate_is_running();
    221
    222    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
    223    if (ret < 0) {
    224        return ret;
    225    }
    226
    227    assert(ret == 1);
    228    env->tsc = msr_data.entries[0].data;
    229    return 0;
    230}
    231
    232static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
    233{
    234    kvm_get_tsc(cpu);
    235}
    236
    237void kvm_synchronize_all_tsc(void)
    238{
    239    CPUState *cpu;
    240
    241    if (kvm_enabled()) {
    242        CPU_FOREACH(cpu) {
    243            run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
    244        }
    245    }
    246}
    247
    248static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
    249{
    250    struct kvm_cpuid2 *cpuid;
    251    int r, size;
    252
    253    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
    254    cpuid = g_malloc0(size);
    255    cpuid->nent = max;
    256    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
    257    if (r == 0 && cpuid->nent >= max) {
    258        r = -E2BIG;
    259    }
    260    if (r < 0) {
    261        if (r == -E2BIG) {
    262            g_free(cpuid);
    263            return NULL;
    264        } else {
    265            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
    266                    strerror(-r));
    267            exit(1);
    268        }
    269    }
    270    return cpuid;
    271}
    272
    273/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
    274 * for all entries.
    275 */
    276static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
    277{
    278    struct kvm_cpuid2 *cpuid;
    279    int max = 1;
    280
    281    if (cpuid_cache != NULL) {
    282        return cpuid_cache;
    283    }
    284    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
    285        max *= 2;
    286    }
    287    cpuid_cache = cpuid;
    288    return cpuid;
    289}
    290
    291static bool host_tsx_broken(void)
    292{
    293    int family, model, stepping;\
    294    char vendor[CPUID_VENDOR_SZ + 1];
    295
    296    host_cpu_vendor_fms(vendor, &family, &model, &stepping);
    297
    298    /* Check if we are running on a Haswell host known to have broken TSX */
    299    return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
    300           (family == 6) &&
    301           ((model == 63 && stepping < 4) ||
    302            model == 60 || model == 69 || model == 70);
    303}
    304
    305/* Returns the value for a specific register on the cpuid entry
    306 */
    307static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
    308{
    309    uint32_t ret = 0;
    310    switch (reg) {
    311    case R_EAX:
    312        ret = entry->eax;
    313        break;
    314    case R_EBX:
    315        ret = entry->ebx;
    316        break;
    317    case R_ECX:
    318        ret = entry->ecx;
    319        break;
    320    case R_EDX:
    321        ret = entry->edx;
    322        break;
    323    }
    324    return ret;
    325}
    326
    327/* Find matching entry for function/index on kvm_cpuid2 struct
    328 */
    329static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
    330                                                 uint32_t function,
    331                                                 uint32_t index)
    332{
    333    int i;
    334    for (i = 0; i < cpuid->nent; ++i) {
    335        if (cpuid->entries[i].function == function &&
    336            cpuid->entries[i].index == index) {
    337            return &cpuid->entries[i];
    338        }
    339    }
    340    /* not found: */
    341    return NULL;
    342}
    343
    344uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
    345                                      uint32_t index, int reg)
    346{
    347    struct kvm_cpuid2 *cpuid;
    348    uint32_t ret = 0;
    349    uint32_t cpuid_1_edx;
    350
    351    cpuid = get_supported_cpuid(s);
    352
    353    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
    354    if (entry) {
    355        ret = cpuid_entry_get_reg(entry, reg);
    356    }
    357
    358    /* Fixups for the data returned by KVM, below */
    359
    360    if (function == 1 && reg == R_EDX) {
    361        /* KVM before 2.6.30 misreports the following features */
    362        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
    363    } else if (function == 1 && reg == R_ECX) {
    364        /* We can set the hypervisor flag, even if KVM does not return it on
    365         * GET_SUPPORTED_CPUID
    366         */
    367        ret |= CPUID_EXT_HYPERVISOR;
    368        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
    369         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
    370         * and the irqchip is in the kernel.
    371         */
    372        if (kvm_irqchip_in_kernel() &&
    373                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
    374            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
    375        }
    376
    377        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
    378         * without the in-kernel irqchip
    379         */
    380        if (!kvm_irqchip_in_kernel()) {
    381            ret &= ~CPUID_EXT_X2APIC;
    382        }
    383
    384        if (enable_cpu_pm) {
    385            int disable_exits = kvm_check_extension(s,
    386                                                    KVM_CAP_X86_DISABLE_EXITS);
    387
    388            if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
    389                ret |= CPUID_EXT_MONITOR;
    390            }
    391        }
    392    } else if (function == 6 && reg == R_EAX) {
    393        ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
    394    } else if (function == 7 && index == 0 && reg == R_EBX) {
    395        if (host_tsx_broken()) {
    396            ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
    397        }
    398    } else if (function == 7 && index == 0 && reg == R_EDX) {
    399        /*
    400         * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
    401         * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
    402         * returned by KVM_GET_MSR_INDEX_LIST.
    403         */
    404        if (!has_msr_arch_capabs) {
    405            ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
    406        }
    407    } else if (function == 0x80000001 && reg == R_ECX) {
    408        /*
    409         * It's safe to enable TOPOEXT even if it's not returned by
    410         * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
    411         * us to keep CPU models including TOPOEXT runnable on older kernels.
    412         */
    413        ret |= CPUID_EXT3_TOPOEXT;
    414    } else if (function == 0x80000001 && reg == R_EDX) {
    415        /* On Intel, kvm returns cpuid according to the Intel spec,
    416         * so add missing bits according to the AMD spec:
    417         */
    418        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
    419        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
    420    } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
    421        /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
    422         * be enabled without the in-kernel irqchip
    423         */
    424        if (!kvm_irqchip_in_kernel()) {
    425            ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
    426        }
    427        if (kvm_irqchip_is_split()) {
    428            ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
    429        }
    430    } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
    431        ret |= 1U << KVM_HINTS_REALTIME;
    432    }
    433
    434    return ret;
    435}
    436
    437uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
    438{
    439    struct {
    440        struct kvm_msrs info;
    441        struct kvm_msr_entry entries[1];
    442    } msr_data = {};
    443    uint64_t value;
    444    uint32_t ret, can_be_one, must_be_one;
    445
    446    if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
    447        return 0;
    448    }
    449
    450    /* Check if requested MSR is supported feature MSR */
    451    int i;
    452    for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
    453        if (kvm_feature_msrs->indices[i] == index) {
    454            break;
    455        }
    456    if (i == kvm_feature_msrs->nmsrs) {
    457        return 0; /* if the feature MSR is not supported, simply return 0 */
    458    }
    459
    460    msr_data.info.nmsrs = 1;
    461    msr_data.entries[0].index = index;
    462
    463    ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
    464    if (ret != 1) {
    465        error_report("KVM get MSR (index=0x%x) feature failed, %s",
    466            index, strerror(-ret));
    467        exit(1);
    468    }
    469
    470    value = msr_data.entries[0].data;
    471    switch (index) {
    472    case MSR_IA32_VMX_PROCBASED_CTLS2:
    473        if (!has_msr_vmx_procbased_ctls2) {
    474            /* KVM forgot to add these bits for some time, do this ourselves. */
    475            if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
    476                CPUID_XSAVE_XSAVES) {
    477                value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
    478            }
    479            if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
    480                CPUID_EXT_RDRAND) {
    481                value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
    482            }
    483            if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
    484                CPUID_7_0_EBX_INVPCID) {
    485                value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
    486            }
    487            if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
    488                CPUID_7_0_EBX_RDSEED) {
    489                value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
    490            }
    491            if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
    492                CPUID_EXT2_RDTSCP) {
    493                value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
    494            }
    495        }
    496        /* fall through */
    497    case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
    498    case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
    499    case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
    500    case MSR_IA32_VMX_TRUE_EXIT_CTLS:
    501        /*
    502         * Return true for bits that can be one, but do not have to be one.
    503         * The SDM tells us which bits could have a "must be one" setting,
    504         * so we can do the opposite transformation in make_vmx_msr_value.
    505         */
    506        must_be_one = (uint32_t)value;
    507        can_be_one = (uint32_t)(value >> 32);
    508        return can_be_one & ~must_be_one;
    509
    510    default:
    511        return value;
    512    }
    513}
    514
    515static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
    516                                     int *max_banks)
    517{
    518    int r;
    519
    520    r = kvm_check_extension(s, KVM_CAP_MCE);
    521    if (r > 0) {
    522        *max_banks = r;
    523        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
    524    }
    525    return -ENOSYS;
    526}
    527
    528static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
    529{
    530    CPUState *cs = CPU(cpu);
    531    CPUX86State *env = &cpu->env;
    532    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
    533                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
    534    uint64_t mcg_status = MCG_STATUS_MCIP;
    535    int flags = 0;
    536
    537    if (code == BUS_MCEERR_AR) {
    538        status |= MCI_STATUS_AR | 0x134;
    539        mcg_status |= MCG_STATUS_EIPV;
    540    } else {
    541        status |= 0xc0;
    542        mcg_status |= MCG_STATUS_RIPV;
    543    }
    544
    545    flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
    546    /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
    547     * guest kernel back into env->mcg_ext_ctl.
    548     */
    549    cpu_synchronize_state(cs);
    550    if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
    551        mcg_status |= MCG_STATUS_LMCE;
    552        flags = 0;
    553    }
    554
    555    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
    556                       (MCM_ADDR_PHYS << 6) | 0xc, flags);
    557}
    558
    559static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
    560{
    561    MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
    562
    563    qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
    564                                   &mff);
    565}
    566
    567static void hardware_memory_error(void *host_addr)
    568{
    569    emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
    570    error_report("QEMU got Hardware memory error at addr %p", host_addr);
    571    exit(1);
    572}
    573
    574void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
    575{
    576    X86CPU *cpu = X86_CPU(c);
    577    CPUX86State *env = &cpu->env;
    578    ram_addr_t ram_addr;
    579    hwaddr paddr;
    580
    581    /* If we get an action required MCE, it has been injected by KVM
    582     * while the VM was running.  An action optional MCE instead should
    583     * be coming from the main thread, which qemu_init_sigbus identifies
    584     * as the "early kill" thread.
    585     */
    586    assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
    587
    588    if ((env->mcg_cap & MCG_SER_P) && addr) {
    589        ram_addr = qemu_ram_addr_from_host(addr);
    590        if (ram_addr != RAM_ADDR_INVALID &&
    591            kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
    592            kvm_hwpoison_page_add(ram_addr);
    593            kvm_mce_inject(cpu, paddr, code);
    594
    595            /*
    596             * Use different logging severity based on error type.
    597             * If there is additional MCE reporting on the hypervisor, QEMU VA
    598             * could be another source to identify the PA and MCE details.
    599             */
    600            if (code == BUS_MCEERR_AR) {
    601                error_report("Guest MCE Memory Error at QEMU addr %p and "
    602                    "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
    603                    addr, paddr, "BUS_MCEERR_AR");
    604            } else {
    605                 warn_report("Guest MCE Memory Error at QEMU addr %p and "
    606                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
    607                     addr, paddr, "BUS_MCEERR_AO");
    608            }
    609
    610            return;
    611        }
    612
    613        if (code == BUS_MCEERR_AO) {
    614            warn_report("Hardware memory error at addr %p of type %s "
    615                "for memory used by QEMU itself instead of guest system!",
    616                 addr, "BUS_MCEERR_AO");
    617        }
    618    }
    619
    620    if (code == BUS_MCEERR_AR) {
    621        hardware_memory_error(addr);
    622    }
    623
    624    /* Hope we are lucky for AO MCE, just notify a event */
    625    emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
    626}
    627
    628static void kvm_reset_exception(CPUX86State *env)
    629{
    630    env->exception_nr = -1;
    631    env->exception_pending = 0;
    632    env->exception_injected = 0;
    633    env->exception_has_payload = false;
    634    env->exception_payload = 0;
    635}
    636
    637static void kvm_queue_exception(CPUX86State *env,
    638                                int32_t exception_nr,
    639                                uint8_t exception_has_payload,
    640                                uint64_t exception_payload)
    641{
    642    assert(env->exception_nr == -1);
    643    assert(!env->exception_pending);
    644    assert(!env->exception_injected);
    645    assert(!env->exception_has_payload);
    646
    647    env->exception_nr = exception_nr;
    648
    649    if (has_exception_payload) {
    650        env->exception_pending = 1;
    651
    652        env->exception_has_payload = exception_has_payload;
    653        env->exception_payload = exception_payload;
    654    } else {
    655        env->exception_injected = 1;
    656
    657        if (exception_nr == EXCP01_DB) {
    658            assert(exception_has_payload);
    659            env->dr[6] = exception_payload;
    660        } else if (exception_nr == EXCP0E_PAGE) {
    661            assert(exception_has_payload);
    662            env->cr[2] = exception_payload;
    663        } else {
    664            assert(!exception_has_payload);
    665        }
    666    }
    667}
    668
    669static int kvm_inject_mce_oldstyle(X86CPU *cpu)
    670{
    671    CPUX86State *env = &cpu->env;
    672
    673    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
    674        unsigned int bank, bank_num = env->mcg_cap & 0xff;
    675        struct kvm_x86_mce mce;
    676
    677        kvm_reset_exception(env);
    678
    679        /*
    680         * There must be at least one bank in use if an MCE is pending.
    681         * Find it and use its values for the event injection.
    682         */
    683        for (bank = 0; bank < bank_num; bank++) {
    684            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
    685                break;
    686            }
    687        }
    688        assert(bank < bank_num);
    689
    690        mce.bank = bank;
    691        mce.status = env->mce_banks[bank * 4 + 1];
    692        mce.mcg_status = env->mcg_status;
    693        mce.addr = env->mce_banks[bank * 4 + 2];
    694        mce.misc = env->mce_banks[bank * 4 + 3];
    695
    696        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
    697    }
    698    return 0;
    699}
    700
    701static void cpu_update_state(void *opaque, bool running, RunState state)
    702{
    703    CPUX86State *env = opaque;
    704
    705    if (running) {
    706        env->tsc_valid = false;
    707    }
    708}
    709
    710unsigned long kvm_arch_vcpu_id(CPUState *cs)
    711{
    712    X86CPU *cpu = X86_CPU(cs);
    713    return cpu->apic_id;
    714}
    715
    716#ifndef KVM_CPUID_SIGNATURE_NEXT
    717#define KVM_CPUID_SIGNATURE_NEXT                0x40000100
    718#endif
    719
    720static bool hyperv_enabled(X86CPU *cpu)
    721{
    722    return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
    723        ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
    724         cpu->hyperv_features || cpu->hyperv_passthrough);
    725}
    726
    727/*
    728 * Check whether target_freq is within conservative
    729 * ntp correctable bounds (250ppm) of freq
    730 */
    731static inline bool freq_within_bounds(int freq, int target_freq)
    732{
    733        int max_freq = freq + (freq * 250 / 1000000);
    734        int min_freq = freq - (freq * 250 / 1000000);
    735
    736        if (target_freq >= min_freq && target_freq <= max_freq) {
    737                return true;
    738        }
    739
    740        return false;
    741}
    742
    743static int kvm_arch_set_tsc_khz(CPUState *cs)
    744{
    745    X86CPU *cpu = X86_CPU(cs);
    746    CPUX86State *env = &cpu->env;
    747    int r, cur_freq;
    748    bool set_ioctl = false;
    749
    750    if (!env->tsc_khz) {
    751        return 0;
    752    }
    753
    754    cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
    755               kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
    756
    757    /*
    758     * If TSC scaling is supported, attempt to set TSC frequency.
    759     */
    760    if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
    761        set_ioctl = true;
    762    }
    763
    764    /*
    765     * If desired TSC frequency is within bounds of NTP correction,
    766     * attempt to set TSC frequency.
    767     */
    768    if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
    769        set_ioctl = true;
    770    }
    771
    772    r = set_ioctl ?
    773        kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
    774        -ENOTSUP;
    775
    776    if (r < 0) {
    777        /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
    778         * TSC frequency doesn't match the one we want.
    779         */
    780        cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
    781                   kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
    782                   -ENOTSUP;
    783        if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
    784            warn_report("TSC frequency mismatch between "
    785                        "VM (%" PRId64 " kHz) and host (%d kHz), "
    786                        "and TSC scaling unavailable",
    787                        env->tsc_khz, cur_freq);
    788            return r;
    789        }
    790    }
    791
    792    return 0;
    793}
    794
    795static bool tsc_is_stable_and_known(CPUX86State *env)
    796{
    797    if (!env->tsc_khz) {
    798        return false;
    799    }
    800    return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
    801        || env->user_tsc_khz;
    802}
    803
    804static struct {
    805    const char *desc;
    806    struct {
    807        uint32_t func;
    808        int reg;
    809        uint32_t bits;
    810    } flags[2];
    811    uint64_t dependencies;
    812} kvm_hyperv_properties[] = {
    813    [HYPERV_FEAT_RELAXED] = {
    814        .desc = "relaxed timing (hv-relaxed)",
    815        .flags = {
    816            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    817             .bits = HV_RELAXED_TIMING_RECOMMENDED}
    818        }
    819    },
    820    [HYPERV_FEAT_VAPIC] = {
    821        .desc = "virtual APIC (hv-vapic)",
    822        .flags = {
    823            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    824             .bits = HV_APIC_ACCESS_AVAILABLE}
    825        }
    826    },
    827    [HYPERV_FEAT_TIME] = {
    828        .desc = "clocksources (hv-time)",
    829        .flags = {
    830            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    831             .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
    832        }
    833    },
    834    [HYPERV_FEAT_CRASH] = {
    835        .desc = "crash MSRs (hv-crash)",
    836        .flags = {
    837            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    838             .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
    839        }
    840    },
    841    [HYPERV_FEAT_RESET] = {
    842        .desc = "reset MSR (hv-reset)",
    843        .flags = {
    844            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    845             .bits = HV_RESET_AVAILABLE}
    846        }
    847    },
    848    [HYPERV_FEAT_VPINDEX] = {
    849        .desc = "VP_INDEX MSR (hv-vpindex)",
    850        .flags = {
    851            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    852             .bits = HV_VP_INDEX_AVAILABLE}
    853        }
    854    },
    855    [HYPERV_FEAT_RUNTIME] = {
    856        .desc = "VP_RUNTIME MSR (hv-runtime)",
    857        .flags = {
    858            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    859             .bits = HV_VP_RUNTIME_AVAILABLE}
    860        }
    861    },
    862    [HYPERV_FEAT_SYNIC] = {
    863        .desc = "synthetic interrupt controller (hv-synic)",
    864        .flags = {
    865            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    866             .bits = HV_SYNIC_AVAILABLE}
    867        }
    868    },
    869    [HYPERV_FEAT_STIMER] = {
    870        .desc = "synthetic timers (hv-stimer)",
    871        .flags = {
    872            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    873             .bits = HV_SYNTIMERS_AVAILABLE}
    874        },
    875        .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
    876    },
    877    [HYPERV_FEAT_FREQUENCIES] = {
    878        .desc = "frequency MSRs (hv-frequencies)",
    879        .flags = {
    880            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    881             .bits = HV_ACCESS_FREQUENCY_MSRS},
    882            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    883             .bits = HV_FREQUENCY_MSRS_AVAILABLE}
    884        }
    885    },
    886    [HYPERV_FEAT_REENLIGHTENMENT] = {
    887        .desc = "reenlightenment MSRs (hv-reenlightenment)",
    888        .flags = {
    889            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    890             .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
    891        }
    892    },
    893    [HYPERV_FEAT_TLBFLUSH] = {
    894        .desc = "paravirtualized TLB flush (hv-tlbflush)",
    895        .flags = {
    896            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    897             .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
    898             HV_EX_PROCESSOR_MASKS_RECOMMENDED}
    899        },
    900        .dependencies = BIT(HYPERV_FEAT_VPINDEX)
    901    },
    902    [HYPERV_FEAT_EVMCS] = {
    903        .desc = "enlightened VMCS (hv-evmcs)",
    904        .flags = {
    905            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    906             .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
    907        },
    908        .dependencies = BIT(HYPERV_FEAT_VAPIC)
    909    },
    910    [HYPERV_FEAT_IPI] = {
    911        .desc = "paravirtualized IPI (hv-ipi)",
    912        .flags = {
    913            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    914             .bits = HV_CLUSTER_IPI_RECOMMENDED |
    915             HV_EX_PROCESSOR_MASKS_RECOMMENDED}
    916        },
    917        .dependencies = BIT(HYPERV_FEAT_VPINDEX)
    918    },
    919    [HYPERV_FEAT_STIMER_DIRECT] = {
    920        .desc = "direct mode synthetic timers (hv-stimer-direct)",
    921        .flags = {
    922            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    923             .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
    924        },
    925        .dependencies = BIT(HYPERV_FEAT_STIMER)
    926    },
    927    [HYPERV_FEAT_AVIC] = {
    928        .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
    929        .flags = {
    930            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    931             .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
    932        }
    933    },
    934};
    935
    936static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
    937                                           bool do_sys_ioctl)
    938{
    939    struct kvm_cpuid2 *cpuid;
    940    int r, size;
    941
    942    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
    943    cpuid = g_malloc0(size);
    944    cpuid->nent = max;
    945
    946    if (do_sys_ioctl) {
    947        r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
    948    } else {
    949        r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
    950    }
    951    if (r == 0 && cpuid->nent >= max) {
    952        r = -E2BIG;
    953    }
    954    if (r < 0) {
    955        if (r == -E2BIG) {
    956            g_free(cpuid);
    957            return NULL;
    958        } else {
    959            fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
    960                    strerror(-r));
    961            exit(1);
    962        }
    963    }
    964    return cpuid;
    965}
    966
    967/*
    968 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
    969 * for all entries.
    970 */
    971static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
    972{
    973    struct kvm_cpuid2 *cpuid;
    974    /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000080 leaves */
    975    int max = 10;
    976    int i;
    977    bool do_sys_ioctl;
    978
    979    do_sys_ioctl =
    980        kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
    981
    982    /*
    983     * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
    984     * unsupported, kvm_hyperv_expand_features() checks for that.
    985     */
    986    assert(do_sys_ioctl || cs->kvm_state);
    987
    988    /*
    989     * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
    990     * -E2BIG, however, it doesn't report back the right size. Keep increasing
    991     * it and re-trying until we succeed.
    992     */
    993    while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
    994        max++;
    995    }
    996
    997    /*
    998     * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
    999     * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
   1000     * information early, just check for the capability and set the bit
   1001     * manually.
   1002     */
   1003    if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
   1004                            KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
   1005        for (i = 0; i < cpuid->nent; i++) {
   1006            if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
   1007                cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
   1008            }
   1009        }
   1010    }
   1011
   1012    return cpuid;
   1013}
   1014
   1015/*
   1016 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
   1017 * leaves from KVM_CAP_HYPERV* and present MSRs data.
   1018 */
   1019static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
   1020{
   1021    X86CPU *cpu = X86_CPU(cs);
   1022    struct kvm_cpuid2 *cpuid;
   1023    struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
   1024
   1025    /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
   1026    cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
   1027    cpuid->nent = 2;
   1028
   1029    /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
   1030    entry_feat = &cpuid->entries[0];
   1031    entry_feat->function = HV_CPUID_FEATURES;
   1032
   1033    entry_recomm = &cpuid->entries[1];
   1034    entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
   1035    entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
   1036
   1037    if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
   1038        entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
   1039        entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
   1040        entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
   1041        entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
   1042        entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
   1043    }
   1044
   1045    if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
   1046        entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
   1047        entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
   1048    }
   1049
   1050    if (has_msr_hv_frequencies) {
   1051        entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
   1052        entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
   1053    }
   1054
   1055    if (has_msr_hv_crash) {
   1056        entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
   1057    }
   1058
   1059    if (has_msr_hv_reenlightenment) {
   1060        entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
   1061    }
   1062
   1063    if (has_msr_hv_reset) {
   1064        entry_feat->eax |= HV_RESET_AVAILABLE;
   1065    }
   1066
   1067    if (has_msr_hv_vpindex) {
   1068        entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
   1069    }
   1070
   1071    if (has_msr_hv_runtime) {
   1072        entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
   1073    }
   1074
   1075    if (has_msr_hv_synic) {
   1076        unsigned int cap = cpu->hyperv_synic_kvm_only ?
   1077            KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
   1078
   1079        if (kvm_check_extension(cs->kvm_state, cap) > 0) {
   1080            entry_feat->eax |= HV_SYNIC_AVAILABLE;
   1081        }
   1082    }
   1083
   1084    if (has_msr_hv_stimer) {
   1085        entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
   1086    }
   1087
   1088    if (kvm_check_extension(cs->kvm_state,
   1089                            KVM_CAP_HYPERV_TLBFLUSH) > 0) {
   1090        entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
   1091        entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
   1092    }
   1093
   1094    if (kvm_check_extension(cs->kvm_state,
   1095                            KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
   1096        entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
   1097    }
   1098
   1099    if (kvm_check_extension(cs->kvm_state,
   1100                            KVM_CAP_HYPERV_SEND_IPI) > 0) {
   1101        entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
   1102        entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
   1103    }
   1104
   1105    return cpuid;
   1106}
   1107
   1108static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
   1109{
   1110    struct kvm_cpuid_entry2 *entry;
   1111    struct kvm_cpuid2 *cpuid;
   1112
   1113    if (hv_cpuid_cache) {
   1114        cpuid = hv_cpuid_cache;
   1115    } else {
   1116        if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
   1117            cpuid = get_supported_hv_cpuid(cs);
   1118        } else {
   1119            /*
   1120             * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
   1121             * before KVM context is created but this is only done when
   1122             * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
   1123             * KVM_CAP_HYPERV_CPUID.
   1124             */
   1125            assert(cs->kvm_state);
   1126
   1127            cpuid = get_supported_hv_cpuid_legacy(cs);
   1128        }
   1129        hv_cpuid_cache = cpuid;
   1130    }
   1131
   1132    if (!cpuid) {
   1133        return 0;
   1134    }
   1135
   1136    entry = cpuid_find_entry(cpuid, func, 0);
   1137    if (!entry) {
   1138        return 0;
   1139    }
   1140
   1141    return cpuid_entry_get_reg(entry, reg);
   1142}
   1143
   1144static bool hyperv_feature_supported(CPUState *cs, int feature)
   1145{
   1146    uint32_t func, bits;
   1147    int i, reg;
   1148
   1149    for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
   1150
   1151        func = kvm_hyperv_properties[feature].flags[i].func;
   1152        reg = kvm_hyperv_properties[feature].flags[i].reg;
   1153        bits = kvm_hyperv_properties[feature].flags[i].bits;
   1154
   1155        if (!func) {
   1156            continue;
   1157        }
   1158
   1159        if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
   1160            return false;
   1161        }
   1162    }
   1163
   1164    return true;
   1165}
   1166
   1167/* Checks that all feature dependencies are enabled */
   1168static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
   1169{
   1170    uint64_t deps;
   1171    int dep_feat;
   1172
   1173    deps = kvm_hyperv_properties[feature].dependencies;
   1174    while (deps) {
   1175        dep_feat = ctz64(deps);
   1176        if (!(hyperv_feat_enabled(cpu, dep_feat))) {
   1177            error_setg(errp, "Hyper-V %s requires Hyper-V %s",
   1178                       kvm_hyperv_properties[feature].desc,
   1179                       kvm_hyperv_properties[dep_feat].desc);
   1180            return false;
   1181        }
   1182        deps &= ~(1ull << dep_feat);
   1183    }
   1184
   1185    return true;
   1186}
   1187
   1188static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
   1189{
   1190    X86CPU *cpu = X86_CPU(cs);
   1191    uint32_t r = 0;
   1192    int i, j;
   1193
   1194    for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
   1195        if (!hyperv_feat_enabled(cpu, i)) {
   1196            continue;
   1197        }
   1198
   1199        for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
   1200            if (kvm_hyperv_properties[i].flags[j].func != func) {
   1201                continue;
   1202            }
   1203            if (kvm_hyperv_properties[i].flags[j].reg != reg) {
   1204                continue;
   1205            }
   1206
   1207            r |= kvm_hyperv_properties[i].flags[j].bits;
   1208        }
   1209    }
   1210
   1211    return r;
   1212}
   1213
   1214/*
   1215 * Expand Hyper-V CPU features. In partucular, check that all the requested
   1216 * features are supported by the host and the sanity of the configuration
   1217 * (that all the required dependencies are included). Also, this takes care
   1218 * of 'hv_passthrough' mode and fills the environment with all supported
   1219 * Hyper-V features.
   1220 */
   1221bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
   1222{
   1223    CPUState *cs = CPU(cpu);
   1224    Error *local_err = NULL;
   1225    int feat;
   1226
   1227    if (!hyperv_enabled(cpu))
   1228        return true;
   1229
   1230    /*
   1231     * When kvm_hyperv_expand_features is called at CPU feature expansion
   1232     * time per-CPU kvm_state is not available yet so we can only proceed
   1233     * when KVM_CAP_SYS_HYPERV_CPUID is supported.
   1234     */
   1235    if (!cs->kvm_state &&
   1236        !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
   1237        return true;
   1238
   1239    if (cpu->hyperv_passthrough) {
   1240        cpu->hyperv_vendor_id[0] =
   1241            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
   1242        cpu->hyperv_vendor_id[1] =
   1243            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
   1244        cpu->hyperv_vendor_id[2] =
   1245            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
   1246        cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
   1247                                       sizeof(cpu->hyperv_vendor_id) + 1);
   1248        memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
   1249               sizeof(cpu->hyperv_vendor_id));
   1250        cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
   1251
   1252        cpu->hyperv_interface_id[0] =
   1253            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
   1254        cpu->hyperv_interface_id[1] =
   1255            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
   1256        cpu->hyperv_interface_id[2] =
   1257            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
   1258        cpu->hyperv_interface_id[3] =
   1259            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
   1260
   1261        cpu->hyperv_ver_id_build =
   1262            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
   1263        cpu->hyperv_ver_id_major =
   1264            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
   1265        cpu->hyperv_ver_id_minor =
   1266            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
   1267        cpu->hyperv_ver_id_sp =
   1268            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
   1269        cpu->hyperv_ver_id_sb =
   1270            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
   1271        cpu->hyperv_ver_id_sn =
   1272            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
   1273
   1274        cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
   1275                                            R_EAX);
   1276        cpu->hyperv_limits[0] =
   1277            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
   1278        cpu->hyperv_limits[1] =
   1279            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
   1280        cpu->hyperv_limits[2] =
   1281            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
   1282
   1283        cpu->hyperv_spinlock_attempts =
   1284            hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
   1285
   1286        /*
   1287         * Mark feature as enabled in 'cpu->hyperv_features' as
   1288         * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
   1289         */
   1290        for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
   1291            if (hyperv_feature_supported(cs, feat)) {
   1292                cpu->hyperv_features |= BIT(feat);
   1293            }
   1294        }
   1295    } else {
   1296        /* Check features availability and dependencies */
   1297        for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
   1298            /* If the feature was not requested skip it. */
   1299            if (!hyperv_feat_enabled(cpu, feat)) {
   1300                continue;
   1301            }
   1302
   1303            /* Check if the feature is supported by KVM */
   1304            if (!hyperv_feature_supported(cs, feat)) {
   1305                error_setg(errp, "Hyper-V %s is not supported by kernel",
   1306                           kvm_hyperv_properties[feat].desc);
   1307                return false;
   1308            }
   1309
   1310            /* Check dependencies */
   1311            if (!hv_feature_check_deps(cpu, feat, &local_err)) {
   1312                error_propagate(errp, local_err);
   1313                return false;
   1314            }
   1315        }
   1316    }
   1317
   1318    /* Additional dependencies not covered by kvm_hyperv_properties[] */
   1319    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
   1320        !cpu->hyperv_synic_kvm_only &&
   1321        !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
   1322        error_setg(errp, "Hyper-V %s requires Hyper-V %s",
   1323                   kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
   1324                   kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
   1325        return false;
   1326    }
   1327
   1328    return true;
   1329}
   1330
   1331/*
   1332 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
   1333 */
   1334static int hyperv_fill_cpuids(CPUState *cs,
   1335                              struct kvm_cpuid_entry2 *cpuid_ent)
   1336{
   1337    X86CPU *cpu = X86_CPU(cs);
   1338    struct kvm_cpuid_entry2 *c;
   1339    uint32_t cpuid_i = 0;
   1340
   1341    c = &cpuid_ent[cpuid_i++];
   1342    c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
   1343    c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
   1344        HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
   1345    c->ebx = cpu->hyperv_vendor_id[0];
   1346    c->ecx = cpu->hyperv_vendor_id[1];
   1347    c->edx = cpu->hyperv_vendor_id[2];
   1348
   1349    c = &cpuid_ent[cpuid_i++];
   1350    c->function = HV_CPUID_INTERFACE;
   1351    c->eax = cpu->hyperv_interface_id[0];
   1352    c->ebx = cpu->hyperv_interface_id[1];
   1353    c->ecx = cpu->hyperv_interface_id[2];
   1354    c->edx = cpu->hyperv_interface_id[3];
   1355
   1356    c = &cpuid_ent[cpuid_i++];
   1357    c->function = HV_CPUID_VERSION;
   1358    c->eax = cpu->hyperv_ver_id_build;
   1359    c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
   1360        cpu->hyperv_ver_id_minor;
   1361    c->ecx = cpu->hyperv_ver_id_sp;
   1362    c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
   1363        (cpu->hyperv_ver_id_sn & 0xffffff);
   1364
   1365    c = &cpuid_ent[cpuid_i++];
   1366    c->function = HV_CPUID_FEATURES;
   1367    c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
   1368    c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
   1369    c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
   1370
   1371    /* Unconditionally required with any Hyper-V enlightenment */
   1372    c->eax |= HV_HYPERCALL_AVAILABLE;
   1373
   1374    /* SynIC and Vmbus devices require messages/signals hypercalls */
   1375    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
   1376        !cpu->hyperv_synic_kvm_only) {
   1377        c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
   1378    }
   1379
   1380
   1381    /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
   1382    c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
   1383
   1384    c = &cpuid_ent[cpuid_i++];
   1385    c->function = HV_CPUID_ENLIGHTMENT_INFO;
   1386    c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
   1387    c->ebx = cpu->hyperv_spinlock_attempts;
   1388
   1389    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
   1390        !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
   1391        c->eax |= HV_APIC_ACCESS_RECOMMENDED;
   1392    }
   1393
   1394    if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
   1395        c->eax |= HV_NO_NONARCH_CORESHARING;
   1396    } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
   1397        c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
   1398            HV_NO_NONARCH_CORESHARING;
   1399    }
   1400
   1401    c = &cpuid_ent[cpuid_i++];
   1402    c->function = HV_CPUID_IMPLEMENT_LIMITS;
   1403    c->eax = cpu->hv_max_vps;
   1404    c->ebx = cpu->hyperv_limits[0];
   1405    c->ecx = cpu->hyperv_limits[1];
   1406    c->edx = cpu->hyperv_limits[2];
   1407
   1408    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
   1409        __u32 function;
   1410
   1411        /* Create zeroed 0x40000006..0x40000009 leaves */
   1412        for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
   1413             function < HV_CPUID_NESTED_FEATURES; function++) {
   1414            c = &cpuid_ent[cpuid_i++];
   1415            c->function = function;
   1416        }
   1417
   1418        c = &cpuid_ent[cpuid_i++];
   1419        c->function = HV_CPUID_NESTED_FEATURES;
   1420        c->eax = cpu->hyperv_nested[0];
   1421    }
   1422
   1423    return cpuid_i;
   1424}
   1425
   1426static Error *hv_passthrough_mig_blocker;
   1427static Error *hv_no_nonarch_cs_mig_blocker;
   1428
   1429/* Checks that the exposed eVMCS version range is supported by KVM */
   1430static bool evmcs_version_supported(uint16_t evmcs_version,
   1431                                    uint16_t supported_evmcs_version)
   1432{
   1433    uint8_t min_version = evmcs_version & 0xff;
   1434    uint8_t max_version = evmcs_version >> 8;
   1435    uint8_t min_supported_version = supported_evmcs_version & 0xff;
   1436    uint8_t max_supported_version = supported_evmcs_version >> 8;
   1437
   1438    return (min_version >= min_supported_version) &&
   1439        (max_version <= max_supported_version);
   1440}
   1441
   1442#define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
   1443
   1444static int hyperv_init_vcpu(X86CPU *cpu)
   1445{
   1446    CPUState *cs = CPU(cpu);
   1447    Error *local_err = NULL;
   1448    int ret;
   1449
   1450    if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
   1451        error_setg(&hv_passthrough_mig_blocker,
   1452                   "'hv-passthrough' CPU flag prevents migration, use explicit"
   1453                   " set of hv-* flags instead");
   1454        ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
   1455        if (ret < 0) {
   1456            error_report_err(local_err);
   1457            return ret;
   1458        }
   1459    }
   1460
   1461    if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
   1462        hv_no_nonarch_cs_mig_blocker == NULL) {
   1463        error_setg(&hv_no_nonarch_cs_mig_blocker,
   1464                   "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
   1465                   " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
   1466                   " make sure SMT is disabled and/or that vCPUs are properly"
   1467                   " pinned)");
   1468        ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
   1469        if (ret < 0) {
   1470            error_report_err(local_err);
   1471            return ret;
   1472        }
   1473    }
   1474
   1475    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
   1476        /*
   1477         * the kernel doesn't support setting vp_index; assert that its value
   1478         * is in sync
   1479         */
   1480        struct {
   1481            struct kvm_msrs info;
   1482            struct kvm_msr_entry entries[1];
   1483        } msr_data = {
   1484            .info.nmsrs = 1,
   1485            .entries[0].index = HV_X64_MSR_VP_INDEX,
   1486        };
   1487
   1488        ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
   1489        if (ret < 0) {
   1490            return ret;
   1491        }
   1492        assert(ret == 1);
   1493
   1494        if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) {
   1495            error_report("kernel's vp_index != QEMU's vp_index");
   1496            return -ENXIO;
   1497        }
   1498    }
   1499
   1500    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   1501        uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
   1502            KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
   1503        ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
   1504        if (ret < 0) {
   1505            error_report("failed to turn on HyperV SynIC in KVM: %s",
   1506                         strerror(-ret));
   1507            return ret;
   1508        }
   1509
   1510        if (!cpu->hyperv_synic_kvm_only) {
   1511            ret = hyperv_x86_synic_add(cpu);
   1512            if (ret < 0) {
   1513                error_report("failed to create HyperV SynIC: %s",
   1514                             strerror(-ret));
   1515                return ret;
   1516            }
   1517        }
   1518    }
   1519
   1520    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
   1521        uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
   1522        uint16_t supported_evmcs_version;
   1523
   1524        ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
   1525                                  (uintptr_t)&supported_evmcs_version);
   1526
   1527        /*
   1528         * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
   1529         * option sets. Note: we hardcode the maximum supported eVMCS version
   1530         * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
   1531         * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
   1532         * to be added.
   1533         */
   1534        if (ret < 0) {
   1535            error_report("Hyper-V %s is not supported by kernel",
   1536                         kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
   1537            return ret;
   1538        }
   1539
   1540        if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
   1541            error_report("eVMCS version range [%d..%d] is not supported by "
   1542                         "kernel (supported: [%d..%d])", evmcs_version & 0xff,
   1543                         evmcs_version >> 8, supported_evmcs_version & 0xff,
   1544                         supported_evmcs_version >> 8);
   1545            return -ENOTSUP;
   1546        }
   1547
   1548        cpu->hyperv_nested[0] = evmcs_version;
   1549    }
   1550
   1551    if (cpu->hyperv_enforce_cpuid) {
   1552        ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
   1553        if (ret < 0) {
   1554            error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
   1555                         strerror(-ret));
   1556            return ret;
   1557        }
   1558    }
   1559
   1560    return 0;
   1561}
   1562
   1563static Error *invtsc_mig_blocker;
   1564
   1565#define KVM_MAX_CPUID_ENTRIES  100
   1566
   1567int kvm_arch_init_vcpu(CPUState *cs)
   1568{
   1569    struct {
   1570        struct kvm_cpuid2 cpuid;
   1571        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
   1572    } cpuid_data;
   1573    /*
   1574     * The kernel defines these structs with padding fields so there
   1575     * should be no extra padding in our cpuid_data struct.
   1576     */
   1577    QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
   1578                      sizeof(struct kvm_cpuid2) +
   1579                      sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
   1580
   1581    X86CPU *cpu = X86_CPU(cs);
   1582    CPUX86State *env = &cpu->env;
   1583    uint32_t limit, i, j, cpuid_i;
   1584    uint32_t unused;
   1585    struct kvm_cpuid_entry2 *c;
   1586    uint32_t signature[3];
   1587    int kvm_base = KVM_CPUID_SIGNATURE;
   1588    int max_nested_state_len;
   1589    int r;
   1590    Error *local_err = NULL;
   1591
   1592    memset(&cpuid_data, 0, sizeof(cpuid_data));
   1593
   1594    cpuid_i = 0;
   1595
   1596    r = kvm_arch_set_tsc_khz(cs);
   1597    if (r < 0) {
   1598        return r;
   1599    }
   1600
   1601    /* vcpu's TSC frequency is either specified by user, or following
   1602     * the value used by KVM if the former is not present. In the
   1603     * latter case, we query it from KVM and record in env->tsc_khz,
   1604     * so that vcpu's TSC frequency can be migrated later via this field.
   1605     */
   1606    if (!env->tsc_khz) {
   1607        r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
   1608            kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
   1609            -ENOTSUP;
   1610        if (r > 0) {
   1611            env->tsc_khz = r;
   1612        }
   1613    }
   1614
   1615    env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
   1616
   1617    /*
   1618     * kvm_hyperv_expand_features() is called here for the second time in case
   1619     * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
   1620     * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
   1621     * check which Hyper-V enlightenments are supported and which are not, we
   1622     * can still proceed and check/expand Hyper-V enlightenments here so legacy
   1623     * behavior is preserved.
   1624     */
   1625    if (!kvm_hyperv_expand_features(cpu, &local_err)) {
   1626        error_report_err(local_err);
   1627        return -ENOSYS;
   1628    }
   1629
   1630    if (hyperv_enabled(cpu)) {
   1631        r = hyperv_init_vcpu(cpu);
   1632        if (r) {
   1633            return r;
   1634        }
   1635
   1636        cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
   1637        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
   1638        has_msr_hv_hypercall = true;
   1639    }
   1640
   1641    if (cpu->expose_kvm) {
   1642        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
   1643        c = &cpuid_data.entries[cpuid_i++];
   1644        c->function = KVM_CPUID_SIGNATURE | kvm_base;
   1645        c->eax = KVM_CPUID_FEATURES | kvm_base;
   1646        c->ebx = signature[0];
   1647        c->ecx = signature[1];
   1648        c->edx = signature[2];
   1649
   1650        c = &cpuid_data.entries[cpuid_i++];
   1651        c->function = KVM_CPUID_FEATURES | kvm_base;
   1652        c->eax = env->features[FEAT_KVM];
   1653        c->edx = env->features[FEAT_KVM_HINTS];
   1654    }
   1655
   1656    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
   1657
   1658    if (cpu->kvm_pv_enforce_cpuid) {
   1659        r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
   1660        if (r < 0) {
   1661            fprintf(stderr,
   1662                    "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
   1663                    strerror(-r));
   1664            abort();
   1665        }
   1666    }
   1667
   1668    for (i = 0; i <= limit; i++) {
   1669        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1670            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
   1671            abort();
   1672        }
   1673        c = &cpuid_data.entries[cpuid_i++];
   1674
   1675        switch (i) {
   1676        case 2: {
   1677            /* Keep reading function 2 till all the input is received */
   1678            int times;
   1679
   1680            c->function = i;
   1681            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
   1682                       KVM_CPUID_FLAG_STATE_READ_NEXT;
   1683            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1684            times = c->eax & 0xff;
   1685
   1686            for (j = 1; j < times; ++j) {
   1687                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1688                    fprintf(stderr, "cpuid_data is full, no space for "
   1689                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
   1690                    abort();
   1691                }
   1692                c = &cpuid_data.entries[cpuid_i++];
   1693                c->function = i;
   1694                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
   1695                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1696            }
   1697            break;
   1698        }
   1699        case 0x1f:
   1700            if (env->nr_dies < 2) {
   1701                break;
   1702            }
   1703            /* fallthrough */
   1704        case 4:
   1705        case 0xb:
   1706        case 0xd:
   1707            for (j = 0; ; j++) {
   1708                if (i == 0xd && j == 64) {
   1709                    break;
   1710                }
   1711
   1712                if (i == 0x1f && j == 64) {
   1713                    break;
   1714                }
   1715
   1716                c->function = i;
   1717                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1718                c->index = j;
   1719                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1720
   1721                if (i == 4 && c->eax == 0) {
   1722                    break;
   1723                }
   1724                if (i == 0xb && !(c->ecx & 0xff00)) {
   1725                    break;
   1726                }
   1727                if (i == 0x1f && !(c->ecx & 0xff00)) {
   1728                    break;
   1729                }
   1730                if (i == 0xd && c->eax == 0) {
   1731                    continue;
   1732                }
   1733                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1734                    fprintf(stderr, "cpuid_data is full, no space for "
   1735                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   1736                    abort();
   1737                }
   1738                c = &cpuid_data.entries[cpuid_i++];
   1739            }
   1740            break;
   1741        case 0x7:
   1742        case 0x12:
   1743            for (j = 0; ; j++) {
   1744                c->function = i;
   1745                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1746                c->index = j;
   1747                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1748
   1749                if (j > 1 && (c->eax & 0xf) != 1) {
   1750                    break;
   1751                }
   1752
   1753                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1754                    fprintf(stderr, "cpuid_data is full, no space for "
   1755                                "cpuid(eax:0x12,ecx:0x%x)\n", j);
   1756                    abort();
   1757                }
   1758                c = &cpuid_data.entries[cpuid_i++];
   1759            }
   1760            break;
   1761        case 0x14: {
   1762            uint32_t times;
   1763
   1764            c->function = i;
   1765            c->index = 0;
   1766            c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1767            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1768            times = c->eax;
   1769
   1770            for (j = 1; j <= times; ++j) {
   1771                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1772                    fprintf(stderr, "cpuid_data is full, no space for "
   1773                                "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   1774                    abort();
   1775                }
   1776                c = &cpuid_data.entries[cpuid_i++];
   1777                c->function = i;
   1778                c->index = j;
   1779                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1780                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1781            }
   1782            break;
   1783        }
   1784        default:
   1785            c->function = i;
   1786            c->flags = 0;
   1787            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1788            if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
   1789                /*
   1790                 * KVM already returns all zeroes if a CPUID entry is missing,
   1791                 * so we can omit it and avoid hitting KVM's 80-entry limit.
   1792                 */
   1793                cpuid_i--;
   1794            }
   1795            break;
   1796        }
   1797    }
   1798
   1799    if (limit >= 0x0a) {
   1800        uint32_t eax, edx;
   1801
   1802        cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
   1803
   1804        has_architectural_pmu_version = eax & 0xff;
   1805        if (has_architectural_pmu_version > 0) {
   1806            num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
   1807
   1808            /* Shouldn't be more than 32, since that's the number of bits
   1809             * available in EBX to tell us _which_ counters are available.
   1810             * Play it safe.
   1811             */
   1812            if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
   1813                num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
   1814            }
   1815
   1816            if (has_architectural_pmu_version > 1) {
   1817                num_architectural_pmu_fixed_counters = edx & 0x1f;
   1818
   1819                if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
   1820                    num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
   1821                }
   1822            }
   1823        }
   1824    }
   1825
   1826    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
   1827
   1828    for (i = 0x80000000; i <= limit; i++) {
   1829        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1830            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
   1831            abort();
   1832        }
   1833        c = &cpuid_data.entries[cpuid_i++];
   1834
   1835        switch (i) {
   1836        case 0x8000001d:
   1837            /* Query for all AMD cache information leaves */
   1838            for (j = 0; ; j++) {
   1839                c->function = i;
   1840                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1841                c->index = j;
   1842                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1843
   1844                if (c->eax == 0) {
   1845                    break;
   1846                }
   1847                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1848                    fprintf(stderr, "cpuid_data is full, no space for "
   1849                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   1850                    abort();
   1851                }
   1852                c = &cpuid_data.entries[cpuid_i++];
   1853            }
   1854            break;
   1855        default:
   1856            c->function = i;
   1857            c->flags = 0;
   1858            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1859            if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
   1860                /*
   1861                 * KVM already returns all zeroes if a CPUID entry is missing,
   1862                 * so we can omit it and avoid hitting KVM's 80-entry limit.
   1863                 */
   1864                cpuid_i--;
   1865            }
   1866            break;
   1867        }
   1868    }
   1869
   1870    /* Call Centaur's CPUID instructions they are supported. */
   1871    if (env->cpuid_xlevel2 > 0) {
   1872        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
   1873
   1874        for (i = 0xC0000000; i <= limit; i++) {
   1875            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1876                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
   1877                abort();
   1878            }
   1879            c = &cpuid_data.entries[cpuid_i++];
   1880
   1881            c->function = i;
   1882            c->flags = 0;
   1883            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1884        }
   1885    }
   1886
   1887    cpuid_data.cpuid.nent = cpuid_i;
   1888
   1889    if (((env->cpuid_version >> 8)&0xF) >= 6
   1890        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
   1891           (CPUID_MCE | CPUID_MCA)
   1892        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
   1893        uint64_t mcg_cap, unsupported_caps;
   1894        int banks;
   1895        int ret;
   1896
   1897        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
   1898        if (ret < 0) {
   1899            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
   1900            return ret;
   1901        }
   1902
   1903        if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
   1904            error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
   1905                         (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
   1906            return -ENOTSUP;
   1907        }
   1908
   1909        unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
   1910        if (unsupported_caps) {
   1911            if (unsupported_caps & MCG_LMCE_P) {
   1912                error_report("kvm: LMCE not supported");
   1913                return -ENOTSUP;
   1914            }
   1915            warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
   1916                        unsupported_caps);
   1917        }
   1918
   1919        env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
   1920        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
   1921        if (ret < 0) {
   1922            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
   1923            return ret;
   1924        }
   1925    }
   1926
   1927    cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
   1928
   1929    c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
   1930    if (c) {
   1931        has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
   1932                                  !!(c->ecx & CPUID_EXT_SMX);
   1933    }
   1934
   1935    c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
   1936    if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
   1937        has_msr_feature_control = true;
   1938    }
   1939
   1940    if (env->mcg_cap & MCG_LMCE_P) {
   1941        has_msr_mcg_ext_ctl = has_msr_feature_control = true;
   1942    }
   1943
   1944    if (!env->user_tsc_khz) {
   1945        if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
   1946            invtsc_mig_blocker == NULL) {
   1947            error_setg(&invtsc_mig_blocker,
   1948                       "State blocked by non-migratable CPU device"
   1949                       " (invtsc flag)");
   1950            r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
   1951            if (r < 0) {
   1952                error_report_err(local_err);
   1953                return r;
   1954            }
   1955        }
   1956    }
   1957
   1958    if (cpu->vmware_cpuid_freq
   1959        /* Guests depend on 0x40000000 to detect this feature, so only expose
   1960         * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
   1961        && cpu->expose_kvm
   1962        && kvm_base == KVM_CPUID_SIGNATURE
   1963        /* TSC clock must be stable and known for this feature. */
   1964        && tsc_is_stable_and_known(env)) {
   1965
   1966        c = &cpuid_data.entries[cpuid_i++];
   1967        c->function = KVM_CPUID_SIGNATURE | 0x10;
   1968        c->eax = env->tsc_khz;
   1969        c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
   1970        c->ecx = c->edx = 0;
   1971
   1972        c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
   1973        c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
   1974    }
   1975
   1976    cpuid_data.cpuid.nent = cpuid_i;
   1977
   1978    cpuid_data.cpuid.padding = 0;
   1979    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
   1980    if (r) {
   1981        goto fail;
   1982    }
   1983
   1984    if (has_xsave) {
   1985        env->xsave_buf_len = sizeof(struct kvm_xsave);
   1986        env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
   1987        memset(env->xsave_buf, 0, env->xsave_buf_len);
   1988
   1989        /*
   1990         * The allocated storage must be large enough for all of the
   1991         * possible XSAVE state components.
   1992         */
   1993        assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX)
   1994               <= env->xsave_buf_len);
   1995    }
   1996
   1997    max_nested_state_len = kvm_max_nested_state_length();
   1998    if (max_nested_state_len > 0) {
   1999        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
   2000
   2001        if (cpu_has_vmx(env) || cpu_has_svm(env)) {
   2002            struct kvm_vmx_nested_state_hdr *vmx_hdr;
   2003
   2004            env->nested_state = g_malloc0(max_nested_state_len);
   2005            env->nested_state->size = max_nested_state_len;
   2006
   2007            if (cpu_has_vmx(env)) {
   2008                env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
   2009                vmx_hdr = &env->nested_state->hdr.vmx;
   2010                vmx_hdr->vmxon_pa = -1ull;
   2011                vmx_hdr->vmcs12_pa = -1ull;
   2012            } else {
   2013                env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
   2014            }
   2015        }
   2016    }
   2017
   2018    cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
   2019
   2020    if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
   2021        has_msr_tsc_aux = false;
   2022    }
   2023
   2024    kvm_init_msrs(cpu);
   2025
   2026    return 0;
   2027
   2028 fail:
   2029    migrate_del_blocker(invtsc_mig_blocker);
   2030
   2031    return r;
   2032}
   2033
   2034int kvm_arch_destroy_vcpu(CPUState *cs)
   2035{
   2036    X86CPU *cpu = X86_CPU(cs);
   2037    CPUX86State *env = &cpu->env;
   2038
   2039    if (cpu->kvm_msr_buf) {
   2040        g_free(cpu->kvm_msr_buf);
   2041        cpu->kvm_msr_buf = NULL;
   2042    }
   2043
   2044    if (env->nested_state) {
   2045        g_free(env->nested_state);
   2046        env->nested_state = NULL;
   2047    }
   2048
   2049    qemu_del_vm_change_state_handler(cpu->vmsentry);
   2050
   2051    return 0;
   2052}
   2053
   2054void kvm_arch_reset_vcpu(X86CPU *cpu)
   2055{
   2056    CPUX86State *env = &cpu->env;
   2057
   2058    env->xcr0 = 1;
   2059    if (kvm_irqchip_in_kernel()) {
   2060        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
   2061                                          KVM_MP_STATE_UNINITIALIZED;
   2062    } else {
   2063        env->mp_state = KVM_MP_STATE_RUNNABLE;
   2064    }
   2065
   2066    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   2067        int i;
   2068        for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
   2069            env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
   2070        }
   2071
   2072        hyperv_x86_synic_reset(cpu);
   2073    }
   2074    /* enabled by default */
   2075    env->poll_control_msr = 1;
   2076
   2077    sev_es_set_reset_vector(CPU(cpu));
   2078}
   2079
   2080void kvm_arch_do_init_vcpu(X86CPU *cpu)
   2081{
   2082    CPUX86State *env = &cpu->env;
   2083
   2084    /* APs get directly into wait-for-SIPI state.  */
   2085    if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
   2086        env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
   2087    }
   2088}
   2089
   2090static int kvm_get_supported_feature_msrs(KVMState *s)
   2091{
   2092    int ret = 0;
   2093
   2094    if (kvm_feature_msrs != NULL) {
   2095        return 0;
   2096    }
   2097
   2098    if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
   2099        return 0;
   2100    }
   2101
   2102    struct kvm_msr_list msr_list;
   2103
   2104    msr_list.nmsrs = 0;
   2105    ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
   2106    if (ret < 0 && ret != -E2BIG) {
   2107        error_report("Fetch KVM feature MSR list failed: %s",
   2108            strerror(-ret));
   2109        return ret;
   2110    }
   2111
   2112    assert(msr_list.nmsrs > 0);
   2113    kvm_feature_msrs = (struct kvm_msr_list *) \
   2114        g_malloc0(sizeof(msr_list) +
   2115                 msr_list.nmsrs * sizeof(msr_list.indices[0]));
   2116
   2117    kvm_feature_msrs->nmsrs = msr_list.nmsrs;
   2118    ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
   2119
   2120    if (ret < 0) {
   2121        error_report("Fetch KVM feature MSR list failed: %s",
   2122            strerror(-ret));
   2123        g_free(kvm_feature_msrs);
   2124        kvm_feature_msrs = NULL;
   2125        return ret;
   2126    }
   2127
   2128    return 0;
   2129}
   2130
   2131static int kvm_get_supported_msrs(KVMState *s)
   2132{
   2133    int ret = 0;
   2134    struct kvm_msr_list msr_list, *kvm_msr_list;
   2135
   2136    /*
   2137     *  Obtain MSR list from KVM.  These are the MSRs that we must
   2138     *  save/restore.
   2139     */
   2140    msr_list.nmsrs = 0;
   2141    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
   2142    if (ret < 0 && ret != -E2BIG) {
   2143        return ret;
   2144    }
   2145    /*
   2146     * Old kernel modules had a bug and could write beyond the provided
   2147     * memory. Allocate at least a safe amount of 1K.
   2148     */
   2149    kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
   2150                                          msr_list.nmsrs *
   2151                                          sizeof(msr_list.indices[0])));
   2152
   2153    kvm_msr_list->nmsrs = msr_list.nmsrs;
   2154    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
   2155    if (ret >= 0) {
   2156        int i;
   2157
   2158        for (i = 0; i < kvm_msr_list->nmsrs; i++) {
   2159            switch (kvm_msr_list->indices[i]) {
   2160            case MSR_STAR:
   2161                has_msr_star = true;
   2162                break;
   2163            case MSR_VM_HSAVE_PA:
   2164                has_msr_hsave_pa = true;
   2165                break;
   2166            case MSR_TSC_AUX:
   2167                has_msr_tsc_aux = true;
   2168                break;
   2169            case MSR_TSC_ADJUST:
   2170                has_msr_tsc_adjust = true;
   2171                break;
   2172            case MSR_IA32_TSCDEADLINE:
   2173                has_msr_tsc_deadline = true;
   2174                break;
   2175            case MSR_IA32_SMBASE:
   2176                has_msr_smbase = true;
   2177                break;
   2178            case MSR_SMI_COUNT:
   2179                has_msr_smi_count = true;
   2180                break;
   2181            case MSR_IA32_MISC_ENABLE:
   2182                has_msr_misc_enable = true;
   2183                break;
   2184            case MSR_IA32_BNDCFGS:
   2185                has_msr_bndcfgs = true;
   2186                break;
   2187            case MSR_IA32_XSS:
   2188                has_msr_xss = true;
   2189                break;
   2190            case MSR_IA32_UMWAIT_CONTROL:
   2191                has_msr_umwait = true;
   2192                break;
   2193            case HV_X64_MSR_CRASH_CTL:
   2194                has_msr_hv_crash = true;
   2195                break;
   2196            case HV_X64_MSR_RESET:
   2197                has_msr_hv_reset = true;
   2198                break;
   2199            case HV_X64_MSR_VP_INDEX:
   2200                has_msr_hv_vpindex = true;
   2201                break;
   2202            case HV_X64_MSR_VP_RUNTIME:
   2203                has_msr_hv_runtime = true;
   2204                break;
   2205            case HV_X64_MSR_SCONTROL:
   2206                has_msr_hv_synic = true;
   2207                break;
   2208            case HV_X64_MSR_STIMER0_CONFIG:
   2209                has_msr_hv_stimer = true;
   2210                break;
   2211            case HV_X64_MSR_TSC_FREQUENCY:
   2212                has_msr_hv_frequencies = true;
   2213                break;
   2214            case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   2215                has_msr_hv_reenlightenment = true;
   2216                break;
   2217            case MSR_IA32_SPEC_CTRL:
   2218                has_msr_spec_ctrl = true;
   2219                break;
   2220            case MSR_IA32_TSX_CTRL:
   2221                has_msr_tsx_ctrl = true;
   2222                break;
   2223            case MSR_VIRT_SSBD:
   2224                has_msr_virt_ssbd = true;
   2225                break;
   2226            case MSR_IA32_ARCH_CAPABILITIES:
   2227                has_msr_arch_capabs = true;
   2228                break;
   2229            case MSR_IA32_CORE_CAPABILITY:
   2230                has_msr_core_capabs = true;
   2231                break;
   2232            case MSR_IA32_PERF_CAPABILITIES:
   2233                has_msr_perf_capabs = true;
   2234                break;
   2235            case MSR_IA32_VMX_VMFUNC:
   2236                has_msr_vmx_vmfunc = true;
   2237                break;
   2238            case MSR_IA32_UCODE_REV:
   2239                has_msr_ucode_rev = true;
   2240                break;
   2241            case MSR_IA32_VMX_PROCBASED_CTLS2:
   2242                has_msr_vmx_procbased_ctls2 = true;
   2243                break;
   2244            case MSR_IA32_PKRS:
   2245                has_msr_pkrs = true;
   2246                break;
   2247            }
   2248        }
   2249    }
   2250
   2251    g_free(kvm_msr_list);
   2252
   2253    return ret;
   2254}
   2255
   2256static Notifier smram_machine_done;
   2257static KVMMemoryListener smram_listener;
   2258static AddressSpace smram_address_space;
   2259static MemoryRegion smram_as_root;
   2260static MemoryRegion smram_as_mem;
   2261
   2262static void register_smram_listener(Notifier *n, void *unused)
   2263{
   2264    MemoryRegion *smram =
   2265        (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
   2266
   2267    /* Outer container... */
   2268    memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
   2269    memory_region_set_enabled(&smram_as_root, true);
   2270
   2271    /* ... with two regions inside: normal system memory with low
   2272     * priority, and...
   2273     */
   2274    memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
   2275                             get_system_memory(), 0, ~0ull);
   2276    memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
   2277    memory_region_set_enabled(&smram_as_mem, true);
   2278
   2279    if (smram) {
   2280        /* ... SMRAM with higher priority */
   2281        memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
   2282        memory_region_set_enabled(smram, true);
   2283    }
   2284
   2285    address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
   2286    kvm_memory_listener_register(kvm_state, &smram_listener,
   2287                                 &smram_address_space, 1, "kvm-smram");
   2288}
   2289
   2290int kvm_arch_init(MachineState *ms, KVMState *s)
   2291{
   2292    uint64_t identity_base = 0xfffbc000;
   2293    uint64_t shadow_mem;
   2294    int ret;
   2295    struct utsname utsname;
   2296    Error *local_err = NULL;
   2297
   2298    /*
   2299     * Initialize SEV context, if required
   2300     *
   2301     * If no memory encryption is requested (ms->cgs == NULL) this is
   2302     * a no-op.
   2303     *
   2304     * It's also a no-op if a non-SEV confidential guest support
   2305     * mechanism is selected.  SEV is the only mechanism available to
   2306     * select on x86 at present, so this doesn't arise, but if new
   2307     * mechanisms are supported in future (e.g. TDX), they'll need
   2308     * their own initialization either here or elsewhere.
   2309     */
   2310    ret = sev_kvm_init(ms->cgs, &local_err);
   2311    if (ret < 0) {
   2312        error_report_err(local_err);
   2313        return ret;
   2314    }
   2315
   2316    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
   2317        error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
   2318        return -ENOTSUP;
   2319    }
   2320
   2321    has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
   2322    has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
   2323    has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
   2324
   2325    hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
   2326
   2327    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
   2328    if (has_exception_payload) {
   2329        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
   2330        if (ret < 0) {
   2331            error_report("kvm: Failed to enable exception payload cap: %s",
   2332                         strerror(-ret));
   2333            return ret;
   2334        }
   2335    }
   2336
   2337    ret = kvm_get_supported_msrs(s);
   2338    if (ret < 0) {
   2339        return ret;
   2340    }
   2341
   2342    kvm_get_supported_feature_msrs(s);
   2343
   2344    uname(&utsname);
   2345    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
   2346
   2347    /*
   2348     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
   2349     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
   2350     * Since these must be part of guest physical memory, we need to allocate
   2351     * them, both by setting their start addresses in the kernel and by
   2352     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
   2353     *
   2354     * Older KVM versions may not support setting the identity map base. In
   2355     * that case we need to stick with the default, i.e. a 256K maximum BIOS
   2356     * size.
   2357     */
   2358    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
   2359        /* Allows up to 16M BIOSes. */
   2360        identity_base = 0xfeffc000;
   2361
   2362        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
   2363        if (ret < 0) {
   2364            return ret;
   2365        }
   2366    }
   2367
   2368    /* Set TSS base one page after EPT identity map. */
   2369    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
   2370    if (ret < 0) {
   2371        return ret;
   2372    }
   2373
   2374    /* Tell fw_cfg to notify the BIOS to reserve the range. */
   2375    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
   2376    if (ret < 0) {
   2377        fprintf(stderr, "e820_add_entry() table is full\n");
   2378        return ret;
   2379    }
   2380
   2381    shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
   2382    if (shadow_mem != -1) {
   2383        shadow_mem /= 4096;
   2384        ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
   2385        if (ret < 0) {
   2386            return ret;
   2387        }
   2388    }
   2389
   2390    if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
   2391        object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
   2392        x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
   2393        smram_machine_done.notify = register_smram_listener;
   2394        qemu_add_machine_init_done_notifier(&smram_machine_done);
   2395    }
   2396
   2397    if (enable_cpu_pm) {
   2398        int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
   2399        int ret;
   2400
   2401/* Work around for kernel header with a typo. TODO: fix header and drop. */
   2402#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
   2403#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
   2404#endif
   2405        if (disable_exits) {
   2406            disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
   2407                              KVM_X86_DISABLE_EXITS_HLT |
   2408                              KVM_X86_DISABLE_EXITS_PAUSE |
   2409                              KVM_X86_DISABLE_EXITS_CSTATE);
   2410        }
   2411
   2412        ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
   2413                                disable_exits);
   2414        if (ret < 0) {
   2415            error_report("kvm: guest stopping CPU not supported: %s",
   2416                         strerror(-ret));
   2417        }
   2418    }
   2419
   2420    if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
   2421        X86MachineState *x86ms = X86_MACHINE(ms);
   2422
   2423        if (x86ms->bus_lock_ratelimit > 0) {
   2424            ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
   2425            if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
   2426                error_report("kvm: bus lock detection unsupported");
   2427                return -ENOTSUP;
   2428            }
   2429            ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
   2430                                    KVM_BUS_LOCK_DETECTION_EXIT);
   2431            if (ret < 0) {
   2432                error_report("kvm: Failed to enable bus lock detection cap: %s",
   2433                             strerror(-ret));
   2434                return ret;
   2435            }
   2436            ratelimit_init(&bus_lock_ratelimit_ctrl);
   2437            ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
   2438                                x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
   2439        }
   2440    }
   2441
   2442    return 0;
   2443}
   2444
   2445static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
   2446{
   2447    lhs->selector = rhs->selector;
   2448    lhs->base = rhs->base;
   2449    lhs->limit = rhs->limit;
   2450    lhs->type = 3;
   2451    lhs->present = 1;
   2452    lhs->dpl = 3;
   2453    lhs->db = 0;
   2454    lhs->s = 1;
   2455    lhs->l = 0;
   2456    lhs->g = 0;
   2457    lhs->avl = 0;
   2458    lhs->unusable = 0;
   2459}
   2460
   2461static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
   2462{
   2463    unsigned flags = rhs->flags;
   2464    lhs->selector = rhs->selector;
   2465    lhs->base = rhs->base;
   2466    lhs->limit = rhs->limit;
   2467    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
   2468    lhs->present = (flags & DESC_P_MASK) != 0;
   2469    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
   2470    lhs->db = (flags >> DESC_B_SHIFT) & 1;
   2471    lhs->s = (flags & DESC_S_MASK) != 0;
   2472    lhs->l = (flags >> DESC_L_SHIFT) & 1;
   2473    lhs->g = (flags & DESC_G_MASK) != 0;
   2474    lhs->avl = (flags & DESC_AVL_MASK) != 0;
   2475    lhs->unusable = !lhs->present;
   2476    lhs->padding = 0;
   2477}
   2478
   2479static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
   2480{
   2481    lhs->selector = rhs->selector;
   2482    lhs->base = rhs->base;
   2483    lhs->limit = rhs->limit;
   2484    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
   2485                 ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
   2486                 (rhs->dpl << DESC_DPL_SHIFT) |
   2487                 (rhs->db << DESC_B_SHIFT) |
   2488                 (rhs->s * DESC_S_MASK) |
   2489                 (rhs->l << DESC_L_SHIFT) |
   2490                 (rhs->g * DESC_G_MASK) |
   2491                 (rhs->avl * DESC_AVL_MASK);
   2492}
   2493
   2494static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
   2495{
   2496    if (set) {
   2497        *kvm_reg = *qemu_reg;
   2498    } else {
   2499        *qemu_reg = *kvm_reg;
   2500    }
   2501}
   2502
   2503static int kvm_getput_regs(X86CPU *cpu, int set)
   2504{
   2505    CPUX86State *env = &cpu->env;
   2506    struct kvm_regs regs;
   2507    int ret = 0;
   2508
   2509    if (!set) {
   2510        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
   2511        if (ret < 0) {
   2512            return ret;
   2513        }
   2514    }
   2515
   2516    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
   2517    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
   2518    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
   2519    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
   2520    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
   2521    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
   2522    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
   2523    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
   2524#ifdef TARGET_X86_64
   2525    kvm_getput_reg(&regs.r8, &env->regs[8], set);
   2526    kvm_getput_reg(&regs.r9, &env->regs[9], set);
   2527    kvm_getput_reg(&regs.r10, &env->regs[10], set);
   2528    kvm_getput_reg(&regs.r11, &env->regs[11], set);
   2529    kvm_getput_reg(&regs.r12, &env->regs[12], set);
   2530    kvm_getput_reg(&regs.r13, &env->regs[13], set);
   2531    kvm_getput_reg(&regs.r14, &env->regs[14], set);
   2532    kvm_getput_reg(&regs.r15, &env->regs[15], set);
   2533#endif
   2534
   2535    kvm_getput_reg(&regs.rflags, &env->eflags, set);
   2536    kvm_getput_reg(&regs.rip, &env->eip, set);
   2537
   2538    if (set) {
   2539        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
   2540    }
   2541
   2542    return ret;
   2543}
   2544
   2545static int kvm_put_fpu(X86CPU *cpu)
   2546{
   2547    CPUX86State *env = &cpu->env;
   2548    struct kvm_fpu fpu;
   2549    int i;
   2550
   2551    memset(&fpu, 0, sizeof fpu);
   2552    fpu.fsw = env->fpus & ~(7 << 11);
   2553    fpu.fsw |= (env->fpstt & 7) << 11;
   2554    fpu.fcw = env->fpuc;
   2555    fpu.last_opcode = env->fpop;
   2556    fpu.last_ip = env->fpip;
   2557    fpu.last_dp = env->fpdp;
   2558    for (i = 0; i < 8; ++i) {
   2559        fpu.ftwx |= (!env->fptags[i]) << i;
   2560    }
   2561    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
   2562    for (i = 0; i < CPU_NB_REGS; i++) {
   2563        stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
   2564        stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
   2565    }
   2566    fpu.mxcsr = env->mxcsr;
   2567
   2568    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
   2569}
   2570
   2571static int kvm_put_xsave(X86CPU *cpu)
   2572{
   2573    CPUX86State *env = &cpu->env;
   2574    void *xsave = env->xsave_buf;
   2575
   2576    if (!has_xsave) {
   2577        return kvm_put_fpu(cpu);
   2578    }
   2579    x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
   2580
   2581    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
   2582}
   2583
   2584static int kvm_put_xcrs(X86CPU *cpu)
   2585{
   2586    CPUX86State *env = &cpu->env;
   2587    struct kvm_xcrs xcrs = {};
   2588
   2589    if (!has_xcrs) {
   2590        return 0;
   2591    }
   2592
   2593    xcrs.nr_xcrs = 1;
   2594    xcrs.flags = 0;
   2595    xcrs.xcrs[0].xcr = 0;
   2596    xcrs.xcrs[0].value = env->xcr0;
   2597    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
   2598}
   2599
   2600static int kvm_put_sregs(X86CPU *cpu)
   2601{
   2602    CPUX86State *env = &cpu->env;
   2603    struct kvm_sregs sregs;
   2604
   2605    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
   2606    if (env->interrupt_injected >= 0) {
   2607        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
   2608                (uint64_t)1 << (env->interrupt_injected % 64);
   2609    }
   2610
   2611    if ((env->eflags & VM_MASK)) {
   2612        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
   2613        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
   2614        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
   2615        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
   2616        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
   2617        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
   2618    } else {
   2619        set_seg(&sregs.cs, &env->segs[R_CS]);
   2620        set_seg(&sregs.ds, &env->segs[R_DS]);
   2621        set_seg(&sregs.es, &env->segs[R_ES]);
   2622        set_seg(&sregs.fs, &env->segs[R_FS]);
   2623        set_seg(&sregs.gs, &env->segs[R_GS]);
   2624        set_seg(&sregs.ss, &env->segs[R_SS]);
   2625    }
   2626
   2627    set_seg(&sregs.tr, &env->tr);
   2628    set_seg(&sregs.ldt, &env->ldt);
   2629
   2630    sregs.idt.limit = env->idt.limit;
   2631    sregs.idt.base = env->idt.base;
   2632    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
   2633    sregs.gdt.limit = env->gdt.limit;
   2634    sregs.gdt.base = env->gdt.base;
   2635    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
   2636
   2637    sregs.cr0 = env->cr[0];
   2638    sregs.cr2 = env->cr[2];
   2639    sregs.cr3 = env->cr[3];
   2640    sregs.cr4 = env->cr[4];
   2641
   2642    sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
   2643    sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
   2644
   2645    sregs.efer = env->efer;
   2646
   2647    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
   2648}
   2649
   2650static void kvm_msr_buf_reset(X86CPU *cpu)
   2651{
   2652    memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
   2653}
   2654
   2655static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
   2656{
   2657    struct kvm_msrs *msrs = cpu->kvm_msr_buf;
   2658    void *limit = ((void *)msrs) + MSR_BUF_SIZE;
   2659    struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
   2660
   2661    assert((void *)(entry + 1) <= limit);
   2662
   2663    entry->index = index;
   2664    entry->reserved = 0;
   2665    entry->data = value;
   2666    msrs->nmsrs++;
   2667}
   2668
   2669static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
   2670{
   2671    kvm_msr_buf_reset(cpu);
   2672    kvm_msr_entry_add(cpu, index, value);
   2673
   2674    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
   2675}
   2676
   2677void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
   2678{
   2679    int ret;
   2680
   2681    ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
   2682    assert(ret == 1);
   2683}
   2684
   2685static int kvm_put_tscdeadline_msr(X86CPU *cpu)
   2686{
   2687    CPUX86State *env = &cpu->env;
   2688    int ret;
   2689
   2690    if (!has_msr_tsc_deadline) {
   2691        return 0;
   2692    }
   2693
   2694    ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
   2695    if (ret < 0) {
   2696        return ret;
   2697    }
   2698
   2699    assert(ret == 1);
   2700    return 0;
   2701}
   2702
   2703/*
   2704 * Provide a separate write service for the feature control MSR in order to
   2705 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
   2706 * before writing any other state because forcibly leaving nested mode
   2707 * invalidates the VCPU state.
   2708 */
   2709static int kvm_put_msr_feature_control(X86CPU *cpu)
   2710{
   2711    int ret;
   2712
   2713    if (!has_msr_feature_control) {
   2714        return 0;
   2715    }
   2716
   2717    ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
   2718                          cpu->env.msr_ia32_feature_control);
   2719    if (ret < 0) {
   2720        return ret;
   2721    }
   2722
   2723    assert(ret == 1);
   2724    return 0;
   2725}
   2726
   2727static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
   2728{
   2729    uint32_t default1, can_be_one, can_be_zero;
   2730    uint32_t must_be_one;
   2731
   2732    switch (index) {
   2733    case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
   2734        default1 = 0x00000016;
   2735        break;
   2736    case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
   2737        default1 = 0x0401e172;
   2738        break;
   2739    case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
   2740        default1 = 0x000011ff;
   2741        break;
   2742    case MSR_IA32_VMX_TRUE_EXIT_CTLS:
   2743        default1 = 0x00036dff;
   2744        break;
   2745    case MSR_IA32_VMX_PROCBASED_CTLS2:
   2746        default1 = 0;
   2747        break;
   2748    default:
   2749        abort();
   2750    }
   2751
   2752    /* If a feature bit is set, the control can be either set or clear.
   2753     * Otherwise the value is limited to either 0 or 1 by default1.
   2754     */
   2755    can_be_one = features | default1;
   2756    can_be_zero = features | ~default1;
   2757    must_be_one = ~can_be_zero;
   2758
   2759    /*
   2760     * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
   2761     * Bit 32:63 -> 1 if the control bit can be one.
   2762     */
   2763    return must_be_one | (((uint64_t)can_be_one) << 32);
   2764}
   2765
   2766static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
   2767{
   2768    uint64_t kvm_vmx_basic =
   2769        kvm_arch_get_supported_msr_feature(kvm_state,
   2770                                           MSR_IA32_VMX_BASIC);
   2771
   2772    if (!kvm_vmx_basic) {
   2773        /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
   2774         * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
   2775         */
   2776        return;
   2777    }
   2778
   2779    uint64_t kvm_vmx_misc =
   2780        kvm_arch_get_supported_msr_feature(kvm_state,
   2781                                           MSR_IA32_VMX_MISC);
   2782    uint64_t kvm_vmx_ept_vpid =
   2783        kvm_arch_get_supported_msr_feature(kvm_state,
   2784                                           MSR_IA32_VMX_EPT_VPID_CAP);
   2785
   2786    /*
   2787     * If the guest is 64-bit, a value of 1 is allowed for the host address
   2788     * space size vmexit control.
   2789     */
   2790    uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
   2791        ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
   2792
   2793    /*
   2794     * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
   2795     * not change them for backwards compatibility.
   2796     */
   2797    uint64_t fixed_vmx_basic = kvm_vmx_basic &
   2798        (MSR_VMX_BASIC_VMCS_REVISION_MASK |
   2799         MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
   2800         MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
   2801
   2802    /*
   2803     * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
   2804     * change in the future but are always zero for now, clear them to be
   2805     * future proof.  Bits 32-63 in theory could change, though KVM does
   2806     * not support dual-monitor treatment and probably never will; mask
   2807     * them out as well.
   2808     */
   2809    uint64_t fixed_vmx_misc = kvm_vmx_misc &
   2810        (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
   2811         MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
   2812
   2813    /*
   2814     * EPT memory types should not change either, so we do not bother
   2815     * adding features for them.
   2816     */
   2817    uint64_t fixed_vmx_ept_mask =
   2818            (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
   2819             MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
   2820    uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
   2821
   2822    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   2823                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   2824                                         f[FEAT_VMX_PROCBASED_CTLS]));
   2825    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   2826                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   2827                                         f[FEAT_VMX_PINBASED_CTLS]));
   2828    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
   2829                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
   2830                                         f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
   2831    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   2832                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   2833                                         f[FEAT_VMX_ENTRY_CTLS]));
   2834    kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
   2835                      make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
   2836                                         f[FEAT_VMX_SECONDARY_CTLS]));
   2837    kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
   2838                      f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
   2839    kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
   2840                      f[FEAT_VMX_BASIC] | fixed_vmx_basic);
   2841    kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
   2842                      f[FEAT_VMX_MISC] | fixed_vmx_misc);
   2843    if (has_msr_vmx_vmfunc) {
   2844        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
   2845    }
   2846
   2847    /*
   2848     * Just to be safe, write these with constant values.  The CRn_FIXED1
   2849     * MSRs are generated by KVM based on the vCPU's CPUID.
   2850     */
   2851    kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
   2852                      CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
   2853    kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
   2854                      CR4_VMXE_MASK);
   2855
   2856    if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
   2857        /* TSC multiplier (0x2032).  */
   2858        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
   2859    } else {
   2860        /* Preemption timer (0x482E).  */
   2861        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
   2862    }
   2863}
   2864
   2865static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
   2866{
   2867    uint64_t kvm_perf_cap =
   2868        kvm_arch_get_supported_msr_feature(kvm_state,
   2869                                           MSR_IA32_PERF_CAPABILITIES);
   2870
   2871    if (kvm_perf_cap) {
   2872        kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
   2873                        kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
   2874    }
   2875}
   2876
   2877static int kvm_buf_set_msrs(X86CPU *cpu)
   2878{
   2879    int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
   2880    if (ret < 0) {
   2881        return ret;
   2882    }
   2883
   2884    if (ret < cpu->kvm_msr_buf->nmsrs) {
   2885        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
   2886        error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
   2887                     (uint32_t)e->index, (uint64_t)e->data);
   2888    }
   2889
   2890    assert(ret == cpu->kvm_msr_buf->nmsrs);
   2891    return 0;
   2892}
   2893
   2894static void kvm_init_msrs(X86CPU *cpu)
   2895{
   2896    CPUX86State *env = &cpu->env;
   2897
   2898    kvm_msr_buf_reset(cpu);
   2899    if (has_msr_arch_capabs) {
   2900        kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
   2901                          env->features[FEAT_ARCH_CAPABILITIES]);
   2902    }
   2903
   2904    if (has_msr_core_capabs) {
   2905        kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
   2906                          env->features[FEAT_CORE_CAPABILITY]);
   2907    }
   2908
   2909    if (has_msr_perf_capabs && cpu->enable_pmu) {
   2910        kvm_msr_entry_add_perf(cpu, env->features);
   2911    }
   2912
   2913    if (has_msr_ucode_rev) {
   2914        kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
   2915    }
   2916
   2917    /*
   2918     * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
   2919     * all kernels with MSR features should have them.
   2920     */
   2921    if (kvm_feature_msrs && cpu_has_vmx(env)) {
   2922        kvm_msr_entry_add_vmx(cpu, env->features);
   2923    }
   2924
   2925    assert(kvm_buf_set_msrs(cpu) == 0);
   2926}
   2927
   2928static int kvm_put_msrs(X86CPU *cpu, int level)
   2929{
   2930    CPUX86State *env = &cpu->env;
   2931    int i;
   2932
   2933    kvm_msr_buf_reset(cpu);
   2934
   2935    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
   2936    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
   2937    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
   2938    kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
   2939    if (has_msr_star) {
   2940        kvm_msr_entry_add(cpu, MSR_STAR, env->star);
   2941    }
   2942    if (has_msr_hsave_pa) {
   2943        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
   2944    }
   2945    if (has_msr_tsc_aux) {
   2946        kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
   2947    }
   2948    if (has_msr_tsc_adjust) {
   2949        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
   2950    }
   2951    if (has_msr_misc_enable) {
   2952        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
   2953                          env->msr_ia32_misc_enable);
   2954    }
   2955    if (has_msr_smbase) {
   2956        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
   2957    }
   2958    if (has_msr_smi_count) {
   2959        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
   2960    }
   2961    if (has_msr_pkrs) {
   2962        kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
   2963    }
   2964    if (has_msr_bndcfgs) {
   2965        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
   2966    }
   2967    if (has_msr_xss) {
   2968        kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
   2969    }
   2970    if (has_msr_umwait) {
   2971        kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
   2972    }
   2973    if (has_msr_spec_ctrl) {
   2974        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
   2975    }
   2976    if (has_msr_tsx_ctrl) {
   2977        kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
   2978    }
   2979    if (has_msr_virt_ssbd) {
   2980        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
   2981    }
   2982
   2983#ifdef TARGET_X86_64
   2984    if (lm_capable_kernel) {
   2985        kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
   2986        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
   2987        kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
   2988        kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
   2989    }
   2990#endif
   2991
   2992    /*
   2993     * The following MSRs have side effects on the guest or are too heavy
   2994     * for normal writeback. Limit them to reset or full state updates.
   2995     */
   2996    if (level >= KVM_PUT_RESET_STATE) {
   2997        kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
   2998        kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
   2999        kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
   3000        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
   3001            kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
   3002        }
   3003        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
   3004            kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
   3005        }
   3006        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
   3007            kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
   3008        }
   3009        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
   3010            kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
   3011        }
   3012
   3013        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
   3014            kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
   3015        }
   3016
   3017        if (has_architectural_pmu_version > 0) {
   3018            if (has_architectural_pmu_version > 1) {
   3019                /* Stop the counter.  */
   3020                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
   3021                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
   3022            }
   3023
   3024            /* Set the counter values.  */
   3025            for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
   3026                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
   3027                                  env->msr_fixed_counters[i]);
   3028            }
   3029            for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
   3030                kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
   3031                                  env->msr_gp_counters[i]);
   3032                kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
   3033                                  env->msr_gp_evtsel[i]);
   3034            }
   3035            if (has_architectural_pmu_version > 1) {
   3036                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
   3037                                  env->msr_global_status);
   3038                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
   3039                                  env->msr_global_ovf_ctrl);
   3040
   3041                /* Now start the PMU.  */
   3042                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
   3043                                  env->msr_fixed_ctr_ctrl);
   3044                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
   3045                                  env->msr_global_ctrl);
   3046            }
   3047        }
   3048        /*
   3049         * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
   3050         * only sync them to KVM on the first cpu
   3051         */
   3052        if (current_cpu == first_cpu) {
   3053            if (has_msr_hv_hypercall) {
   3054                kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
   3055                                  env->msr_hv_guest_os_id);
   3056                kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
   3057                                  env->msr_hv_hypercall);
   3058            }
   3059            if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
   3060                kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
   3061                                  env->msr_hv_tsc);
   3062            }
   3063            if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
   3064                kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
   3065                                  env->msr_hv_reenlightenment_control);
   3066                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
   3067                                  env->msr_hv_tsc_emulation_control);
   3068                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
   3069                                  env->msr_hv_tsc_emulation_status);
   3070            }
   3071        }
   3072        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
   3073            kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
   3074                              env->msr_hv_vapic);
   3075        }
   3076        if (has_msr_hv_crash) {
   3077            int j;
   3078
   3079            for (j = 0; j < HV_CRASH_PARAMS; j++)
   3080                kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
   3081                                  env->msr_hv_crash_params[j]);
   3082
   3083            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
   3084        }
   3085        if (has_msr_hv_runtime) {
   3086            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
   3087        }
   3088        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
   3089            && hv_vpindex_settable) {
   3090            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
   3091                              hyperv_vp_index(CPU(cpu)));
   3092        }
   3093        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   3094            int j;
   3095
   3096            kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
   3097
   3098            kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
   3099                              env->msr_hv_synic_control);
   3100            kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
   3101                              env->msr_hv_synic_evt_page);
   3102            kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
   3103                              env->msr_hv_synic_msg_page);
   3104
   3105            for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
   3106                kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
   3107                                  env->msr_hv_synic_sint[j]);
   3108            }
   3109        }
   3110        if (has_msr_hv_stimer) {
   3111            int j;
   3112
   3113            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
   3114                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
   3115                                env->msr_hv_stimer_config[j]);
   3116            }
   3117
   3118            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
   3119                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
   3120                                env->msr_hv_stimer_count[j]);
   3121            }
   3122        }
   3123        if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
   3124            uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
   3125
   3126            kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
   3127            kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
   3128            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
   3129            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
   3130            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
   3131            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
   3132            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
   3133            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
   3134            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
   3135            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
   3136            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
   3137            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
   3138            for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
   3139                /* The CPU GPs if we write to a bit above the physical limit of
   3140                 * the host CPU (and KVM emulates that)
   3141                 */
   3142                uint64_t mask = env->mtrr_var[i].mask;
   3143                mask &= phys_mask;
   3144
   3145                kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
   3146                                  env->mtrr_var[i].base);
   3147                kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
   3148            }
   3149        }
   3150        if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
   3151            int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
   3152                                                    0x14, 1, R_EAX) & 0x7;
   3153
   3154            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
   3155                            env->msr_rtit_ctrl);
   3156            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
   3157                            env->msr_rtit_status);
   3158            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
   3159                            env->msr_rtit_output_base);
   3160            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
   3161                            env->msr_rtit_output_mask);
   3162            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
   3163                            env->msr_rtit_cr3_match);
   3164            for (i = 0; i < addr_num; i++) {
   3165                kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
   3166                            env->msr_rtit_addrs[i]);
   3167            }
   3168        }
   3169
   3170        if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
   3171            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
   3172                              env->msr_ia32_sgxlepubkeyhash[0]);
   3173            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
   3174                              env->msr_ia32_sgxlepubkeyhash[1]);
   3175            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
   3176                              env->msr_ia32_sgxlepubkeyhash[2]);
   3177            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
   3178                              env->msr_ia32_sgxlepubkeyhash[3]);
   3179        }
   3180
   3181        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
   3182         *       kvm_put_msr_feature_control. */
   3183    }
   3184
   3185    if (env->mcg_cap) {
   3186        int i;
   3187
   3188        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
   3189        kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
   3190        if (has_msr_mcg_ext_ctl) {
   3191            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
   3192        }
   3193        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
   3194            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
   3195        }
   3196    }
   3197
   3198    return kvm_buf_set_msrs(cpu);
   3199}
   3200
   3201
   3202static int kvm_get_fpu(X86CPU *cpu)
   3203{
   3204    CPUX86State *env = &cpu->env;
   3205    struct kvm_fpu fpu;
   3206    int i, ret;
   3207
   3208    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
   3209    if (ret < 0) {
   3210        return ret;
   3211    }
   3212
   3213    env->fpstt = (fpu.fsw >> 11) & 7;
   3214    env->fpus = fpu.fsw;
   3215    env->fpuc = fpu.fcw;
   3216    env->fpop = fpu.last_opcode;
   3217    env->fpip = fpu.last_ip;
   3218    env->fpdp = fpu.last_dp;
   3219    for (i = 0; i < 8; ++i) {
   3220        env->fptags[i] = !((fpu.ftwx >> i) & 1);
   3221    }
   3222    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
   3223    for (i = 0; i < CPU_NB_REGS; i++) {
   3224        env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
   3225        env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
   3226    }
   3227    env->mxcsr = fpu.mxcsr;
   3228
   3229    return 0;
   3230}
   3231
   3232static int kvm_get_xsave(X86CPU *cpu)
   3233{
   3234    CPUX86State *env = &cpu->env;
   3235    void *xsave = env->xsave_buf;
   3236    int ret;
   3237
   3238    if (!has_xsave) {
   3239        return kvm_get_fpu(cpu);
   3240    }
   3241
   3242    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
   3243    if (ret < 0) {
   3244        return ret;
   3245    }
   3246    x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
   3247
   3248    return 0;
   3249}
   3250
   3251static int kvm_get_xcrs(X86CPU *cpu)
   3252{
   3253    CPUX86State *env = &cpu->env;
   3254    int i, ret;
   3255    struct kvm_xcrs xcrs;
   3256
   3257    if (!has_xcrs) {
   3258        return 0;
   3259    }
   3260
   3261    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
   3262    if (ret < 0) {
   3263        return ret;
   3264    }
   3265
   3266    for (i = 0; i < xcrs.nr_xcrs; i++) {
   3267        /* Only support xcr0 now */
   3268        if (xcrs.xcrs[i].xcr == 0) {
   3269            env->xcr0 = xcrs.xcrs[i].value;
   3270            break;
   3271        }
   3272    }
   3273    return 0;
   3274}
   3275
   3276static int kvm_get_sregs(X86CPU *cpu)
   3277{
   3278    CPUX86State *env = &cpu->env;
   3279    struct kvm_sregs sregs;
   3280    int bit, i, ret;
   3281
   3282    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
   3283    if (ret < 0) {
   3284        return ret;
   3285    }
   3286
   3287    /* There can only be one pending IRQ set in the bitmap at a time, so try
   3288       to find it and save its number instead (-1 for none). */
   3289    env->interrupt_injected = -1;
   3290    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
   3291        if (sregs.interrupt_bitmap[i]) {
   3292            bit = ctz64(sregs.interrupt_bitmap[i]);
   3293            env->interrupt_injected = i * 64 + bit;
   3294            break;
   3295        }
   3296    }
   3297
   3298    get_seg(&env->segs[R_CS], &sregs.cs);
   3299    get_seg(&env->segs[R_DS], &sregs.ds);
   3300    get_seg(&env->segs[R_ES], &sregs.es);
   3301    get_seg(&env->segs[R_FS], &sregs.fs);
   3302    get_seg(&env->segs[R_GS], &sregs.gs);
   3303    get_seg(&env->segs[R_SS], &sregs.ss);
   3304
   3305    get_seg(&env->tr, &sregs.tr);
   3306    get_seg(&env->ldt, &sregs.ldt);
   3307
   3308    env->idt.limit = sregs.idt.limit;
   3309    env->idt.base = sregs.idt.base;
   3310    env->gdt.limit = sregs.gdt.limit;
   3311    env->gdt.base = sregs.gdt.base;
   3312
   3313    env->cr[0] = sregs.cr0;
   3314    env->cr[2] = sregs.cr2;
   3315    env->cr[3] = sregs.cr3;
   3316    env->cr[4] = sregs.cr4;
   3317
   3318    env->efer = sregs.efer;
   3319
   3320    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
   3321    x86_update_hflags(env);
   3322
   3323    return 0;
   3324}
   3325
   3326static int kvm_get_msrs(X86CPU *cpu)
   3327{
   3328    CPUX86State *env = &cpu->env;
   3329    struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
   3330    int ret, i;
   3331    uint64_t mtrr_top_bits;
   3332
   3333    kvm_msr_buf_reset(cpu);
   3334
   3335    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
   3336    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
   3337    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
   3338    kvm_msr_entry_add(cpu, MSR_PAT, 0);
   3339    if (has_msr_star) {
   3340        kvm_msr_entry_add(cpu, MSR_STAR, 0);
   3341    }
   3342    if (has_msr_hsave_pa) {
   3343        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
   3344    }
   3345    if (has_msr_tsc_aux) {
   3346        kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
   3347    }
   3348    if (has_msr_tsc_adjust) {
   3349        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
   3350    }
   3351    if (has_msr_tsc_deadline) {
   3352        kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
   3353    }
   3354    if (has_msr_misc_enable) {
   3355        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
   3356    }
   3357    if (has_msr_smbase) {
   3358        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
   3359    }
   3360    if (has_msr_smi_count) {
   3361        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
   3362    }
   3363    if (has_msr_feature_control) {
   3364        kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
   3365    }
   3366    if (has_msr_pkrs) {
   3367        kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
   3368    }
   3369    if (has_msr_bndcfgs) {
   3370        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
   3371    }
   3372    if (has_msr_xss) {
   3373        kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
   3374    }
   3375    if (has_msr_umwait) {
   3376        kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
   3377    }
   3378    if (has_msr_spec_ctrl) {
   3379        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
   3380    }
   3381    if (has_msr_tsx_ctrl) {
   3382        kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
   3383    }
   3384    if (has_msr_virt_ssbd) {
   3385        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
   3386    }
   3387    if (!env->tsc_valid) {
   3388        kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
   3389        env->tsc_valid = !runstate_is_running();
   3390    }
   3391
   3392#ifdef TARGET_X86_64
   3393    if (lm_capable_kernel) {
   3394        kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
   3395        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
   3396        kvm_msr_entry_add(cpu, MSR_FMASK, 0);
   3397        kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
   3398    }
   3399#endif
   3400    kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
   3401    kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
   3402    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
   3403        kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
   3404    }
   3405    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
   3406        kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
   3407    }
   3408    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
   3409        kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
   3410    }
   3411    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
   3412        kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
   3413    }
   3414    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
   3415        kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
   3416    }
   3417    if (has_architectural_pmu_version > 0) {
   3418        if (has_architectural_pmu_version > 1) {
   3419            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
   3420            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
   3421            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
   3422            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
   3423        }
   3424        for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
   3425            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
   3426        }
   3427        for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
   3428            kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
   3429            kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
   3430        }
   3431    }
   3432
   3433    if (env->mcg_cap) {
   3434        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
   3435        kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
   3436        if (has_msr_mcg_ext_ctl) {
   3437            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
   3438        }
   3439        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
   3440            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
   3441        }
   3442    }
   3443
   3444    if (has_msr_hv_hypercall) {
   3445        kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
   3446        kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
   3447    }
   3448    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
   3449        kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
   3450    }
   3451    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
   3452        kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
   3453    }
   3454    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
   3455        kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
   3456        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
   3457        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
   3458    }
   3459    if (has_msr_hv_crash) {
   3460        int j;
   3461
   3462        for (j = 0; j < HV_CRASH_PARAMS; j++) {
   3463            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
   3464        }
   3465    }
   3466    if (has_msr_hv_runtime) {
   3467        kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
   3468    }
   3469    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   3470        uint32_t msr;
   3471
   3472        kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
   3473        kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
   3474        kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
   3475        for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
   3476            kvm_msr_entry_add(cpu, msr, 0);
   3477        }
   3478    }
   3479    if (has_msr_hv_stimer) {
   3480        uint32_t msr;
   3481
   3482        for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
   3483             msr++) {
   3484            kvm_msr_entry_add(cpu, msr, 0);
   3485        }
   3486    }
   3487    if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
   3488        kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
   3489        kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
   3490        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
   3491        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
   3492        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
   3493        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
   3494        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
   3495        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
   3496        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
   3497        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
   3498        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
   3499        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
   3500        for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
   3501            kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
   3502            kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
   3503        }
   3504    }
   3505
   3506    if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
   3507        int addr_num =
   3508            kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
   3509
   3510        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
   3511        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
   3512        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
   3513        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
   3514        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
   3515        for (i = 0; i < addr_num; i++) {
   3516            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
   3517        }
   3518    }
   3519
   3520    if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
   3521        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
   3522        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
   3523        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
   3524        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
   3525    }
   3526
   3527    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
   3528    if (ret < 0) {
   3529        return ret;
   3530    }
   3531
   3532    if (ret < cpu->kvm_msr_buf->nmsrs) {
   3533        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
   3534        error_report("error: failed to get MSR 0x%" PRIx32,
   3535                     (uint32_t)e->index);
   3536    }
   3537
   3538    assert(ret == cpu->kvm_msr_buf->nmsrs);
   3539    /*
   3540     * MTRR masks: Each mask consists of 5 parts
   3541     * a  10..0: must be zero
   3542     * b  11   : valid bit
   3543     * c n-1.12: actual mask bits
   3544     * d  51..n: reserved must be zero
   3545     * e  63.52: reserved must be zero
   3546     *
   3547     * 'n' is the number of physical bits supported by the CPU and is
   3548     * apparently always <= 52.   We know our 'n' but don't know what
   3549     * the destinations 'n' is; it might be smaller, in which case
   3550     * it masks (c) on loading. It might be larger, in which case
   3551     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
   3552     * we're migrating to.
   3553     */
   3554
   3555    if (cpu->fill_mtrr_mask) {
   3556        QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
   3557        assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
   3558        mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
   3559    } else {
   3560        mtrr_top_bits = 0;
   3561    }
   3562
   3563    for (i = 0; i < ret; i++) {
   3564        uint32_t index = msrs[i].index;
   3565        switch (index) {
   3566        case MSR_IA32_SYSENTER_CS:
   3567            env->sysenter_cs = msrs[i].data;
   3568            break;
   3569        case MSR_IA32_SYSENTER_ESP:
   3570            env->sysenter_esp = msrs[i].data;
   3571            break;
   3572        case MSR_IA32_SYSENTER_EIP:
   3573            env->sysenter_eip = msrs[i].data;
   3574            break;
   3575        case MSR_PAT:
   3576            env->pat = msrs[i].data;
   3577            break;
   3578        case MSR_STAR:
   3579            env->star = msrs[i].data;
   3580            break;
   3581#ifdef TARGET_X86_64
   3582        case MSR_CSTAR:
   3583            env->cstar = msrs[i].data;
   3584            break;
   3585        case MSR_KERNELGSBASE:
   3586            env->kernelgsbase = msrs[i].data;
   3587            break;
   3588        case MSR_FMASK:
   3589            env->fmask = msrs[i].data;
   3590            break;
   3591        case MSR_LSTAR:
   3592            env->lstar = msrs[i].data;
   3593            break;
   3594#endif
   3595        case MSR_IA32_TSC:
   3596            env->tsc = msrs[i].data;
   3597            break;
   3598        case MSR_TSC_AUX:
   3599            env->tsc_aux = msrs[i].data;
   3600            break;
   3601        case MSR_TSC_ADJUST:
   3602            env->tsc_adjust = msrs[i].data;
   3603            break;
   3604        case MSR_IA32_TSCDEADLINE:
   3605            env->tsc_deadline = msrs[i].data;
   3606            break;
   3607        case MSR_VM_HSAVE_PA:
   3608            env->vm_hsave = msrs[i].data;
   3609            break;
   3610        case MSR_KVM_SYSTEM_TIME:
   3611            env->system_time_msr = msrs[i].data;
   3612            break;
   3613        case MSR_KVM_WALL_CLOCK:
   3614            env->wall_clock_msr = msrs[i].data;
   3615            break;
   3616        case MSR_MCG_STATUS:
   3617            env->mcg_status = msrs[i].data;
   3618            break;
   3619        case MSR_MCG_CTL:
   3620            env->mcg_ctl = msrs[i].data;
   3621            break;
   3622        case MSR_MCG_EXT_CTL:
   3623            env->mcg_ext_ctl = msrs[i].data;
   3624            break;
   3625        case MSR_IA32_MISC_ENABLE:
   3626            env->msr_ia32_misc_enable = msrs[i].data;
   3627            break;
   3628        case MSR_IA32_SMBASE:
   3629            env->smbase = msrs[i].data;
   3630            break;
   3631        case MSR_SMI_COUNT:
   3632            env->msr_smi_count = msrs[i].data;
   3633            break;
   3634        case MSR_IA32_FEATURE_CONTROL:
   3635            env->msr_ia32_feature_control = msrs[i].data;
   3636            break;
   3637        case MSR_IA32_BNDCFGS:
   3638            env->msr_bndcfgs = msrs[i].data;
   3639            break;
   3640        case MSR_IA32_XSS:
   3641            env->xss = msrs[i].data;
   3642            break;
   3643        case MSR_IA32_UMWAIT_CONTROL:
   3644            env->umwait = msrs[i].data;
   3645            break;
   3646        case MSR_IA32_PKRS:
   3647            env->pkrs = msrs[i].data;
   3648            break;
   3649        default:
   3650            if (msrs[i].index >= MSR_MC0_CTL &&
   3651                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
   3652                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
   3653            }
   3654            break;
   3655        case MSR_KVM_ASYNC_PF_EN:
   3656            env->async_pf_en_msr = msrs[i].data;
   3657            break;
   3658        case MSR_KVM_ASYNC_PF_INT:
   3659            env->async_pf_int_msr = msrs[i].data;
   3660            break;
   3661        case MSR_KVM_PV_EOI_EN:
   3662            env->pv_eoi_en_msr = msrs[i].data;
   3663            break;
   3664        case MSR_KVM_STEAL_TIME:
   3665            env->steal_time_msr = msrs[i].data;
   3666            break;
   3667        case MSR_KVM_POLL_CONTROL: {
   3668            env->poll_control_msr = msrs[i].data;
   3669            break;
   3670        }
   3671        case MSR_CORE_PERF_FIXED_CTR_CTRL:
   3672            env->msr_fixed_ctr_ctrl = msrs[i].data;
   3673            break;
   3674        case MSR_CORE_PERF_GLOBAL_CTRL:
   3675            env->msr_global_ctrl = msrs[i].data;
   3676            break;
   3677        case MSR_CORE_PERF_GLOBAL_STATUS:
   3678            env->msr_global_status = msrs[i].data;
   3679            break;
   3680        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
   3681            env->msr_global_ovf_ctrl = msrs[i].data;
   3682            break;
   3683        case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
   3684            env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
   3685            break;
   3686        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
   3687            env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
   3688            break;
   3689        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
   3690            env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
   3691            break;
   3692        case HV_X64_MSR_HYPERCALL:
   3693            env->msr_hv_hypercall = msrs[i].data;
   3694            break;
   3695        case HV_X64_MSR_GUEST_OS_ID:
   3696            env->msr_hv_guest_os_id = msrs[i].data;
   3697            break;
   3698        case HV_X64_MSR_APIC_ASSIST_PAGE:
   3699            env->msr_hv_vapic = msrs[i].data;
   3700            break;
   3701        case HV_X64_MSR_REFERENCE_TSC:
   3702            env->msr_hv_tsc = msrs[i].data;
   3703            break;
   3704        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
   3705            env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
   3706            break;
   3707        case HV_X64_MSR_VP_RUNTIME:
   3708            env->msr_hv_runtime = msrs[i].data;
   3709            break;
   3710        case HV_X64_MSR_SCONTROL:
   3711            env->msr_hv_synic_control = msrs[i].data;
   3712            break;
   3713        case HV_X64_MSR_SIEFP:
   3714            env->msr_hv_synic_evt_page = msrs[i].data;
   3715            break;
   3716        case HV_X64_MSR_SIMP:
   3717            env->msr_hv_synic_msg_page = msrs[i].data;
   3718            break;
   3719        case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
   3720            env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
   3721            break;
   3722        case HV_X64_MSR_STIMER0_CONFIG:
   3723        case HV_X64_MSR_STIMER1_CONFIG:
   3724        case HV_X64_MSR_STIMER2_CONFIG:
   3725        case HV_X64_MSR_STIMER3_CONFIG:
   3726            env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
   3727                                msrs[i].data;
   3728            break;
   3729        case HV_X64_MSR_STIMER0_COUNT:
   3730        case HV_X64_MSR_STIMER1_COUNT:
   3731        case HV_X64_MSR_STIMER2_COUNT:
   3732        case HV_X64_MSR_STIMER3_COUNT:
   3733            env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
   3734                                msrs[i].data;
   3735            break;
   3736        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   3737            env->msr_hv_reenlightenment_control = msrs[i].data;
   3738            break;
   3739        case HV_X64_MSR_TSC_EMULATION_CONTROL:
   3740            env->msr_hv_tsc_emulation_control = msrs[i].data;
   3741            break;
   3742        case HV_X64_MSR_TSC_EMULATION_STATUS:
   3743            env->msr_hv_tsc_emulation_status = msrs[i].data;
   3744            break;
   3745        case MSR_MTRRdefType:
   3746            env->mtrr_deftype = msrs[i].data;
   3747            break;
   3748        case MSR_MTRRfix64K_00000:
   3749            env->mtrr_fixed[0] = msrs[i].data;
   3750            break;
   3751        case MSR_MTRRfix16K_80000:
   3752            env->mtrr_fixed[1] = msrs[i].data;
   3753            break;
   3754        case MSR_MTRRfix16K_A0000:
   3755            env->mtrr_fixed[2] = msrs[i].data;
   3756            break;
   3757        case MSR_MTRRfix4K_C0000:
   3758            env->mtrr_fixed[3] = msrs[i].data;
   3759            break;
   3760        case MSR_MTRRfix4K_C8000:
   3761            env->mtrr_fixed[4] = msrs[i].data;
   3762            break;
   3763        case MSR_MTRRfix4K_D0000:
   3764            env->mtrr_fixed[5] = msrs[i].data;
   3765            break;
   3766        case MSR_MTRRfix4K_D8000:
   3767            env->mtrr_fixed[6] = msrs[i].data;
   3768            break;
   3769        case MSR_MTRRfix4K_E0000:
   3770            env->mtrr_fixed[7] = msrs[i].data;
   3771            break;
   3772        case MSR_MTRRfix4K_E8000:
   3773            env->mtrr_fixed[8] = msrs[i].data;
   3774            break;
   3775        case MSR_MTRRfix4K_F0000:
   3776            env->mtrr_fixed[9] = msrs[i].data;
   3777            break;
   3778        case MSR_MTRRfix4K_F8000:
   3779            env->mtrr_fixed[10] = msrs[i].data;
   3780            break;
   3781        case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
   3782            if (index & 1) {
   3783                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
   3784                                                               mtrr_top_bits;
   3785            } else {
   3786                env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
   3787            }
   3788            break;
   3789        case MSR_IA32_SPEC_CTRL:
   3790            env->spec_ctrl = msrs[i].data;
   3791            break;
   3792        case MSR_IA32_TSX_CTRL:
   3793            env->tsx_ctrl = msrs[i].data;
   3794            break;
   3795        case MSR_VIRT_SSBD:
   3796            env->virt_ssbd = msrs[i].data;
   3797            break;
   3798        case MSR_IA32_RTIT_CTL:
   3799            env->msr_rtit_ctrl = msrs[i].data;
   3800            break;
   3801        case MSR_IA32_RTIT_STATUS:
   3802            env->msr_rtit_status = msrs[i].data;
   3803            break;
   3804        case MSR_IA32_RTIT_OUTPUT_BASE:
   3805            env->msr_rtit_output_base = msrs[i].data;
   3806            break;
   3807        case MSR_IA32_RTIT_OUTPUT_MASK:
   3808            env->msr_rtit_output_mask = msrs[i].data;
   3809            break;
   3810        case MSR_IA32_RTIT_CR3_MATCH:
   3811            env->msr_rtit_cr3_match = msrs[i].data;
   3812            break;
   3813        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
   3814            env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
   3815            break;
   3816        case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
   3817            env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
   3818                           msrs[i].data;
   3819            break;
   3820        }
   3821    }
   3822
   3823    return 0;
   3824}
   3825
   3826static int kvm_put_mp_state(X86CPU *cpu)
   3827{
   3828    struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
   3829
   3830    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
   3831}
   3832
   3833static int kvm_get_mp_state(X86CPU *cpu)
   3834{
   3835    CPUState *cs = CPU(cpu);
   3836    CPUX86State *env = &cpu->env;
   3837    struct kvm_mp_state mp_state;
   3838    int ret;
   3839
   3840    ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
   3841    if (ret < 0) {
   3842        return ret;
   3843    }
   3844    env->mp_state = mp_state.mp_state;
   3845    if (kvm_irqchip_in_kernel()) {
   3846        cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
   3847    }
   3848    return 0;
   3849}
   3850
   3851static int kvm_get_apic(X86CPU *cpu)
   3852{
   3853    DeviceState *apic = cpu->apic_state;
   3854    struct kvm_lapic_state kapic;
   3855    int ret;
   3856
   3857    if (apic && kvm_irqchip_in_kernel()) {
   3858        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
   3859        if (ret < 0) {
   3860            return ret;
   3861        }
   3862
   3863        kvm_get_apic_state(apic, &kapic);
   3864    }
   3865    return 0;
   3866}
   3867
   3868static int kvm_put_vcpu_events(X86CPU *cpu, int level)
   3869{
   3870    CPUState *cs = CPU(cpu);
   3871    CPUX86State *env = &cpu->env;
   3872    struct kvm_vcpu_events events = {};
   3873
   3874    if (!kvm_has_vcpu_events()) {
   3875        return 0;
   3876    }
   3877
   3878    events.flags = 0;
   3879
   3880    if (has_exception_payload) {
   3881        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
   3882        events.exception.pending = env->exception_pending;
   3883        events.exception_has_payload = env->exception_has_payload;
   3884        events.exception_payload = env->exception_payload;
   3885    }
   3886    events.exception.nr = env->exception_nr;
   3887    events.exception.injected = env->exception_injected;
   3888    events.exception.has_error_code = env->has_error_code;
   3889    events.exception.error_code = env->error_code;
   3890
   3891    events.interrupt.injected = (env->interrupt_injected >= 0);
   3892    events.interrupt.nr = env->interrupt_injected;
   3893    events.interrupt.soft = env->soft_interrupt;
   3894
   3895    events.nmi.injected = env->nmi_injected;
   3896    events.nmi.pending = env->nmi_pending;
   3897    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
   3898
   3899    events.sipi_vector = env->sipi_vector;
   3900
   3901    if (has_msr_smbase) {
   3902        events.smi.smm = !!(env->hflags & HF_SMM_MASK);
   3903        events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
   3904        if (kvm_irqchip_in_kernel()) {
   3905            /* As soon as these are moved to the kernel, remove them
   3906             * from cs->interrupt_request.
   3907             */
   3908            events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
   3909            events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
   3910            cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
   3911        } else {
   3912            /* Keep these in cs->interrupt_request.  */
   3913            events.smi.pending = 0;
   3914            events.smi.latched_init = 0;
   3915        }
   3916        /* Stop SMI delivery on old machine types to avoid a reboot
   3917         * on an inward migration of an old VM.
   3918         */
   3919        if (!cpu->kvm_no_smi_migration) {
   3920            events.flags |= KVM_VCPUEVENT_VALID_SMM;
   3921        }
   3922    }
   3923
   3924    if (level >= KVM_PUT_RESET_STATE) {
   3925        events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
   3926        if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
   3927            events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
   3928        }
   3929    }
   3930
   3931    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
   3932}
   3933
   3934static int kvm_get_vcpu_events(X86CPU *cpu)
   3935{
   3936    CPUX86State *env = &cpu->env;
   3937    struct kvm_vcpu_events events;
   3938    int ret;
   3939
   3940    if (!kvm_has_vcpu_events()) {
   3941        return 0;
   3942    }
   3943
   3944    memset(&events, 0, sizeof(events));
   3945    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
   3946    if (ret < 0) {
   3947       return ret;
   3948    }
   3949
   3950    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
   3951        env->exception_pending = events.exception.pending;
   3952        env->exception_has_payload = events.exception_has_payload;
   3953        env->exception_payload = events.exception_payload;
   3954    } else {
   3955        env->exception_pending = 0;
   3956        env->exception_has_payload = false;
   3957    }
   3958    env->exception_injected = events.exception.injected;
   3959    env->exception_nr =
   3960        (env->exception_pending || env->exception_injected) ?
   3961        events.exception.nr : -1;
   3962    env->has_error_code = events.exception.has_error_code;
   3963    env->error_code = events.exception.error_code;
   3964
   3965    env->interrupt_injected =
   3966        events.interrupt.injected ? events.interrupt.nr : -1;
   3967    env->soft_interrupt = events.interrupt.soft;
   3968
   3969    env->nmi_injected = events.nmi.injected;
   3970    env->nmi_pending = events.nmi.pending;
   3971    if (events.nmi.masked) {
   3972        env->hflags2 |= HF2_NMI_MASK;
   3973    } else {
   3974        env->hflags2 &= ~HF2_NMI_MASK;
   3975    }
   3976
   3977    if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
   3978        if (events.smi.smm) {
   3979            env->hflags |= HF_SMM_MASK;
   3980        } else {
   3981            env->hflags &= ~HF_SMM_MASK;
   3982        }
   3983        if (events.smi.pending) {
   3984            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
   3985        } else {
   3986            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
   3987        }
   3988        if (events.smi.smm_inside_nmi) {
   3989            env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
   3990        } else {
   3991            env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
   3992        }
   3993        if (events.smi.latched_init) {
   3994            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
   3995        } else {
   3996            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
   3997        }
   3998    }
   3999
   4000    env->sipi_vector = events.sipi_vector;
   4001
   4002    return 0;
   4003}
   4004
   4005static int kvm_guest_debug_workarounds(X86CPU *cpu)
   4006{
   4007    CPUState *cs = CPU(cpu);
   4008    CPUX86State *env = &cpu->env;
   4009    int ret = 0;
   4010    unsigned long reinject_trap = 0;
   4011
   4012    if (!kvm_has_vcpu_events()) {
   4013        if (env->exception_nr == EXCP01_DB) {
   4014            reinject_trap = KVM_GUESTDBG_INJECT_DB;
   4015        } else if (env->exception_injected == EXCP03_INT3) {
   4016            reinject_trap = KVM_GUESTDBG_INJECT_BP;
   4017        }
   4018        kvm_reset_exception(env);
   4019    }
   4020
   4021    /*
   4022     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
   4023     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
   4024     * by updating the debug state once again if single-stepping is on.
   4025     * Another reason to call kvm_update_guest_debug here is a pending debug
   4026     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
   4027     * reinject them via SET_GUEST_DEBUG.
   4028     */
   4029    if (reinject_trap ||
   4030        (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
   4031        ret = kvm_update_guest_debug(cs, reinject_trap);
   4032    }
   4033    return ret;
   4034}
   4035
   4036static int kvm_put_debugregs(X86CPU *cpu)
   4037{
   4038    CPUX86State *env = &cpu->env;
   4039    struct kvm_debugregs dbgregs;
   4040    int i;
   4041
   4042    if (!kvm_has_debugregs()) {
   4043        return 0;
   4044    }
   4045
   4046    memset(&dbgregs, 0, sizeof(dbgregs));
   4047    for (i = 0; i < 4; i++) {
   4048        dbgregs.db[i] = env->dr[i];
   4049    }
   4050    dbgregs.dr6 = env->dr[6];
   4051    dbgregs.dr7 = env->dr[7];
   4052    dbgregs.flags = 0;
   4053
   4054    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
   4055}
   4056
   4057static int kvm_get_debugregs(X86CPU *cpu)
   4058{
   4059    CPUX86State *env = &cpu->env;
   4060    struct kvm_debugregs dbgregs;
   4061    int i, ret;
   4062
   4063    if (!kvm_has_debugregs()) {
   4064        return 0;
   4065    }
   4066
   4067    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
   4068    if (ret < 0) {
   4069        return ret;
   4070    }
   4071    for (i = 0; i < 4; i++) {
   4072        env->dr[i] = dbgregs.db[i];
   4073    }
   4074    env->dr[4] = env->dr[6] = dbgregs.dr6;
   4075    env->dr[5] = env->dr[7] = dbgregs.dr7;
   4076
   4077    return 0;
   4078}
   4079
   4080static int kvm_put_nested_state(X86CPU *cpu)
   4081{
   4082    CPUX86State *env = &cpu->env;
   4083    int max_nested_state_len = kvm_max_nested_state_length();
   4084
   4085    if (!env->nested_state) {
   4086        return 0;
   4087    }
   4088
   4089    /*
   4090     * Copy flags that are affected by reset from env->hflags and env->hflags2.
   4091     */
   4092    if (env->hflags & HF_GUEST_MASK) {
   4093        env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
   4094    } else {
   4095        env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
   4096    }
   4097
   4098    /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
   4099    if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
   4100        env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
   4101    } else {
   4102        env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
   4103    }
   4104
   4105    assert(env->nested_state->size <= max_nested_state_len);
   4106    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
   4107}
   4108
   4109static int kvm_get_nested_state(X86CPU *cpu)
   4110{
   4111    CPUX86State *env = &cpu->env;
   4112    int max_nested_state_len = kvm_max_nested_state_length();
   4113    int ret;
   4114
   4115    if (!env->nested_state) {
   4116        return 0;
   4117    }
   4118
   4119    /*
   4120     * It is possible that migration restored a smaller size into
   4121     * nested_state->hdr.size than what our kernel support.
   4122     * We preserve migration origin nested_state->hdr.size for
   4123     * call to KVM_SET_NESTED_STATE but wish that our next call
   4124     * to KVM_GET_NESTED_STATE will use max size our kernel support.
   4125     */
   4126    env->nested_state->size = max_nested_state_len;
   4127
   4128    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
   4129    if (ret < 0) {
   4130        return ret;
   4131    }
   4132
   4133    /*
   4134     * Copy flags that are affected by reset to env->hflags and env->hflags2.
   4135     */
   4136    if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
   4137        env->hflags |= HF_GUEST_MASK;
   4138    } else {
   4139        env->hflags &= ~HF_GUEST_MASK;
   4140    }
   4141
   4142    /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
   4143    if (cpu_has_svm(env)) {
   4144        if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
   4145            env->hflags2 |= HF2_GIF_MASK;
   4146        } else {
   4147            env->hflags2 &= ~HF2_GIF_MASK;
   4148        }
   4149    }
   4150
   4151    return ret;
   4152}
   4153
   4154int kvm_arch_put_registers(CPUState *cpu, int level)
   4155{
   4156    X86CPU *x86_cpu = X86_CPU(cpu);
   4157    int ret;
   4158
   4159    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
   4160
   4161    /* must be before kvm_put_nested_state so that EFER.SVME is set */
   4162    ret = kvm_put_sregs(x86_cpu);
   4163    if (ret < 0) {
   4164        return ret;
   4165    }
   4166
   4167    if (level >= KVM_PUT_RESET_STATE) {
   4168        ret = kvm_put_nested_state(x86_cpu);
   4169        if (ret < 0) {
   4170            return ret;
   4171        }
   4172
   4173        ret = kvm_put_msr_feature_control(x86_cpu);
   4174        if (ret < 0) {
   4175            return ret;
   4176        }
   4177    }
   4178
   4179    if (level == KVM_PUT_FULL_STATE) {
   4180        /* We don't check for kvm_arch_set_tsc_khz() errors here,
   4181         * because TSC frequency mismatch shouldn't abort migration,
   4182         * unless the user explicitly asked for a more strict TSC
   4183         * setting (e.g. using an explicit "tsc-freq" option).
   4184         */
   4185        kvm_arch_set_tsc_khz(cpu);
   4186    }
   4187
   4188    ret = kvm_getput_regs(x86_cpu, 1);
   4189    if (ret < 0) {
   4190        return ret;
   4191    }
   4192    ret = kvm_put_xsave(x86_cpu);
   4193    if (ret < 0) {
   4194        return ret;
   4195    }
   4196    ret = kvm_put_xcrs(x86_cpu);
   4197    if (ret < 0) {
   4198        return ret;
   4199    }
   4200    /* must be before kvm_put_msrs */
   4201    ret = kvm_inject_mce_oldstyle(x86_cpu);
   4202    if (ret < 0) {
   4203        return ret;
   4204    }
   4205    ret = kvm_put_msrs(x86_cpu, level);
   4206    if (ret < 0) {
   4207        return ret;
   4208    }
   4209    ret = kvm_put_vcpu_events(x86_cpu, level);
   4210    if (ret < 0) {
   4211        return ret;
   4212    }
   4213    if (level >= KVM_PUT_RESET_STATE) {
   4214        ret = kvm_put_mp_state(x86_cpu);
   4215        if (ret < 0) {
   4216            return ret;
   4217        }
   4218    }
   4219
   4220    ret = kvm_put_tscdeadline_msr(x86_cpu);
   4221    if (ret < 0) {
   4222        return ret;
   4223    }
   4224    ret = kvm_put_debugregs(x86_cpu);
   4225    if (ret < 0) {
   4226        return ret;
   4227    }
   4228    /* must be last */
   4229    ret = kvm_guest_debug_workarounds(x86_cpu);
   4230    if (ret < 0) {
   4231        return ret;
   4232    }
   4233    return 0;
   4234}
   4235
   4236int kvm_arch_get_registers(CPUState *cs)
   4237{
   4238    X86CPU *cpu = X86_CPU(cs);
   4239    int ret;
   4240
   4241    assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
   4242
   4243    ret = kvm_get_vcpu_events(cpu);
   4244    if (ret < 0) {
   4245        goto out;
   4246    }
   4247    /*
   4248     * KVM_GET_MPSTATE can modify CS and RIP, call it before
   4249     * KVM_GET_REGS and KVM_GET_SREGS.
   4250     */
   4251    ret = kvm_get_mp_state(cpu);
   4252    if (ret < 0) {
   4253        goto out;
   4254    }
   4255    ret = kvm_getput_regs(cpu, 0);
   4256    if (ret < 0) {
   4257        goto out;
   4258    }
   4259    ret = kvm_get_xsave(cpu);
   4260    if (ret < 0) {
   4261        goto out;
   4262    }
   4263    ret = kvm_get_xcrs(cpu);
   4264    if (ret < 0) {
   4265        goto out;
   4266    }
   4267    ret = kvm_get_sregs(cpu);
   4268    if (ret < 0) {
   4269        goto out;
   4270    }
   4271    ret = kvm_get_msrs(cpu);
   4272    if (ret < 0) {
   4273        goto out;
   4274    }
   4275    ret = kvm_get_apic(cpu);
   4276    if (ret < 0) {
   4277        goto out;
   4278    }
   4279    ret = kvm_get_debugregs(cpu);
   4280    if (ret < 0) {
   4281        goto out;
   4282    }
   4283    ret = kvm_get_nested_state(cpu);
   4284    if (ret < 0) {
   4285        goto out;
   4286    }
   4287    ret = 0;
   4288 out:
   4289    cpu_sync_bndcs_hflags(&cpu->env);
   4290    return ret;
   4291}
   4292
   4293void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
   4294{
   4295    X86CPU *x86_cpu = X86_CPU(cpu);
   4296    CPUX86State *env = &x86_cpu->env;
   4297    int ret;
   4298
   4299    /* Inject NMI */
   4300    if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
   4301        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
   4302            qemu_mutex_lock_iothread();
   4303            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
   4304            qemu_mutex_unlock_iothread();
   4305            DPRINTF("injected NMI\n");
   4306            ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
   4307            if (ret < 0) {
   4308                fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
   4309                        strerror(-ret));
   4310            }
   4311        }
   4312        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
   4313            qemu_mutex_lock_iothread();
   4314            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
   4315            qemu_mutex_unlock_iothread();
   4316            DPRINTF("injected SMI\n");
   4317            ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
   4318            if (ret < 0) {
   4319                fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
   4320                        strerror(-ret));
   4321            }
   4322        }
   4323    }
   4324
   4325    if (!kvm_pic_in_kernel()) {
   4326        qemu_mutex_lock_iothread();
   4327    }
   4328
   4329    /* Force the VCPU out of its inner loop to process any INIT requests
   4330     * or (for userspace APIC, but it is cheap to combine the checks here)
   4331     * pending TPR access reports.
   4332     */
   4333    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
   4334        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
   4335            !(env->hflags & HF_SMM_MASK)) {
   4336            cpu->exit_request = 1;
   4337        }
   4338        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
   4339            cpu->exit_request = 1;
   4340        }
   4341    }
   4342
   4343    if (!kvm_pic_in_kernel()) {
   4344        /* Try to inject an interrupt if the guest can accept it */
   4345        if (run->ready_for_interrupt_injection &&
   4346            (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
   4347            (env->eflags & IF_MASK)) {
   4348            int irq;
   4349
   4350            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
   4351            irq = cpu_get_pic_interrupt(env);
   4352            if (irq >= 0) {
   4353                struct kvm_interrupt intr;
   4354
   4355                intr.irq = irq;
   4356                DPRINTF("injected interrupt %d\n", irq);
   4357                ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
   4358                if (ret < 0) {
   4359                    fprintf(stderr,
   4360                            "KVM: injection failed, interrupt lost (%s)\n",
   4361                            strerror(-ret));
   4362                }
   4363            }
   4364        }
   4365
   4366        /* If we have an interrupt but the guest is not ready to receive an
   4367         * interrupt, request an interrupt window exit.  This will
   4368         * cause a return to userspace as soon as the guest is ready to
   4369         * receive interrupts. */
   4370        if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
   4371            run->request_interrupt_window = 1;
   4372        } else {
   4373            run->request_interrupt_window = 0;
   4374        }
   4375
   4376        DPRINTF("setting tpr\n");
   4377        run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
   4378
   4379        qemu_mutex_unlock_iothread();
   4380    }
   4381}
   4382
   4383static void kvm_rate_limit_on_bus_lock(void)
   4384{
   4385    uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
   4386
   4387    if (delay_ns) {
   4388        g_usleep(delay_ns / SCALE_US);
   4389    }
   4390}
   4391
   4392MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
   4393{
   4394    X86CPU *x86_cpu = X86_CPU(cpu);
   4395    CPUX86State *env = &x86_cpu->env;
   4396
   4397    if (run->flags & KVM_RUN_X86_SMM) {
   4398        env->hflags |= HF_SMM_MASK;
   4399    } else {
   4400        env->hflags &= ~HF_SMM_MASK;
   4401    }
   4402    if (run->if_flag) {
   4403        env->eflags |= IF_MASK;
   4404    } else {
   4405        env->eflags &= ~IF_MASK;
   4406    }
   4407    if (run->flags & KVM_RUN_X86_BUS_LOCK) {
   4408        kvm_rate_limit_on_bus_lock();
   4409    }
   4410
   4411    /* We need to protect the apic state against concurrent accesses from
   4412     * different threads in case the userspace irqchip is used. */
   4413    if (!kvm_irqchip_in_kernel()) {
   4414        qemu_mutex_lock_iothread();
   4415    }
   4416    cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
   4417    cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
   4418    if (!kvm_irqchip_in_kernel()) {
   4419        qemu_mutex_unlock_iothread();
   4420    }
   4421    return cpu_get_mem_attrs(env);
   4422}
   4423
   4424int kvm_arch_process_async_events(CPUState *cs)
   4425{
   4426    X86CPU *cpu = X86_CPU(cs);
   4427    CPUX86State *env = &cpu->env;
   4428
   4429    if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
   4430        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
   4431        assert(env->mcg_cap);
   4432
   4433        cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
   4434
   4435        kvm_cpu_synchronize_state(cs);
   4436
   4437        if (env->exception_nr == EXCP08_DBLE) {
   4438            /* this means triple fault */
   4439            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
   4440            cs->exit_request = 1;
   4441            return 0;
   4442        }
   4443        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
   4444        env->has_error_code = 0;
   4445
   4446        cs->halted = 0;
   4447        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
   4448            env->mp_state = KVM_MP_STATE_RUNNABLE;
   4449        }
   4450    }
   4451
   4452    if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
   4453        !(env->hflags & HF_SMM_MASK)) {
   4454        kvm_cpu_synchronize_state(cs);
   4455        do_cpu_init(cpu);
   4456    }
   4457
   4458    if (kvm_irqchip_in_kernel()) {
   4459        return 0;
   4460    }
   4461
   4462    if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
   4463        cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
   4464        apic_poll_irq(cpu->apic_state);
   4465    }
   4466    if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
   4467         (env->eflags & IF_MASK)) ||
   4468        (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
   4469        cs->halted = 0;
   4470    }
   4471    if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
   4472        kvm_cpu_synchronize_state(cs);
   4473        do_cpu_sipi(cpu);
   4474    }
   4475    if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
   4476        cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
   4477        kvm_cpu_synchronize_state(cs);
   4478        apic_handle_tpr_access_report(cpu->apic_state, env->eip,
   4479                                      env->tpr_access_type);
   4480    }
   4481
   4482    return cs->halted;
   4483}
   4484
   4485static int kvm_handle_halt(X86CPU *cpu)
   4486{
   4487    CPUState *cs = CPU(cpu);
   4488    CPUX86State *env = &cpu->env;
   4489
   4490    if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
   4491          (env->eflags & IF_MASK)) &&
   4492        !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
   4493        cs->halted = 1;
   4494        return EXCP_HLT;
   4495    }
   4496
   4497    return 0;
   4498}
   4499
   4500static int kvm_handle_tpr_access(X86CPU *cpu)
   4501{
   4502    CPUState *cs = CPU(cpu);
   4503    struct kvm_run *run = cs->kvm_run;
   4504
   4505    apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
   4506                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
   4507                                                           : TPR_ACCESS_READ);
   4508    return 1;
   4509}
   4510
   4511int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   4512{
   4513    static const uint8_t int3 = 0xcc;
   4514
   4515    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
   4516        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
   4517        return -EINVAL;
   4518    }
   4519    return 0;
   4520}
   4521
   4522int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   4523{
   4524    uint8_t int3;
   4525
   4526    if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
   4527        return -EINVAL;
   4528    }
   4529    if (int3 != 0xcc) {
   4530        return 0;
   4531    }
   4532    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
   4533        return -EINVAL;
   4534    }
   4535    return 0;
   4536}
   4537
   4538static struct {
   4539    target_ulong addr;
   4540    int len;
   4541    int type;
   4542} hw_breakpoint[4];
   4543
   4544static int nb_hw_breakpoint;
   4545
   4546static int find_hw_breakpoint(target_ulong addr, int len, int type)
   4547{
   4548    int n;
   4549
   4550    for (n = 0; n < nb_hw_breakpoint; n++) {
   4551        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
   4552            (hw_breakpoint[n].len == len || len == -1)) {
   4553            return n;
   4554        }
   4555    }
   4556    return -1;
   4557}
   4558
   4559int kvm_arch_insert_hw_breakpoint(target_ulong addr,
   4560                                  target_ulong len, int type)
   4561{
   4562    switch (type) {
   4563    case GDB_BREAKPOINT_HW:
   4564        len = 1;
   4565        break;
   4566    case GDB_WATCHPOINT_WRITE:
   4567    case GDB_WATCHPOINT_ACCESS:
   4568        switch (len) {
   4569        case 1:
   4570            break;
   4571        case 2:
   4572        case 4:
   4573        case 8:
   4574            if (addr & (len - 1)) {
   4575                return -EINVAL;
   4576            }
   4577            break;
   4578        default:
   4579            return -EINVAL;
   4580        }
   4581        break;
   4582    default:
   4583        return -ENOSYS;
   4584    }
   4585
   4586    if (nb_hw_breakpoint == 4) {
   4587        return -ENOBUFS;
   4588    }
   4589    if (find_hw_breakpoint(addr, len, type) >= 0) {
   4590        return -EEXIST;
   4591    }
   4592    hw_breakpoint[nb_hw_breakpoint].addr = addr;
   4593    hw_breakpoint[nb_hw_breakpoint].len = len;
   4594    hw_breakpoint[nb_hw_breakpoint].type = type;
   4595    nb_hw_breakpoint++;
   4596
   4597    return 0;
   4598}
   4599
   4600int kvm_arch_remove_hw_breakpoint(target_ulong addr,
   4601                                  target_ulong len, int type)
   4602{
   4603    int n;
   4604
   4605    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
   4606    if (n < 0) {
   4607        return -ENOENT;
   4608    }
   4609    nb_hw_breakpoint--;
   4610    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
   4611
   4612    return 0;
   4613}
   4614
   4615void kvm_arch_remove_all_hw_breakpoints(void)
   4616{
   4617    nb_hw_breakpoint = 0;
   4618}
   4619
   4620static CPUWatchpoint hw_watchpoint;
   4621
   4622static int kvm_handle_debug(X86CPU *cpu,
   4623                            struct kvm_debug_exit_arch *arch_info)
   4624{
   4625    CPUState *cs = CPU(cpu);
   4626    CPUX86State *env = &cpu->env;
   4627    int ret = 0;
   4628    int n;
   4629
   4630    if (arch_info->exception == EXCP01_DB) {
   4631        if (arch_info->dr6 & DR6_BS) {
   4632            if (cs->singlestep_enabled) {
   4633                ret = EXCP_DEBUG;
   4634            }
   4635        } else {
   4636            for (n = 0; n < 4; n++) {
   4637                if (arch_info->dr6 & (1 << n)) {
   4638                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
   4639                    case 0x0:
   4640                        ret = EXCP_DEBUG;
   4641                        break;
   4642                    case 0x1:
   4643                        ret = EXCP_DEBUG;
   4644                        cs->watchpoint_hit = &hw_watchpoint;
   4645                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
   4646                        hw_watchpoint.flags = BP_MEM_WRITE;
   4647                        break;
   4648                    case 0x3:
   4649                        ret = EXCP_DEBUG;
   4650                        cs->watchpoint_hit = &hw_watchpoint;
   4651                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
   4652                        hw_watchpoint.flags = BP_MEM_ACCESS;
   4653                        break;
   4654                    }
   4655                }
   4656            }
   4657        }
   4658    } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
   4659        ret = EXCP_DEBUG;
   4660    }
   4661    if (ret == 0) {
   4662        cpu_synchronize_state(cs);
   4663        assert(env->exception_nr == -1);
   4664
   4665        /* pass to guest */
   4666        kvm_queue_exception(env, arch_info->exception,
   4667                            arch_info->exception == EXCP01_DB,
   4668                            arch_info->dr6);
   4669        env->has_error_code = 0;
   4670    }
   4671
   4672    return ret;
   4673}
   4674
   4675void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
   4676{
   4677    const uint8_t type_code[] = {
   4678        [GDB_BREAKPOINT_HW] = 0x0,
   4679        [GDB_WATCHPOINT_WRITE] = 0x1,
   4680        [GDB_WATCHPOINT_ACCESS] = 0x3
   4681    };
   4682    const uint8_t len_code[] = {
   4683        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
   4684    };
   4685    int n;
   4686
   4687    if (kvm_sw_breakpoints_active(cpu)) {
   4688        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
   4689    }
   4690    if (nb_hw_breakpoint > 0) {
   4691        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
   4692        dbg->arch.debugreg[7] = 0x0600;
   4693        for (n = 0; n < nb_hw_breakpoint; n++) {
   4694            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
   4695            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
   4696                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
   4697                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
   4698        }
   4699    }
   4700}
   4701
   4702static bool has_sgx_provisioning;
   4703
   4704static bool __kvm_enable_sgx_provisioning(KVMState *s)
   4705{
   4706    int fd, ret;
   4707
   4708    if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
   4709        return false;
   4710    }
   4711
   4712    fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
   4713    if (fd < 0) {
   4714        return false;
   4715    }
   4716
   4717    ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
   4718    if (ret) {
   4719        error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
   4720        exit(1);
   4721    }
   4722    close(fd);
   4723    return true;
   4724}
   4725
   4726bool kvm_enable_sgx_provisioning(KVMState *s)
   4727{
   4728    return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
   4729}
   4730
   4731static bool host_supports_vmx(void)
   4732{
   4733    uint32_t ecx, unused;
   4734
   4735    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
   4736    return ecx & CPUID_EXT_VMX;
   4737}
   4738
   4739#define VMX_INVALID_GUEST_STATE 0x80000021
   4740
   4741int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
   4742{
   4743    X86CPU *cpu = X86_CPU(cs);
   4744    uint64_t code;
   4745    int ret;
   4746
   4747    switch (run->exit_reason) {
   4748    case KVM_EXIT_HLT:
   4749        DPRINTF("handle_hlt\n");
   4750        qemu_mutex_lock_iothread();
   4751        ret = kvm_handle_halt(cpu);
   4752        qemu_mutex_unlock_iothread();
   4753        break;
   4754    case KVM_EXIT_SET_TPR:
   4755        ret = 0;
   4756        break;
   4757    case KVM_EXIT_TPR_ACCESS:
   4758        qemu_mutex_lock_iothread();
   4759        ret = kvm_handle_tpr_access(cpu);
   4760        qemu_mutex_unlock_iothread();
   4761        break;
   4762    case KVM_EXIT_FAIL_ENTRY:
   4763        code = run->fail_entry.hardware_entry_failure_reason;
   4764        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
   4765                code);
   4766        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
   4767            fprintf(stderr,
   4768                    "\nIf you're running a guest on an Intel machine without "
   4769                        "unrestricted mode\n"
   4770                    "support, the failure can be most likely due to the guest "
   4771                        "entering an invalid\n"
   4772                    "state for Intel VT. For example, the guest maybe running "
   4773                        "in big real mode\n"
   4774                    "which is not supported on less recent Intel processors."
   4775                        "\n\n");
   4776        }
   4777        ret = -1;
   4778        break;
   4779    case KVM_EXIT_EXCEPTION:
   4780        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
   4781                run->ex.exception, run->ex.error_code);
   4782        ret = -1;
   4783        break;
   4784    case KVM_EXIT_DEBUG:
   4785        DPRINTF("kvm_exit_debug\n");
   4786        qemu_mutex_lock_iothread();
   4787        ret = kvm_handle_debug(cpu, &run->debug.arch);
   4788        qemu_mutex_unlock_iothread();
   4789        break;
   4790    case KVM_EXIT_HYPERV:
   4791        ret = kvm_hv_handle_exit(cpu, &run->hyperv);
   4792        break;
   4793    case KVM_EXIT_IOAPIC_EOI:
   4794        ioapic_eoi_broadcast(run->eoi.vector);
   4795        ret = 0;
   4796        break;
   4797    case KVM_EXIT_X86_BUS_LOCK:
   4798        /* already handled in kvm_arch_post_run */
   4799        ret = 0;
   4800        break;
   4801    default:
   4802        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
   4803        ret = -1;
   4804        break;
   4805    }
   4806
   4807    return ret;
   4808}
   4809
   4810bool kvm_arch_stop_on_emulation_error(CPUState *cs)
   4811{
   4812    X86CPU *cpu = X86_CPU(cs);
   4813    CPUX86State *env = &cpu->env;
   4814
   4815    kvm_cpu_synchronize_state(cs);
   4816    return !(env->cr[0] & CR0_PE_MASK) ||
   4817           ((env->segs[R_CS].selector  & 3) != 3);
   4818}
   4819
   4820void kvm_arch_init_irq_routing(KVMState *s)
   4821{
   4822    /* We know at this point that we're using the in-kernel
   4823     * irqchip, so we can use irqfds, and on x86 we know
   4824     * we can use msi via irqfd and GSI routing.
   4825     */
   4826    kvm_msi_via_irqfd_allowed = true;
   4827    kvm_gsi_routing_allowed = true;
   4828
   4829    if (kvm_irqchip_is_split()) {
   4830        int i;
   4831
   4832        /* If the ioapic is in QEMU and the lapics are in KVM, reserve
   4833           MSI routes for signaling interrupts to the local apics. */
   4834        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
   4835            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
   4836                error_report("Could not enable split IRQ mode.");
   4837                exit(1);
   4838            }
   4839        }
   4840    }
   4841}
   4842
   4843int kvm_arch_irqchip_create(KVMState *s)
   4844{
   4845    int ret;
   4846    if (kvm_kernel_irqchip_split()) {
   4847        ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
   4848        if (ret) {
   4849            error_report("Could not enable split irqchip mode: %s",
   4850                         strerror(-ret));
   4851            exit(1);
   4852        } else {
   4853            DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
   4854            kvm_split_irqchip = true;
   4855            return 1;
   4856        }
   4857    } else {
   4858        return 0;
   4859    }
   4860}
   4861
   4862uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
   4863{
   4864    CPUX86State *env;
   4865    uint64_t ext_id;
   4866
   4867    if (!first_cpu) {
   4868        return address;
   4869    }
   4870    env = &X86_CPU(first_cpu)->env;
   4871    if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
   4872        return address;
   4873    }
   4874
   4875    /*
   4876     * If the remappable format bit is set, or the upper bits are
   4877     * already set in address_hi, or the low extended bits aren't
   4878     * there anyway, do nothing.
   4879     */
   4880    ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
   4881    if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
   4882        return address;
   4883    }
   4884
   4885    address &= ~ext_id;
   4886    address |= ext_id << 35;
   4887    return address;
   4888}
   4889
   4890int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
   4891                             uint64_t address, uint32_t data, PCIDevice *dev)
   4892{
   4893    X86IOMMUState *iommu = x86_iommu_get_default();
   4894
   4895    if (iommu) {
   4896        X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
   4897
   4898        if (class->int_remap) {
   4899            int ret;
   4900            MSIMessage src, dst;
   4901
   4902            src.address = route->u.msi.address_hi;
   4903            src.address <<= VTD_MSI_ADDR_HI_SHIFT;
   4904            src.address |= route->u.msi.address_lo;
   4905            src.data = route->u.msi.data;
   4906
   4907            ret = class->int_remap(iommu, &src, &dst, dev ?     \
   4908                                   pci_requester_id(dev) :      \
   4909                                   X86_IOMMU_SID_INVALID);
   4910            if (ret) {
   4911                trace_kvm_x86_fixup_msi_error(route->gsi);
   4912                return 1;
   4913            }
   4914
   4915            /*
   4916             * Handled untranslated compatibilty format interrupt with
   4917             * extended destination ID in the low bits 11-5. */
   4918            dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
   4919
   4920            route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
   4921            route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
   4922            route->u.msi.data = dst.data;
   4923            return 0;
   4924        }
   4925    }
   4926
   4927    address = kvm_swizzle_msi_ext_dest_id(address);
   4928    route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
   4929    route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
   4930    return 0;
   4931}
   4932
   4933typedef struct MSIRouteEntry MSIRouteEntry;
   4934
   4935struct MSIRouteEntry {
   4936    PCIDevice *dev;             /* Device pointer */
   4937    int vector;                 /* MSI/MSIX vector index */
   4938    int virq;                   /* Virtual IRQ index */
   4939    QLIST_ENTRY(MSIRouteEntry) list;
   4940};
   4941
   4942/* List of used GSI routes */
   4943static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
   4944    QLIST_HEAD_INITIALIZER(msi_route_list);
   4945
   4946static void kvm_update_msi_routes_all(void *private, bool global,
   4947                                      uint32_t index, uint32_t mask)
   4948{
   4949    int cnt = 0, vector;
   4950    MSIRouteEntry *entry;
   4951    MSIMessage msg;
   4952    PCIDevice *dev;
   4953
   4954    /* TODO: explicit route update */
   4955    QLIST_FOREACH(entry, &msi_route_list, list) {
   4956        cnt++;
   4957        vector = entry->vector;
   4958        dev = entry->dev;
   4959        if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
   4960            msg = msix_get_message(dev, vector);
   4961        } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
   4962            msg = msi_get_message(dev, vector);
   4963        } else {
   4964            /*
   4965             * Either MSI/MSIX is disabled for the device, or the
   4966             * specific message was masked out.  Skip this one.
   4967             */
   4968            continue;
   4969        }
   4970        kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
   4971    }
   4972    kvm_irqchip_commit_routes(kvm_state);
   4973    trace_kvm_x86_update_msi_routes(cnt);
   4974}
   4975
   4976int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
   4977                                int vector, PCIDevice *dev)
   4978{
   4979    static bool notify_list_inited = false;
   4980    MSIRouteEntry *entry;
   4981
   4982    if (!dev) {
   4983        /* These are (possibly) IOAPIC routes only used for split
   4984         * kernel irqchip mode, while what we are housekeeping are
   4985         * PCI devices only. */
   4986        return 0;
   4987    }
   4988
   4989    entry = g_new0(MSIRouteEntry, 1);
   4990    entry->dev = dev;
   4991    entry->vector = vector;
   4992    entry->virq = route->gsi;
   4993    QLIST_INSERT_HEAD(&msi_route_list, entry, list);
   4994
   4995    trace_kvm_x86_add_msi_route(route->gsi);
   4996
   4997    if (!notify_list_inited) {
   4998        /* For the first time we do add route, add ourselves into
   4999         * IOMMU's IEC notify list if needed. */
   5000        X86IOMMUState *iommu = x86_iommu_get_default();
   5001        if (iommu) {
   5002            x86_iommu_iec_register_notifier(iommu,
   5003                                            kvm_update_msi_routes_all,
   5004                                            NULL);
   5005        }
   5006        notify_list_inited = true;
   5007    }
   5008    return 0;
   5009}
   5010
   5011int kvm_arch_release_virq_post(int virq)
   5012{
   5013    MSIRouteEntry *entry, *next;
   5014    QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
   5015        if (entry->virq == virq) {
   5016            trace_kvm_x86_remove_msi_route(virq);
   5017            QLIST_REMOVE(entry, list);
   5018            g_free(entry);
   5019            break;
   5020        }
   5021    }
   5022    return 0;
   5023}
   5024
   5025int kvm_arch_msi_data_to_gsi(uint32_t data)
   5026{
   5027    abort();
   5028}
   5029
   5030bool kvm_has_waitpkg(void)
   5031{
   5032    return has_msr_umwait;
   5033}
   5034
   5035bool kvm_arch_cpu_check_are_resettable(void)
   5036{
   5037    return !sev_es_enabled();
   5038}