x86.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
x86.c (352891B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * derived from drivers/kvm/kvm_main.c
      6 *
      7 * Copyright (C) 2006 Qumranet, Inc.
      8 * Copyright (C) 2008 Qumranet, Inc.
      9 * Copyright IBM Corporation, 2008
     10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
     11 *
     12 * Authors:
     13 *   Avi Kivity   <avi@qumranet.com>
     14 *   Yaniv Kamay  <yaniv@qumranet.com>
     15 *   Amit Shah    <amit.shah@qumranet.com>
     16 *   Ben-Ami Yassour <benami@il.ibm.com>
     17 */
     18
     19#include <linux/kvm_host.h>
     20#include "irq.h"
     21#include "ioapic.h"
     22#include "mmu.h"
     23#include "i8254.h"
     24#include "tss.h"
     25#include "kvm_cache_regs.h"
     26#include "kvm_emulate.h"
     27#include "x86.h"
     28#include "cpuid.h"
     29#include "pmu.h"
     30#include "hyperv.h"
     31#include "lapic.h"
     32#include "xen.h"
     33
     34#include <linux/clocksource.h>
     35#include <linux/interrupt.h>
     36#include <linux/kvm.h>
     37#include <linux/fs.h>
     38#include <linux/vmalloc.h>
     39#include <linux/export.h>
     40#include <linux/moduleparam.h>
     41#include <linux/mman.h>
     42#include <linux/highmem.h>
     43#include <linux/iommu.h>
     44#include <linux/intel-iommu.h>
     45#include <linux/cpufreq.h>
     46#include <linux/user-return-notifier.h>
     47#include <linux/srcu.h>
     48#include <linux/slab.h>
     49#include <linux/perf_event.h>
     50#include <linux/uaccess.h>
     51#include <linux/hash.h>
     52#include <linux/pci.h>
     53#include <linux/timekeeper_internal.h>
     54#include <linux/pvclock_gtod.h>
     55#include <linux/kvm_irqfd.h>
     56#include <linux/irqbypass.h>
     57#include <linux/sched/stat.h>
     58#include <linux/sched/isolation.h>
     59#include <linux/mem_encrypt.h>
     60#include <linux/entry-kvm.h>
     61#include <linux/suspend.h>
     62
     63#include <trace/events/kvm.h>
     64
     65#include <asm/debugreg.h>
     66#include <asm/msr.h>
     67#include <asm/desc.h>
     68#include <asm/mce.h>
     69#include <asm/pkru.h>
     70#include <linux/kernel_stat.h>
     71#include <asm/fpu/api.h>
     72#include <asm/fpu/xcr.h>
     73#include <asm/fpu/xstate.h>
     74#include <asm/pvclock.h>
     75#include <asm/div64.h>
     76#include <asm/irq_remapping.h>
     77#include <asm/mshyperv.h>
     78#include <asm/hypervisor.h>
     79#include <asm/tlbflush.h>
     80#include <asm/intel_pt.h>
     81#include <asm/emulate_prefix.h>
     82#include <asm/sgx.h>
     83#include <clocksource/hyperv_timer.h>
     84
     85#include "cachepc/cachepc.h"
     86#include "cachepc/event.h"
     87#include "cachepc/track.h"
     88
     89#define CREATE_TRACE_POINTS
     90#include "trace.h"
     91
     92#define MAX_IO_MSRS 256
     93#define KVM_MAX_MCE_BANKS 32
     94u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
     95EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
     96
     97#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
     98
     99#define emul_to_vcpu(ctxt) \
    100	((struct kvm_vcpu *)(ctxt)->vcpu)
    101
    102/* EFER defaults:
    103 * - enable syscall per default because its emulated by KVM
    104 * - enable LME and LMA per default on 64 bit KVM
    105 */
    106#ifdef CONFIG_X86_64
    107static
    108u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
    109#else
    110static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
    111#endif
    112
    113static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
    114
    115#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
    116
    117#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
    118
    119#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
    120                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
    121
    122static void update_cr8_intercept(struct kvm_vcpu *vcpu);
    123static void process_nmi(struct kvm_vcpu *vcpu);
    124static void process_smi(struct kvm_vcpu *vcpu);
    125static void enter_smm(struct kvm_vcpu *vcpu);
    126static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
    127static void store_regs(struct kvm_vcpu *vcpu);
    128static int sync_regs(struct kvm_vcpu *vcpu);
    129static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
    130
    131static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
    132static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
    133
    134struct kvm_x86_ops kvm_x86_ops __read_mostly;
    135
    136#define KVM_X86_OP(func)					     \
    137	DEFINE_STATIC_CALL_NULL(kvm_x86_##func,			     \
    138				*(((struct kvm_x86_ops *)0)->func));
    139#define KVM_X86_OP_OPTIONAL KVM_X86_OP
    140#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
    141#include <asm/kvm-x86-ops.h>
    142EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
    143EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
    144
    145static bool __read_mostly ignore_msrs = 0;
    146module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
    147
    148bool __read_mostly report_ignored_msrs = true;
    149module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
    150EXPORT_SYMBOL_GPL(report_ignored_msrs);
    151
    152unsigned int min_timer_period_us = 200;
    153module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
    154
    155static bool __read_mostly kvmclock_periodic_sync = true;
    156module_param(kvmclock_periodic_sync, bool, S_IRUGO);
    157
    158bool __read_mostly kvm_has_tsc_control;
    159EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
    160u32  __read_mostly kvm_max_guest_tsc_khz;
    161EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
    162u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
    163EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
    164u64  __read_mostly kvm_max_tsc_scaling_ratio;
    165EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
    166u64 __read_mostly kvm_default_tsc_scaling_ratio;
    167EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
    168bool __read_mostly kvm_has_bus_lock_exit;
    169EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
    170
    171/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
    172static u32 __read_mostly tsc_tolerance_ppm = 250;
    173module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
    174
    175/*
    176 * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
    177 * adaptive tuning starting from default advancement of 1000ns.  '0' disables
    178 * advancement entirely.  Any other value is used as-is and disables adaptive
    179 * tuning, i.e. allows privileged userspace to set an exact advancement time.
    180 */
    181static int __read_mostly lapic_timer_advance_ns = -1;
    182module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
    183
    184static bool __read_mostly vector_hashing = true;
    185module_param(vector_hashing, bool, S_IRUGO);
    186
    187bool __read_mostly enable_vmware_backdoor = false;
    188module_param(enable_vmware_backdoor, bool, S_IRUGO);
    189EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
    190
    191static bool __read_mostly force_emulation_prefix = false;
    192module_param(force_emulation_prefix, bool, S_IRUGO);
    193
    194int __read_mostly pi_inject_timer = -1;
    195module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
    196
    197/* Enable/disable PMU virtualization */
    198bool __read_mostly enable_pmu = true;
    199EXPORT_SYMBOL_GPL(enable_pmu);
    200module_param(enable_pmu, bool, 0444);
    201
    202bool __read_mostly eager_page_split = true;
    203module_param(eager_page_split, bool, 0644);
    204
    205/*
    206 * Restoring the host value for MSRs that are only consumed when running in
    207 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
    208 * returns to userspace, i.e. the kernel can run with the guest's value.
    209 */
    210#define KVM_MAX_NR_USER_RETURN_MSRS 16
    211
    212struct kvm_user_return_msrs {
    213	struct user_return_notifier urn;
    214	bool registered;
    215	struct kvm_user_return_msr_values {
    216		u64 host;
    217		u64 curr;
    218	} values[KVM_MAX_NR_USER_RETURN_MSRS];
    219};
    220
    221u32 __read_mostly kvm_nr_uret_msrs;
    222EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
    223static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
    224static struct kvm_user_return_msrs __percpu *user_return_msrs;
    225
    226#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
    227				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
    228				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
    229				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
    230
    231u64 __read_mostly host_efer;
    232EXPORT_SYMBOL_GPL(host_efer);
    233
    234bool __read_mostly allow_smaller_maxphyaddr = 0;
    235EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
    236
    237bool __read_mostly enable_apicv = true;
    238EXPORT_SYMBOL_GPL(enable_apicv);
    239
    240u64 __read_mostly host_xss;
    241EXPORT_SYMBOL_GPL(host_xss);
    242u64 __read_mostly supported_xss;
    243EXPORT_SYMBOL_GPL(supported_xss);
    244
    245const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
    246	KVM_GENERIC_VM_STATS(),
    247	STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
    248	STATS_DESC_COUNTER(VM, mmu_pte_write),
    249	STATS_DESC_COUNTER(VM, mmu_pde_zapped),
    250	STATS_DESC_COUNTER(VM, mmu_flooded),
    251	STATS_DESC_COUNTER(VM, mmu_recycled),
    252	STATS_DESC_COUNTER(VM, mmu_cache_miss),
    253	STATS_DESC_ICOUNTER(VM, mmu_unsync),
    254	STATS_DESC_ICOUNTER(VM, pages_4k),
    255	STATS_DESC_ICOUNTER(VM, pages_2m),
    256	STATS_DESC_ICOUNTER(VM, pages_1g),
    257	STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
    258	STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
    259	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
    260};
    261
    262const struct kvm_stats_header kvm_vm_stats_header = {
    263	.name_size = KVM_STATS_NAME_SIZE,
    264	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
    265	.id_offset = sizeof(struct kvm_stats_header),
    266	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
    267	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
    268		       sizeof(kvm_vm_stats_desc),
    269};
    270
    271const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
    272	KVM_GENERIC_VCPU_STATS(),
    273	STATS_DESC_COUNTER(VCPU, pf_taken),
    274	STATS_DESC_COUNTER(VCPU, pf_fixed),
    275	STATS_DESC_COUNTER(VCPU, pf_emulate),
    276	STATS_DESC_COUNTER(VCPU, pf_spurious),
    277	STATS_DESC_COUNTER(VCPU, pf_fast),
    278	STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
    279	STATS_DESC_COUNTER(VCPU, pf_guest),
    280	STATS_DESC_COUNTER(VCPU, tlb_flush),
    281	STATS_DESC_COUNTER(VCPU, invlpg),
    282	STATS_DESC_COUNTER(VCPU, exits),
    283	STATS_DESC_COUNTER(VCPU, io_exits),
    284	STATS_DESC_COUNTER(VCPU, mmio_exits),
    285	STATS_DESC_COUNTER(VCPU, signal_exits),
    286	STATS_DESC_COUNTER(VCPU, irq_window_exits),
    287	STATS_DESC_COUNTER(VCPU, nmi_window_exits),
    288	STATS_DESC_COUNTER(VCPU, l1d_flush),
    289	STATS_DESC_COUNTER(VCPU, halt_exits),
    290	STATS_DESC_COUNTER(VCPU, request_irq_exits),
    291	STATS_DESC_COUNTER(VCPU, irq_exits),
    292	STATS_DESC_COUNTER(VCPU, host_state_reload),
    293	STATS_DESC_COUNTER(VCPU, fpu_reload),
    294	STATS_DESC_COUNTER(VCPU, insn_emulation),
    295	STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
    296	STATS_DESC_COUNTER(VCPU, hypercalls),
    297	STATS_DESC_COUNTER(VCPU, irq_injections),
    298	STATS_DESC_COUNTER(VCPU, nmi_injections),
    299	STATS_DESC_COUNTER(VCPU, req_event),
    300	STATS_DESC_COUNTER(VCPU, nested_run),
    301	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
    302	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
    303	STATS_DESC_COUNTER(VCPU, preemption_reported),
    304	STATS_DESC_COUNTER(VCPU, preemption_other),
    305	STATS_DESC_ICOUNTER(VCPU, guest_mode)
    306};
    307
    308const struct kvm_stats_header kvm_vcpu_stats_header = {
    309	.name_size = KVM_STATS_NAME_SIZE,
    310	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
    311	.id_offset = sizeof(struct kvm_stats_header),
    312	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
    313	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
    314		       sizeof(kvm_vcpu_stats_desc),
    315};
    316
    317u64 __read_mostly host_xcr0;
    318u64 __read_mostly supported_xcr0;
    319EXPORT_SYMBOL_GPL(supported_xcr0);
    320
    321static struct kmem_cache *x86_emulator_cache;
    322
    323/*
    324 * When called, it means the previous get/set msr reached an invalid msr.
    325 * Return true if we want to ignore/silent this failed msr access.
    326 */
    327static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
    328{
    329	const char *op = write ? "wrmsr" : "rdmsr";
    330
    331	if (ignore_msrs) {
    332		if (report_ignored_msrs)
    333			kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
    334				      op, msr, data);
    335		/* Mask the error */
    336		return true;
    337	} else {
    338		kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
    339				      op, msr, data);
    340		return false;
    341	}
    342}
    343
    344static struct kmem_cache *kvm_alloc_emulator_cache(void)
    345{
    346	unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
    347	unsigned int size = sizeof(struct x86_emulate_ctxt);
    348
    349	return kmem_cache_create_usercopy("x86_emulator", size,
    350					  __alignof__(struct x86_emulate_ctxt),
    351					  SLAB_ACCOUNT, useroffset,
    352					  size - useroffset, NULL);
    353}
    354
    355static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
    356
    357static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
    358{
    359	int i;
    360	for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
    361		vcpu->arch.apf.gfns[i] = ~0;
    362}
    363
    364static void kvm_on_user_return(struct user_return_notifier *urn)
    365{
    366	unsigned slot;
    367	struct kvm_user_return_msrs *msrs
    368		= container_of(urn, struct kvm_user_return_msrs, urn);
    369	struct kvm_user_return_msr_values *values;
    370	unsigned long flags;
    371
    372	/*
    373	 * Disabling irqs at this point since the following code could be
    374	 * interrupted and executed through kvm_arch_hardware_disable()
    375	 */
    376	local_irq_save(flags);
    377	if (msrs->registered) {
    378		msrs->registered = false;
    379		user_return_notifier_unregister(urn);
    380	}
    381	local_irq_restore(flags);
    382	for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
    383		values = &msrs->values[slot];
    384		if (values->host != values->curr) {
    385			wrmsrl(kvm_uret_msrs_list[slot], values->host);
    386			values->curr = values->host;
    387		}
    388	}
    389}
    390
    391static int kvm_probe_user_return_msr(u32 msr)
    392{
    393	u64 val;
    394	int ret;
    395
    396	preempt_disable();
    397	ret = rdmsrl_safe(msr, &val);
    398	if (ret)
    399		goto out;
    400	ret = wrmsrl_safe(msr, val);
    401out:
    402	preempt_enable();
    403	return ret;
    404}
    405
    406int kvm_add_user_return_msr(u32 msr)
    407{
    408	BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
    409
    410	if (kvm_probe_user_return_msr(msr))
    411		return -1;
    412
    413	kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
    414	return kvm_nr_uret_msrs++;
    415}
    416EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
    417
    418int kvm_find_user_return_msr(u32 msr)
    419{
    420	int i;
    421
    422	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
    423		if (kvm_uret_msrs_list[i] == msr)
    424			return i;
    425	}
    426	return -1;
    427}
    428EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
    429
    430static void kvm_user_return_msr_cpu_online(void)
    431{
    432	unsigned int cpu = smp_processor_id();
    433	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
    434	u64 value;
    435	int i;
    436
    437	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
    438		rdmsrl_safe(kvm_uret_msrs_list[i], &value);
    439		msrs->values[i].host = value;
    440		msrs->values[i].curr = value;
    441	}
    442}
    443
    444int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
    445{
    446	unsigned int cpu = smp_processor_id();
    447	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
    448	int err;
    449
    450	value = (value & mask) | (msrs->values[slot].host & ~mask);
    451	if (value == msrs->values[slot].curr)
    452		return 0;
    453	err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
    454	if (err)
    455		return 1;
    456
    457	msrs->values[slot].curr = value;
    458	if (!msrs->registered) {
    459		msrs->urn.on_user_return = kvm_on_user_return;
    460		user_return_notifier_register(&msrs->urn);
    461		msrs->registered = true;
    462	}
    463	return 0;
    464}
    465EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
    466
    467static void drop_user_return_notifiers(void)
    468{
    469	unsigned int cpu = smp_processor_id();
    470	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
    471
    472	if (msrs->registered)
    473		kvm_on_user_return(&msrs->urn);
    474}
    475
    476u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    477{
    478	return vcpu->arch.apic_base;
    479}
    480EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    481
    482enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
    483{
    484	return kvm_apic_mode(kvm_get_apic_base(vcpu));
    485}
    486EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
    487
    488int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
    489{
    490	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
    491	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
    492	u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
    493		(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
    494
    495	if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
    496		return 1;
    497	if (!msr_info->host_initiated) {
    498		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
    499			return 1;
    500		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
    501			return 1;
    502	}
    503
    504	kvm_lapic_set_base(vcpu, msr_info->data);
    505	kvm_recalculate_apic_map(vcpu->kvm);
    506	return 0;
    507}
    508EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    509
    510/*
    511 * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
    512 *
    513 * Hardware virtualization extension instructions may fault if a reboot turns
    514 * off virtualization while processes are running.  Usually after catching the
    515 * fault we just panic; during reboot instead the instruction is ignored.
    516 */
    517noinstr void kvm_spurious_fault(void)
    518{
    519	/* Fault while not rebooting.  We want the trace. */
    520	BUG_ON(!kvm_rebooting);
    521}
    522EXPORT_SYMBOL_GPL(kvm_spurious_fault);
    523
    524#define EXCPT_BENIGN		0
    525#define EXCPT_CONTRIBUTORY	1
    526#define EXCPT_PF		2
    527
    528static int exception_class(int vector)
    529{
    530	switch (vector) {
    531	case PF_VECTOR:
    532		return EXCPT_PF;
    533	case DE_VECTOR:
    534	case TS_VECTOR:
    535	case NP_VECTOR:
    536	case SS_VECTOR:
    537	case GP_VECTOR:
    538		return EXCPT_CONTRIBUTORY;
    539	default:
    540		break;
    541	}
    542	return EXCPT_BENIGN;
    543}
    544
    545#define EXCPT_FAULT		0
    546#define EXCPT_TRAP		1
    547#define EXCPT_ABORT		2
    548#define EXCPT_INTERRUPT		3
    549
    550static int exception_type(int vector)
    551{
    552	unsigned int mask;
    553
    554	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
    555		return EXCPT_INTERRUPT;
    556
    557	mask = 1 << vector;
    558
    559	/* #DB is trap, as instruction watchpoints are handled elsewhere */
    560	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
    561		return EXCPT_TRAP;
    562
    563	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
    564		return EXCPT_ABORT;
    565
    566	/* Reserved exceptions will result in fault */
    567	return EXCPT_FAULT;
    568}
    569
    570void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
    571{
    572	unsigned nr = vcpu->arch.exception.nr;
    573	bool has_payload = vcpu->arch.exception.has_payload;
    574	unsigned long payload = vcpu->arch.exception.payload;
    575
    576	if (!has_payload)
    577		return;
    578
    579	switch (nr) {
    580	case DB_VECTOR:
    581		/*
    582		 * "Certain debug exceptions may clear bit 0-3.  The
    583		 * remaining contents of the DR6 register are never
    584		 * cleared by the processor".
    585		 */
    586		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
    587		/*
    588		 * In order to reflect the #DB exception payload in guest
    589		 * dr6, three components need to be considered: active low
    590		 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
    591		 * DR6_BS and DR6_BT)
    592		 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
    593		 * In the target guest dr6:
    594		 * FIXED_1 bits should always be set.
    595		 * Active low bits should be cleared if 1-setting in payload.
    596		 * Active high bits should be set if 1-setting in payload.
    597		 *
    598		 * Note, the payload is compatible with the pending debug
    599		 * exceptions/exit qualification under VMX, that active_low bits
    600		 * are active high in payload.
    601		 * So they need to be flipped for DR6.
    602		 */
    603		vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
    604		vcpu->arch.dr6 |= payload;
    605		vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
    606
    607		/*
    608		 * The #DB payload is defined as compatible with the 'pending
    609		 * debug exceptions' field under VMX, not DR6. While bit 12 is
    610		 * defined in the 'pending debug exceptions' field (enabled
    611		 * breakpoint), it is reserved and must be zero in DR6.
    612		 */
    613		vcpu->arch.dr6 &= ~BIT(12);
    614		break;
    615	case PF_VECTOR:
    616		vcpu->arch.cr2 = payload;
    617		break;
    618	}
    619
    620	vcpu->arch.exception.has_payload = false;
    621	vcpu->arch.exception.payload = 0;
    622}
    623EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
    624
    625static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
    626		unsigned nr, bool has_error, u32 error_code,
    627	        bool has_payload, unsigned long payload, bool reinject)
    628{
    629	u32 prev_nr;
    630	int class1, class2;
    631
    632	kvm_make_request(KVM_REQ_EVENT, vcpu);
    633
    634	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
    635	queue:
    636		if (reinject) {
    637			/*
    638			 * On vmentry, vcpu->arch.exception.pending is only
    639			 * true if an event injection was blocked by
    640			 * nested_run_pending.  In that case, however,
    641			 * vcpu_enter_guest requests an immediate exit,
    642			 * and the guest shouldn't proceed far enough to
    643			 * need reinjection.
    644			 */
    645			WARN_ON_ONCE(vcpu->arch.exception.pending);
    646			vcpu->arch.exception.injected = true;
    647			if (WARN_ON_ONCE(has_payload)) {
    648				/*
    649				 * A reinjected event has already
    650				 * delivered its payload.
    651				 */
    652				has_payload = false;
    653				payload = 0;
    654			}
    655		} else {
    656			vcpu->arch.exception.pending = true;
    657			vcpu->arch.exception.injected = false;
    658		}
    659		vcpu->arch.exception.has_error_code = has_error;
    660		vcpu->arch.exception.nr = nr;
    661		vcpu->arch.exception.error_code = error_code;
    662		vcpu->arch.exception.has_payload = has_payload;
    663		vcpu->arch.exception.payload = payload;
    664		if (!is_guest_mode(vcpu))
    665			kvm_deliver_exception_payload(vcpu);
    666		return;
    667	}
    668
    669	/* to check exception */
    670	prev_nr = vcpu->arch.exception.nr;
    671	if (prev_nr == DF_VECTOR) {
    672		/* triple fault -> shutdown */
    673		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
    674		return;
    675	}
    676	class1 = exception_class(prev_nr);
    677	class2 = exception_class(nr);
    678	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
    679		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
    680		/*
    681		 * Generate double fault per SDM Table 5-5.  Set
    682		 * exception.pending = true so that the double fault
    683		 * can trigger a nested vmexit.
    684		 */
    685		vcpu->arch.exception.pending = true;
    686		vcpu->arch.exception.injected = false;
    687		vcpu->arch.exception.has_error_code = true;
    688		vcpu->arch.exception.nr = DF_VECTOR;
    689		vcpu->arch.exception.error_code = 0;
    690		vcpu->arch.exception.has_payload = false;
    691		vcpu->arch.exception.payload = 0;
    692	} else
    693		/* replace previous exception with a new one in a hope
    694		   that instruction re-execution will regenerate lost
    695		   exception */
    696		goto queue;
    697}
    698
    699void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    700{
    701	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
    702}
    703EXPORT_SYMBOL_GPL(kvm_queue_exception);
    704
    705void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    706{
    707	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
    708}
    709EXPORT_SYMBOL_GPL(kvm_requeue_exception);
    710
    711void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
    712			   unsigned long payload)
    713{
    714	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
    715}
    716EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
    717
    718static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
    719				    u32 error_code, unsigned long payload)
    720{
    721	kvm_multiple_exception(vcpu, nr, true, error_code,
    722			       true, payload, false);
    723}
    724
    725int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
    726{
    727	if (err)
    728		kvm_inject_gp(vcpu, 0);
    729	else
    730		return kvm_skip_emulated_instruction(vcpu);
    731
    732	return 1;
    733}
    734EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
    735
    736static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
    737{
    738	if (err) {
    739		kvm_inject_gp(vcpu, 0);
    740		return 1;
    741	}
    742
    743	return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
    744				       EMULTYPE_COMPLETE_USER_EXIT);
    745}
    746
    747void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
    748{
    749	++vcpu->stat.pf_guest;
    750	vcpu->arch.exception.nested_apf =
    751		is_guest_mode(vcpu) && fault->async_page_fault;
    752	if (vcpu->arch.exception.nested_apf) {
    753		vcpu->arch.apf.nested_apf_token = fault->address;
    754		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
    755	} else {
    756		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
    757					fault->address);
    758	}
    759}
    760EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
    761
    762/* Returns true if the page fault was immediately morphed into a VM-Exit. */
    763bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
    764				    struct x86_exception *fault)
    765{
    766	struct kvm_mmu *fault_mmu;
    767	WARN_ON_ONCE(fault->vector != PF_VECTOR);
    768
    769	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
    770					       vcpu->arch.walk_mmu;
    771
    772	/*
    773	 * Invalidate the TLB entry for the faulting address, if it exists,
    774	 * else the access will fault indefinitely (and to emulate hardware).
    775	 */
    776	if ((fault->error_code & PFERR_PRESENT_MASK) &&
    777	    !(fault->error_code & PFERR_RSVD_MASK))
    778		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
    779				       fault_mmu->root.hpa);
    780
    781	/*
    782	 * A workaround for KVM's bad exception handling.  If KVM injected an
    783	 * exception into L2, and L2 encountered a #PF while vectoring the
    784	 * injected exception, manually check to see if L1 wants to intercept
    785	 * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
    786	 * In all other cases, defer the check to nested_ops->check_events(),
    787	 * which will correctly handle priority (this does not).  Note, other
    788	 * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
    789	 * most problematic, e.g. when L0 and L1 are both intercepting #PF for
    790	 * shadow paging.
    791	 *
    792	 * TODO: Rewrite exception handling to track injected and pending
    793	 *       (VM-Exit) exceptions separately.
    794	 */
    795	if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
    796	    kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
    797		return true;
    798
    799	fault_mmu->inject_page_fault(vcpu, fault);
    800	return false;
    801}
    802EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
    803
    804void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    805{
    806	atomic_inc(&vcpu->arch.nmi_queued);
    807	kvm_make_request(KVM_REQ_NMI, vcpu);
    808}
    809EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    810
    811void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    812{
    813	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
    814}
    815EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    816
    817void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    818{
    819	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
    820}
    821EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
    822
    823/*
    824 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
    825 * a #GP and return false.
    826 */
    827bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
    828{
    829	if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
    830		return true;
    831	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
    832	return false;
    833}
    834EXPORT_SYMBOL_GPL(kvm_require_cpl);
    835
    836bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
    837{
    838	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    839		return true;
    840
    841	kvm_queue_exception(vcpu, UD_VECTOR);
    842	return false;
    843}
    844EXPORT_SYMBOL_GPL(kvm_require_dr);
    845
    846static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
    847{
    848	return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
    849}
    850
    851/*
    852 * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
    853 */
    854int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    855{
    856	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
    857	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    858	gpa_t real_gpa;
    859	int i;
    860	int ret;
    861	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
    862
    863	/*
    864	 * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
    865	 * to an L1 GPA.
    866	 */
    867	real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
    868				     PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
    869	if (real_gpa == UNMAPPED_GVA)
    870		return 0;
    871
    872	/* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
    873	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
    874				       cr3 & GENMASK(11, 5), sizeof(pdpte));
    875	if (ret < 0)
    876		return 0;
    877
    878	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    879		if ((pdpte[i] & PT_PRESENT_MASK) &&
    880		    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
    881			return 0;
    882		}
    883	}
    884
    885	/*
    886	 * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
    887	 * Shadow page roots need to be reconstructed instead.
    888	 */
    889	if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
    890		kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
    891
    892	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
    893	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
    894	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
    895	vcpu->arch.pdptrs_from_userspace = false;
    896
    897	return 1;
    898}
    899EXPORT_SYMBOL_GPL(load_pdptrs);
    900
    901void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
    902{
    903	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
    904		kvm_clear_async_pf_completion_queue(vcpu);
    905		kvm_async_pf_hash_reset(vcpu);
    906
    907		/*
    908		 * Clearing CR0.PG is defined to flush the TLB from the guest's
    909		 * perspective.
    910		 */
    911		if (!(cr0 & X86_CR0_PG))
    912			kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
    913	}
    914
    915	if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
    916		kvm_mmu_reset_context(vcpu);
    917
    918	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
    919	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
    920	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
    921		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
    922}
    923EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
    924
    925int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    926{
    927	unsigned long old_cr0 = kvm_read_cr0(vcpu);
    928
    929	cr0 |= X86_CR0_ET;
    930
    931#ifdef CONFIG_X86_64
    932	if (cr0 & 0xffffffff00000000UL)
    933		return 1;
    934#endif
    935
    936	cr0 &= ~CR0_RESERVED_BITS;
    937
    938	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
    939		return 1;
    940
    941	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
    942		return 1;
    943
    944#ifdef CONFIG_X86_64
    945	if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
    946	    (cr0 & X86_CR0_PG)) {
    947		int cs_db, cs_l;
    948
    949		if (!is_pae(vcpu))
    950			return 1;
    951		static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
    952		if (cs_l)
    953			return 1;
    954	}
    955#endif
    956	if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
    957	    is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
    958	    !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
    959		return 1;
    960
    961	if (!(cr0 & X86_CR0_PG) &&
    962	    (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
    963		return 1;
    964
    965	static_call(kvm_x86_set_cr0)(vcpu, cr0);
    966
    967	kvm_post_set_cr0(vcpu, old_cr0, cr0);
    968
    969	return 0;
    970}
    971EXPORT_SYMBOL_GPL(kvm_set_cr0);
    972
    973void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    974{
    975	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
    976}
    977EXPORT_SYMBOL_GPL(kvm_lmsw);
    978
    979void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
    980{
    981	if (vcpu->arch.guest_state_protected)
    982		return;
    983
    984	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
    985
    986		if (vcpu->arch.xcr0 != host_xcr0)
    987			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
    988
    989		if (vcpu->arch.xsaves_enabled &&
    990		    vcpu->arch.ia32_xss != host_xss)
    991			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
    992	}
    993
    994#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
    995	if (static_cpu_has(X86_FEATURE_PKU) &&
    996	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
    997	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
    998	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
    999		write_pkru(vcpu->arch.pkru);
   1000#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
   1001}
   1002EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
   1003
   1004void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
   1005{
   1006	if (vcpu->arch.guest_state_protected)
   1007		return;
   1008
   1009#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
   1010	if (static_cpu_has(X86_FEATURE_PKU) &&
   1011	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
   1012	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
   1013		vcpu->arch.pkru = rdpkru();
   1014		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
   1015			write_pkru(vcpu->arch.host_pkru);
   1016	}
   1017#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
   1018
   1019	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
   1020
   1021		if (vcpu->arch.xcr0 != host_xcr0)
   1022			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
   1023
   1024		if (vcpu->arch.xsaves_enabled &&
   1025		    vcpu->arch.ia32_xss != host_xss)
   1026			wrmsrl(MSR_IA32_XSS, host_xss);
   1027	}
   1028
   1029}
   1030EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
   1031
   1032static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
   1033{
   1034	return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
   1035}
   1036
   1037#ifdef CONFIG_X86_64
   1038static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
   1039{
   1040	return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
   1041}
   1042#endif
   1043
   1044static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
   1045{
   1046	u64 xcr0 = xcr;
   1047	u64 old_xcr0 = vcpu->arch.xcr0;
   1048	u64 valid_bits;
   1049
   1050	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
   1051	if (index != XCR_XFEATURE_ENABLED_MASK)
   1052		return 1;
   1053	if (!(xcr0 & XFEATURE_MASK_FP))
   1054		return 1;
   1055	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
   1056		return 1;
   1057
   1058	/*
   1059	 * Do not allow the guest to set bits that we do not support
   1060	 * saving.  However, xcr0 bit 0 is always set, even if the
   1061	 * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
   1062	 */
   1063	valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
   1064	if (xcr0 & ~valid_bits)
   1065		return 1;
   1066
   1067	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
   1068	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
   1069		return 1;
   1070
   1071	if (xcr0 & XFEATURE_MASK_AVX512) {
   1072		if (!(xcr0 & XFEATURE_MASK_YMM))
   1073			return 1;
   1074		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
   1075			return 1;
   1076	}
   1077
   1078	if ((xcr0 & XFEATURE_MASK_XTILE) &&
   1079	    ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
   1080		return 1;
   1081
   1082	vcpu->arch.xcr0 = xcr0;
   1083
   1084	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
   1085		kvm_update_cpuid_runtime(vcpu);
   1086	return 0;
   1087}
   1088
   1089int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
   1090{
   1091	if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
   1092	    __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
   1093		kvm_inject_gp(vcpu, 0);
   1094		return 1;
   1095	}
   1096
   1097	return kvm_skip_emulated_instruction(vcpu);
   1098}
   1099EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
   1100
   1101bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   1102{
   1103	if (cr4 & cr4_reserved_bits)
   1104		return false;
   1105
   1106	if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
   1107		return false;
   1108
   1109	return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
   1110}
   1111EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
   1112
   1113void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
   1114{
   1115	if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
   1116		kvm_mmu_reset_context(vcpu);
   1117
   1118	/*
   1119	 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
   1120	 * according to the SDM; however, stale prev_roots could be reused
   1121	 * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
   1122	 * free them all.  This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
   1123	 * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
   1124	 * so fall through.
   1125	 */
   1126	if (!tdp_enabled &&
   1127	    (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
   1128		kvm_mmu_unload(vcpu);
   1129
   1130	/*
   1131	 * The TLB has to be flushed for all PCIDs if any of the following
   1132	 * (architecturally required) changes happen:
   1133	 * - CR4.PCIDE is changed from 1 to 0
   1134	 * - CR4.PGE is toggled
   1135	 *
   1136	 * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
   1137	 */
   1138	if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
   1139	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
   1140		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
   1141
   1142	/*
   1143	 * The TLB has to be flushed for the current PCID if any of the
   1144	 * following (architecturally required) changes happen:
   1145	 * - CR4.SMEP is changed from 0 to 1
   1146	 * - CR4.PAE is toggled
   1147	 */
   1148	else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
   1149		 ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
   1150		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
   1151
   1152}
   1153EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
   1154
   1155int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   1156{
   1157	unsigned long old_cr4 = kvm_read_cr4(vcpu);
   1158
   1159	if (!kvm_is_valid_cr4(vcpu, cr4))
   1160		return 1;
   1161
   1162	if (is_long_mode(vcpu)) {
   1163		if (!(cr4 & X86_CR4_PAE))
   1164			return 1;
   1165		if ((cr4 ^ old_cr4) & X86_CR4_LA57)
   1166			return 1;
   1167	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
   1168		   && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
   1169		   && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
   1170		return 1;
   1171
   1172	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
   1173		if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
   1174			return 1;
   1175
   1176		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
   1177		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
   1178			return 1;
   1179	}
   1180
   1181	static_call(kvm_x86_set_cr4)(vcpu, cr4);
   1182
   1183	kvm_post_set_cr4(vcpu, old_cr4, cr4);
   1184
   1185	return 0;
   1186}
   1187EXPORT_SYMBOL_GPL(kvm_set_cr4);
   1188
   1189static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
   1190{
   1191	struct kvm_mmu *mmu = vcpu->arch.mmu;
   1192	unsigned long roots_to_free = 0;
   1193	int i;
   1194
   1195	/*
   1196	 * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
   1197	 * this is reachable when running EPT=1 and unrestricted_guest=0,  and
   1198	 * also via the emulator.  KVM's TDP page tables are not in the scope of
   1199	 * the invalidation, but the guest's TLB entries need to be flushed as
   1200	 * the CPU may have cached entries in its TLB for the target PCID.
   1201	 */
   1202	if (unlikely(tdp_enabled)) {
   1203		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
   1204		return;
   1205	}
   1206
   1207	/*
   1208	 * If neither the current CR3 nor any of the prev_roots use the given
   1209	 * PCID, then nothing needs to be done here because a resync will
   1210	 * happen anyway before switching to any other CR3.
   1211	 */
   1212	if (kvm_get_active_pcid(vcpu) == pcid) {
   1213		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
   1214		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
   1215	}
   1216
   1217	/*
   1218	 * If PCID is disabled, there is no need to free prev_roots even if the
   1219	 * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
   1220	 * with PCIDE=0.
   1221	 */
   1222	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
   1223		return;
   1224
   1225	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
   1226		if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
   1227			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
   1228
   1229	kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
   1230}
   1231
   1232int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
   1233{
   1234	bool skip_tlb_flush = false;
   1235	unsigned long pcid = 0;
   1236#ifdef CONFIG_X86_64
   1237	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
   1238
   1239	if (pcid_enabled) {
   1240		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
   1241		cr3 &= ~X86_CR3_PCID_NOFLUSH;
   1242		pcid = cr3 & X86_CR3_PCID_MASK;
   1243	}
   1244#endif
   1245
   1246	/* PDPTRs are always reloaded for PAE paging. */
   1247	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
   1248		goto handle_tlb_flush;
   1249
   1250	/*
   1251	 * Do not condition the GPA check on long mode, this helper is used to
   1252	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
   1253	 * the current vCPU mode is accurate.
   1254	 */
   1255	if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
   1256		return 1;
   1257
   1258	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
   1259		return 1;
   1260
   1261	if (cr3 != kvm_read_cr3(vcpu))
   1262		kvm_mmu_new_pgd(vcpu, cr3);
   1263
   1264	vcpu->arch.cr3 = cr3;
   1265	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
   1266	/* Do not call post_set_cr3, we do not get here for confidential guests.  */
   1267
   1268handle_tlb_flush:
   1269	/*
   1270	 * A load of CR3 that flushes the TLB flushes only the current PCID,
   1271	 * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
   1272	 * moot point in the end because _disabling_ PCID will flush all PCIDs,
   1273	 * and it's impossible to use a non-zero PCID when PCID is disabled,
   1274	 * i.e. only PCID=0 can be relevant.
   1275	 */
   1276	if (!skip_tlb_flush)
   1277		kvm_invalidate_pcid(vcpu, pcid);
   1278
   1279	return 0;
   1280}
   1281EXPORT_SYMBOL_GPL(kvm_set_cr3);
   1282
   1283int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
   1284{
   1285	if (cr8 & CR8_RESERVED_BITS)
   1286		return 1;
   1287	if (lapic_in_kernel(vcpu))
   1288		kvm_lapic_set_tpr(vcpu, cr8);
   1289	else
   1290		vcpu->arch.cr8 = cr8;
   1291	return 0;
   1292}
   1293EXPORT_SYMBOL_GPL(kvm_set_cr8);
   1294
   1295unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
   1296{
   1297	if (lapic_in_kernel(vcpu))
   1298		return kvm_lapic_get_cr8(vcpu);
   1299	else
   1300		return vcpu->arch.cr8;
   1301}
   1302EXPORT_SYMBOL_GPL(kvm_get_cr8);
   1303
   1304static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
   1305{
   1306	int i;
   1307
   1308	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
   1309		for (i = 0; i < KVM_NR_DB_REGS; i++)
   1310			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
   1311	}
   1312}
   1313
   1314void kvm_update_dr7(struct kvm_vcpu *vcpu)
   1315{
   1316	unsigned long dr7;
   1317
   1318	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
   1319		dr7 = vcpu->arch.guest_debug_dr7;
   1320	else
   1321		dr7 = vcpu->arch.dr7;
   1322	static_call(kvm_x86_set_dr7)(vcpu, dr7);
   1323	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
   1324	if (dr7 & DR7_BP_EN_MASK)
   1325		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
   1326}
   1327EXPORT_SYMBOL_GPL(kvm_update_dr7);
   1328
   1329static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
   1330{
   1331	u64 fixed = DR6_FIXED_1;
   1332
   1333	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
   1334		fixed |= DR6_RTM;
   1335
   1336	if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
   1337		fixed |= DR6_BUS_LOCK;
   1338	return fixed;
   1339}
   1340
   1341int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
   1342{
   1343	size_t size = ARRAY_SIZE(vcpu->arch.db);
   1344
   1345	switch (dr) {
   1346	case 0 ... 3:
   1347		vcpu->arch.db[array_index_nospec(dr, size)] = val;
   1348		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
   1349			vcpu->arch.eff_db[dr] = val;
   1350		break;
   1351	case 4:
   1352	case 6:
   1353		if (!kvm_dr6_valid(val))
   1354			return 1; /* #GP */
   1355		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
   1356		break;
   1357	case 5:
   1358	default: /* 7 */
   1359		if (!kvm_dr7_valid(val))
   1360			return 1; /* #GP */
   1361		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
   1362		kvm_update_dr7(vcpu);
   1363		break;
   1364	}
   1365
   1366	return 0;
   1367}
   1368EXPORT_SYMBOL_GPL(kvm_set_dr);
   1369
   1370void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
   1371{
   1372	size_t size = ARRAY_SIZE(vcpu->arch.db);
   1373
   1374	switch (dr) {
   1375	case 0 ... 3:
   1376		*val = vcpu->arch.db[array_index_nospec(dr, size)];
   1377		break;
   1378	case 4:
   1379	case 6:
   1380		*val = vcpu->arch.dr6;
   1381		break;
   1382	case 5:
   1383	default: /* 7 */
   1384		*val = vcpu->arch.dr7;
   1385		break;
   1386	}
   1387}
   1388EXPORT_SYMBOL_GPL(kvm_get_dr);
   1389
   1390int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
   1391{
   1392	u32 ecx = kvm_rcx_read(vcpu);
   1393	u64 data;
   1394
   1395	if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
   1396		kvm_inject_gp(vcpu, 0);
   1397		return 1;
   1398	}
   1399
   1400	kvm_rax_write(vcpu, (u32)data);
   1401	kvm_rdx_write(vcpu, data >> 32);
   1402	return kvm_skip_emulated_instruction(vcpu);
   1403}
   1404EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
   1405
   1406/*
   1407 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
   1408 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
   1409 *
   1410 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
   1411 * extract the supported MSRs from the related const lists.
   1412 * msrs_to_save is selected from the msrs_to_save_all to reflect the
   1413 * capabilities of the host cpu. This capabilities test skips MSRs that are
   1414 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
   1415 * may depend on host virtualization features rather than host cpu features.
   1416 */
   1417
   1418static const u32 msrs_to_save_all[] = {
   1419	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
   1420	MSR_STAR,
   1421#ifdef CONFIG_X86_64
   1422	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
   1423#endif
   1424	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
   1425	MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
   1426	MSR_IA32_SPEC_CTRL,
   1427	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
   1428	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
   1429	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
   1430	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
   1431	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
   1432	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
   1433	MSR_IA32_UMWAIT_CONTROL,
   1434
   1435	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
   1436	MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
   1437	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
   1438	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
   1439	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
   1440	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
   1441	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
   1442	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
   1443	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
   1444	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
   1445	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
   1446	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
   1447	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
   1448	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
   1449	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
   1450	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
   1451	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
   1452	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
   1453	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
   1454	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
   1455	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
   1456	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
   1457
   1458	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
   1459	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
   1460	MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
   1461	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
   1462	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
   1463	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
   1464	MSR_IA32_XFD, MSR_IA32_XFD_ERR,
   1465};
   1466
   1467static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
   1468static unsigned num_msrs_to_save;
   1469
   1470static const u32 emulated_msrs_all[] = {
   1471	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
   1472	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
   1473	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
   1474	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
   1475	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
   1476	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
   1477	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
   1478	HV_X64_MSR_RESET,
   1479	HV_X64_MSR_VP_INDEX,
   1480	HV_X64_MSR_VP_RUNTIME,
   1481	HV_X64_MSR_SCONTROL,
   1482	HV_X64_MSR_STIMER0_CONFIG,
   1483	HV_X64_MSR_VP_ASSIST_PAGE,
   1484	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
   1485	HV_X64_MSR_TSC_EMULATION_STATUS,
   1486	HV_X64_MSR_SYNDBG_OPTIONS,
   1487	HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
   1488	HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
   1489	HV_X64_MSR_SYNDBG_PENDING_BUFFER,
   1490
   1491	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
   1492	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
   1493
   1494	MSR_IA32_TSC_ADJUST,
   1495	MSR_IA32_TSC_DEADLINE,
   1496	MSR_IA32_ARCH_CAPABILITIES,
   1497	MSR_IA32_PERF_CAPABILITIES,
   1498	MSR_IA32_MISC_ENABLE,
   1499	MSR_IA32_MCG_STATUS,
   1500	MSR_IA32_MCG_CTL,
   1501	MSR_IA32_MCG_EXT_CTL,
   1502	MSR_IA32_SMBASE,
   1503	MSR_SMI_COUNT,
   1504	MSR_PLATFORM_INFO,
   1505	MSR_MISC_FEATURES_ENABLES,
   1506	MSR_AMD64_VIRT_SPEC_CTRL,
   1507	MSR_AMD64_TSC_RATIO,
   1508	MSR_IA32_POWER_CTL,
   1509	MSR_IA32_UCODE_REV,
   1510
   1511	/*
   1512	 * The following list leaves out MSRs whose values are determined
   1513	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
   1514	 * We always support the "true" VMX control MSRs, even if the host
   1515	 * processor does not, so I am putting these registers here rather
   1516	 * than in msrs_to_save_all.
   1517	 */
   1518	MSR_IA32_VMX_BASIC,
   1519	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   1520	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   1521	MSR_IA32_VMX_TRUE_EXIT_CTLS,
   1522	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   1523	MSR_IA32_VMX_MISC,
   1524	MSR_IA32_VMX_CR0_FIXED0,
   1525	MSR_IA32_VMX_CR4_FIXED0,
   1526	MSR_IA32_VMX_VMCS_ENUM,
   1527	MSR_IA32_VMX_PROCBASED_CTLS2,
   1528	MSR_IA32_VMX_EPT_VPID_CAP,
   1529	MSR_IA32_VMX_VMFUNC,
   1530
   1531	MSR_K7_HWCR,
   1532	MSR_KVM_POLL_CONTROL,
   1533};
   1534
   1535static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
   1536static unsigned num_emulated_msrs;
   1537
   1538/*
   1539 * List of msr numbers which are used to expose MSR-based features that
   1540 * can be used by a hypervisor to validate requested CPU features.
   1541 */
   1542static const u32 msr_based_features_all[] = {
   1543	MSR_IA32_VMX_BASIC,
   1544	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   1545	MSR_IA32_VMX_PINBASED_CTLS,
   1546	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   1547	MSR_IA32_VMX_PROCBASED_CTLS,
   1548	MSR_IA32_VMX_TRUE_EXIT_CTLS,
   1549	MSR_IA32_VMX_EXIT_CTLS,
   1550	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   1551	MSR_IA32_VMX_ENTRY_CTLS,
   1552	MSR_IA32_VMX_MISC,
   1553	MSR_IA32_VMX_CR0_FIXED0,
   1554	MSR_IA32_VMX_CR0_FIXED1,
   1555	MSR_IA32_VMX_CR4_FIXED0,
   1556	MSR_IA32_VMX_CR4_FIXED1,
   1557	MSR_IA32_VMX_VMCS_ENUM,
   1558	MSR_IA32_VMX_PROCBASED_CTLS2,
   1559	MSR_IA32_VMX_EPT_VPID_CAP,
   1560	MSR_IA32_VMX_VMFUNC,
   1561
   1562	MSR_F10H_DECFG,
   1563	MSR_IA32_UCODE_REV,
   1564	MSR_IA32_ARCH_CAPABILITIES,
   1565	MSR_IA32_PERF_CAPABILITIES,
   1566};
   1567
   1568static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
   1569static unsigned int num_msr_based_features;
   1570
   1571static u64 kvm_get_arch_capabilities(void)
   1572{
   1573	u64 data = 0;
   1574
   1575	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
   1576		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
   1577
   1578	/*
   1579	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
   1580	 * the nested hypervisor runs with NX huge pages.  If it is not,
   1581	 * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
   1582	 * L1 guests, so it need not worry about its own (L2) guests.
   1583	 */
   1584	data |= ARCH_CAP_PSCHANGE_MC_NO;
   1585
   1586	/*
   1587	 * If we're doing cache flushes (either "always" or "cond")
   1588	 * we will do one whenever the guest does a vmlaunch/vmresume.
   1589	 * If an outer hypervisor is doing the cache flush for us
   1590	 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
   1591	 * capability to the guest too, and if EPT is disabled we're not
   1592	 * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
   1593	 * require a nested hypervisor to do a flush of its own.
   1594	 */
   1595	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
   1596		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
   1597
   1598	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
   1599		data |= ARCH_CAP_RDCL_NO;
   1600	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
   1601		data |= ARCH_CAP_SSB_NO;
   1602	if (!boot_cpu_has_bug(X86_BUG_MDS))
   1603		data |= ARCH_CAP_MDS_NO;
   1604
   1605	if (!boot_cpu_has(X86_FEATURE_RTM)) {
   1606		/*
   1607		 * If RTM=0 because the kernel has disabled TSX, the host might
   1608		 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
   1609		 * and therefore knows that there cannot be TAA) but keep
   1610		 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
   1611		 * and we want to allow migrating those guests to tsx=off hosts.
   1612		 */
   1613		data &= ~ARCH_CAP_TAA_NO;
   1614	} else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
   1615		data |= ARCH_CAP_TAA_NO;
   1616	} else {
   1617		/*
   1618		 * Nothing to do here; we emulate TSX_CTRL if present on the
   1619		 * host so the guest can choose between disabling TSX or
   1620		 * using VERW to clear CPU buffers.
   1621		 */
   1622	}
   1623
   1624	/* Guests don't need to know "Fill buffer clear control" exists */
   1625	data &= ~ARCH_CAP_FB_CLEAR_CTRL;
   1626
   1627	return data;
   1628}
   1629
   1630static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
   1631{
   1632	switch (msr->index) {
   1633	case MSR_IA32_ARCH_CAPABILITIES:
   1634		msr->data = kvm_get_arch_capabilities();
   1635		break;
   1636	case MSR_IA32_UCODE_REV:
   1637		rdmsrl_safe(msr->index, &msr->data);
   1638		break;
   1639	default:
   1640		return static_call(kvm_x86_get_msr_feature)(msr);
   1641	}
   1642	return 0;
   1643}
   1644
   1645static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
   1646{
   1647	struct kvm_msr_entry msr;
   1648	int r;
   1649
   1650	msr.index = index;
   1651	r = kvm_get_msr_feature(&msr);
   1652
   1653	if (r == KVM_MSR_RET_INVALID) {
   1654		/* Unconditionally clear the output for simplicity */
   1655		*data = 0;
   1656		if (kvm_msr_ignored_check(index, 0, false))
   1657			r = 0;
   1658	}
   1659
   1660	if (r)
   1661		return r;
   1662
   1663	*data = msr.data;
   1664
   1665	return 0;
   1666}
   1667
   1668static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
   1669{
   1670	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
   1671		return false;
   1672
   1673	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
   1674		return false;
   1675
   1676	if (efer & (EFER_LME | EFER_LMA) &&
   1677	    !guest_cpuid_has(vcpu, X86_FEATURE_LM))
   1678		return false;
   1679
   1680	if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
   1681		return false;
   1682
   1683	return true;
   1684
   1685}
   1686bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
   1687{
   1688	if (efer & efer_reserved_bits)
   1689		return false;
   1690
   1691	return __kvm_valid_efer(vcpu, efer);
   1692}
   1693EXPORT_SYMBOL_GPL(kvm_valid_efer);
   1694
   1695static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   1696{
   1697	u64 old_efer = vcpu->arch.efer;
   1698	u64 efer = msr_info->data;
   1699	int r;
   1700
   1701	if (efer & efer_reserved_bits)
   1702		return 1;
   1703
   1704	if (!msr_info->host_initiated) {
   1705		if (!__kvm_valid_efer(vcpu, efer))
   1706			return 1;
   1707
   1708		if (is_paging(vcpu) &&
   1709		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
   1710			return 1;
   1711	}
   1712
   1713	efer &= ~EFER_LMA;
   1714	efer |= vcpu->arch.efer & EFER_LMA;
   1715
   1716	r = static_call(kvm_x86_set_efer)(vcpu, efer);
   1717	if (r) {
   1718		WARN_ON(r > 0);
   1719		return r;
   1720	}
   1721
   1722	if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
   1723		kvm_mmu_reset_context(vcpu);
   1724
   1725	return 0;
   1726}
   1727
   1728void kvm_enable_efer_bits(u64 mask)
   1729{
   1730       efer_reserved_bits &= ~mask;
   1731}
   1732EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
   1733
   1734bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
   1735{
   1736	struct kvm_x86_msr_filter *msr_filter;
   1737	struct msr_bitmap_range *ranges;
   1738	struct kvm *kvm = vcpu->kvm;
   1739	bool allowed;
   1740	int idx;
   1741	u32 i;
   1742
   1743	/* x2APIC MSRs do not support filtering. */
   1744	if (index >= 0x800 && index <= 0x8ff)
   1745		return true;
   1746
   1747	idx = srcu_read_lock(&kvm->srcu);
   1748
   1749	msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
   1750	if (!msr_filter) {
   1751		allowed = true;
   1752		goto out;
   1753	}
   1754
   1755	allowed = msr_filter->default_allow;
   1756	ranges = msr_filter->ranges;
   1757
   1758	for (i = 0; i < msr_filter->count; i++) {
   1759		u32 start = ranges[i].base;
   1760		u32 end = start + ranges[i].nmsrs;
   1761		u32 flags = ranges[i].flags;
   1762		unsigned long *bitmap = ranges[i].bitmap;
   1763
   1764		if ((index >= start) && (index < end) && (flags & type)) {
   1765			allowed = !!test_bit(index - start, bitmap);
   1766			break;
   1767		}
   1768	}
   1769
   1770out:
   1771	srcu_read_unlock(&kvm->srcu, idx);
   1772
   1773	return allowed;
   1774}
   1775EXPORT_SYMBOL_GPL(kvm_msr_allowed);
   1776
   1777/*
   1778 * Write @data into the MSR specified by @index.  Select MSR specific fault
   1779 * checks are bypassed if @host_initiated is %true.
   1780 * Returns 0 on success, non-0 otherwise.
   1781 * Assumes vcpu_load() was already called.
   1782 */
   1783static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
   1784			 bool host_initiated)
   1785{
   1786	struct msr_data msr;
   1787
   1788	switch (index) {
   1789	case MSR_FS_BASE:
   1790	case MSR_GS_BASE:
   1791	case MSR_KERNEL_GS_BASE:
   1792	case MSR_CSTAR:
   1793	case MSR_LSTAR:
   1794		if (is_noncanonical_address(data, vcpu))
   1795			return 1;
   1796		break;
   1797	case MSR_IA32_SYSENTER_EIP:
   1798	case MSR_IA32_SYSENTER_ESP:
   1799		/*
   1800		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
   1801		 * non-canonical address is written on Intel but not on
   1802		 * AMD (which ignores the top 32-bits, because it does
   1803		 * not implement 64-bit SYSENTER).
   1804		 *
   1805		 * 64-bit code should hence be able to write a non-canonical
   1806		 * value on AMD.  Making the address canonical ensures that
   1807		 * vmentry does not fail on Intel after writing a non-canonical
   1808		 * value, and that something deterministic happens if the guest
   1809		 * invokes 64-bit SYSENTER.
   1810		 */
   1811		data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
   1812		break;
   1813	case MSR_TSC_AUX:
   1814		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
   1815			return 1;
   1816
   1817		if (!host_initiated &&
   1818		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
   1819		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
   1820			return 1;
   1821
   1822		/*
   1823		 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
   1824		 * incomplete and conflicting architectural behavior.  Current
   1825		 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
   1826		 * reserved and always read as zeros.  Enforce Intel's reserved
   1827		 * bits check if and only if the guest CPU is Intel, and clear
   1828		 * the bits in all other cases.  This ensures cross-vendor
   1829		 * migration will provide consistent behavior for the guest.
   1830		 */
   1831		if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
   1832			return 1;
   1833
   1834		data = (u32)data;
   1835		break;
   1836	}
   1837
   1838	msr.data = data;
   1839	msr.index = index;
   1840	msr.host_initiated = host_initiated;
   1841
   1842	return static_call(kvm_x86_set_msr)(vcpu, &msr);
   1843}
   1844
   1845static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
   1846				     u32 index, u64 data, bool host_initiated)
   1847{
   1848	int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
   1849
   1850	if (ret == KVM_MSR_RET_INVALID)
   1851		if (kvm_msr_ignored_check(index, data, true))
   1852			ret = 0;
   1853
   1854	return ret;
   1855}
   1856
   1857/*
   1858 * Read the MSR specified by @index into @data.  Select MSR specific fault
   1859 * checks are bypassed if @host_initiated is %true.
   1860 * Returns 0 on success, non-0 otherwise.
   1861 * Assumes vcpu_load() was already called.
   1862 */
   1863int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
   1864		  bool host_initiated)
   1865{
   1866	struct msr_data msr;
   1867	int ret;
   1868
   1869	switch (index) {
   1870	case MSR_TSC_AUX:
   1871		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
   1872			return 1;
   1873
   1874		if (!host_initiated &&
   1875		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
   1876		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
   1877			return 1;
   1878		break;
   1879	}
   1880
   1881	msr.index = index;
   1882	msr.host_initiated = host_initiated;
   1883
   1884	ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
   1885	if (!ret)
   1886		*data = msr.data;
   1887	return ret;
   1888}
   1889
   1890static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
   1891				     u32 index, u64 *data, bool host_initiated)
   1892{
   1893	int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
   1894
   1895	if (ret == KVM_MSR_RET_INVALID) {
   1896		/* Unconditionally clear *data for simplicity */
   1897		*data = 0;
   1898		if (kvm_msr_ignored_check(index, 0, false))
   1899			ret = 0;
   1900	}
   1901
   1902	return ret;
   1903}
   1904
   1905static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
   1906{
   1907	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
   1908		return KVM_MSR_RET_FILTERED;
   1909	return kvm_get_msr_ignored_check(vcpu, index, data, false);
   1910}
   1911
   1912static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
   1913{
   1914	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
   1915		return KVM_MSR_RET_FILTERED;
   1916	return kvm_set_msr_ignored_check(vcpu, index, data, false);
   1917}
   1918
   1919int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
   1920{
   1921	return kvm_get_msr_ignored_check(vcpu, index, data, false);
   1922}
   1923EXPORT_SYMBOL_GPL(kvm_get_msr);
   1924
   1925int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
   1926{
   1927	return kvm_set_msr_ignored_check(vcpu, index, data, false);
   1928}
   1929EXPORT_SYMBOL_GPL(kvm_set_msr);
   1930
   1931static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
   1932{
   1933	if (!vcpu->run->msr.error) {
   1934		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
   1935		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
   1936	}
   1937}
   1938
   1939static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
   1940{
   1941	return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
   1942}
   1943
   1944static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
   1945{
   1946	complete_userspace_rdmsr(vcpu);
   1947	return complete_emulated_msr_access(vcpu);
   1948}
   1949
   1950static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
   1951{
   1952	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
   1953}
   1954
   1955static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
   1956{
   1957	complete_userspace_rdmsr(vcpu);
   1958	return complete_fast_msr_access(vcpu);
   1959}
   1960
   1961static u64 kvm_msr_reason(int r)
   1962{
   1963	switch (r) {
   1964	case KVM_MSR_RET_INVALID:
   1965		return KVM_MSR_EXIT_REASON_UNKNOWN;
   1966	case KVM_MSR_RET_FILTERED:
   1967		return KVM_MSR_EXIT_REASON_FILTER;
   1968	default:
   1969		return KVM_MSR_EXIT_REASON_INVAL;
   1970	}
   1971}
   1972
   1973static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
   1974			      u32 exit_reason, u64 data,
   1975			      int (*completion)(struct kvm_vcpu *vcpu),
   1976			      int r)
   1977{
   1978	u64 msr_reason = kvm_msr_reason(r);
   1979
   1980	/* Check if the user wanted to know about this MSR fault */
   1981	if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
   1982		return 0;
   1983
   1984	vcpu->run->exit_reason = exit_reason;
   1985	vcpu->run->msr.error = 0;
   1986	memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
   1987	vcpu->run->msr.reason = msr_reason;
   1988	vcpu->run->msr.index = index;
   1989	vcpu->run->msr.data = data;
   1990	vcpu->arch.complete_userspace_io = completion;
   1991
   1992	return 1;
   1993}
   1994
   1995int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
   1996{
   1997	u32 ecx = kvm_rcx_read(vcpu);
   1998	u64 data;
   1999	int r;
   2000
   2001	r = kvm_get_msr_with_filter(vcpu, ecx, &data);
   2002
   2003	if (!r) {
   2004		trace_kvm_msr_read(ecx, data);
   2005
   2006		kvm_rax_write(vcpu, data & -1u);
   2007		kvm_rdx_write(vcpu, (data >> 32) & -1u);
   2008	} else {
   2009		/* MSR read failed? See if we should ask user space */
   2010		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
   2011				       complete_fast_rdmsr, r))
   2012			return 0;
   2013		trace_kvm_msr_read_ex(ecx);
   2014	}
   2015
   2016	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
   2017}
   2018EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
   2019
   2020int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
   2021{
   2022	u32 ecx = kvm_rcx_read(vcpu);
   2023	u64 data = kvm_read_edx_eax(vcpu);
   2024	int r;
   2025
   2026	r = kvm_set_msr_with_filter(vcpu, ecx, data);
   2027
   2028	if (!r) {
   2029		trace_kvm_msr_write(ecx, data);
   2030	} else {
   2031		/* MSR write failed? See if we should ask user space */
   2032		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
   2033				       complete_fast_msr_access, r))
   2034			return 0;
   2035		/* Signal all other negative errors to userspace */
   2036		if (r < 0)
   2037			return r;
   2038		trace_kvm_msr_write_ex(ecx, data);
   2039	}
   2040
   2041	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
   2042}
   2043EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
   2044
   2045int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
   2046{
   2047	return kvm_skip_emulated_instruction(vcpu);
   2048}
   2049EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
   2050
   2051int kvm_emulate_invd(struct kvm_vcpu *vcpu)
   2052{
   2053	/* Treat an INVD instruction as a NOP and just skip it. */
   2054	return kvm_emulate_as_nop(vcpu);
   2055}
   2056EXPORT_SYMBOL_GPL(kvm_emulate_invd);
   2057
   2058int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
   2059{
   2060	pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
   2061	return kvm_emulate_as_nop(vcpu);
   2062}
   2063EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
   2064
   2065int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
   2066{
   2067	kvm_queue_exception(vcpu, UD_VECTOR);
   2068	return 1;
   2069}
   2070EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
   2071
   2072int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
   2073{
   2074	pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
   2075	return kvm_emulate_as_nop(vcpu);
   2076}
   2077EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
   2078
   2079static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
   2080{
   2081	xfer_to_guest_mode_prepare();
   2082	return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
   2083		xfer_to_guest_mode_work_pending();
   2084}
   2085
   2086/*
   2087 * The fast path for frequent and performance sensitive wrmsr emulation,
   2088 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
   2089 * the latency of virtual IPI by avoiding the expensive bits of transitioning
   2090 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
   2091 * other cases which must be called after interrupts are enabled on the host.
   2092 */
   2093static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
   2094{
   2095	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
   2096		return 1;
   2097
   2098	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
   2099	    ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
   2100	    ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
   2101	    ((u32)(data >> 32) != X2APIC_BROADCAST))
   2102		return kvm_x2apic_icr_write(vcpu->arch.apic, data);
   2103
   2104	return 1;
   2105}
   2106
   2107static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
   2108{
   2109	if (!kvm_can_use_hv_timer(vcpu))
   2110		return 1;
   2111
   2112	kvm_set_lapic_tscdeadline_msr(vcpu, data);
   2113	return 0;
   2114}
   2115
   2116fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
   2117{
   2118	u32 msr = kvm_rcx_read(vcpu);
   2119	u64 data;
   2120	fastpath_t ret = EXIT_FASTPATH_NONE;
   2121
   2122	switch (msr) {
   2123	case APIC_BASE_MSR + (APIC_ICR >> 4):
   2124		data = kvm_read_edx_eax(vcpu);
   2125		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
   2126			kvm_skip_emulated_instruction(vcpu);
   2127			ret = EXIT_FASTPATH_EXIT_HANDLED;
   2128		}
   2129		break;
   2130	case MSR_IA32_TSC_DEADLINE:
   2131		data = kvm_read_edx_eax(vcpu);
   2132		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
   2133			kvm_skip_emulated_instruction(vcpu);
   2134			ret = EXIT_FASTPATH_REENTER_GUEST;
   2135		}
   2136		break;
   2137	default:
   2138		break;
   2139	}
   2140
   2141	if (ret != EXIT_FASTPATH_NONE)
   2142		trace_kvm_msr_write(msr, data);
   2143
   2144	return ret;
   2145}
   2146EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
   2147
   2148/*
   2149 * Adapt set_msr() to msr_io()'s calling convention
   2150 */
   2151static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
   2152{
   2153	return kvm_get_msr_ignored_check(vcpu, index, data, true);
   2154}
   2155
   2156static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
   2157{
   2158	return kvm_set_msr_ignored_check(vcpu, index, *data, true);
   2159}
   2160
   2161#ifdef CONFIG_X86_64
   2162struct pvclock_clock {
   2163	int vclock_mode;
   2164	u64 cycle_last;
   2165	u64 mask;
   2166	u32 mult;
   2167	u32 shift;
   2168	u64 base_cycles;
   2169	u64 offset;
   2170};
   2171
   2172struct pvclock_gtod_data {
   2173	seqcount_t	seq;
   2174
   2175	struct pvclock_clock clock; /* extract of a clocksource struct */
   2176	struct pvclock_clock raw_clock; /* extract of a clocksource struct */
   2177
   2178	ktime_t		offs_boot;
   2179	u64		wall_time_sec;
   2180};
   2181
   2182static struct pvclock_gtod_data pvclock_gtod_data;
   2183
   2184static void update_pvclock_gtod(struct timekeeper *tk)
   2185{
   2186	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
   2187
   2188	write_seqcount_begin(&vdata->seq);
   2189
   2190	/* copy pvclock gtod data */
   2191	vdata->clock.vclock_mode	= tk->tkr_mono.clock->vdso_clock_mode;
   2192	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
   2193	vdata->clock.mask		= tk->tkr_mono.mask;
   2194	vdata->clock.mult		= tk->tkr_mono.mult;
   2195	vdata->clock.shift		= tk->tkr_mono.shift;
   2196	vdata->clock.base_cycles	= tk->tkr_mono.xtime_nsec;
   2197	vdata->clock.offset		= tk->tkr_mono.base;
   2198
   2199	vdata->raw_clock.vclock_mode	= tk->tkr_raw.clock->vdso_clock_mode;
   2200	vdata->raw_clock.cycle_last	= tk->tkr_raw.cycle_last;
   2201	vdata->raw_clock.mask		= tk->tkr_raw.mask;
   2202	vdata->raw_clock.mult		= tk->tkr_raw.mult;
   2203	vdata->raw_clock.shift		= tk->tkr_raw.shift;
   2204	vdata->raw_clock.base_cycles	= tk->tkr_raw.xtime_nsec;
   2205	vdata->raw_clock.offset		= tk->tkr_raw.base;
   2206
   2207	vdata->wall_time_sec            = tk->xtime_sec;
   2208
   2209	vdata->offs_boot		= tk->offs_boot;
   2210
   2211	write_seqcount_end(&vdata->seq);
   2212}
   2213
   2214static s64 get_kvmclock_base_ns(void)
   2215{
   2216	/* Count up from boot time, but with the frequency of the raw clock.  */
   2217	return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
   2218}
   2219#else
   2220static s64 get_kvmclock_base_ns(void)
   2221{
   2222	/* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
   2223	return ktime_get_boottime_ns();
   2224}
   2225#endif
   2226
   2227static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
   2228{
   2229	int version;
   2230	int r;
   2231	struct pvclock_wall_clock wc;
   2232	u32 wc_sec_hi;
   2233	u64 wall_nsec;
   2234
   2235	if (!wall_clock)
   2236		return;
   2237
   2238	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
   2239	if (r)
   2240		return;
   2241
   2242	if (version & 1)
   2243		++version;  /* first time write, random junk */
   2244
   2245	++version;
   2246
   2247	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
   2248		return;
   2249
   2250	/*
   2251	 * The guest calculates current wall clock time by adding
   2252	 * system time (updated by kvm_guest_time_update below) to the
   2253	 * wall clock specified here.  We do the reverse here.
   2254	 */
   2255	wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
   2256
   2257	wc.nsec = do_div(wall_nsec, 1000000000);
   2258	wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
   2259	wc.version = version;
   2260
   2261	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
   2262
   2263	if (sec_hi_ofs) {
   2264		wc_sec_hi = wall_nsec >> 32;
   2265		kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
   2266				&wc_sec_hi, sizeof(wc_sec_hi));
   2267	}
   2268
   2269	version++;
   2270	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
   2271}
   2272
   2273static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
   2274				  bool old_msr, bool host_initiated)
   2275{
   2276	struct kvm_arch *ka = &vcpu->kvm->arch;
   2277
   2278	if (vcpu->vcpu_id == 0 && !host_initiated) {
   2279		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
   2280			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
   2281
   2282		ka->boot_vcpu_runs_old_kvmclock = old_msr;
   2283	}
   2284
   2285	vcpu->arch.time = system_time;
   2286	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
   2287
   2288	/* we verify if the enable bit is set... */
   2289	if (system_time & 1) {
   2290		kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
   2291					  KVM_HOST_USES_PFN, system_time & ~1ULL,
   2292					  sizeof(struct pvclock_vcpu_time_info));
   2293	} else {
   2294		kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
   2295	}
   2296
   2297	return;
   2298}
   2299
   2300static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
   2301{
   2302	do_shl32_div32(dividend, divisor);
   2303	return dividend;
   2304}
   2305
   2306static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
   2307			       s8 *pshift, u32 *pmultiplier)
   2308{
   2309	uint64_t scaled64;
   2310	int32_t  shift = 0;
   2311	uint64_t tps64;
   2312	uint32_t tps32;
   2313
   2314	tps64 = base_hz;
   2315	scaled64 = scaled_hz;
   2316	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
   2317		tps64 >>= 1;
   2318		shift--;
   2319	}
   2320
   2321	tps32 = (uint32_t)tps64;
   2322	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
   2323		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
   2324			scaled64 >>= 1;
   2325		else
   2326			tps32 <<= 1;
   2327		shift++;
   2328	}
   2329
   2330	*pshift = shift;
   2331	*pmultiplier = div_frac(scaled64, tps32);
   2332}
   2333
   2334#ifdef CONFIG_X86_64
   2335static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
   2336#endif
   2337
   2338static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
   2339static unsigned long max_tsc_khz;
   2340
   2341static u32 adjust_tsc_khz(u32 khz, s32 ppm)
   2342{
   2343	u64 v = (u64)khz * (1000000 + ppm);
   2344	do_div(v, 1000000);
   2345	return v;
   2346}
   2347
   2348static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
   2349
   2350static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
   2351{
   2352	u64 ratio;
   2353
   2354	/* Guest TSC same frequency as host TSC? */
   2355	if (!scale) {
   2356		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
   2357		return 0;
   2358	}
   2359
   2360	/* TSC scaling supported? */
   2361	if (!kvm_has_tsc_control) {
   2362		if (user_tsc_khz > tsc_khz) {
   2363			vcpu->arch.tsc_catchup = 1;
   2364			vcpu->arch.tsc_always_catchup = 1;
   2365			return 0;
   2366		} else {
   2367			pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
   2368			return -1;
   2369		}
   2370	}
   2371
   2372	/* TSC scaling required  - calculate ratio */
   2373	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
   2374				user_tsc_khz, tsc_khz);
   2375
   2376	if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
   2377		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
   2378			            user_tsc_khz);
   2379		return -1;
   2380	}
   2381
   2382	kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
   2383	return 0;
   2384}
   2385
   2386static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
   2387{
   2388	u32 thresh_lo, thresh_hi;
   2389	int use_scaling = 0;
   2390
   2391	/* tsc_khz can be zero if TSC calibration fails */
   2392	if (user_tsc_khz == 0) {
   2393		/* set tsc_scaling_ratio to a safe value */
   2394		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
   2395		return -1;
   2396	}
   2397
   2398	/* Compute a scale to convert nanoseconds in TSC cycles */
   2399	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
   2400			   &vcpu->arch.virtual_tsc_shift,
   2401			   &vcpu->arch.virtual_tsc_mult);
   2402	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
   2403
   2404	/*
   2405	 * Compute the variation in TSC rate which is acceptable
   2406	 * within the range of tolerance and decide if the
   2407	 * rate being applied is within that bounds of the hardware
   2408	 * rate.  If so, no scaling or compensation need be done.
   2409	 */
   2410	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
   2411	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
   2412	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
   2413		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
   2414		use_scaling = 1;
   2415	}
   2416	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
   2417}
   2418
   2419static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
   2420{
   2421	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
   2422				      vcpu->arch.virtual_tsc_mult,
   2423				      vcpu->arch.virtual_tsc_shift);
   2424	tsc += vcpu->arch.this_tsc_write;
   2425	return tsc;
   2426}
   2427
   2428#ifdef CONFIG_X86_64
   2429static inline int gtod_is_based_on_tsc(int mode)
   2430{
   2431	return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
   2432}
   2433#endif
   2434
   2435static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
   2436{
   2437#ifdef CONFIG_X86_64
   2438	bool vcpus_matched;
   2439	struct kvm_arch *ka = &vcpu->kvm->arch;
   2440	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
   2441
   2442	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
   2443			 atomic_read(&vcpu->kvm->online_vcpus));
   2444
   2445	/*
   2446	 * Once the masterclock is enabled, always perform request in
   2447	 * order to update it.
   2448	 *
   2449	 * In order to enable masterclock, the host clocksource must be TSC
   2450	 * and the vcpus need to have matched TSCs.  When that happens,
   2451	 * perform request to enable masterclock.
   2452	 */
   2453	if (ka->use_master_clock ||
   2454	    (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
   2455		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
   2456
   2457	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
   2458			    atomic_read(&vcpu->kvm->online_vcpus),
   2459		            ka->use_master_clock, gtod->clock.vclock_mode);
   2460#endif
   2461}
   2462
   2463/*
   2464 * Multiply tsc by a fixed point number represented by ratio.
   2465 *
   2466 * The most significant 64-N bits (mult) of ratio represent the
   2467 * integral part of the fixed point number; the remaining N bits
   2468 * (frac) represent the fractional part, ie. ratio represents a fixed
   2469 * point number (mult + frac * 2^(-N)).
   2470 *
   2471 * N equals to kvm_tsc_scaling_ratio_frac_bits.
   2472 */
   2473static inline u64 __scale_tsc(u64 ratio, u64 tsc)
   2474{
   2475	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
   2476}
   2477
   2478u64 kvm_scale_tsc(u64 tsc, u64 ratio)
   2479{
   2480	u64 _tsc = tsc;
   2481
   2482	if (ratio != kvm_default_tsc_scaling_ratio)
   2483		_tsc = __scale_tsc(ratio, tsc);
   2484
   2485	return _tsc;
   2486}
   2487EXPORT_SYMBOL_GPL(kvm_scale_tsc);
   2488
   2489static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
   2490{
   2491	u64 tsc;
   2492
   2493	tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
   2494
   2495	return target_tsc - tsc;
   2496}
   2497
   2498u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
   2499{
   2500	return vcpu->arch.l1_tsc_offset +
   2501		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
   2502}
   2503EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
   2504
   2505u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
   2506{
   2507	u64 nested_offset;
   2508
   2509	if (l2_multiplier == kvm_default_tsc_scaling_ratio)
   2510		nested_offset = l1_offset;
   2511	else
   2512		nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
   2513						kvm_tsc_scaling_ratio_frac_bits);
   2514
   2515	nested_offset += l2_offset;
   2516	return nested_offset;
   2517}
   2518EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
   2519
   2520u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
   2521{
   2522	if (l2_multiplier != kvm_default_tsc_scaling_ratio)
   2523		return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
   2524				       kvm_tsc_scaling_ratio_frac_bits);
   2525
   2526	return l1_multiplier;
   2527}
   2528EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
   2529
   2530static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
   2531{
   2532	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
   2533				   vcpu->arch.l1_tsc_offset,
   2534				   l1_offset);
   2535
   2536	vcpu->arch.l1_tsc_offset = l1_offset;
   2537
   2538	/*
   2539	 * If we are here because L1 chose not to trap WRMSR to TSC then
   2540	 * according to the spec this should set L1's TSC (as opposed to
   2541	 * setting L1's offset for L2).
   2542	 */
   2543	if (is_guest_mode(vcpu))
   2544		vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
   2545			l1_offset,
   2546			static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
   2547			static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
   2548	else
   2549		vcpu->arch.tsc_offset = l1_offset;
   2550
   2551	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
   2552}
   2553
   2554static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
   2555{
   2556	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
   2557
   2558	/* Userspace is changing the multiplier while L2 is active */
   2559	if (is_guest_mode(vcpu))
   2560		vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
   2561			l1_multiplier,
   2562			static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
   2563	else
   2564		vcpu->arch.tsc_scaling_ratio = l1_multiplier;
   2565
   2566	if (kvm_has_tsc_control)
   2567		static_call(kvm_x86_write_tsc_multiplier)(
   2568			vcpu, vcpu->arch.tsc_scaling_ratio);
   2569}
   2570
   2571static inline bool kvm_check_tsc_unstable(void)
   2572{
   2573#ifdef CONFIG_X86_64
   2574	/*
   2575	 * TSC is marked unstable when we're running on Hyper-V,
   2576	 * 'TSC page' clocksource is good.
   2577	 */
   2578	if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
   2579		return false;
   2580#endif
   2581	return check_tsc_unstable();
   2582}
   2583
   2584/*
   2585 * Infers attempts to synchronize the guest's tsc from host writes. Sets the
   2586 * offset for the vcpu and tracks the TSC matching generation that the vcpu
   2587 * participates in.
   2588 */
   2589static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
   2590				  u64 ns, bool matched)
   2591{
   2592	struct kvm *kvm = vcpu->kvm;
   2593
   2594	lockdep_assert_held(&kvm->arch.tsc_write_lock);
   2595
   2596	/*
   2597	 * We also track th most recent recorded KHZ, write and time to
   2598	 * allow the matching interval to be extended at each write.
   2599	 */
   2600	kvm->arch.last_tsc_nsec = ns;
   2601	kvm->arch.last_tsc_write = tsc;
   2602	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
   2603	kvm->arch.last_tsc_offset = offset;
   2604
   2605	vcpu->arch.last_guest_tsc = tsc;
   2606
   2607	kvm_vcpu_write_tsc_offset(vcpu, offset);
   2608
   2609	if (!matched) {
   2610		/*
   2611		 * We split periods of matched TSC writes into generations.
   2612		 * For each generation, we track the original measured
   2613		 * nanosecond time, offset, and write, so if TSCs are in
   2614		 * sync, we can match exact offset, and if not, we can match
   2615		 * exact software computation in compute_guest_tsc()
   2616		 *
   2617		 * These values are tracked in kvm->arch.cur_xxx variables.
   2618		 */
   2619		kvm->arch.cur_tsc_generation++;
   2620		kvm->arch.cur_tsc_nsec = ns;
   2621		kvm->arch.cur_tsc_write = tsc;
   2622		kvm->arch.cur_tsc_offset = offset;
   2623		kvm->arch.nr_vcpus_matched_tsc = 0;
   2624	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
   2625		kvm->arch.nr_vcpus_matched_tsc++;
   2626	}
   2627
   2628	/* Keep track of which generation this VCPU has synchronized to */
   2629	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
   2630	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
   2631	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
   2632
   2633	kvm_track_tsc_matching(vcpu);
   2634}
   2635
   2636static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
   2637{
   2638	struct kvm *kvm = vcpu->kvm;
   2639	u64 offset, ns, elapsed;
   2640	unsigned long flags;
   2641	bool matched = false;
   2642	bool synchronizing = false;
   2643
   2644	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
   2645	offset = kvm_compute_l1_tsc_offset(vcpu, data);
   2646	ns = get_kvmclock_base_ns();
   2647	elapsed = ns - kvm->arch.last_tsc_nsec;
   2648
   2649	if (vcpu->arch.virtual_tsc_khz) {
   2650		if (data == 0) {
   2651			/*
   2652			 * detection of vcpu initialization -- need to sync
   2653			 * with other vCPUs. This particularly helps to keep
   2654			 * kvm_clock stable after CPU hotplug
   2655			 */
   2656			synchronizing = true;
   2657		} else {
   2658			u64 tsc_exp = kvm->arch.last_tsc_write +
   2659						nsec_to_cycles(vcpu, elapsed);
   2660			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
   2661			/*
   2662			 * Special case: TSC write with a small delta (1 second)
   2663			 * of virtual cycle time against real time is
   2664			 * interpreted as an attempt to synchronize the CPU.
   2665			 */
   2666			synchronizing = data < tsc_exp + tsc_hz &&
   2667					data + tsc_hz > tsc_exp;
   2668		}
   2669	}
   2670
   2671	/*
   2672	 * For a reliable TSC, we can match TSC offsets, and for an unstable
   2673	 * TSC, we add elapsed time in this computation.  We could let the
   2674	 * compensation code attempt to catch up if we fall behind, but
   2675	 * it's better to try to match offsets from the beginning.
   2676         */
   2677	if (synchronizing &&
   2678	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
   2679		if (!kvm_check_tsc_unstable()) {
   2680			offset = kvm->arch.cur_tsc_offset;
   2681		} else {
   2682			u64 delta = nsec_to_cycles(vcpu, elapsed);
   2683			data += delta;
   2684			offset = kvm_compute_l1_tsc_offset(vcpu, data);
   2685		}
   2686		matched = true;
   2687	}
   2688
   2689	__kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
   2690	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
   2691}
   2692
   2693static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
   2694					   s64 adjustment)
   2695{
   2696	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
   2697	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
   2698}
   2699
   2700static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
   2701{
   2702	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
   2703		WARN_ON(adjustment < 0);
   2704	adjustment = kvm_scale_tsc((u64) adjustment,
   2705				   vcpu->arch.l1_tsc_scaling_ratio);
   2706	adjust_tsc_offset_guest(vcpu, adjustment);
   2707}
   2708
   2709#ifdef CONFIG_X86_64
   2710
   2711static u64 read_tsc(void)
   2712{
   2713	u64 ret = (u64)rdtsc_ordered();
   2714	u64 last = pvclock_gtod_data.clock.cycle_last;
   2715
   2716	if (likely(ret >= last))
   2717		return ret;
   2718
   2719	/*
   2720	 * GCC likes to generate cmov here, but this branch is extremely
   2721	 * predictable (it's just a function of time and the likely is
   2722	 * very likely) and there's a data dependence, so force GCC
   2723	 * to generate a branch instead.  I don't barrier() because
   2724	 * we don't actually need a barrier, and if this function
   2725	 * ever gets inlined it will generate worse code.
   2726	 */
   2727	asm volatile ("");
   2728	return last;
   2729}
   2730
   2731static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
   2732			  int *mode)
   2733{
   2734	long v;
   2735	u64 tsc_pg_val;
   2736
   2737	switch (clock->vclock_mode) {
   2738	case VDSO_CLOCKMODE_HVCLOCK:
   2739		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
   2740						  tsc_timestamp);
   2741		if (tsc_pg_val != U64_MAX) {
   2742			/* TSC page valid */
   2743			*mode = VDSO_CLOCKMODE_HVCLOCK;
   2744			v = (tsc_pg_val - clock->cycle_last) &
   2745				clock->mask;
   2746		} else {
   2747			/* TSC page invalid */
   2748			*mode = VDSO_CLOCKMODE_NONE;
   2749		}
   2750		break;
   2751	case VDSO_CLOCKMODE_TSC:
   2752		*mode = VDSO_CLOCKMODE_TSC;
   2753		*tsc_timestamp = read_tsc();
   2754		v = (*tsc_timestamp - clock->cycle_last) &
   2755			clock->mask;
   2756		break;
   2757	default:
   2758		*mode = VDSO_CLOCKMODE_NONE;
   2759	}
   2760
   2761	if (*mode == VDSO_CLOCKMODE_NONE)
   2762		*tsc_timestamp = v = 0;
   2763
   2764	return v * clock->mult;
   2765}
   2766
   2767static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
   2768{
   2769	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
   2770	unsigned long seq;
   2771	int mode;
   2772	u64 ns;
   2773
   2774	do {
   2775		seq = read_seqcount_begin(&gtod->seq);
   2776		ns = gtod->raw_clock.base_cycles;
   2777		ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
   2778		ns >>= gtod->raw_clock.shift;
   2779		ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
   2780	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
   2781	*t = ns;
   2782
   2783	return mode;
   2784}
   2785
   2786static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
   2787{
   2788	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
   2789	unsigned long seq;
   2790	int mode;
   2791	u64 ns;
   2792
   2793	do {
   2794		seq = read_seqcount_begin(&gtod->seq);
   2795		ts->tv_sec = gtod->wall_time_sec;
   2796		ns = gtod->clock.base_cycles;
   2797		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
   2798		ns >>= gtod->clock.shift;
   2799	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
   2800
   2801	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
   2802	ts->tv_nsec = ns;
   2803
   2804	return mode;
   2805}
   2806
   2807/* returns true if host is using TSC based clocksource */
   2808static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
   2809{
   2810	/* checked again under seqlock below */
   2811	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
   2812		return false;
   2813
   2814	return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
   2815						      tsc_timestamp));
   2816}
   2817
   2818/* returns true if host is using TSC based clocksource */
   2819static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
   2820					   u64 *tsc_timestamp)
   2821{
   2822	/* checked again under seqlock below */
   2823	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
   2824		return false;
   2825
   2826	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
   2827}
   2828#endif
   2829
   2830/*
   2831 *
   2832 * Assuming a stable TSC across physical CPUS, and a stable TSC
   2833 * across virtual CPUs, the following condition is possible.
   2834 * Each numbered line represents an event visible to both
   2835 * CPUs at the next numbered event.
   2836 *
   2837 * "timespecX" represents host monotonic time. "tscX" represents
   2838 * RDTSC value.
   2839 *
   2840 * 		VCPU0 on CPU0		|	VCPU1 on CPU1
   2841 *
   2842 * 1.  read timespec0,tsc0
   2843 * 2.					| timespec1 = timespec0 + N
   2844 * 					| tsc1 = tsc0 + M
   2845 * 3. transition to guest		| transition to guest
   2846 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
   2847 * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
   2848 * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
   2849 *
   2850 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
   2851 *
   2852 * 	- ret0 < ret1
   2853 *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
   2854 *		...
   2855 *	- 0 < N - M => M < N
   2856 *
   2857 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
   2858 * always the case (the difference between two distinct xtime instances
   2859 * might be smaller then the difference between corresponding TSC reads,
   2860 * when updating guest vcpus pvclock areas).
   2861 *
   2862 * To avoid that problem, do not allow visibility of distinct
   2863 * system_timestamp/tsc_timestamp values simultaneously: use a master
   2864 * copy of host monotonic time values. Update that master copy
   2865 * in lockstep.
   2866 *
   2867 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
   2868 *
   2869 */
   2870
   2871static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
   2872{
   2873#ifdef CONFIG_X86_64
   2874	struct kvm_arch *ka = &kvm->arch;
   2875	int vclock_mode;
   2876	bool host_tsc_clocksource, vcpus_matched;
   2877
   2878	lockdep_assert_held(&kvm->arch.tsc_write_lock);
   2879	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
   2880			atomic_read(&kvm->online_vcpus));
   2881
   2882	/*
   2883	 * If the host uses TSC clock, then passthrough TSC as stable
   2884	 * to the guest.
   2885	 */
   2886	host_tsc_clocksource = kvm_get_time_and_clockread(
   2887					&ka->master_kernel_ns,
   2888					&ka->master_cycle_now);
   2889
   2890	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
   2891				&& !ka->backwards_tsc_observed
   2892				&& !ka->boot_vcpu_runs_old_kvmclock;
   2893
   2894	if (ka->use_master_clock)
   2895		atomic_set(&kvm_guest_has_master_clock, 1);
   2896
   2897	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
   2898	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
   2899					vcpus_matched);
   2900#endif
   2901}
   2902
   2903static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
   2904{
   2905	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
   2906}
   2907
   2908static void __kvm_start_pvclock_update(struct kvm *kvm)
   2909{
   2910	raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
   2911	write_seqcount_begin(&kvm->arch.pvclock_sc);
   2912}
   2913
   2914static void kvm_start_pvclock_update(struct kvm *kvm)
   2915{
   2916	kvm_make_mclock_inprogress_request(kvm);
   2917
   2918	/* no guest entries from this point */
   2919	__kvm_start_pvclock_update(kvm);
   2920}
   2921
   2922static void kvm_end_pvclock_update(struct kvm *kvm)
   2923{
   2924	struct kvm_arch *ka = &kvm->arch;
   2925	struct kvm_vcpu *vcpu;
   2926	unsigned long i;
   2927
   2928	write_seqcount_end(&ka->pvclock_sc);
   2929	raw_spin_unlock_irq(&ka->tsc_write_lock);
   2930	kvm_for_each_vcpu(i, vcpu, kvm)
   2931		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   2932
   2933	/* guest entries allowed */
   2934	kvm_for_each_vcpu(i, vcpu, kvm)
   2935		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
   2936}
   2937
   2938static void kvm_update_masterclock(struct kvm *kvm)
   2939{
   2940	kvm_hv_request_tsc_page_update(kvm);
   2941	kvm_start_pvclock_update(kvm);
   2942	pvclock_update_vm_gtod_copy(kvm);
   2943	kvm_end_pvclock_update(kvm);
   2944}
   2945
   2946/* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
   2947static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
   2948{
   2949	struct kvm_arch *ka = &kvm->arch;
   2950	struct pvclock_vcpu_time_info hv_clock;
   2951
   2952	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
   2953	get_cpu();
   2954
   2955	data->flags = 0;
   2956	if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
   2957#ifdef CONFIG_X86_64
   2958		struct timespec64 ts;
   2959
   2960		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
   2961			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
   2962			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
   2963		} else
   2964#endif
   2965		data->host_tsc = rdtsc();
   2966
   2967		data->flags |= KVM_CLOCK_TSC_STABLE;
   2968		hv_clock.tsc_timestamp = ka->master_cycle_now;
   2969		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
   2970		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
   2971				   &hv_clock.tsc_shift,
   2972				   &hv_clock.tsc_to_system_mul);
   2973		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
   2974	} else {
   2975		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
   2976	}
   2977
   2978	put_cpu();
   2979}
   2980
   2981static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
   2982{
   2983	struct kvm_arch *ka = &kvm->arch;
   2984	unsigned seq;
   2985
   2986	do {
   2987		seq = read_seqcount_begin(&ka->pvclock_sc);
   2988		__get_kvmclock(kvm, data);
   2989	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
   2990}
   2991
   2992u64 get_kvmclock_ns(struct kvm *kvm)
   2993{
   2994	struct kvm_clock_data data;
   2995
   2996	get_kvmclock(kvm, &data);
   2997	return data.clock;
   2998}
   2999
   3000static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
   3001				    struct gfn_to_pfn_cache *gpc,
   3002				    unsigned int offset)
   3003{
   3004	struct kvm_vcpu_arch *vcpu = &v->arch;
   3005	struct pvclock_vcpu_time_info *guest_hv_clock;
   3006	unsigned long flags;
   3007
   3008	read_lock_irqsave(&gpc->lock, flags);
   3009	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
   3010					   offset + sizeof(*guest_hv_clock))) {
   3011		read_unlock_irqrestore(&gpc->lock, flags);
   3012
   3013		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
   3014						 offset + sizeof(*guest_hv_clock)))
   3015			return;
   3016
   3017		read_lock_irqsave(&gpc->lock, flags);
   3018	}
   3019
   3020	guest_hv_clock = (void *)(gpc->khva + offset);
   3021
   3022	/*
   3023	 * This VCPU is paused, but it's legal for a guest to read another
   3024	 * VCPU's kvmclock, so we really have to follow the specification where
   3025	 * it says that version is odd if data is being modified, and even after
   3026	 * it is consistent.
   3027	 */
   3028
   3029	guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
   3030	smp_wmb();
   3031
   3032	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
   3033	vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
   3034
   3035	if (vcpu->pvclock_set_guest_stopped_request) {
   3036		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
   3037		vcpu->pvclock_set_guest_stopped_request = false;
   3038	}
   3039
   3040	memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
   3041	smp_wmb();
   3042
   3043	guest_hv_clock->version = ++vcpu->hv_clock.version;
   3044
   3045	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
   3046	read_unlock_irqrestore(&gpc->lock, flags);
   3047
   3048	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
   3049}
   3050
   3051static int kvm_guest_time_update(struct kvm_vcpu *v)
   3052{
   3053	unsigned long flags, tgt_tsc_khz;
   3054	unsigned seq;
   3055	struct kvm_vcpu_arch *vcpu = &v->arch;
   3056	struct kvm_arch *ka = &v->kvm->arch;
   3057	s64 kernel_ns;
   3058	u64 tsc_timestamp, host_tsc;
   3059	u8 pvclock_flags;
   3060	bool use_master_clock;
   3061
   3062	kernel_ns = 0;
   3063	host_tsc = 0;
   3064
   3065	/*
   3066	 * If the host uses TSC clock, then passthrough TSC as stable
   3067	 * to the guest.
   3068	 */
   3069	do {
   3070		seq = read_seqcount_begin(&ka->pvclock_sc);
   3071		use_master_clock = ka->use_master_clock;
   3072		if (use_master_clock) {
   3073			host_tsc = ka->master_cycle_now;
   3074			kernel_ns = ka->master_kernel_ns;
   3075		}
   3076	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
   3077
   3078	/* Keep irq disabled to prevent changes to the clock */
   3079	local_irq_save(flags);
   3080	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
   3081	if (unlikely(tgt_tsc_khz == 0)) {
   3082		local_irq_restore(flags);
   3083		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
   3084		return 1;
   3085	}
   3086	if (!use_master_clock) {
   3087		host_tsc = rdtsc();
   3088		kernel_ns = get_kvmclock_base_ns();
   3089	}
   3090
   3091	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
   3092
   3093	/*
   3094	 * We may have to catch up the TSC to match elapsed wall clock
   3095	 * time for two reasons, even if kvmclock is used.
   3096	 *   1) CPU could have been running below the maximum TSC rate
   3097	 *   2) Broken TSC compensation resets the base at each VCPU
   3098	 *      entry to avoid unknown leaps of TSC even when running
   3099	 *      again on the same CPU.  This may cause apparent elapsed
   3100	 *      time to disappear, and the guest to stand still or run
   3101	 *	very slowly.
   3102	 */
   3103	if (vcpu->tsc_catchup) {
   3104		u64 tsc = compute_guest_tsc(v, kernel_ns);
   3105		if (tsc > tsc_timestamp) {
   3106			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
   3107			tsc_timestamp = tsc;
   3108		}
   3109	}
   3110
   3111	local_irq_restore(flags);
   3112
   3113	/* With all the info we got, fill in the values */
   3114
   3115	if (kvm_has_tsc_control)
   3116		tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
   3117					    v->arch.l1_tsc_scaling_ratio);
   3118
   3119	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
   3120		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
   3121				   &vcpu->hv_clock.tsc_shift,
   3122				   &vcpu->hv_clock.tsc_to_system_mul);
   3123		vcpu->hw_tsc_khz = tgt_tsc_khz;
   3124	}
   3125
   3126	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
   3127	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
   3128	vcpu->last_guest_tsc = tsc_timestamp;
   3129
   3130	/* If the host uses TSC clocksource, then it is stable */
   3131	pvclock_flags = 0;
   3132	if (use_master_clock)
   3133		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
   3134
   3135	vcpu->hv_clock.flags = pvclock_flags;
   3136
   3137	if (vcpu->pv_time.active)
   3138		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
   3139	if (vcpu->xen.vcpu_info_cache.active)
   3140		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
   3141					offsetof(struct compat_vcpu_info, time));
   3142	if (vcpu->xen.vcpu_time_info_cache.active)
   3143		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
   3144	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
   3145	return 0;
   3146}
   3147
   3148/*
   3149 * kvmclock updates which are isolated to a given vcpu, such as
   3150 * vcpu->cpu migration, should not allow system_timestamp from
   3151 * the rest of the vcpus to remain static. Otherwise ntp frequency
   3152 * correction applies to one vcpu's system_timestamp but not
   3153 * the others.
   3154 *
   3155 * So in those cases, request a kvmclock update for all vcpus.
   3156 * We need to rate-limit these requests though, as they can
   3157 * considerably slow guests that have a large number of vcpus.
   3158 * The time for a remote vcpu to update its kvmclock is bound
   3159 * by the delay we use to rate-limit the updates.
   3160 */
   3161
   3162#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
   3163
   3164static void kvmclock_update_fn(struct work_struct *work)
   3165{
   3166	unsigned long i;
   3167	struct delayed_work *dwork = to_delayed_work(work);
   3168	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
   3169					   kvmclock_update_work);
   3170	struct kvm *kvm = container_of(ka, struct kvm, arch);
   3171	struct kvm_vcpu *vcpu;
   3172
   3173	kvm_for_each_vcpu(i, vcpu, kvm) {
   3174		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   3175		kvm_vcpu_kick(vcpu);
   3176	}
   3177}
   3178
   3179static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
   3180{
   3181	struct kvm *kvm = v->kvm;
   3182
   3183	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
   3184	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
   3185					KVMCLOCK_UPDATE_DELAY);
   3186}
   3187
   3188#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
   3189
   3190static void kvmclock_sync_fn(struct work_struct *work)
   3191{
   3192	struct delayed_work *dwork = to_delayed_work(work);
   3193	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
   3194					   kvmclock_sync_work);
   3195	struct kvm *kvm = container_of(ka, struct kvm, arch);
   3196
   3197	if (!kvmclock_periodic_sync)
   3198		return;
   3199
   3200	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
   3201	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
   3202					KVMCLOCK_SYNC_PERIOD);
   3203}
   3204
   3205/*
   3206 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
   3207 */
   3208static bool can_set_mci_status(struct kvm_vcpu *vcpu)
   3209{
   3210	/* McStatusWrEn enabled? */
   3211	if (guest_cpuid_is_amd_or_hygon(vcpu))
   3212		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
   3213
   3214	return false;
   3215}
   3216
   3217static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   3218{
   3219	u64 mcg_cap = vcpu->arch.mcg_cap;
   3220	unsigned bank_num = mcg_cap & 0xff;
   3221	u32 msr = msr_info->index;
   3222	u64 data = msr_info->data;
   3223
   3224	switch (msr) {
   3225	case MSR_IA32_MCG_STATUS:
   3226		vcpu->arch.mcg_status = data;
   3227		break;
   3228	case MSR_IA32_MCG_CTL:
   3229		if (!(mcg_cap & MCG_CTL_P) &&
   3230		    (data || !msr_info->host_initiated))
   3231			return 1;
   3232		if (data != 0 && data != ~(u64)0)
   3233			return 1;
   3234		vcpu->arch.mcg_ctl = data;
   3235		break;
   3236	default:
   3237		if (msr >= MSR_IA32_MC0_CTL &&
   3238		    msr < MSR_IA32_MCx_CTL(bank_num)) {
   3239			u32 offset = array_index_nospec(
   3240				msr - MSR_IA32_MC0_CTL,
   3241				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
   3242
   3243			/* only 0 or all 1s can be written to IA32_MCi_CTL
   3244			 * some Linux kernels though clear bit 10 in bank 4 to
   3245			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
   3246			 * this to avoid an uncatched #GP in the guest
   3247			 */
   3248			if ((offset & 0x3) == 0 &&
   3249			    data != 0 && (data | (1 << 10)) != ~(u64)0)
   3250				return -1;
   3251
   3252			/* MCi_STATUS */
   3253			if (!msr_info->host_initiated &&
   3254			    (offset & 0x3) == 1 && data != 0) {
   3255				if (!can_set_mci_status(vcpu))
   3256					return -1;
   3257			}
   3258
   3259			vcpu->arch.mce_banks[offset] = data;
   3260			break;
   3261		}
   3262		return 1;
   3263	}
   3264	return 0;
   3265}
   3266
   3267static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
   3268{
   3269	u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
   3270
   3271	return (vcpu->arch.apf.msr_en_val & mask) == mask;
   3272}
   3273
   3274static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
   3275{
   3276	gpa_t gpa = data & ~0x3f;
   3277
   3278	/* Bits 4:5 are reserved, Should be zero */
   3279	if (data & 0x30)
   3280		return 1;
   3281
   3282	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
   3283	    (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
   3284		return 1;
   3285
   3286	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
   3287	    (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
   3288		return 1;
   3289
   3290	if (!lapic_in_kernel(vcpu))
   3291		return data ? 1 : 0;
   3292
   3293	vcpu->arch.apf.msr_en_val = data;
   3294
   3295	if (!kvm_pv_async_pf_enabled(vcpu)) {
   3296		kvm_clear_async_pf_completion_queue(vcpu);
   3297		kvm_async_pf_hash_reset(vcpu);
   3298		return 0;
   3299	}
   3300
   3301	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
   3302					sizeof(u64)))
   3303		return 1;
   3304
   3305	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
   3306	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
   3307
   3308	kvm_async_pf_wakeup_all(vcpu);
   3309
   3310	return 0;
   3311}
   3312
   3313static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
   3314{
   3315	/* Bits 8-63 are reserved */
   3316	if (data >> 8)
   3317		return 1;
   3318
   3319	if (!lapic_in_kernel(vcpu))
   3320		return 1;
   3321
   3322	vcpu->arch.apf.msr_int_val = data;
   3323
   3324	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
   3325
   3326	return 0;
   3327}
   3328
   3329static void kvmclock_reset(struct kvm_vcpu *vcpu)
   3330{
   3331	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
   3332	vcpu->arch.time = 0;
   3333}
   3334
   3335static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
   3336{
   3337	++vcpu->stat.tlb_flush;
   3338	static_call(kvm_x86_flush_tlb_all)(vcpu);
   3339}
   3340
   3341static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
   3342{
   3343	++vcpu->stat.tlb_flush;
   3344
   3345	if (!tdp_enabled) {
   3346		/*
   3347		 * A TLB flush on behalf of the guest is equivalent to
   3348		 * INVPCID(all), toggling CR4.PGE, etc., which requires
   3349		 * a forced sync of the shadow page tables.  Ensure all the
   3350		 * roots are synced and the guest TLB in hardware is clean.
   3351		 */
   3352		kvm_mmu_sync_roots(vcpu);
   3353		kvm_mmu_sync_prev_roots(vcpu);
   3354	}
   3355
   3356	static_call(kvm_x86_flush_tlb_guest)(vcpu);
   3357}
   3358
   3359
   3360static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
   3361{
   3362	++vcpu->stat.tlb_flush;
   3363	static_call(kvm_x86_flush_tlb_current)(vcpu);
   3364}
   3365
   3366/*
   3367 * Service "local" TLB flush requests, which are specific to the current MMU
   3368 * context.  In addition to the generic event handling in vcpu_enter_guest(),
   3369 * TLB flushes that are targeted at an MMU context also need to be serviced
   3370 * prior before nested VM-Enter/VM-Exit.
   3371 */
   3372void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
   3373{
   3374	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
   3375		kvm_vcpu_flush_tlb_current(vcpu);
   3376
   3377	if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
   3378		kvm_vcpu_flush_tlb_guest(vcpu);
   3379}
   3380EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
   3381
   3382static void record_steal_time(struct kvm_vcpu *vcpu)
   3383{
   3384	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
   3385	struct kvm_steal_time __user *st;
   3386	struct kvm_memslots *slots;
   3387	u64 steal;
   3388	u32 version;
   3389
   3390	if (kvm_xen_msr_enabled(vcpu->kvm)) {
   3391		kvm_xen_runstate_set_running(vcpu);
   3392		return;
   3393	}
   3394
   3395	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
   3396		return;
   3397
   3398	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
   3399		return;
   3400
   3401	slots = kvm_memslots(vcpu->kvm);
   3402
   3403	if (unlikely(slots->generation != ghc->generation ||
   3404		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
   3405		gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
   3406
   3407		/* We rely on the fact that it fits in a single page. */
   3408		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
   3409
   3410		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
   3411		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
   3412			return;
   3413	}
   3414
   3415	st = (struct kvm_steal_time __user *)ghc->hva;
   3416	/*
   3417	 * Doing a TLB flush here, on the guest's behalf, can avoid
   3418	 * expensive IPIs.
   3419	 */
   3420	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
   3421		u8 st_preempted = 0;
   3422		int err = -EFAULT;
   3423
   3424		if (!user_access_begin(st, sizeof(*st)))
   3425			return;
   3426
   3427		asm volatile("1: xchgb %0, %2\n"
   3428			     "xor %1, %1\n"
   3429			     "2:\n"
   3430			     _ASM_EXTABLE_UA(1b, 2b)
   3431			     : "+q" (st_preempted),
   3432			       "+&r" (err),
   3433			       "+m" (st->preempted));
   3434		if (err)
   3435			goto out;
   3436
   3437		user_access_end();
   3438
   3439		vcpu->arch.st.preempted = 0;
   3440
   3441		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
   3442				       st_preempted & KVM_VCPU_FLUSH_TLB);
   3443		if (st_preempted & KVM_VCPU_FLUSH_TLB)
   3444			kvm_vcpu_flush_tlb_guest(vcpu);
   3445
   3446		if (!user_access_begin(st, sizeof(*st)))
   3447			goto dirty;
   3448	} else {
   3449		if (!user_access_begin(st, sizeof(*st)))
   3450			return;
   3451
   3452		unsafe_put_user(0, &st->preempted, out);
   3453		vcpu->arch.st.preempted = 0;
   3454	}
   3455
   3456	unsafe_get_user(version, &st->version, out);
   3457	if (version & 1)
   3458		version += 1;  /* first time write, random junk */
   3459
   3460	version += 1;
   3461	unsafe_put_user(version, &st->version, out);
   3462
   3463	smp_wmb();
   3464
   3465	unsafe_get_user(steal, &st->steal, out);
   3466	steal += current->sched_info.run_delay -
   3467		vcpu->arch.st.last_steal;
   3468	vcpu->arch.st.last_steal = current->sched_info.run_delay;
   3469	unsafe_put_user(steal, &st->steal, out);
   3470
   3471	version += 1;
   3472	unsafe_put_user(version, &st->version, out);
   3473
   3474 out:
   3475	user_access_end();
   3476 dirty:
   3477	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
   3478}
   3479
   3480int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   3481{
   3482	bool pr = false;
   3483	u32 msr = msr_info->index;
   3484	u64 data = msr_info->data;
   3485
   3486	if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
   3487		return kvm_xen_write_hypercall_page(vcpu, data);
   3488
   3489	switch (msr) {
   3490	case MSR_AMD64_NB_CFG:
   3491	case MSR_IA32_UCODE_WRITE:
   3492	case MSR_VM_HSAVE_PA:
   3493	case MSR_AMD64_PATCH_LOADER:
   3494	case MSR_AMD64_BU_CFG2:
   3495	case MSR_AMD64_DC_CFG:
   3496	case MSR_F15H_EX_CFG:
   3497		break;
   3498
   3499	case MSR_IA32_UCODE_REV:
   3500		if (msr_info->host_initiated)
   3501			vcpu->arch.microcode_version = data;
   3502		break;
   3503	case MSR_IA32_ARCH_CAPABILITIES:
   3504		if (!msr_info->host_initiated)
   3505			return 1;
   3506		vcpu->arch.arch_capabilities = data;
   3507		break;
   3508	case MSR_IA32_PERF_CAPABILITIES: {
   3509		struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
   3510
   3511		if (!msr_info->host_initiated)
   3512			return 1;
   3513		if (kvm_get_msr_feature(&msr_ent))
   3514			return 1;
   3515		if (data & ~msr_ent.data)
   3516			return 1;
   3517
   3518		vcpu->arch.perf_capabilities = data;
   3519
   3520		return 0;
   3521		}
   3522	case MSR_EFER:
   3523		return set_efer(vcpu, msr_info);
   3524	case MSR_K7_HWCR:
   3525		data &= ~(u64)0x40;	/* ignore flush filter disable */
   3526		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
   3527		data &= ~(u64)0x8;	/* ignore TLB cache disable */
   3528
   3529		/* Handle McStatusWrEn */
   3530		if (data == BIT_ULL(18)) {
   3531			vcpu->arch.msr_hwcr = data;
   3532		} else if (data != 0) {
   3533			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
   3534				    data);
   3535			return 1;
   3536		}
   3537		break;
   3538	case MSR_FAM10H_MMIO_CONF_BASE:
   3539		if (data != 0) {
   3540			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
   3541				    "0x%llx\n", data);
   3542			return 1;
   3543		}
   3544		break;
   3545	case 0x200 ... 0x2ff:
   3546		return kvm_mtrr_set_msr(vcpu, msr, data);
   3547	case MSR_IA32_APICBASE:
   3548		return kvm_set_apic_base(vcpu, msr_info);
   3549	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
   3550		return kvm_x2apic_msr_write(vcpu, msr, data);
   3551	case MSR_IA32_TSC_DEADLINE:
   3552		kvm_set_lapic_tscdeadline_msr(vcpu, data);
   3553		break;
   3554	case MSR_IA32_TSC_ADJUST:
   3555		if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
   3556			if (!msr_info->host_initiated) {
   3557				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
   3558				adjust_tsc_offset_guest(vcpu, adj);
   3559				/* Before back to guest, tsc_timestamp must be adjusted
   3560				 * as well, otherwise guest's percpu pvclock time could jump.
   3561				 */
   3562				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   3563			}
   3564			vcpu->arch.ia32_tsc_adjust_msr = data;
   3565		}
   3566		break;
   3567	case MSR_IA32_MISC_ENABLE:
   3568		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
   3569		    ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
   3570			if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
   3571				return 1;
   3572			vcpu->arch.ia32_misc_enable_msr = data;
   3573			kvm_update_cpuid_runtime(vcpu);
   3574		} else {
   3575			vcpu->arch.ia32_misc_enable_msr = data;
   3576		}
   3577		break;
   3578	case MSR_IA32_SMBASE:
   3579		if (!msr_info->host_initiated)
   3580			return 1;
   3581		vcpu->arch.smbase = data;
   3582		break;
   3583	case MSR_IA32_POWER_CTL:
   3584		vcpu->arch.msr_ia32_power_ctl = data;
   3585		break;
   3586	case MSR_IA32_TSC:
   3587		if (msr_info->host_initiated) {
   3588			kvm_synchronize_tsc(vcpu, data);
   3589		} else {
   3590			u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
   3591			adjust_tsc_offset_guest(vcpu, adj);
   3592			vcpu->arch.ia32_tsc_adjust_msr += adj;
   3593		}
   3594		break;
   3595	case MSR_IA32_XSS:
   3596		if (!msr_info->host_initiated &&
   3597		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
   3598			return 1;
   3599		/*
   3600		 * KVM supports exposing PT to the guest, but does not support
   3601		 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
   3602		 * XSAVES/XRSTORS to save/restore PT MSRs.
   3603		 */
   3604		if (data & ~supported_xss)
   3605			return 1;
   3606		vcpu->arch.ia32_xss = data;
   3607		kvm_update_cpuid_runtime(vcpu);
   3608		break;
   3609	case MSR_SMI_COUNT:
   3610		if (!msr_info->host_initiated)
   3611			return 1;
   3612		vcpu->arch.smi_count = data;
   3613		break;
   3614	case MSR_KVM_WALL_CLOCK_NEW:
   3615		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
   3616			return 1;
   3617
   3618		vcpu->kvm->arch.wall_clock = data;
   3619		kvm_write_wall_clock(vcpu->kvm, data, 0);
   3620		break;
   3621	case MSR_KVM_WALL_CLOCK:
   3622		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
   3623			return 1;
   3624
   3625		vcpu->kvm->arch.wall_clock = data;
   3626		kvm_write_wall_clock(vcpu->kvm, data, 0);
   3627		break;
   3628	case MSR_KVM_SYSTEM_TIME_NEW:
   3629		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
   3630			return 1;
   3631
   3632		kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
   3633		break;
   3634	case MSR_KVM_SYSTEM_TIME:
   3635		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
   3636			return 1;
   3637
   3638		kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
   3639		break;
   3640	case MSR_KVM_ASYNC_PF_EN:
   3641		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
   3642			return 1;
   3643
   3644		if (kvm_pv_enable_async_pf(vcpu, data))
   3645			return 1;
   3646		break;
   3647	case MSR_KVM_ASYNC_PF_INT:
   3648		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
   3649			return 1;
   3650
   3651		if (kvm_pv_enable_async_pf_int(vcpu, data))
   3652			return 1;
   3653		break;
   3654	case MSR_KVM_ASYNC_PF_ACK:
   3655		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
   3656			return 1;
   3657		if (data & 0x1) {
   3658			vcpu->arch.apf.pageready_pending = false;
   3659			kvm_check_async_pf_completion(vcpu);
   3660		}
   3661		break;
   3662	case MSR_KVM_STEAL_TIME:
   3663		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
   3664			return 1;
   3665
   3666		if (unlikely(!sched_info_on()))
   3667			return 1;
   3668
   3669		if (data & KVM_STEAL_RESERVED_MASK)
   3670			return 1;
   3671
   3672		vcpu->arch.st.msr_val = data;
   3673
   3674		if (!(data & KVM_MSR_ENABLED))
   3675			break;
   3676
   3677		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
   3678
   3679		break;
   3680	case MSR_KVM_PV_EOI_EN:
   3681		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
   3682			return 1;
   3683
   3684		if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
   3685			return 1;
   3686		break;
   3687
   3688	case MSR_KVM_POLL_CONTROL:
   3689		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
   3690			return 1;
   3691
   3692		/* only enable bit supported */
   3693		if (data & (-1ULL << 1))
   3694			return 1;
   3695
   3696		vcpu->arch.msr_kvm_poll_control = data;
   3697		break;
   3698
   3699	case MSR_IA32_MCG_CTL:
   3700	case MSR_IA32_MCG_STATUS:
   3701	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
   3702		return set_msr_mce(vcpu, msr_info);
   3703
   3704	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
   3705	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
   3706		pr = true;
   3707		fallthrough;
   3708	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
   3709	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
   3710		if (kvm_pmu_is_valid_msr(vcpu, msr))
   3711			return kvm_pmu_set_msr(vcpu, msr_info);
   3712
   3713		if (pr || data != 0)
   3714			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
   3715				    "0x%x data 0x%llx\n", msr, data);
   3716		break;
   3717	case MSR_K7_CLK_CTL:
   3718		/*
   3719		 * Ignore all writes to this no longer documented MSR.
   3720		 * Writes are only relevant for old K7 processors,
   3721		 * all pre-dating SVM, but a recommended workaround from
   3722		 * AMD for these chips. It is possible to specify the
   3723		 * affected processor models on the command line, hence
   3724		 * the need to ignore the workaround.
   3725		 */
   3726		break;
   3727	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
   3728	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
   3729	case HV_X64_MSR_SYNDBG_OPTIONS:
   3730	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
   3731	case HV_X64_MSR_CRASH_CTL:
   3732	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
   3733	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   3734	case HV_X64_MSR_TSC_EMULATION_CONTROL:
   3735	case HV_X64_MSR_TSC_EMULATION_STATUS:
   3736		return kvm_hv_set_msr_common(vcpu, msr, data,
   3737					     msr_info->host_initiated);
   3738	case MSR_IA32_BBL_CR_CTL3:
   3739		/* Drop writes to this legacy MSR -- see rdmsr
   3740		 * counterpart for further detail.
   3741		 */
   3742		if (report_ignored_msrs)
   3743			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
   3744				msr, data);
   3745		break;
   3746	case MSR_AMD64_OSVW_ID_LENGTH:
   3747		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
   3748			return 1;
   3749		vcpu->arch.osvw.length = data;
   3750		break;
   3751	case MSR_AMD64_OSVW_STATUS:
   3752		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
   3753			return 1;
   3754		vcpu->arch.osvw.status = data;
   3755		break;
   3756	case MSR_PLATFORM_INFO:
   3757		if (!msr_info->host_initiated ||
   3758		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
   3759		     cpuid_fault_enabled(vcpu)))
   3760			return 1;
   3761		vcpu->arch.msr_platform_info = data;
   3762		break;
   3763	case MSR_MISC_FEATURES_ENABLES:
   3764		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
   3765		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
   3766		     !supports_cpuid_fault(vcpu)))
   3767			return 1;
   3768		vcpu->arch.msr_misc_features_enables = data;
   3769		break;
   3770#ifdef CONFIG_X86_64
   3771	case MSR_IA32_XFD:
   3772		if (!msr_info->host_initiated &&
   3773		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
   3774			return 1;
   3775
   3776		if (data & ~kvm_guest_supported_xfd(vcpu))
   3777			return 1;
   3778
   3779		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
   3780		break;
   3781	case MSR_IA32_XFD_ERR:
   3782		if (!msr_info->host_initiated &&
   3783		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
   3784			return 1;
   3785
   3786		if (data & ~kvm_guest_supported_xfd(vcpu))
   3787			return 1;
   3788
   3789		vcpu->arch.guest_fpu.xfd_err = data;
   3790		break;
   3791#endif
   3792	default:
   3793		if (kvm_pmu_is_valid_msr(vcpu, msr))
   3794			return kvm_pmu_set_msr(vcpu, msr_info);
   3795		return KVM_MSR_RET_INVALID;
   3796	}
   3797	return 0;
   3798}
   3799EXPORT_SYMBOL_GPL(kvm_set_msr_common);
   3800
   3801static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
   3802{
   3803	u64 data;
   3804	u64 mcg_cap = vcpu->arch.mcg_cap;
   3805	unsigned bank_num = mcg_cap & 0xff;
   3806
   3807	switch (msr) {
   3808	case MSR_IA32_P5_MC_ADDR:
   3809	case MSR_IA32_P5_MC_TYPE:
   3810		data = 0;
   3811		break;
   3812	case MSR_IA32_MCG_CAP:
   3813		data = vcpu->arch.mcg_cap;
   3814		break;
   3815	case MSR_IA32_MCG_CTL:
   3816		if (!(mcg_cap & MCG_CTL_P) && !host)
   3817			return 1;
   3818		data = vcpu->arch.mcg_ctl;
   3819		break;
   3820	case MSR_IA32_MCG_STATUS:
   3821		data = vcpu->arch.mcg_status;
   3822		break;
   3823	default:
   3824		if (msr >= MSR_IA32_MC0_CTL &&
   3825		    msr < MSR_IA32_MCx_CTL(bank_num)) {
   3826			u32 offset = array_index_nospec(
   3827				msr - MSR_IA32_MC0_CTL,
   3828				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
   3829
   3830			data = vcpu->arch.mce_banks[offset];
   3831			break;
   3832		}
   3833		return 1;
   3834	}
   3835	*pdata = data;
   3836	return 0;
   3837}
   3838
   3839int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   3840{
   3841	switch (msr_info->index) {
   3842	case MSR_IA32_PLATFORM_ID:
   3843	case MSR_IA32_EBL_CR_POWERON:
   3844	case MSR_IA32_LASTBRANCHFROMIP:
   3845	case MSR_IA32_LASTBRANCHTOIP:
   3846	case MSR_IA32_LASTINTFROMIP:
   3847	case MSR_IA32_LASTINTTOIP:
   3848	case MSR_AMD64_SYSCFG:
   3849	case MSR_K8_TSEG_ADDR:
   3850	case MSR_K8_TSEG_MASK:
   3851	case MSR_VM_HSAVE_PA:
   3852	case MSR_K8_INT_PENDING_MSG:
   3853	case MSR_AMD64_NB_CFG:
   3854	case MSR_FAM10H_MMIO_CONF_BASE:
   3855	case MSR_AMD64_BU_CFG2:
   3856	case MSR_IA32_PERF_CTL:
   3857	case MSR_AMD64_DC_CFG:
   3858	case MSR_F15H_EX_CFG:
   3859	/*
   3860	 * Intel Sandy Bridge CPUs must support the RAPL (running average power
   3861	 * limit) MSRs. Just return 0, as we do not want to expose the host
   3862	 * data here. Do not conditionalize this on CPUID, as KVM does not do
   3863	 * so for existing CPU-specific MSRs.
   3864	 */
   3865	case MSR_RAPL_POWER_UNIT:
   3866	case MSR_PP0_ENERGY_STATUS:	/* Power plane 0 (core) */
   3867	case MSR_PP1_ENERGY_STATUS:	/* Power plane 1 (graphics uncore) */
   3868	case MSR_PKG_ENERGY_STATUS:	/* Total package */
   3869	case MSR_DRAM_ENERGY_STATUS:	/* DRAM controller */
   3870		msr_info->data = 0;
   3871		break;
   3872	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
   3873		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
   3874			return kvm_pmu_get_msr(vcpu, msr_info);
   3875		if (!msr_info->host_initiated)
   3876			return 1;
   3877		msr_info->data = 0;
   3878		break;
   3879	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
   3880	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
   3881	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
   3882	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
   3883		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
   3884			return kvm_pmu_get_msr(vcpu, msr_info);
   3885		msr_info->data = 0;
   3886		break;
   3887	case MSR_IA32_UCODE_REV:
   3888		msr_info->data = vcpu->arch.microcode_version;
   3889		break;
   3890	case MSR_IA32_ARCH_CAPABILITIES:
   3891		if (!msr_info->host_initiated &&
   3892		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
   3893			return 1;
   3894		msr_info->data = vcpu->arch.arch_capabilities;
   3895		break;
   3896	case MSR_IA32_PERF_CAPABILITIES:
   3897		if (!msr_info->host_initiated &&
   3898		    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
   3899			return 1;
   3900		msr_info->data = vcpu->arch.perf_capabilities;
   3901		break;
   3902	case MSR_IA32_POWER_CTL:
   3903		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
   3904		break;
   3905	case MSR_IA32_TSC: {
   3906		/*
   3907		 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
   3908		 * even when not intercepted. AMD manual doesn't explicitly
   3909		 * state this but appears to behave the same.
   3910		 *
   3911		 * On userspace reads and writes, however, we unconditionally
   3912		 * return L1's TSC value to ensure backwards-compatible
   3913		 * behavior for migration.
   3914		 */
   3915		u64 offset, ratio;
   3916
   3917		if (msr_info->host_initiated) {
   3918			offset = vcpu->arch.l1_tsc_offset;
   3919			ratio = vcpu->arch.l1_tsc_scaling_ratio;
   3920		} else {
   3921			offset = vcpu->arch.tsc_offset;
   3922			ratio = vcpu->arch.tsc_scaling_ratio;
   3923		}
   3924
   3925		msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
   3926		break;
   3927	}
   3928	case MSR_MTRRcap:
   3929	case 0x200 ... 0x2ff:
   3930		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
   3931	case 0xcd: /* fsb frequency */
   3932		msr_info->data = 3;
   3933		break;
   3934		/*
   3935		 * MSR_EBC_FREQUENCY_ID
   3936		 * Conservative value valid for even the basic CPU models.
   3937		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
   3938		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
   3939		 * and 266MHz for model 3, or 4. Set Core Clock
   3940		 * Frequency to System Bus Frequency Ratio to 1 (bits
   3941		 * 31:24) even though these are only valid for CPU
   3942		 * models > 2, however guests may end up dividing or
   3943		 * multiplying by zero otherwise.
   3944		 */
   3945	case MSR_EBC_FREQUENCY_ID:
   3946		msr_info->data = 1 << 24;
   3947		break;
   3948	case MSR_IA32_APICBASE:
   3949		msr_info->data = kvm_get_apic_base(vcpu);
   3950		break;
   3951	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
   3952		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
   3953	case MSR_IA32_TSC_DEADLINE:
   3954		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
   3955		break;
   3956	case MSR_IA32_TSC_ADJUST:
   3957		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
   3958		break;
   3959	case MSR_IA32_MISC_ENABLE:
   3960		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
   3961		break;
   3962	case MSR_IA32_SMBASE:
   3963		if (!msr_info->host_initiated)
   3964			return 1;
   3965		msr_info->data = vcpu->arch.smbase;
   3966		break;
   3967	case MSR_SMI_COUNT:
   3968		msr_info->data = vcpu->arch.smi_count;
   3969		break;
   3970	case MSR_IA32_PERF_STATUS:
   3971		/* TSC increment by tick */
   3972		msr_info->data = 1000ULL;
   3973		/* CPU multiplier */
   3974		msr_info->data |= (((uint64_t)4ULL) << 40);
   3975		break;
   3976	case MSR_EFER:
   3977		msr_info->data = vcpu->arch.efer;
   3978		break;
   3979	case MSR_KVM_WALL_CLOCK:
   3980		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
   3981			return 1;
   3982
   3983		msr_info->data = vcpu->kvm->arch.wall_clock;
   3984		break;
   3985	case MSR_KVM_WALL_CLOCK_NEW:
   3986		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
   3987			return 1;
   3988
   3989		msr_info->data = vcpu->kvm->arch.wall_clock;
   3990		break;
   3991	case MSR_KVM_SYSTEM_TIME:
   3992		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
   3993			return 1;
   3994
   3995		msr_info->data = vcpu->arch.time;
   3996		break;
   3997	case MSR_KVM_SYSTEM_TIME_NEW:
   3998		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
   3999			return 1;
   4000
   4001		msr_info->data = vcpu->arch.time;
   4002		break;
   4003	case MSR_KVM_ASYNC_PF_EN:
   4004		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
   4005			return 1;
   4006
   4007		msr_info->data = vcpu->arch.apf.msr_en_val;
   4008		break;
   4009	case MSR_KVM_ASYNC_PF_INT:
   4010		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
   4011			return 1;
   4012
   4013		msr_info->data = vcpu->arch.apf.msr_int_val;
   4014		break;
   4015	case MSR_KVM_ASYNC_PF_ACK:
   4016		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
   4017			return 1;
   4018
   4019		msr_info->data = 0;
   4020		break;
   4021	case MSR_KVM_STEAL_TIME:
   4022		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
   4023			return 1;
   4024
   4025		msr_info->data = vcpu->arch.st.msr_val;
   4026		break;
   4027	case MSR_KVM_PV_EOI_EN:
   4028		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
   4029			return 1;
   4030
   4031		msr_info->data = vcpu->arch.pv_eoi.msr_val;
   4032		break;
   4033	case MSR_KVM_POLL_CONTROL:
   4034		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
   4035			return 1;
   4036
   4037		msr_info->data = vcpu->arch.msr_kvm_poll_control;
   4038		break;
   4039	case MSR_IA32_P5_MC_ADDR:
   4040	case MSR_IA32_P5_MC_TYPE:
   4041	case MSR_IA32_MCG_CAP:
   4042	case MSR_IA32_MCG_CTL:
   4043	case MSR_IA32_MCG_STATUS:
   4044	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
   4045		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
   4046				   msr_info->host_initiated);
   4047	case MSR_IA32_XSS:
   4048		if (!msr_info->host_initiated &&
   4049		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
   4050			return 1;
   4051		msr_info->data = vcpu->arch.ia32_xss;
   4052		break;
   4053	case MSR_K7_CLK_CTL:
   4054		/*
   4055		 * Provide expected ramp-up count for K7. All other
   4056		 * are set to zero, indicating minimum divisors for
   4057		 * every field.
   4058		 *
   4059		 * This prevents guest kernels on AMD host with CPU
   4060		 * type 6, model 8 and higher from exploding due to
   4061		 * the rdmsr failing.
   4062		 */
   4063		msr_info->data = 0x20000000;
   4064		break;
   4065	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
   4066	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
   4067	case HV_X64_MSR_SYNDBG_OPTIONS:
   4068	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
   4069	case HV_X64_MSR_CRASH_CTL:
   4070	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
   4071	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   4072	case HV_X64_MSR_TSC_EMULATION_CONTROL:
   4073	case HV_X64_MSR_TSC_EMULATION_STATUS:
   4074		return kvm_hv_get_msr_common(vcpu,
   4075					     msr_info->index, &msr_info->data,
   4076					     msr_info->host_initiated);
   4077	case MSR_IA32_BBL_CR_CTL3:
   4078		/* This legacy MSR exists but isn't fully documented in current
   4079		 * silicon.  It is however accessed by winxp in very narrow
   4080		 * scenarios where it sets bit #19, itself documented as
   4081		 * a "reserved" bit.  Best effort attempt to source coherent
   4082		 * read data here should the balance of the register be
   4083		 * interpreted by the guest:
   4084		 *
   4085		 * L2 cache control register 3: 64GB range, 256KB size,
   4086		 * enabled, latency 0x1, configured
   4087		 */
   4088		msr_info->data = 0xbe702111;
   4089		break;
   4090	case MSR_AMD64_OSVW_ID_LENGTH:
   4091		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
   4092			return 1;
   4093		msr_info->data = vcpu->arch.osvw.length;
   4094		break;
   4095	case MSR_AMD64_OSVW_STATUS:
   4096		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
   4097			return 1;
   4098		msr_info->data = vcpu->arch.osvw.status;
   4099		break;
   4100	case MSR_PLATFORM_INFO:
   4101		if (!msr_info->host_initiated &&
   4102		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
   4103			return 1;
   4104		msr_info->data = vcpu->arch.msr_platform_info;
   4105		break;
   4106	case MSR_MISC_FEATURES_ENABLES:
   4107		msr_info->data = vcpu->arch.msr_misc_features_enables;
   4108		break;
   4109	case MSR_K7_HWCR:
   4110		msr_info->data = vcpu->arch.msr_hwcr;
   4111		break;
   4112#ifdef CONFIG_X86_64
   4113	case MSR_IA32_XFD:
   4114		if (!msr_info->host_initiated &&
   4115		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
   4116			return 1;
   4117
   4118		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
   4119		break;
   4120	case MSR_IA32_XFD_ERR:
   4121		if (!msr_info->host_initiated &&
   4122		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
   4123			return 1;
   4124
   4125		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
   4126		break;
   4127#endif
   4128	default:
   4129		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
   4130			return kvm_pmu_get_msr(vcpu, msr_info);
   4131		return KVM_MSR_RET_INVALID;
   4132	}
   4133	return 0;
   4134}
   4135EXPORT_SYMBOL_GPL(kvm_get_msr_common);
   4136
   4137/*
   4138 * Read or write a bunch of msrs. All parameters are kernel addresses.
   4139 *
   4140 * @return number of msrs set successfully.
   4141 */
   4142static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
   4143		    struct kvm_msr_entry *entries,
   4144		    int (*do_msr)(struct kvm_vcpu *vcpu,
   4145				  unsigned index, u64 *data))
   4146{
   4147	int i;
   4148
   4149	for (i = 0; i < msrs->nmsrs; ++i)
   4150		if (do_msr(vcpu, entries[i].index, &entries[i].data))
   4151			break;
   4152
   4153	return i;
   4154}
   4155
   4156/*
   4157 * Read or write a bunch of msrs. Parameters are user addresses.
   4158 *
   4159 * @return number of msrs set successfully.
   4160 */
   4161static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
   4162		  int (*do_msr)(struct kvm_vcpu *vcpu,
   4163				unsigned index, u64 *data),
   4164		  int writeback)
   4165{
   4166	struct kvm_msrs msrs;
   4167	struct kvm_msr_entry *entries;
   4168	int r, n;
   4169	unsigned size;
   4170
   4171	r = -EFAULT;
   4172	if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
   4173		goto out;
   4174
   4175	r = -E2BIG;
   4176	if (msrs.nmsrs >= MAX_IO_MSRS)
   4177		goto out;
   4178
   4179	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
   4180	entries = memdup_user(user_msrs->entries, size);
   4181	if (IS_ERR(entries)) {
   4182		r = PTR_ERR(entries);
   4183		goto out;
   4184	}
   4185
   4186	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
   4187	if (r < 0)
   4188		goto out_free;
   4189
   4190	r = -EFAULT;
   4191	if (writeback && copy_to_user(user_msrs->entries, entries, size))
   4192		goto out_free;
   4193
   4194	r = n;
   4195
   4196out_free:
   4197	kfree(entries);
   4198out:
   4199	return r;
   4200}
   4201
   4202static inline bool kvm_can_mwait_in_guest(void)
   4203{
   4204	return boot_cpu_has(X86_FEATURE_MWAIT) &&
   4205		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
   4206		boot_cpu_has(X86_FEATURE_ARAT);
   4207}
   4208
   4209static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
   4210					    struct kvm_cpuid2 __user *cpuid_arg)
   4211{
   4212	struct kvm_cpuid2 cpuid;
   4213	int r;
   4214
   4215	r = -EFAULT;
   4216	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
   4217		return r;
   4218
   4219	r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
   4220	if (r)
   4221		return r;
   4222
   4223	r = -EFAULT;
   4224	if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
   4225		return r;
   4226
   4227	return 0;
   4228}
   4229
   4230int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
   4231{
   4232	int r = 0;
   4233
   4234	switch (ext) {
   4235	case KVM_CAP_IRQCHIP:
   4236	case KVM_CAP_HLT:
   4237	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
   4238	case KVM_CAP_SET_TSS_ADDR:
   4239	case KVM_CAP_EXT_CPUID:
   4240	case KVM_CAP_EXT_EMUL_CPUID:
   4241	case KVM_CAP_CLOCKSOURCE:
   4242	case KVM_CAP_PIT:
   4243	case KVM_CAP_NOP_IO_DELAY:
   4244	case KVM_CAP_MP_STATE:
   4245	case KVM_CAP_SYNC_MMU:
   4246	case KVM_CAP_USER_NMI:
   4247	case KVM_CAP_REINJECT_CONTROL:
   4248	case KVM_CAP_IRQ_INJECT_STATUS:
   4249	case KVM_CAP_IOEVENTFD:
   4250	case KVM_CAP_IOEVENTFD_NO_LENGTH:
   4251	case KVM_CAP_PIT2:
   4252	case KVM_CAP_PIT_STATE2:
   4253	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
   4254	case KVM_CAP_VCPU_EVENTS:
   4255	case KVM_CAP_HYPERV:
   4256	case KVM_CAP_HYPERV_VAPIC:
   4257	case KVM_CAP_HYPERV_SPIN:
   4258	case KVM_CAP_HYPERV_SYNIC:
   4259	case KVM_CAP_HYPERV_SYNIC2:
   4260	case KVM_CAP_HYPERV_VP_INDEX:
   4261	case KVM_CAP_HYPERV_EVENTFD:
   4262	case KVM_CAP_HYPERV_TLBFLUSH:
   4263	case KVM_CAP_HYPERV_SEND_IPI:
   4264	case KVM_CAP_HYPERV_CPUID:
   4265	case KVM_CAP_HYPERV_ENFORCE_CPUID:
   4266	case KVM_CAP_SYS_HYPERV_CPUID:
   4267	case KVM_CAP_PCI_SEGMENT:
   4268	case KVM_CAP_DEBUGREGS:
   4269	case KVM_CAP_X86_ROBUST_SINGLESTEP:
   4270	case KVM_CAP_XSAVE:
   4271	case KVM_CAP_ASYNC_PF:
   4272	case KVM_CAP_ASYNC_PF_INT:
   4273	case KVM_CAP_GET_TSC_KHZ:
   4274	case KVM_CAP_KVMCLOCK_CTRL:
   4275	case KVM_CAP_READONLY_MEM:
   4276	case KVM_CAP_HYPERV_TIME:
   4277	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
   4278	case KVM_CAP_TSC_DEADLINE_TIMER:
   4279	case KVM_CAP_DISABLE_QUIRKS:
   4280	case KVM_CAP_SET_BOOT_CPU_ID:
   4281 	case KVM_CAP_SPLIT_IRQCHIP:
   4282	case KVM_CAP_IMMEDIATE_EXIT:
   4283	case KVM_CAP_PMU_EVENT_FILTER:
   4284	case KVM_CAP_GET_MSR_FEATURES:
   4285	case KVM_CAP_MSR_PLATFORM_INFO:
   4286	case KVM_CAP_EXCEPTION_PAYLOAD:
   4287	case KVM_CAP_SET_GUEST_DEBUG:
   4288	case KVM_CAP_LAST_CPU:
   4289	case KVM_CAP_X86_USER_SPACE_MSR:
   4290	case KVM_CAP_X86_MSR_FILTER:
   4291	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
   4292#ifdef CONFIG_X86_SGX_KVM
   4293	case KVM_CAP_SGX_ATTRIBUTE:
   4294#endif
   4295	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
   4296	case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
   4297	case KVM_CAP_SREGS2:
   4298	case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
   4299	case KVM_CAP_VCPU_ATTRIBUTES:
   4300	case KVM_CAP_SYS_ATTRIBUTES:
   4301	case KVM_CAP_VAPIC:
   4302	case KVM_CAP_ENABLE_CAP:
   4303		r = 1;
   4304		break;
   4305	case KVM_CAP_EXIT_HYPERCALL:
   4306		r = KVM_EXIT_HYPERCALL_VALID_MASK;
   4307		break;
   4308	case KVM_CAP_SET_GUEST_DEBUG2:
   4309		return KVM_GUESTDBG_VALID_MASK;
   4310#ifdef CONFIG_KVM_XEN
   4311	case KVM_CAP_XEN_HVM:
   4312		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
   4313		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
   4314		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
   4315		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
   4316		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
   4317		if (sched_info_on())
   4318			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
   4319		break;
   4320#endif
   4321	case KVM_CAP_SYNC_REGS:
   4322		r = KVM_SYNC_X86_VALID_FIELDS;
   4323		break;
   4324	case KVM_CAP_ADJUST_CLOCK:
   4325		r = KVM_CLOCK_VALID_FLAGS;
   4326		break;
   4327	case KVM_CAP_X86_DISABLE_EXITS:
   4328		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
   4329		      KVM_X86_DISABLE_EXITS_CSTATE;
   4330		if(kvm_can_mwait_in_guest())
   4331			r |= KVM_X86_DISABLE_EXITS_MWAIT;
   4332		break;
   4333	case KVM_CAP_X86_SMM:
   4334		/* SMBASE is usually relocated above 1M on modern chipsets,
   4335		 * and SMM handlers might indeed rely on 4G segment limits,
   4336		 * so do not report SMM to be available if real mode is
   4337		 * emulated via vm86 mode.  Still, do not go to great lengths
   4338		 * to avoid userspace's usage of the feature, because it is a
   4339		 * fringe case that is not enabled except via specific settings
   4340		 * of the module parameters.
   4341		 */
   4342		r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
   4343		break;
   4344	case KVM_CAP_NR_VCPUS:
   4345		r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
   4346		break;
   4347	case KVM_CAP_MAX_VCPUS:
   4348		r = KVM_MAX_VCPUS;
   4349		break;
   4350	case KVM_CAP_MAX_VCPU_ID:
   4351		r = KVM_MAX_VCPU_IDS;
   4352		break;
   4353	case KVM_CAP_PV_MMU:	/* obsolete */
   4354		r = 0;
   4355		break;
   4356	case KVM_CAP_MCE:
   4357		r = KVM_MAX_MCE_BANKS;
   4358		break;
   4359	case KVM_CAP_XCRS:
   4360		r = boot_cpu_has(X86_FEATURE_XSAVE);
   4361		break;
   4362	case KVM_CAP_TSC_CONTROL:
   4363	case KVM_CAP_VM_TSC_CONTROL:
   4364		r = kvm_has_tsc_control;
   4365		break;
   4366	case KVM_CAP_X2APIC_API:
   4367		r = KVM_X2APIC_API_VALID_FLAGS;
   4368		break;
   4369	case KVM_CAP_NESTED_STATE:
   4370		r = kvm_x86_ops.nested_ops->get_state ?
   4371			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
   4372		break;
   4373	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
   4374		r = kvm_x86_ops.enable_direct_tlbflush != NULL;
   4375		break;
   4376	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
   4377		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
   4378		break;
   4379	case KVM_CAP_SMALLER_MAXPHYADDR:
   4380		r = (int) allow_smaller_maxphyaddr;
   4381		break;
   4382	case KVM_CAP_STEAL_TIME:
   4383		r = sched_info_on();
   4384		break;
   4385	case KVM_CAP_X86_BUS_LOCK_EXIT:
   4386		if (kvm_has_bus_lock_exit)
   4387			r = KVM_BUS_LOCK_DETECTION_OFF |
   4388			    KVM_BUS_LOCK_DETECTION_EXIT;
   4389		else
   4390			r = 0;
   4391		break;
   4392	case KVM_CAP_XSAVE2: {
   4393		u64 guest_perm = xstate_get_guest_group_perm();
   4394
   4395		r = xstate_required_size(supported_xcr0 & guest_perm, false);
   4396		if (r < sizeof(struct kvm_xsave))
   4397			r = sizeof(struct kvm_xsave);
   4398		break;
   4399	case KVM_CAP_PMU_CAPABILITY:
   4400		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
   4401		break;
   4402	}
   4403	case KVM_CAP_DISABLE_QUIRKS2:
   4404		r = KVM_X86_VALID_QUIRKS;
   4405		break;
   4406	default:
   4407		break;
   4408	}
   4409	return r;
   4410}
   4411
   4412static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
   4413{
   4414	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
   4415
   4416	if ((u64)(unsigned long)uaddr != attr->addr)
   4417		return ERR_PTR_USR(-EFAULT);
   4418	return uaddr;
   4419}
   4420
   4421static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
   4422{
   4423	u64 __user *uaddr = kvm_get_attr_addr(attr);
   4424
   4425	if (attr->group)
   4426		return -ENXIO;
   4427
   4428	if (IS_ERR(uaddr))
   4429		return PTR_ERR(uaddr);
   4430
   4431	switch (attr->attr) {
   4432	case KVM_X86_XCOMP_GUEST_SUPP:
   4433		if (put_user(supported_xcr0, uaddr))
   4434			return -EFAULT;
   4435		return 0;
   4436	default:
   4437		return -ENXIO;
   4438		break;
   4439	}
   4440}
   4441
   4442static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
   4443{
   4444	if (attr->group)
   4445		return -ENXIO;
   4446
   4447	switch (attr->attr) {
   4448	case KVM_X86_XCOMP_GUEST_SUPP:
   4449		return 0;
   4450	default:
   4451		return -ENXIO;
   4452	}
   4453}
   4454
   4455long kvm_arch_dev_ioctl(struct file *filp,
   4456			unsigned int ioctl, unsigned long arg)
   4457{
   4458	void __user *argp = (void __user *)arg;
   4459	long r;
   4460
   4461	switch (ioctl) {
   4462	case KVM_GET_MSR_INDEX_LIST: {
   4463		struct kvm_msr_list __user *user_msr_list = argp;
   4464		struct kvm_msr_list msr_list;
   4465		unsigned n;
   4466
   4467		r = -EFAULT;
   4468		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
   4469			goto out;
   4470		n = msr_list.nmsrs;
   4471		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
   4472		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
   4473			goto out;
   4474		r = -E2BIG;
   4475		if (n < msr_list.nmsrs)
   4476			goto out;
   4477		r = -EFAULT;
   4478		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
   4479				 num_msrs_to_save * sizeof(u32)))
   4480			goto out;
   4481		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
   4482				 &emulated_msrs,
   4483				 num_emulated_msrs * sizeof(u32)))
   4484			goto out;
   4485		r = 0;
   4486		break;
   4487	}
   4488	case KVM_GET_SUPPORTED_CPUID:
   4489	case KVM_GET_EMULATED_CPUID: {
   4490		struct kvm_cpuid2 __user *cpuid_arg = argp;
   4491		struct kvm_cpuid2 cpuid;
   4492
   4493		r = -EFAULT;
   4494		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
   4495			goto out;
   4496
   4497		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
   4498					    ioctl);
   4499		if (r)
   4500			goto out;
   4501
   4502		r = -EFAULT;
   4503		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
   4504			goto out;
   4505		r = 0;
   4506		break;
   4507	}
   4508	case KVM_X86_GET_MCE_CAP_SUPPORTED:
   4509		r = -EFAULT;
   4510		if (copy_to_user(argp, &kvm_mce_cap_supported,
   4511				 sizeof(kvm_mce_cap_supported)))
   4512			goto out;
   4513		r = 0;
   4514		break;
   4515	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
   4516		struct kvm_msr_list __user *user_msr_list = argp;
   4517		struct kvm_msr_list msr_list;
   4518		unsigned int n;
   4519
   4520		r = -EFAULT;
   4521		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
   4522			goto out;
   4523		n = msr_list.nmsrs;
   4524		msr_list.nmsrs = num_msr_based_features;
   4525		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
   4526			goto out;
   4527		r = -E2BIG;
   4528		if (n < msr_list.nmsrs)
   4529			goto out;
   4530		r = -EFAULT;
   4531		if (copy_to_user(user_msr_list->indices, &msr_based_features,
   4532				 num_msr_based_features * sizeof(u32)))
   4533			goto out;
   4534		r = 0;
   4535		break;
   4536	}
   4537	case KVM_GET_MSRS:
   4538		r = msr_io(NULL, argp, do_get_msr_feature, 1);
   4539		break;
   4540	case KVM_GET_SUPPORTED_HV_CPUID:
   4541		r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
   4542		break;
   4543	case KVM_GET_DEVICE_ATTR: {
   4544		struct kvm_device_attr attr;
   4545		r = -EFAULT;
   4546		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
   4547			break;
   4548		r = kvm_x86_dev_get_attr(&attr);
   4549		break;
   4550	}
   4551	case KVM_HAS_DEVICE_ATTR: {
   4552		struct kvm_device_attr attr;
   4553		r = -EFAULT;
   4554		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
   4555			break;
   4556		r = kvm_x86_dev_has_attr(&attr);
   4557		break;
   4558	}
   4559	default:
   4560		r = -EINVAL;
   4561		break;
   4562	}
   4563out:
   4564	return r;
   4565}
   4566
   4567static void wbinvd_ipi(void *garbage)
   4568{
   4569	wbinvd();
   4570}
   4571
   4572static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
   4573{
   4574	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
   4575}
   4576
   4577void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   4578{
   4579	/* Address WBINVD may be executed by guest */
   4580	if (need_emulate_wbinvd(vcpu)) {
   4581		if (static_call(kvm_x86_has_wbinvd_exit)())
   4582			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
   4583		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
   4584			smp_call_function_single(vcpu->cpu,
   4585					wbinvd_ipi, NULL, 1);
   4586	}
   4587
   4588	static_call(kvm_x86_vcpu_load)(vcpu, cpu);
   4589
   4590	/* Save host pkru register if supported */
   4591	vcpu->arch.host_pkru = read_pkru();
   4592
   4593	/* Apply any externally detected TSC adjustments (due to suspend) */
   4594	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
   4595		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
   4596		vcpu->arch.tsc_offset_adjustment = 0;
   4597		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   4598	}
   4599
   4600	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
   4601		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
   4602				rdtsc() - vcpu->arch.last_host_tsc;
   4603		if (tsc_delta < 0)
   4604			mark_tsc_unstable("KVM discovered backwards TSC");
   4605
   4606		if (kvm_check_tsc_unstable()) {
   4607			u64 offset = kvm_compute_l1_tsc_offset(vcpu,
   4608						vcpu->arch.last_guest_tsc);
   4609			kvm_vcpu_write_tsc_offset(vcpu, offset);
   4610			vcpu->arch.tsc_catchup = 1;
   4611		}
   4612
   4613		if (kvm_lapic_hv_timer_in_use(vcpu))
   4614			kvm_lapic_restart_hv_timer(vcpu);
   4615
   4616		/*
   4617		 * On a host with synchronized TSC, there is no need to update
   4618		 * kvmclock on vcpu->cpu migration
   4619		 */
   4620		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
   4621			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
   4622		if (vcpu->cpu != cpu)
   4623			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
   4624		vcpu->cpu = cpu;
   4625	}
   4626
   4627	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
   4628}
   4629
   4630static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
   4631{
   4632	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
   4633	struct kvm_steal_time __user *st;
   4634	struct kvm_memslots *slots;
   4635	static const u8 preempted = KVM_VCPU_PREEMPTED;
   4636
   4637	/*
   4638	 * The vCPU can be marked preempted if and only if the VM-Exit was on
   4639	 * an instruction boundary and will not trigger guest emulation of any
   4640	 * kind (see vcpu_run).  Vendor specific code controls (conservatively)
   4641	 * when this is true, for example allowing the vCPU to be marked
   4642	 * preempted if and only if the VM-Exit was due to a host interrupt.
   4643	 */
   4644	if (!vcpu->arch.at_instruction_boundary) {
   4645		vcpu->stat.preemption_other++;
   4646		return;
   4647	}
   4648
   4649	vcpu->stat.preemption_reported++;
   4650	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
   4651		return;
   4652
   4653	if (vcpu->arch.st.preempted)
   4654		return;
   4655
   4656	/* This happens on process exit */
   4657	if (unlikely(current->mm != vcpu->kvm->mm))
   4658		return;
   4659
   4660	slots = kvm_memslots(vcpu->kvm);
   4661
   4662	if (unlikely(slots->generation != ghc->generation ||
   4663		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
   4664		return;
   4665
   4666	st = (struct kvm_steal_time __user *)ghc->hva;
   4667	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
   4668
   4669	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
   4670		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
   4671
   4672	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
   4673}
   4674
   4675void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
   4676{
   4677	int idx;
   4678
   4679	if (vcpu->preempted) {
   4680		if (!vcpu->arch.guest_state_protected)
   4681			vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
   4682
   4683		/*
   4684		 * Take the srcu lock as memslots will be accessed to check the gfn
   4685		 * cache generation against the memslots generation.
   4686		 */
   4687		idx = srcu_read_lock(&vcpu->kvm->srcu);
   4688		if (kvm_xen_msr_enabled(vcpu->kvm))
   4689			kvm_xen_runstate_set_preempted(vcpu);
   4690		else
   4691			kvm_steal_time_set_preempted(vcpu);
   4692		srcu_read_unlock(&vcpu->kvm->srcu, idx);
   4693	}
   4694
   4695	static_call(kvm_x86_vcpu_put)(vcpu);
   4696	vcpu->arch.last_host_tsc = rdtsc();
   4697}
   4698
   4699static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
   4700				    struct kvm_lapic_state *s)
   4701{
   4702	static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
   4703
   4704	return kvm_apic_get_state(vcpu, s);
   4705}
   4706
   4707static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
   4708				    struct kvm_lapic_state *s)
   4709{
   4710	int r;
   4711
   4712	r = kvm_apic_set_state(vcpu, s);
   4713	if (r)
   4714		return r;
   4715	update_cr8_intercept(vcpu);
   4716
   4717	return 0;
   4718}
   4719
   4720static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
   4721{
   4722	/*
   4723	 * We can accept userspace's request for interrupt injection
   4724	 * as long as we have a place to store the interrupt number.
   4725	 * The actual injection will happen when the CPU is able to
   4726	 * deliver the interrupt.
   4727	 */
   4728	if (kvm_cpu_has_extint(vcpu))
   4729		return false;
   4730
   4731	/* Acknowledging ExtINT does not happen if LINT0 is masked.  */
   4732	return (!lapic_in_kernel(vcpu) ||
   4733		kvm_apic_accept_pic_intr(vcpu));
   4734}
   4735
   4736static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
   4737{
   4738	/*
   4739	 * Do not cause an interrupt window exit if an exception
   4740	 * is pending or an event needs reinjection; userspace
   4741	 * might want to inject the interrupt manually using KVM_SET_REGS
   4742	 * or KVM_SET_SREGS.  For that to work, we must be at an
   4743	 * instruction boundary and with no events half-injected.
   4744	 */
   4745	return (kvm_arch_interrupt_allowed(vcpu) &&
   4746		kvm_cpu_accept_dm_intr(vcpu) &&
   4747		!kvm_event_needs_reinjection(vcpu) &&
   4748		!vcpu->arch.exception.pending);
   4749}
   4750
   4751static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
   4752				    struct kvm_interrupt *irq)
   4753{
   4754	if (irq->irq >= KVM_NR_INTERRUPTS)
   4755		return -EINVAL;
   4756
   4757	if (!irqchip_in_kernel(vcpu->kvm)) {
   4758		kvm_queue_interrupt(vcpu, irq->irq, false);
   4759		kvm_make_request(KVM_REQ_EVENT, vcpu);
   4760		return 0;
   4761	}
   4762
   4763	/*
   4764	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
   4765	 * fail for in-kernel 8259.
   4766	 */
   4767	if (pic_in_kernel(vcpu->kvm))
   4768		return -ENXIO;
   4769
   4770	if (vcpu->arch.pending_external_vector != -1)
   4771		return -EEXIST;
   4772
   4773	vcpu->arch.pending_external_vector = irq->irq;
   4774	kvm_make_request(KVM_REQ_EVENT, vcpu);
   4775	return 0;
   4776}
   4777
   4778static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
   4779{
   4780	kvm_inject_nmi(vcpu);
   4781
   4782	return 0;
   4783}
   4784
   4785static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
   4786{
   4787	kvm_make_request(KVM_REQ_SMI, vcpu);
   4788
   4789	return 0;
   4790}
   4791
   4792static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
   4793					   struct kvm_tpr_access_ctl *tac)
   4794{
   4795	if (tac->flags)
   4796		return -EINVAL;
   4797	vcpu->arch.tpr_access_reporting = !!tac->enabled;
   4798	return 0;
   4799}
   4800
   4801static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
   4802					u64 mcg_cap)
   4803{
   4804	int r;
   4805	unsigned bank_num = mcg_cap & 0xff, bank;
   4806
   4807	r = -EINVAL;
   4808	if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
   4809		goto out;
   4810	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
   4811		goto out;
   4812	r = 0;
   4813	vcpu->arch.mcg_cap = mcg_cap;
   4814	/* Init IA32_MCG_CTL to all 1s */
   4815	if (mcg_cap & MCG_CTL_P)
   4816		vcpu->arch.mcg_ctl = ~(u64)0;
   4817	/* Init IA32_MCi_CTL to all 1s */
   4818	for (bank = 0; bank < bank_num; bank++)
   4819		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
   4820
   4821	static_call(kvm_x86_setup_mce)(vcpu);
   4822out:
   4823	return r;
   4824}
   4825
   4826static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
   4827				      struct kvm_x86_mce *mce)
   4828{
   4829	u64 mcg_cap = vcpu->arch.mcg_cap;
   4830	unsigned bank_num = mcg_cap & 0xff;
   4831	u64 *banks = vcpu->arch.mce_banks;
   4832
   4833	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
   4834		return -EINVAL;
   4835	/*
   4836	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
   4837	 * reporting is disabled
   4838	 */
   4839	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
   4840	    vcpu->arch.mcg_ctl != ~(u64)0)
   4841		return 0;
   4842	banks += 4 * mce->bank;
   4843	/*
   4844	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
   4845	 * reporting is disabled for the bank
   4846	 */
   4847	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
   4848		return 0;
   4849	if (mce->status & MCI_STATUS_UC) {
   4850		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
   4851		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
   4852			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   4853			return 0;
   4854		}
   4855		if (banks[1] & MCI_STATUS_VAL)
   4856			mce->status |= MCI_STATUS_OVER;
   4857		banks[2] = mce->addr;
   4858		banks[3] = mce->misc;
   4859		vcpu->arch.mcg_status = mce->mcg_status;
   4860		banks[1] = mce->status;
   4861		kvm_queue_exception(vcpu, MC_VECTOR);
   4862	} else if (!(banks[1] & MCI_STATUS_VAL)
   4863		   || !(banks[1] & MCI_STATUS_UC)) {
   4864		if (banks[1] & MCI_STATUS_VAL)
   4865			mce->status |= MCI_STATUS_OVER;
   4866		banks[2] = mce->addr;
   4867		banks[3] = mce->misc;
   4868		banks[1] = mce->status;
   4869	} else
   4870		banks[1] |= MCI_STATUS_OVER;
   4871	return 0;
   4872}
   4873
   4874static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
   4875					       struct kvm_vcpu_events *events)
   4876{
   4877	process_nmi(vcpu);
   4878
   4879	if (kvm_check_request(KVM_REQ_SMI, vcpu))
   4880		process_smi(vcpu);
   4881
   4882	/*
   4883	 * In guest mode, payload delivery should be deferred,
   4884	 * so that the L1 hypervisor can intercept #PF before
   4885	 * CR2 is modified (or intercept #DB before DR6 is
   4886	 * modified under nVMX). Unless the per-VM capability,
   4887	 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
   4888	 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
   4889	 * opportunistically defer the exception payload, deliver it if the
   4890	 * capability hasn't been requested before processing a
   4891	 * KVM_GET_VCPU_EVENTS.
   4892	 */
   4893	if (!vcpu->kvm->arch.exception_payload_enabled &&
   4894	    vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
   4895		kvm_deliver_exception_payload(vcpu);
   4896
   4897	/*
   4898	 * The API doesn't provide the instruction length for software
   4899	 * exceptions, so don't report them. As long as the guest RIP
   4900	 * isn't advanced, we should expect to encounter the exception
   4901	 * again.
   4902	 */
   4903	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
   4904		events->exception.injected = 0;
   4905		events->exception.pending = 0;
   4906	} else {
   4907		events->exception.injected = vcpu->arch.exception.injected;
   4908		events->exception.pending = vcpu->arch.exception.pending;
   4909		/*
   4910		 * For ABI compatibility, deliberately conflate
   4911		 * pending and injected exceptions when
   4912		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
   4913		 */
   4914		if (!vcpu->kvm->arch.exception_payload_enabled)
   4915			events->exception.injected |=
   4916				vcpu->arch.exception.pending;
   4917	}
   4918	events->exception.nr = vcpu->arch.exception.nr;
   4919	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
   4920	events->exception.error_code = vcpu->arch.exception.error_code;
   4921	events->exception_has_payload = vcpu->arch.exception.has_payload;
   4922	events->exception_payload = vcpu->arch.exception.payload;
   4923
   4924	events->interrupt.injected =
   4925		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
   4926	events->interrupt.nr = vcpu->arch.interrupt.nr;
   4927	events->interrupt.soft = 0;
   4928	events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
   4929
   4930	events->nmi.injected = vcpu->arch.nmi_injected;
   4931	events->nmi.pending = vcpu->arch.nmi_pending != 0;
   4932	events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
   4933	events->nmi.pad = 0;
   4934
   4935	events->sipi_vector = 0; /* never valid when reporting to user space */
   4936
   4937	events->smi.smm = is_smm(vcpu);
   4938	events->smi.pending = vcpu->arch.smi_pending;
   4939	events->smi.smm_inside_nmi =
   4940		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
   4941	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
   4942
   4943	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
   4944			 | KVM_VCPUEVENT_VALID_SHADOW
   4945			 | KVM_VCPUEVENT_VALID_SMM);
   4946	if (vcpu->kvm->arch.exception_payload_enabled)
   4947		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
   4948
   4949	memset(&events->reserved, 0, sizeof(events->reserved));
   4950}
   4951
   4952static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
   4953
   4954static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
   4955					      struct kvm_vcpu_events *events)
   4956{
   4957	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
   4958			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
   4959			      | KVM_VCPUEVENT_VALID_SHADOW
   4960			      | KVM_VCPUEVENT_VALID_SMM
   4961			      | KVM_VCPUEVENT_VALID_PAYLOAD))
   4962		return -EINVAL;
   4963
   4964	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
   4965		if (!vcpu->kvm->arch.exception_payload_enabled)
   4966			return -EINVAL;
   4967		if (events->exception.pending)
   4968			events->exception.injected = 0;
   4969		else
   4970			events->exception_has_payload = 0;
   4971	} else {
   4972		events->exception.pending = 0;
   4973		events->exception_has_payload = 0;
   4974	}
   4975
   4976	if ((events->exception.injected || events->exception.pending) &&
   4977	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
   4978		return -EINVAL;
   4979
   4980	/* INITs are latched while in SMM */
   4981	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
   4982	    (events->smi.smm || events->smi.pending) &&
   4983	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
   4984		return -EINVAL;
   4985
   4986	process_nmi(vcpu);
   4987	vcpu->arch.exception.injected = events->exception.injected;
   4988	vcpu->arch.exception.pending = events->exception.pending;
   4989	vcpu->arch.exception.nr = events->exception.nr;
   4990	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
   4991	vcpu->arch.exception.error_code = events->exception.error_code;
   4992	vcpu->arch.exception.has_payload = events->exception_has_payload;
   4993	vcpu->arch.exception.payload = events->exception_payload;
   4994
   4995	vcpu->arch.interrupt.injected = events->interrupt.injected;
   4996	vcpu->arch.interrupt.nr = events->interrupt.nr;
   4997	vcpu->arch.interrupt.soft = events->interrupt.soft;
   4998	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
   4999		static_call(kvm_x86_set_interrupt_shadow)(vcpu,
   5000						events->interrupt.shadow);
   5001
   5002	vcpu->arch.nmi_injected = events->nmi.injected;
   5003	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
   5004		vcpu->arch.nmi_pending = events->nmi.pending;
   5005	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
   5006
   5007	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
   5008	    lapic_in_kernel(vcpu))
   5009		vcpu->arch.apic->sipi_vector = events->sipi_vector;
   5010
   5011	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
   5012		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
   5013			kvm_x86_ops.nested_ops->leave_nested(vcpu);
   5014			kvm_smm_changed(vcpu, events->smi.smm);
   5015		}
   5016
   5017		vcpu->arch.smi_pending = events->smi.pending;
   5018
   5019		if (events->smi.smm) {
   5020			if (events->smi.smm_inside_nmi)
   5021				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
   5022			else
   5023				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
   5024		}
   5025
   5026		if (lapic_in_kernel(vcpu)) {
   5027			if (events->smi.latched_init)
   5028				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
   5029			else
   5030				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
   5031		}
   5032	}
   5033
   5034	kvm_make_request(KVM_REQ_EVENT, vcpu);
   5035
   5036	return 0;
   5037}
   5038
   5039static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
   5040					     struct kvm_debugregs *dbgregs)
   5041{
   5042	unsigned long val;
   5043
   5044	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
   5045	kvm_get_dr(vcpu, 6, &val);
   5046	dbgregs->dr6 = val;
   5047	dbgregs->dr7 = vcpu->arch.dr7;
   5048	dbgregs->flags = 0;
   5049	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
   5050}
   5051
   5052static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
   5053					    struct kvm_debugregs *dbgregs)
   5054{
   5055	if (dbgregs->flags)
   5056		return -EINVAL;
   5057
   5058	if (!kvm_dr6_valid(dbgregs->dr6))
   5059		return -EINVAL;
   5060	if (!kvm_dr7_valid(dbgregs->dr7))
   5061		return -EINVAL;
   5062
   5063	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
   5064	kvm_update_dr0123(vcpu);
   5065	vcpu->arch.dr6 = dbgregs->dr6;
   5066	vcpu->arch.dr7 = dbgregs->dr7;
   5067	kvm_update_dr7(vcpu);
   5068
   5069	return 0;
   5070}
   5071
   5072static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
   5073					 struct kvm_xsave *guest_xsave)
   5074{
   5075	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
   5076		return;
   5077
   5078	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
   5079				       guest_xsave->region,
   5080				       sizeof(guest_xsave->region),
   5081				       vcpu->arch.pkru);
   5082}
   5083
   5084static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
   5085					  u8 *state, unsigned int size)
   5086{
   5087	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
   5088		return;
   5089
   5090	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
   5091				       state, size, vcpu->arch.pkru);
   5092}
   5093
   5094static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
   5095					struct kvm_xsave *guest_xsave)
   5096{
   5097	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
   5098		return 0;
   5099
   5100	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
   5101					      guest_xsave->region,
   5102					      supported_xcr0, &vcpu->arch.pkru);
   5103}
   5104
   5105static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
   5106					struct kvm_xcrs *guest_xcrs)
   5107{
   5108	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
   5109		guest_xcrs->nr_xcrs = 0;
   5110		return;
   5111	}
   5112
   5113	guest_xcrs->nr_xcrs = 1;
   5114	guest_xcrs->flags = 0;
   5115	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
   5116	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
   5117}
   5118
   5119static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
   5120				       struct kvm_xcrs *guest_xcrs)
   5121{
   5122	int i, r = 0;
   5123
   5124	if (!boot_cpu_has(X86_FEATURE_XSAVE))
   5125		return -EINVAL;
   5126
   5127	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
   5128		return -EINVAL;
   5129
   5130	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
   5131		/* Only support XCR0 currently */
   5132		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
   5133			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
   5134				guest_xcrs->xcrs[i].value);
   5135			break;
   5136		}
   5137	if (r)
   5138		r = -EINVAL;
   5139	return r;
   5140}
   5141
   5142/*
   5143 * kvm_set_guest_paused() indicates to the guest kernel that it has been
   5144 * stopped by the hypervisor.  This function will be called from the host only.
   5145 * EINVAL is returned when the host attempts to set the flag for a guest that
   5146 * does not support pv clocks.
   5147 */
   5148static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
   5149{
   5150	if (!vcpu->arch.pv_time.active)
   5151		return -EINVAL;
   5152	vcpu->arch.pvclock_set_guest_stopped_request = true;
   5153	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   5154	return 0;
   5155}
   5156
   5157static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
   5158				 struct kvm_device_attr *attr)
   5159{
   5160	int r;
   5161
   5162	switch (attr->attr) {
   5163	case KVM_VCPU_TSC_OFFSET:
   5164		r = 0;
   5165		break;
   5166	default:
   5167		r = -ENXIO;
   5168	}
   5169
   5170	return r;
   5171}
   5172
   5173static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
   5174				 struct kvm_device_attr *attr)
   5175{
   5176	u64 __user *uaddr = kvm_get_attr_addr(attr);
   5177	int r;
   5178
   5179	if (IS_ERR(uaddr))
   5180		return PTR_ERR(uaddr);
   5181
   5182	switch (attr->attr) {
   5183	case KVM_VCPU_TSC_OFFSET:
   5184		r = -EFAULT;
   5185		if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
   5186			break;
   5187		r = 0;
   5188		break;
   5189	default:
   5190		r = -ENXIO;
   5191	}
   5192
   5193	return r;
   5194}
   5195
   5196static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
   5197				 struct kvm_device_attr *attr)
   5198{
   5199	u64 __user *uaddr = kvm_get_attr_addr(attr);
   5200	struct kvm *kvm = vcpu->kvm;
   5201	int r;
   5202
   5203	if (IS_ERR(uaddr))
   5204		return PTR_ERR(uaddr);
   5205
   5206	switch (attr->attr) {
   5207	case KVM_VCPU_TSC_OFFSET: {
   5208		u64 offset, tsc, ns;
   5209		unsigned long flags;
   5210		bool matched;
   5211
   5212		r = -EFAULT;
   5213		if (get_user(offset, uaddr))
   5214			break;
   5215
   5216		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
   5217
   5218		matched = (vcpu->arch.virtual_tsc_khz &&
   5219			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
   5220			   kvm->arch.last_tsc_offset == offset);
   5221
   5222		tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
   5223		ns = get_kvmclock_base_ns();
   5224
   5225		__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
   5226		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
   5227
   5228		r = 0;
   5229		break;
   5230	}
   5231	default:
   5232		r = -ENXIO;
   5233	}
   5234
   5235	return r;
   5236}
   5237
   5238static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
   5239				      unsigned int ioctl,
   5240				      void __user *argp)
   5241{
   5242	struct kvm_device_attr attr;
   5243	int r;
   5244
   5245	if (copy_from_user(&attr, argp, sizeof(attr)))
   5246		return -EFAULT;
   5247
   5248	if (attr.group != KVM_VCPU_TSC_CTRL)
   5249		return -ENXIO;
   5250
   5251	switch (ioctl) {
   5252	case KVM_HAS_DEVICE_ATTR:
   5253		r = kvm_arch_tsc_has_attr(vcpu, &attr);
   5254		break;
   5255	case KVM_GET_DEVICE_ATTR:
   5256		r = kvm_arch_tsc_get_attr(vcpu, &attr);
   5257		break;
   5258	case KVM_SET_DEVICE_ATTR:
   5259		r = kvm_arch_tsc_set_attr(vcpu, &attr);
   5260		break;
   5261	}
   5262
   5263	return r;
   5264}
   5265
   5266static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
   5267				     struct kvm_enable_cap *cap)
   5268{
   5269	int r;
   5270	uint16_t vmcs_version;
   5271	void __user *user_ptr;
   5272
   5273	if (cap->flags)
   5274		return -EINVAL;
   5275
   5276	switch (cap->cap) {
   5277	case KVM_CAP_HYPERV_SYNIC2:
   5278		if (cap->args[0])
   5279			return -EINVAL;
   5280		fallthrough;
   5281
   5282	case KVM_CAP_HYPERV_SYNIC:
   5283		if (!irqchip_in_kernel(vcpu->kvm))
   5284			return -EINVAL;
   5285		return kvm_hv_activate_synic(vcpu, cap->cap ==
   5286					     KVM_CAP_HYPERV_SYNIC2);
   5287	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
   5288		if (!kvm_x86_ops.nested_ops->enable_evmcs)
   5289			return -ENOTTY;
   5290		r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
   5291		if (!r) {
   5292			user_ptr = (void __user *)(uintptr_t)cap->args[0];
   5293			if (copy_to_user(user_ptr, &vmcs_version,
   5294					 sizeof(vmcs_version)))
   5295				r = -EFAULT;
   5296		}
   5297		return r;
   5298	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
   5299		if (!kvm_x86_ops.enable_direct_tlbflush)
   5300			return -ENOTTY;
   5301
   5302		return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
   5303
   5304	case KVM_CAP_HYPERV_ENFORCE_CPUID:
   5305		return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
   5306
   5307	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
   5308		vcpu->arch.pv_cpuid.enforce = cap->args[0];
   5309		if (vcpu->arch.pv_cpuid.enforce)
   5310			kvm_update_pv_runtime(vcpu);
   5311
   5312		return 0;
   5313	default:
   5314		return -EINVAL;
   5315	}
   5316}
   5317
   5318long kvm_arch_vcpu_ioctl(struct file *filp,
   5319			 unsigned int ioctl, unsigned long arg)
   5320{
   5321	struct kvm_vcpu *vcpu = filp->private_data;
   5322	void __user *argp = (void __user *)arg;
   5323	int r;
   5324	union {
   5325		struct kvm_sregs2 *sregs2;
   5326		struct kvm_lapic_state *lapic;
   5327		struct kvm_xsave *xsave;
   5328		struct kvm_xcrs *xcrs;
   5329		void *buffer;
   5330	} u;
   5331
   5332	vcpu_load(vcpu);
   5333
   5334	u.buffer = NULL;
   5335	switch (ioctl) {
   5336	case KVM_GET_LAPIC: {
   5337		r = -EINVAL;
   5338		if (!lapic_in_kernel(vcpu))
   5339			goto out;
   5340		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
   5341				GFP_KERNEL_ACCOUNT);
   5342
   5343		r = -ENOMEM;
   5344		if (!u.lapic)
   5345			goto out;
   5346		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
   5347		if (r)
   5348			goto out;
   5349		r = -EFAULT;
   5350		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
   5351			goto out;
   5352		r = 0;
   5353		break;
   5354	}
   5355	case KVM_SET_LAPIC: {
   5356		r = -EINVAL;
   5357		if (!lapic_in_kernel(vcpu))
   5358			goto out;
   5359		u.lapic = memdup_user(argp, sizeof(*u.lapic));
   5360		if (IS_ERR(u.lapic)) {
   5361			r = PTR_ERR(u.lapic);
   5362			goto out_nofree;
   5363		}
   5364
   5365		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
   5366		break;
   5367	}
   5368	case KVM_INTERRUPT: {
   5369		struct kvm_interrupt irq;
   5370
   5371		r = -EFAULT;
   5372		if (copy_from_user(&irq, argp, sizeof(irq)))
   5373			goto out;
   5374		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
   5375		break;
   5376	}
   5377	case KVM_NMI: {
   5378		r = kvm_vcpu_ioctl_nmi(vcpu);
   5379		break;
   5380	}
   5381	case KVM_SMI: {
   5382		r = kvm_vcpu_ioctl_smi(vcpu);
   5383		break;
   5384	}
   5385	case KVM_SET_CPUID: {
   5386		struct kvm_cpuid __user *cpuid_arg = argp;
   5387		struct kvm_cpuid cpuid;
   5388
   5389		r = -EFAULT;
   5390		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
   5391			goto out;
   5392		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
   5393		break;
   5394	}
   5395	case KVM_SET_CPUID2: {
   5396		struct kvm_cpuid2 __user *cpuid_arg = argp;
   5397		struct kvm_cpuid2 cpuid;
   5398
   5399		r = -EFAULT;
   5400		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
   5401			goto out;
   5402		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
   5403					      cpuid_arg->entries);
   5404		break;
   5405	}
   5406	case KVM_GET_CPUID2: {
   5407		struct kvm_cpuid2 __user *cpuid_arg = argp;
   5408		struct kvm_cpuid2 cpuid;
   5409
   5410		r = -EFAULT;
   5411		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
   5412			goto out;
   5413		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
   5414					      cpuid_arg->entries);
   5415		if (r)
   5416			goto out;
   5417		r = -EFAULT;
   5418		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
   5419			goto out;
   5420		r = 0;
   5421		break;
   5422	}
   5423	case KVM_GET_MSRS: {
   5424		int idx = srcu_read_lock(&vcpu->kvm->srcu);
   5425		r = msr_io(vcpu, argp, do_get_msr, 1);
   5426		srcu_read_unlock(&vcpu->kvm->srcu, idx);
   5427		break;
   5428	}
   5429	case KVM_SET_MSRS: {
   5430		int idx = srcu_read_lock(&vcpu->kvm->srcu);
   5431		r = msr_io(vcpu, argp, do_set_msr, 0);
   5432		srcu_read_unlock(&vcpu->kvm->srcu, idx);
   5433		break;
   5434	}
   5435	case KVM_TPR_ACCESS_REPORTING: {
   5436		struct kvm_tpr_access_ctl tac;
   5437
   5438		r = -EFAULT;
   5439		if (copy_from_user(&tac, argp, sizeof(tac)))
   5440			goto out;
   5441		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
   5442		if (r)
   5443			goto out;
   5444		r = -EFAULT;
   5445		if (copy_to_user(argp, &tac, sizeof(tac)))
   5446			goto out;
   5447		r = 0;
   5448		break;
   5449	};
   5450	case KVM_SET_VAPIC_ADDR: {
   5451		struct kvm_vapic_addr va;
   5452		int idx;
   5453
   5454		r = -EINVAL;
   5455		if (!lapic_in_kernel(vcpu))
   5456			goto out;
   5457		r = -EFAULT;
   5458		if (copy_from_user(&va, argp, sizeof(va)))
   5459			goto out;
   5460		idx = srcu_read_lock(&vcpu->kvm->srcu);
   5461		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
   5462		srcu_read_unlock(&vcpu->kvm->srcu, idx);
   5463		break;
   5464	}
   5465	case KVM_X86_SETUP_MCE: {
   5466		u64 mcg_cap;
   5467
   5468		r = -EFAULT;
   5469		if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
   5470			goto out;
   5471		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
   5472		break;
   5473	}
   5474	case KVM_X86_SET_MCE: {
   5475		struct kvm_x86_mce mce;
   5476
   5477		r = -EFAULT;
   5478		if (copy_from_user(&mce, argp, sizeof(mce)))
   5479			goto out;
   5480		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
   5481		break;
   5482	}
   5483	case KVM_GET_VCPU_EVENTS: {
   5484		struct kvm_vcpu_events events;
   5485
   5486		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
   5487
   5488		r = -EFAULT;
   5489		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
   5490			break;
   5491		r = 0;
   5492		break;
   5493	}
   5494	case KVM_SET_VCPU_EVENTS: {
   5495		struct kvm_vcpu_events events;
   5496
   5497		r = -EFAULT;
   5498		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
   5499			break;
   5500
   5501		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
   5502		break;
   5503	}
   5504	case KVM_GET_DEBUGREGS: {
   5505		struct kvm_debugregs dbgregs;
   5506
   5507		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
   5508
   5509		r = -EFAULT;
   5510		if (copy_to_user(argp, &dbgregs,
   5511				 sizeof(struct kvm_debugregs)))
   5512			break;
   5513		r = 0;
   5514		break;
   5515	}
   5516	case KVM_SET_DEBUGREGS: {
   5517		struct kvm_debugregs dbgregs;
   5518
   5519		r = -EFAULT;
   5520		if (copy_from_user(&dbgregs, argp,
   5521				   sizeof(struct kvm_debugregs)))
   5522			break;
   5523
   5524		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
   5525		break;
   5526	}
   5527	case KVM_GET_XSAVE: {
   5528		r = -EINVAL;
   5529		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
   5530			break;
   5531
   5532		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
   5533		r = -ENOMEM;
   5534		if (!u.xsave)
   5535			break;
   5536
   5537		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
   5538
   5539		r = -EFAULT;
   5540		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
   5541			break;
   5542		r = 0;
   5543		break;
   5544	}
   5545	case KVM_SET_XSAVE: {
   5546		int size = vcpu->arch.guest_fpu.uabi_size;
   5547
   5548		u.xsave = memdup_user(argp, size);
   5549		if (IS_ERR(u.xsave)) {
   5550			r = PTR_ERR(u.xsave);
   5551			goto out_nofree;
   5552		}
   5553
   5554		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
   5555		break;
   5556	}
   5557
   5558	case KVM_GET_XSAVE2: {
   5559		int size = vcpu->arch.guest_fpu.uabi_size;
   5560
   5561		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
   5562		r = -ENOMEM;
   5563		if (!u.xsave)
   5564			break;
   5565
   5566		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
   5567
   5568		r = -EFAULT;
   5569		if (copy_to_user(argp, u.xsave, size))
   5570			break;
   5571
   5572		r = 0;
   5573		break;
   5574	}
   5575
   5576	case KVM_GET_XCRS: {
   5577		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
   5578		r = -ENOMEM;
   5579		if (!u.xcrs)
   5580			break;
   5581
   5582		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
   5583
   5584		r = -EFAULT;
   5585		if (copy_to_user(argp, u.xcrs,
   5586				 sizeof(struct kvm_xcrs)))
   5587			break;
   5588		r = 0;
   5589		break;
   5590	}
   5591	case KVM_SET_XCRS: {
   5592		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
   5593		if (IS_ERR(u.xcrs)) {
   5594			r = PTR_ERR(u.xcrs);
   5595			goto out_nofree;
   5596		}
   5597
   5598		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
   5599		break;
   5600	}
   5601	case KVM_SET_TSC_KHZ: {
   5602		u32 user_tsc_khz;
   5603
   5604		r = -EINVAL;
   5605		user_tsc_khz = (u32)arg;
   5606
   5607		if (kvm_has_tsc_control &&
   5608		    user_tsc_khz >= kvm_max_guest_tsc_khz)
   5609			goto out;
   5610
   5611		if (user_tsc_khz == 0)
   5612			user_tsc_khz = tsc_khz;
   5613
   5614		if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
   5615			r = 0;
   5616
   5617		goto out;
   5618	}
   5619	case KVM_GET_TSC_KHZ: {
   5620		r = vcpu->arch.virtual_tsc_khz;
   5621		goto out;
   5622	}
   5623	case KVM_KVMCLOCK_CTRL: {
   5624		r = kvm_set_guest_paused(vcpu);
   5625		goto out;
   5626	}
   5627	case KVM_ENABLE_CAP: {
   5628		struct kvm_enable_cap cap;
   5629
   5630		r = -EFAULT;
   5631		if (copy_from_user(&cap, argp, sizeof(cap)))
   5632			goto out;
   5633		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
   5634		break;
   5635	}
   5636	case KVM_GET_NESTED_STATE: {
   5637		struct kvm_nested_state __user *user_kvm_nested_state = argp;
   5638		u32 user_data_size;
   5639
   5640		r = -EINVAL;
   5641		if (!kvm_x86_ops.nested_ops->get_state)
   5642			break;
   5643
   5644		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
   5645		r = -EFAULT;
   5646		if (get_user(user_data_size, &user_kvm_nested_state->size))
   5647			break;
   5648
   5649		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
   5650						     user_data_size);
   5651		if (r < 0)
   5652			break;
   5653
   5654		if (r > user_data_size) {
   5655			if (put_user(r, &user_kvm_nested_state->size))
   5656				r = -EFAULT;
   5657			else
   5658				r = -E2BIG;
   5659			break;
   5660		}
   5661
   5662		r = 0;
   5663		break;
   5664	}
   5665	case KVM_SET_NESTED_STATE: {
   5666		struct kvm_nested_state __user *user_kvm_nested_state = argp;
   5667		struct kvm_nested_state kvm_state;
   5668		int idx;
   5669
   5670		r = -EINVAL;
   5671		if (!kvm_x86_ops.nested_ops->set_state)
   5672			break;
   5673
   5674		r = -EFAULT;
   5675		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
   5676			break;
   5677
   5678		r = -EINVAL;
   5679		if (kvm_state.size < sizeof(kvm_state))
   5680			break;
   5681
   5682		if (kvm_state.flags &
   5683		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
   5684		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
   5685		      | KVM_STATE_NESTED_GIF_SET))
   5686			break;
   5687
   5688		/* nested_run_pending implies guest_mode.  */
   5689		if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
   5690		    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
   5691			break;
   5692
   5693		idx = srcu_read_lock(&vcpu->kvm->srcu);
   5694		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
   5695		srcu_read_unlock(&vcpu->kvm->srcu, idx);
   5696		break;
   5697	}
   5698	case KVM_GET_SUPPORTED_HV_CPUID:
   5699		r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
   5700		break;
   5701#ifdef CONFIG_KVM_XEN
   5702	case KVM_XEN_VCPU_GET_ATTR: {
   5703		struct kvm_xen_vcpu_attr xva;
   5704
   5705		r = -EFAULT;
   5706		if (copy_from_user(&xva, argp, sizeof(xva)))
   5707			goto out;
   5708		r = kvm_xen_vcpu_get_attr(vcpu, &xva);
   5709		if (!r && copy_to_user(argp, &xva, sizeof(xva)))
   5710			r = -EFAULT;
   5711		break;
   5712	}
   5713	case KVM_XEN_VCPU_SET_ATTR: {
   5714		struct kvm_xen_vcpu_attr xva;
   5715
   5716		r = -EFAULT;
   5717		if (copy_from_user(&xva, argp, sizeof(xva)))
   5718			goto out;
   5719		r = kvm_xen_vcpu_set_attr(vcpu, &xva);
   5720		break;
   5721	}
   5722#endif
   5723	case KVM_GET_SREGS2: {
   5724		u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
   5725		r = -ENOMEM;
   5726		if (!u.sregs2)
   5727			goto out;
   5728		__get_sregs2(vcpu, u.sregs2);
   5729		r = -EFAULT;
   5730		if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
   5731			goto out;
   5732		r = 0;
   5733		break;
   5734	}
   5735	case KVM_SET_SREGS2: {
   5736		u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
   5737		if (IS_ERR(u.sregs2)) {
   5738			r = PTR_ERR(u.sregs2);
   5739			u.sregs2 = NULL;
   5740			goto out;
   5741		}
   5742		r = __set_sregs2(vcpu, u.sregs2);
   5743		break;
   5744	}
   5745	case KVM_HAS_DEVICE_ATTR:
   5746	case KVM_GET_DEVICE_ATTR:
   5747	case KVM_SET_DEVICE_ATTR:
   5748		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
   5749		break;
   5750	default:
   5751		r = -EINVAL;
   5752	}
   5753out:
   5754	kfree(u.buffer);
   5755out_nofree:
   5756	vcpu_put(vcpu);
   5757	return r;
   5758}
   5759
   5760vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
   5761{
   5762	return VM_FAULT_SIGBUS;
   5763}
   5764
   5765static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
   5766{
   5767	int ret;
   5768
   5769	if (addr > (unsigned int)(-3 * PAGE_SIZE))
   5770		return -EINVAL;
   5771	ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
   5772	return ret;
   5773}
   5774
   5775static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
   5776					      u64 ident_addr)
   5777{
   5778	return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
   5779}
   5780
   5781static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
   5782					 unsigned long kvm_nr_mmu_pages)
   5783{
   5784	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
   5785		return -EINVAL;
   5786
   5787	mutex_lock(&kvm->slots_lock);
   5788
   5789	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
   5790	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
   5791
   5792	mutex_unlock(&kvm->slots_lock);
   5793	return 0;
   5794}
   5795
   5796static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
   5797{
   5798	return kvm->arch.n_max_mmu_pages;
   5799}
   5800
   5801static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
   5802{
   5803	struct kvm_pic *pic = kvm->arch.vpic;
   5804	int r;
   5805
   5806	r = 0;
   5807	switch (chip->chip_id) {
   5808	case KVM_IRQCHIP_PIC_MASTER:
   5809		memcpy(&chip->chip.pic, &pic->pics[0],
   5810			sizeof(struct kvm_pic_state));
   5811		break;
   5812	case KVM_IRQCHIP_PIC_SLAVE:
   5813		memcpy(&chip->chip.pic, &pic->pics[1],
   5814			sizeof(struct kvm_pic_state));
   5815		break;
   5816	case KVM_IRQCHIP_IOAPIC:
   5817		kvm_get_ioapic(kvm, &chip->chip.ioapic);
   5818		break;
   5819	default:
   5820		r = -EINVAL;
   5821		break;
   5822	}
   5823	return r;
   5824}
   5825
   5826static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
   5827{
   5828	struct kvm_pic *pic = kvm->arch.vpic;
   5829	int r;
   5830
   5831	r = 0;
   5832	switch (chip->chip_id) {
   5833	case KVM_IRQCHIP_PIC_MASTER:
   5834		spin_lock(&pic->lock);
   5835		memcpy(&pic->pics[0], &chip->chip.pic,
   5836			sizeof(struct kvm_pic_state));
   5837		spin_unlock(&pic->lock);
   5838		break;
   5839	case KVM_IRQCHIP_PIC_SLAVE:
   5840		spin_lock(&pic->lock);
   5841		memcpy(&pic->pics[1], &chip->chip.pic,
   5842			sizeof(struct kvm_pic_state));
   5843		spin_unlock(&pic->lock);
   5844		break;
   5845	case KVM_IRQCHIP_IOAPIC:
   5846		kvm_set_ioapic(kvm, &chip->chip.ioapic);
   5847		break;
   5848	default:
   5849		r = -EINVAL;
   5850		break;
   5851	}
   5852	kvm_pic_update_irq(pic);
   5853	return r;
   5854}
   5855
   5856static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   5857{
   5858	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
   5859
   5860	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
   5861
   5862	mutex_lock(&kps->lock);
   5863	memcpy(ps, &kps->channels, sizeof(*ps));
   5864	mutex_unlock(&kps->lock);
   5865	return 0;
   5866}
   5867
   5868static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   5869{
   5870	int i;
   5871	struct kvm_pit *pit = kvm->arch.vpit;
   5872
   5873	mutex_lock(&pit->pit_state.lock);
   5874	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
   5875	for (i = 0; i < 3; i++)
   5876		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
   5877	mutex_unlock(&pit->pit_state.lock);
   5878	return 0;
   5879}
   5880
   5881static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
   5882{
   5883	mutex_lock(&kvm->arch.vpit->pit_state.lock);
   5884	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
   5885		sizeof(ps->channels));
   5886	ps->flags = kvm->arch.vpit->pit_state.flags;
   5887	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
   5888	memset(&ps->reserved, 0, sizeof(ps->reserved));
   5889	return 0;
   5890}
   5891
   5892static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
   5893{
   5894	int start = 0;
   5895	int i;
   5896	u32 prev_legacy, cur_legacy;
   5897	struct kvm_pit *pit = kvm->arch.vpit;
   5898
   5899	mutex_lock(&pit->pit_state.lock);
   5900	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
   5901	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
   5902	if (!prev_legacy && cur_legacy)
   5903		start = 1;
   5904	memcpy(&pit->pit_state.channels, &ps->channels,
   5905	       sizeof(pit->pit_state.channels));
   5906	pit->pit_state.flags = ps->flags;
   5907	for (i = 0; i < 3; i++)
   5908		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
   5909				   start && i == 0);
   5910	mutex_unlock(&pit->pit_state.lock);
   5911	return 0;
   5912}
   5913
   5914static int kvm_vm_ioctl_reinject(struct kvm *kvm,
   5915				 struct kvm_reinject_control *control)
   5916{
   5917	struct kvm_pit *pit = kvm->arch.vpit;
   5918
   5919	/* pit->pit_state.lock was overloaded to prevent userspace from getting
   5920	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
   5921	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
   5922	 */
   5923	mutex_lock(&pit->pit_state.lock);
   5924	kvm_pit_set_reinject(pit, control->pit_reinject);
   5925	mutex_unlock(&pit->pit_state.lock);
   5926
   5927	return 0;
   5928}
   5929
   5930void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
   5931{
   5932
   5933	/*
   5934	 * Flush all CPUs' dirty log buffers to the  dirty_bitmap.  Called
   5935	 * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
   5936	 * on all VM-Exits, thus we only need to kick running vCPUs to force a
   5937	 * VM-Exit.
   5938	 */
   5939	struct kvm_vcpu *vcpu;
   5940	unsigned long i;
   5941
   5942	kvm_for_each_vcpu(i, vcpu, kvm)
   5943		kvm_vcpu_kick(vcpu);
   5944}
   5945
   5946int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
   5947			bool line_status)
   5948{
   5949	if (!irqchip_in_kernel(kvm))
   5950		return -ENXIO;
   5951
   5952	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
   5953					irq_event->irq, irq_event->level,
   5954					line_status);
   5955	return 0;
   5956}
   5957
   5958int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
   5959			    struct kvm_enable_cap *cap)
   5960{
   5961	int r;
   5962
   5963	if (cap->flags)
   5964		return -EINVAL;
   5965
   5966	switch (cap->cap) {
   5967	case KVM_CAP_DISABLE_QUIRKS2:
   5968		r = -EINVAL;
   5969		if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
   5970			break;
   5971		fallthrough;
   5972	case KVM_CAP_DISABLE_QUIRKS:
   5973		kvm->arch.disabled_quirks = cap->args[0];
   5974		r = 0;
   5975		break;
   5976	case KVM_CAP_SPLIT_IRQCHIP: {
   5977		mutex_lock(&kvm->lock);
   5978		r = -EINVAL;
   5979		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
   5980			goto split_irqchip_unlock;
   5981		r = -EEXIST;
   5982		if (irqchip_in_kernel(kvm))
   5983			goto split_irqchip_unlock;
   5984		if (kvm->created_vcpus)
   5985			goto split_irqchip_unlock;
   5986		r = kvm_setup_empty_irq_routing(kvm);
   5987		if (r)
   5988			goto split_irqchip_unlock;
   5989		/* Pairs with irqchip_in_kernel. */
   5990		smp_wmb();
   5991		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
   5992		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
   5993		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
   5994		r = 0;
   5995split_irqchip_unlock:
   5996		mutex_unlock(&kvm->lock);
   5997		break;
   5998	}
   5999	case KVM_CAP_X2APIC_API:
   6000		r = -EINVAL;
   6001		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
   6002			break;
   6003
   6004		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
   6005			kvm->arch.x2apic_format = true;
   6006		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
   6007			kvm->arch.x2apic_broadcast_quirk_disabled = true;
   6008
   6009		r = 0;
   6010		break;
   6011	case KVM_CAP_X86_DISABLE_EXITS:
   6012		r = -EINVAL;
   6013		if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
   6014			break;
   6015
   6016		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
   6017			kvm_can_mwait_in_guest())
   6018			kvm->arch.mwait_in_guest = true;
   6019		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
   6020			kvm->arch.hlt_in_guest = true;
   6021		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
   6022			kvm->arch.pause_in_guest = true;
   6023		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
   6024			kvm->arch.cstate_in_guest = true;
   6025		r = 0;
   6026		break;
   6027	case KVM_CAP_MSR_PLATFORM_INFO:
   6028		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
   6029		r = 0;
   6030		break;
   6031	case KVM_CAP_EXCEPTION_PAYLOAD:
   6032		kvm->arch.exception_payload_enabled = cap->args[0];
   6033		r = 0;
   6034		break;
   6035	case KVM_CAP_X86_USER_SPACE_MSR:
   6036		kvm->arch.user_space_msr_mask = cap->args[0];
   6037		r = 0;
   6038		break;
   6039	case KVM_CAP_X86_BUS_LOCK_EXIT:
   6040		r = -EINVAL;
   6041		if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
   6042			break;
   6043
   6044		if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
   6045		    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
   6046			break;
   6047
   6048		if (kvm_has_bus_lock_exit &&
   6049		    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
   6050			kvm->arch.bus_lock_detection_enabled = true;
   6051		r = 0;
   6052		break;
   6053#ifdef CONFIG_X86_SGX_KVM
   6054	case KVM_CAP_SGX_ATTRIBUTE: {
   6055		unsigned long allowed_attributes = 0;
   6056
   6057		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
   6058		if (r)
   6059			break;
   6060
   6061		/* KVM only supports the PROVISIONKEY privileged attribute. */
   6062		if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
   6063		    !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
   6064			kvm->arch.sgx_provisioning_allowed = true;
   6065		else
   6066			r = -EINVAL;
   6067		break;
   6068	}
   6069#endif
   6070	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
   6071		r = -EINVAL;
   6072		if (!kvm_x86_ops.vm_copy_enc_context_from)
   6073			break;
   6074
   6075		r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
   6076		break;
   6077	case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
   6078		r = -EINVAL;
   6079		if (!kvm_x86_ops.vm_move_enc_context_from)
   6080			break;
   6081
   6082		r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
   6083		break;
   6084	case KVM_CAP_EXIT_HYPERCALL:
   6085		if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
   6086			r = -EINVAL;
   6087			break;
   6088		}
   6089		kvm->arch.hypercall_exit_enabled = cap->args[0];
   6090		r = 0;
   6091		break;
   6092	case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
   6093		r = -EINVAL;
   6094		if (cap->args[0] & ~1)
   6095			break;
   6096		kvm->arch.exit_on_emulation_error = cap->args[0];
   6097		r = 0;
   6098		break;
   6099	case KVM_CAP_PMU_CAPABILITY:
   6100		r = -EINVAL;
   6101		if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
   6102			break;
   6103
   6104		mutex_lock(&kvm->lock);
   6105		if (!kvm->created_vcpus) {
   6106			kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
   6107			r = 0;
   6108		}
   6109		mutex_unlock(&kvm->lock);
   6110		break;
   6111	default:
   6112		r = -EINVAL;
   6113		break;
   6114	}
   6115	return r;
   6116}
   6117
   6118static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
   6119{
   6120	struct kvm_x86_msr_filter *msr_filter;
   6121
   6122	msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
   6123	if (!msr_filter)
   6124		return NULL;
   6125
   6126	msr_filter->default_allow = default_allow;
   6127	return msr_filter;
   6128}
   6129
   6130static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
   6131{
   6132	u32 i;
   6133
   6134	if (!msr_filter)
   6135		return;
   6136
   6137	for (i = 0; i < msr_filter->count; i++)
   6138		kfree(msr_filter->ranges[i].bitmap);
   6139
   6140	kfree(msr_filter);
   6141}
   6142
   6143static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
   6144			      struct kvm_msr_filter_range *user_range)
   6145{
   6146	unsigned long *bitmap = NULL;
   6147	size_t bitmap_size;
   6148
   6149	if (!user_range->nmsrs)
   6150		return 0;
   6151
   6152	if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
   6153		return -EINVAL;
   6154
   6155	if (!user_range->flags)
   6156		return -EINVAL;
   6157
   6158	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
   6159	if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
   6160		return -EINVAL;
   6161
   6162	bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
   6163	if (IS_ERR(bitmap))
   6164		return PTR_ERR(bitmap);
   6165
   6166	msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
   6167		.flags = user_range->flags,
   6168		.base = user_range->base,
   6169		.nmsrs = user_range->nmsrs,
   6170		.bitmap = bitmap,
   6171	};
   6172
   6173	msr_filter->count++;
   6174	return 0;
   6175}
   6176
   6177static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
   6178{
   6179	struct kvm_msr_filter __user *user_msr_filter = argp;
   6180	struct kvm_x86_msr_filter *new_filter, *old_filter;
   6181	struct kvm_msr_filter filter;
   6182	bool default_allow;
   6183	bool empty = true;
   6184	int r = 0;
   6185	u32 i;
   6186
   6187	if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
   6188		return -EFAULT;
   6189
   6190	for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
   6191		empty &= !filter.ranges[i].nmsrs;
   6192
   6193	default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
   6194	if (empty && !default_allow)
   6195		return -EINVAL;
   6196
   6197	new_filter = kvm_alloc_msr_filter(default_allow);
   6198	if (!new_filter)
   6199		return -ENOMEM;
   6200
   6201	for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
   6202		r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
   6203		if (r) {
   6204			kvm_free_msr_filter(new_filter);
   6205			return r;
   6206		}
   6207	}
   6208
   6209	mutex_lock(&kvm->lock);
   6210
   6211	/* The per-VM filter is protected by kvm->lock... */
   6212	old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
   6213
   6214	rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
   6215	synchronize_srcu(&kvm->srcu);
   6216
   6217	kvm_free_msr_filter(old_filter);
   6218
   6219	kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
   6220	mutex_unlock(&kvm->lock);
   6221
   6222	return 0;
   6223}
   6224
   6225#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
   6226static int kvm_arch_suspend_notifier(struct kvm *kvm)
   6227{
   6228	struct kvm_vcpu *vcpu;
   6229	unsigned long i;
   6230	int ret = 0;
   6231
   6232	mutex_lock(&kvm->lock);
   6233	kvm_for_each_vcpu(i, vcpu, kvm) {
   6234		if (!vcpu->arch.pv_time.active)
   6235			continue;
   6236
   6237		ret = kvm_set_guest_paused(vcpu);
   6238		if (ret) {
   6239			kvm_err("Failed to pause guest VCPU%d: %d\n",
   6240				vcpu->vcpu_id, ret);
   6241			break;
   6242		}
   6243	}
   6244	mutex_unlock(&kvm->lock);
   6245
   6246	return ret ? NOTIFY_BAD : NOTIFY_DONE;
   6247}
   6248
   6249int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
   6250{
   6251	switch (state) {
   6252	case PM_HIBERNATION_PREPARE:
   6253	case PM_SUSPEND_PREPARE:
   6254		return kvm_arch_suspend_notifier(kvm);
   6255	}
   6256
   6257	return NOTIFY_DONE;
   6258}
   6259#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
   6260
   6261static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
   6262{
   6263	struct kvm_clock_data data = { 0 };
   6264
   6265	get_kvmclock(kvm, &data);
   6266	if (copy_to_user(argp, &data, sizeof(data)))
   6267		return -EFAULT;
   6268
   6269	return 0;
   6270}
   6271
   6272static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
   6273{
   6274	struct kvm_arch *ka = &kvm->arch;
   6275	struct kvm_clock_data data;
   6276	u64 now_raw_ns;
   6277
   6278	if (copy_from_user(&data, argp, sizeof(data)))
   6279		return -EFAULT;
   6280
   6281	/*
   6282	 * Only KVM_CLOCK_REALTIME is used, but allow passing the
   6283	 * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
   6284	 */
   6285	if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
   6286		return -EINVAL;
   6287
   6288	kvm_hv_request_tsc_page_update(kvm);
   6289	kvm_start_pvclock_update(kvm);
   6290	pvclock_update_vm_gtod_copy(kvm);
   6291
   6292	/*
   6293	 * This pairs with kvm_guest_time_update(): when masterclock is
   6294	 * in use, we use master_kernel_ns + kvmclock_offset to set
   6295	 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
   6296	 * is slightly ahead) here we risk going negative on unsigned
   6297	 * 'system_time' when 'data.clock' is very small.
   6298	 */
   6299	if (data.flags & KVM_CLOCK_REALTIME) {
   6300		u64 now_real_ns = ktime_get_real_ns();
   6301
   6302		/*
   6303		 * Avoid stepping the kvmclock backwards.
   6304		 */
   6305		if (now_real_ns > data.realtime)
   6306			data.clock += now_real_ns - data.realtime;
   6307	}
   6308
   6309	if (ka->use_master_clock)
   6310		now_raw_ns = ka->master_kernel_ns;
   6311	else
   6312		now_raw_ns = get_kvmclock_base_ns();
   6313	ka->kvmclock_offset = data.clock - now_raw_ns;
   6314	kvm_end_pvclock_update(kvm);
   6315	return 0;
   6316}
   6317
   6318long kvm_arch_vm_ioctl(struct file *filp,
   6319		       unsigned int ioctl, unsigned long arg)
   6320{
   6321	struct kvm *kvm = filp->private_data;
   6322	void __user *argp = (void __user *)arg;
   6323	int r = -ENOTTY;
   6324	/*
   6325	 * This union makes it completely explicit to gcc-3.x
   6326	 * that these two variables' stack usage should be
   6327	 * combined, not added together.
   6328	 */
   6329	union {
   6330		struct kvm_pit_state ps;
   6331		struct kvm_pit_state2 ps2;
   6332		struct kvm_pit_config pit_config;
   6333	} u;
   6334
   6335	switch (ioctl) {
   6336	case KVM_SET_TSS_ADDR:
   6337		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
   6338		break;
   6339	case KVM_SET_IDENTITY_MAP_ADDR: {
   6340		u64 ident_addr;
   6341
   6342		mutex_lock(&kvm->lock);
   6343		r = -EINVAL;
   6344		if (kvm->created_vcpus)
   6345			goto set_identity_unlock;
   6346		r = -EFAULT;
   6347		if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
   6348			goto set_identity_unlock;
   6349		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
   6350set_identity_unlock:
   6351		mutex_unlock(&kvm->lock);
   6352		break;
   6353	}
   6354	case KVM_SET_NR_MMU_PAGES:
   6355		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
   6356		break;
   6357	case KVM_GET_NR_MMU_PAGES:
   6358		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
   6359		break;
   6360	case KVM_CREATE_IRQCHIP: {
   6361		mutex_lock(&kvm->lock);
   6362
   6363		r = -EEXIST;
   6364		if (irqchip_in_kernel(kvm))
   6365			goto create_irqchip_unlock;
   6366
   6367		r = -EINVAL;
   6368		if (kvm->created_vcpus)
   6369			goto create_irqchip_unlock;
   6370
   6371		r = kvm_pic_init(kvm);
   6372		if (r)
   6373			goto create_irqchip_unlock;
   6374
   6375		r = kvm_ioapic_init(kvm);
   6376		if (r) {
   6377			kvm_pic_destroy(kvm);
   6378			goto create_irqchip_unlock;
   6379		}
   6380
   6381		r = kvm_setup_default_irq_routing(kvm);
   6382		if (r) {
   6383			kvm_ioapic_destroy(kvm);
   6384			kvm_pic_destroy(kvm);
   6385			goto create_irqchip_unlock;
   6386		}
   6387		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
   6388		smp_wmb();
   6389		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
   6390		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
   6391	create_irqchip_unlock:
   6392		mutex_unlock(&kvm->lock);
   6393		break;
   6394	}
   6395	case KVM_CREATE_PIT:
   6396		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
   6397		goto create_pit;
   6398	case KVM_CREATE_PIT2:
   6399		r = -EFAULT;
   6400		if (copy_from_user(&u.pit_config, argp,
   6401				   sizeof(struct kvm_pit_config)))
   6402			goto out;
   6403	create_pit:
   6404		mutex_lock(&kvm->lock);
   6405		r = -EEXIST;
   6406		if (kvm->arch.vpit)
   6407			goto create_pit_unlock;
   6408		r = -ENOMEM;
   6409		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
   6410		if (kvm->arch.vpit)
   6411			r = 0;
   6412	create_pit_unlock:
   6413		mutex_unlock(&kvm->lock);
   6414		break;
   6415	case KVM_GET_IRQCHIP: {
   6416		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
   6417		struct kvm_irqchip *chip;
   6418
   6419		chip = memdup_user(argp, sizeof(*chip));
   6420		if (IS_ERR(chip)) {
   6421			r = PTR_ERR(chip);
   6422			goto out;
   6423		}
   6424
   6425		r = -ENXIO;
   6426		if (!irqchip_kernel(kvm))
   6427			goto get_irqchip_out;
   6428		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
   6429		if (r)
   6430			goto get_irqchip_out;
   6431		r = -EFAULT;
   6432		if (copy_to_user(argp, chip, sizeof(*chip)))
   6433			goto get_irqchip_out;
   6434		r = 0;
   6435	get_irqchip_out:
   6436		kfree(chip);
   6437		break;
   6438	}
   6439	case KVM_SET_IRQCHIP: {
   6440		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
   6441		struct kvm_irqchip *chip;
   6442
   6443		chip = memdup_user(argp, sizeof(*chip));
   6444		if (IS_ERR(chip)) {
   6445			r = PTR_ERR(chip);
   6446			goto out;
   6447		}
   6448
   6449		r = -ENXIO;
   6450		if (!irqchip_kernel(kvm))
   6451			goto set_irqchip_out;
   6452		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
   6453	set_irqchip_out:
   6454		kfree(chip);
   6455		break;
   6456	}
   6457	case KVM_GET_PIT: {
   6458		r = -EFAULT;
   6459		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
   6460			goto out;
   6461		r = -ENXIO;
   6462		if (!kvm->arch.vpit)
   6463			goto out;
   6464		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
   6465		if (r)
   6466			goto out;
   6467		r = -EFAULT;
   6468		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
   6469			goto out;
   6470		r = 0;
   6471		break;
   6472	}
   6473	case KVM_SET_PIT: {
   6474		r = -EFAULT;
   6475		if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
   6476			goto out;
   6477		mutex_lock(&kvm->lock);
   6478		r = -ENXIO;
   6479		if (!kvm->arch.vpit)
   6480			goto set_pit_out;
   6481		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
   6482set_pit_out:
   6483		mutex_unlock(&kvm->lock);
   6484		break;
   6485	}
   6486	case KVM_GET_PIT2: {
   6487		r = -ENXIO;
   6488		if (!kvm->arch.vpit)
   6489			goto out;
   6490		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
   6491		if (r)
   6492			goto out;
   6493		r = -EFAULT;
   6494		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
   6495			goto out;
   6496		r = 0;
   6497		break;
   6498	}
   6499	case KVM_SET_PIT2: {
   6500		r = -EFAULT;
   6501		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
   6502			goto out;
   6503		mutex_lock(&kvm->lock);
   6504		r = -ENXIO;
   6505		if (!kvm->arch.vpit)
   6506			goto set_pit2_out;
   6507		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
   6508set_pit2_out:
   6509		mutex_unlock(&kvm->lock);
   6510		break;
   6511	}
   6512	case KVM_REINJECT_CONTROL: {
   6513		struct kvm_reinject_control control;
   6514		r =  -EFAULT;
   6515		if (copy_from_user(&control, argp, sizeof(control)))
   6516			goto out;
   6517		r = -ENXIO;
   6518		if (!kvm->arch.vpit)
   6519			goto out;
   6520		r = kvm_vm_ioctl_reinject(kvm, &control);
   6521		break;
   6522	}
   6523	case KVM_SET_BOOT_CPU_ID:
   6524		r = 0;
   6525		mutex_lock(&kvm->lock);
   6526		if (kvm->created_vcpus)
   6527			r = -EBUSY;
   6528		else
   6529			kvm->arch.bsp_vcpu_id = arg;
   6530		mutex_unlock(&kvm->lock);
   6531		break;
   6532#ifdef CONFIG_KVM_XEN
   6533	case KVM_XEN_HVM_CONFIG: {
   6534		struct kvm_xen_hvm_config xhc;
   6535		r = -EFAULT;
   6536		if (copy_from_user(&xhc, argp, sizeof(xhc)))
   6537			goto out;
   6538		r = kvm_xen_hvm_config(kvm, &xhc);
   6539		break;
   6540	}
   6541	case KVM_XEN_HVM_GET_ATTR: {
   6542		struct kvm_xen_hvm_attr xha;
   6543
   6544		r = -EFAULT;
   6545		if (copy_from_user(&xha, argp, sizeof(xha)))
   6546			goto out;
   6547		r = kvm_xen_hvm_get_attr(kvm, &xha);
   6548		if (!r && copy_to_user(argp, &xha, sizeof(xha)))
   6549			r = -EFAULT;
   6550		break;
   6551	}
   6552	case KVM_XEN_HVM_SET_ATTR: {
   6553		struct kvm_xen_hvm_attr xha;
   6554
   6555		r = -EFAULT;
   6556		if (copy_from_user(&xha, argp, sizeof(xha)))
   6557			goto out;
   6558		r = kvm_xen_hvm_set_attr(kvm, &xha);
   6559		break;
   6560	}
   6561	case KVM_XEN_HVM_EVTCHN_SEND: {
   6562		struct kvm_irq_routing_xen_evtchn uxe;
   6563
   6564		r = -EFAULT;
   6565		if (copy_from_user(&uxe, argp, sizeof(uxe)))
   6566			goto out;
   6567		r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
   6568		break;
   6569	}
   6570#endif
   6571	case KVM_SET_CLOCK:
   6572		r = kvm_vm_ioctl_set_clock(kvm, argp);
   6573		break;
   6574	case KVM_GET_CLOCK:
   6575		r = kvm_vm_ioctl_get_clock(kvm, argp);
   6576		break;
   6577	case KVM_SET_TSC_KHZ: {
   6578		u32 user_tsc_khz;
   6579
   6580		r = -EINVAL;
   6581		user_tsc_khz = (u32)arg;
   6582
   6583		if (kvm_has_tsc_control &&
   6584		    user_tsc_khz >= kvm_max_guest_tsc_khz)
   6585			goto out;
   6586
   6587		if (user_tsc_khz == 0)
   6588			user_tsc_khz = tsc_khz;
   6589
   6590		WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
   6591		r = 0;
   6592
   6593		goto out;
   6594	}
   6595	case KVM_GET_TSC_KHZ: {
   6596		r = READ_ONCE(kvm->arch.default_tsc_khz);
   6597		goto out;
   6598	}
   6599	case KVM_MEMORY_ENCRYPT_OP: {
   6600		r = -ENOTTY;
   6601		if (!kvm_x86_ops.mem_enc_ioctl)
   6602			goto out;
   6603
   6604		r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
   6605		break;
   6606	}
   6607	case KVM_MEMORY_ENCRYPT_REG_REGION: {
   6608		struct kvm_enc_region region;
   6609
   6610		r = -EFAULT;
   6611		if (copy_from_user(&region, argp, sizeof(region)))
   6612			goto out;
   6613
   6614		r = -ENOTTY;
   6615		if (!kvm_x86_ops.mem_enc_register_region)
   6616			goto out;
   6617
   6618		r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
   6619		break;
   6620	}
   6621	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
   6622		struct kvm_enc_region region;
   6623
   6624		r = -EFAULT;
   6625		if (copy_from_user(&region, argp, sizeof(region)))
   6626			goto out;
   6627
   6628		r = -ENOTTY;
   6629		if (!kvm_x86_ops.mem_enc_unregister_region)
   6630			goto out;
   6631
   6632		r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
   6633		break;
   6634	}
   6635	case KVM_HYPERV_EVENTFD: {
   6636		struct kvm_hyperv_eventfd hvevfd;
   6637
   6638		r = -EFAULT;
   6639		if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
   6640			goto out;
   6641		r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
   6642		break;
   6643	}
   6644	case KVM_SET_PMU_EVENT_FILTER:
   6645		r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
   6646		break;
   6647	case KVM_X86_SET_MSR_FILTER:
   6648		r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
   6649		break;
   6650	default:
   6651		r = -ENOTTY;
   6652	}
   6653out:
   6654	return r;
   6655}
   6656
   6657static void kvm_init_msr_list(void)
   6658{
   6659	struct x86_pmu_capability x86_pmu;
   6660	u32 dummy[2];
   6661	unsigned i;
   6662
   6663	BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
   6664			 "Please update the fixed PMCs in msrs_to_saved_all[]");
   6665
   6666	perf_get_x86_pmu_capability(&x86_pmu);
   6667
   6668	num_msrs_to_save = 0;
   6669	num_emulated_msrs = 0;
   6670	num_msr_based_features = 0;
   6671
   6672	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
   6673		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
   6674			continue;
   6675
   6676		/*
   6677		 * Even MSRs that are valid in the host may not be exposed
   6678		 * to the guests in some cases.
   6679		 */
   6680		switch (msrs_to_save_all[i]) {
   6681		case MSR_IA32_BNDCFGS:
   6682			if (!kvm_mpx_supported())
   6683				continue;
   6684			break;
   6685		case MSR_TSC_AUX:
   6686			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
   6687			    !kvm_cpu_cap_has(X86_FEATURE_RDPID))
   6688				continue;
   6689			break;
   6690		case MSR_IA32_UMWAIT_CONTROL:
   6691			if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
   6692				continue;
   6693			break;
   6694		case MSR_IA32_RTIT_CTL:
   6695		case MSR_IA32_RTIT_STATUS:
   6696			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
   6697				continue;
   6698			break;
   6699		case MSR_IA32_RTIT_CR3_MATCH:
   6700			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
   6701			    !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
   6702				continue;
   6703			break;
   6704		case MSR_IA32_RTIT_OUTPUT_BASE:
   6705		case MSR_IA32_RTIT_OUTPUT_MASK:
   6706			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
   6707				(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
   6708				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
   6709				continue;
   6710			break;
   6711		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
   6712			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
   6713				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
   6714				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
   6715				continue;
   6716			break;
   6717		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
   6718			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
   6719			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
   6720				continue;
   6721			break;
   6722		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
   6723			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
   6724			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
   6725				continue;
   6726			break;
   6727		case MSR_IA32_XFD:
   6728		case MSR_IA32_XFD_ERR:
   6729			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
   6730				continue;
   6731			break;
   6732		default:
   6733			break;
   6734		}
   6735
   6736		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
   6737	}
   6738
   6739	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
   6740		if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
   6741			continue;
   6742
   6743		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
   6744	}
   6745
   6746	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
   6747		struct kvm_msr_entry msr;
   6748
   6749		msr.index = msr_based_features_all[i];
   6750		if (kvm_get_msr_feature(&msr))
   6751			continue;
   6752
   6753		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
   6754	}
   6755}
   6756
   6757static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
   6758			   const void *v)
   6759{
   6760	int handled = 0;
   6761	int n;
   6762
   6763	do {
   6764		n = min(len, 8);
   6765		if (!(lapic_in_kernel(vcpu) &&
   6766		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
   6767		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
   6768			break;
   6769		handled += n;
   6770		addr += n;
   6771		len -= n;
   6772		v += n;
   6773	} while (len);
   6774
   6775	return handled;
   6776}
   6777
   6778static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
   6779{
   6780	int handled = 0;
   6781	int n;
   6782
   6783	do {
   6784		n = min(len, 8);
   6785		if (!(lapic_in_kernel(vcpu) &&
   6786		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
   6787					 addr, n, v))
   6788		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
   6789			break;
   6790		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
   6791		handled += n;
   6792		addr += n;
   6793		len -= n;
   6794		v += n;
   6795	} while (len);
   6796
   6797	return handled;
   6798}
   6799
   6800static void kvm_set_segment(struct kvm_vcpu *vcpu,
   6801			struct kvm_segment *var, int seg)
   6802{
   6803	static_call(kvm_x86_set_segment)(vcpu, var, seg);
   6804}
   6805
   6806void kvm_get_segment(struct kvm_vcpu *vcpu,
   6807		     struct kvm_segment *var, int seg)
   6808{
   6809	static_call(kvm_x86_get_segment)(vcpu, var, seg);
   6810}
   6811
   6812gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
   6813			   struct x86_exception *exception)
   6814{
   6815	struct kvm_mmu *mmu = vcpu->arch.mmu;
   6816	gpa_t t_gpa;
   6817
   6818	BUG_ON(!mmu_is_nested(vcpu));
   6819
   6820	/* NPT walks are always user-walks */
   6821	access |= PFERR_USER_MASK;
   6822	t_gpa  = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
   6823
   6824	return t_gpa;
   6825}
   6826
   6827gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
   6828			      struct x86_exception *exception)
   6829{
   6830	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6831
   6832	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
   6833	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
   6834}
   6835EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
   6836
   6837 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
   6838				struct x86_exception *exception)
   6839{
   6840	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6841
   6842	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
   6843	access |= PFERR_FETCH_MASK;
   6844	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
   6845}
   6846
   6847gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
   6848			       struct x86_exception *exception)
   6849{
   6850	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6851
   6852	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
   6853	access |= PFERR_WRITE_MASK;
   6854	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
   6855}
   6856EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
   6857
   6858/* uses this to access any guest's mapped memory without checking CPL */
   6859gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
   6860				struct x86_exception *exception)
   6861{
   6862	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6863
   6864	return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
   6865}
   6866
   6867static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
   6868				      struct kvm_vcpu *vcpu, u64 access,
   6869				      struct x86_exception *exception)
   6870{
   6871	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6872	void *data = val;
   6873	int r = X86EMUL_CONTINUE;
   6874
   6875	while (bytes) {
   6876		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
   6877		unsigned offset = addr & (PAGE_SIZE-1);
   6878		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
   6879		int ret;
   6880
   6881		if (gpa == UNMAPPED_GVA)
   6882			return X86EMUL_PROPAGATE_FAULT;
   6883		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
   6884					       offset, toread);
   6885		if (ret < 0) {
   6886			r = X86EMUL_IO_NEEDED;
   6887			goto out;
   6888		}
   6889
   6890		bytes -= toread;
   6891		data += toread;
   6892		addr += toread;
   6893	}
   6894out:
   6895	return r;
   6896}
   6897
   6898/* used for instruction fetching */
   6899static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
   6900				gva_t addr, void *val, unsigned int bytes,
   6901				struct x86_exception *exception)
   6902{
   6903	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   6904	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6905	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
   6906	unsigned offset;
   6907	int ret;
   6908
   6909	/* Inline kvm_read_guest_virt_helper for speed.  */
   6910	gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
   6911				    exception);
   6912	if (unlikely(gpa == UNMAPPED_GVA))
   6913		return X86EMUL_PROPAGATE_FAULT;
   6914
   6915	offset = addr & (PAGE_SIZE-1);
   6916	if (WARN_ON(offset + bytes > PAGE_SIZE))
   6917		bytes = (unsigned)PAGE_SIZE - offset;
   6918	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
   6919				       offset, bytes);
   6920	if (unlikely(ret < 0))
   6921		return X86EMUL_IO_NEEDED;
   6922
   6923	return X86EMUL_CONTINUE;
   6924}
   6925
   6926int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
   6927			       gva_t addr, void *val, unsigned int bytes,
   6928			       struct x86_exception *exception)
   6929{
   6930	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
   6931
   6932	/*
   6933	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
   6934	 * is returned, but our callers are not ready for that and they blindly
   6935	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
   6936	 * uninitialized kernel stack memory into cr2 and error code.
   6937	 */
   6938	memset(exception, 0, sizeof(*exception));
   6939	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
   6940					  exception);
   6941}
   6942EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
   6943
   6944static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
   6945			     gva_t addr, void *val, unsigned int bytes,
   6946			     struct x86_exception *exception, bool system)
   6947{
   6948	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   6949	u64 access = 0;
   6950
   6951	if (system)
   6952		access |= PFERR_IMPLICIT_ACCESS;
   6953	else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
   6954		access |= PFERR_USER_MASK;
   6955
   6956	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
   6957}
   6958
   6959static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
   6960		unsigned long addr, void *val, unsigned int bytes)
   6961{
   6962	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   6963	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
   6964
   6965	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
   6966}
   6967
   6968static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
   6969				      struct kvm_vcpu *vcpu, u64 access,
   6970				      struct x86_exception *exception)
   6971{
   6972	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   6973	void *data = val;
   6974	int r = X86EMUL_CONTINUE;
   6975
   6976	while (bytes) {
   6977		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
   6978		unsigned offset = addr & (PAGE_SIZE-1);
   6979		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
   6980		int ret;
   6981
   6982		if (gpa == UNMAPPED_GVA)
   6983			return X86EMUL_PROPAGATE_FAULT;
   6984		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
   6985		if (ret < 0) {
   6986			r = X86EMUL_IO_NEEDED;
   6987			goto out;
   6988		}
   6989
   6990		bytes -= towrite;
   6991		data += towrite;
   6992		addr += towrite;
   6993	}
   6994out:
   6995	return r;
   6996}
   6997
   6998static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
   6999			      unsigned int bytes, struct x86_exception *exception,
   7000			      bool system)
   7001{
   7002	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7003	u64 access = PFERR_WRITE_MASK;
   7004
   7005	if (system)
   7006		access |= PFERR_IMPLICIT_ACCESS;
   7007	else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
   7008		access |= PFERR_USER_MASK;
   7009
   7010	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
   7011					   access, exception);
   7012}
   7013
   7014int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
   7015				unsigned int bytes, struct x86_exception *exception)
   7016{
   7017	/* kvm_write_guest_virt_system can pull in tons of pages. */
   7018	vcpu->arch.l1tf_flush_l1d = true;
   7019
   7020	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
   7021					   PFERR_WRITE_MASK, exception);
   7022}
   7023EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
   7024
   7025static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
   7026				void *insn, int insn_len)
   7027{
   7028	return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
   7029							    insn, insn_len);
   7030}
   7031
   7032int handle_ud(struct kvm_vcpu *vcpu)
   7033{
   7034	static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
   7035	int emul_type = EMULTYPE_TRAP_UD;
   7036	char sig[5]; /* ud2; .ascii "kvm" */
   7037	struct x86_exception e;
   7038
   7039	if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
   7040		return 1;
   7041
   7042	if (force_emulation_prefix &&
   7043	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
   7044				sig, sizeof(sig), &e) == 0 &&
   7045	    memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
   7046		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
   7047		emul_type = EMULTYPE_TRAP_UD_FORCED;
   7048	}
   7049
   7050	return kvm_emulate_instruction(vcpu, emul_type);
   7051}
   7052EXPORT_SYMBOL_GPL(handle_ud);
   7053
   7054static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
   7055			    gpa_t gpa, bool write)
   7056{
   7057	/* For APIC access vmexit */
   7058	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
   7059		return 1;
   7060
   7061	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
   7062		trace_vcpu_match_mmio(gva, gpa, write, true);
   7063		return 1;
   7064	}
   7065
   7066	return 0;
   7067}
   7068
   7069static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
   7070				gpa_t *gpa, struct x86_exception *exception,
   7071				bool write)
   7072{
   7073	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   7074	u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
   7075		| (write ? PFERR_WRITE_MASK : 0);
   7076
   7077	/*
   7078	 * currently PKRU is only applied to ept enabled guest so
   7079	 * there is no pkey in EPT page table for L1 guest or EPT
   7080	 * shadow page table for L2 guest.
   7081	 */
   7082	if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
   7083	    !permission_fault(vcpu, vcpu->arch.walk_mmu,
   7084			      vcpu->arch.mmio_access, 0, access))) {
   7085		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
   7086					(gva & (PAGE_SIZE - 1));
   7087		trace_vcpu_match_mmio(gva, *gpa, write, false);
   7088		return 1;
   7089	}
   7090
   7091	*gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
   7092
   7093	if (*gpa == UNMAPPED_GVA)
   7094		return -1;
   7095
   7096	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
   7097}
   7098
   7099int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
   7100			const void *val, int bytes)
   7101{
   7102	int ret;
   7103
   7104	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
   7105	if (ret < 0)
   7106		return 0;
   7107	kvm_page_track_write(vcpu, gpa, val, bytes);
   7108	return 1;
   7109}
   7110
   7111struct read_write_emulator_ops {
   7112	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
   7113				  int bytes);
   7114	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
   7115				  void *val, int bytes);
   7116	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
   7117			       int bytes, void *val);
   7118	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
   7119				    void *val, int bytes);
   7120	bool write;
   7121};
   7122
   7123static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
   7124{
   7125	if (vcpu->mmio_read_completed) {
   7126		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
   7127			       vcpu->mmio_fragments[0].gpa, val);
   7128		vcpu->mmio_read_completed = 0;
   7129		return 1;
   7130	}
   7131
   7132	return 0;
   7133}
   7134
   7135static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
   7136			void *val, int bytes)
   7137{
   7138	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
   7139}
   7140
   7141static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
   7142			 void *val, int bytes)
   7143{
   7144	return emulator_write_phys(vcpu, gpa, val, bytes);
   7145}
   7146
   7147static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
   7148{
   7149	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
   7150	return vcpu_mmio_write(vcpu, gpa, bytes, val);
   7151}
   7152
   7153static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
   7154			  void *val, int bytes)
   7155{
   7156	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
   7157	return X86EMUL_IO_NEEDED;
   7158}
   7159
   7160static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
   7161			   void *val, int bytes)
   7162{
   7163	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
   7164
   7165	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
   7166	return X86EMUL_CONTINUE;
   7167}
   7168
   7169static const struct read_write_emulator_ops read_emultor = {
   7170	.read_write_prepare = read_prepare,
   7171	.read_write_emulate = read_emulate,
   7172	.read_write_mmio = vcpu_mmio_read,
   7173	.read_write_exit_mmio = read_exit_mmio,
   7174};
   7175
   7176static const struct read_write_emulator_ops write_emultor = {
   7177	.read_write_emulate = write_emulate,
   7178	.read_write_mmio = write_mmio,
   7179	.read_write_exit_mmio = write_exit_mmio,
   7180	.write = true,
   7181};
   7182
   7183static int emulator_read_write_onepage(unsigned long addr, void *val,
   7184				       unsigned int bytes,
   7185				       struct x86_exception *exception,
   7186				       struct kvm_vcpu *vcpu,
   7187				       const struct read_write_emulator_ops *ops)
   7188{
   7189	gpa_t gpa;
   7190	int handled, ret;
   7191	bool write = ops->write;
   7192	struct kvm_mmio_fragment *frag;
   7193	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   7194
   7195	/*
   7196	 * If the exit was due to a NPF we may already have a GPA.
   7197	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
   7198	 * Note, this cannot be used on string operations since string
   7199	 * operation using rep will only have the initial GPA from the NPF
   7200	 * occurred.
   7201	 */
   7202	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
   7203	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
   7204		gpa = ctxt->gpa_val;
   7205		ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
   7206	} else {
   7207		ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
   7208		if (ret < 0)
   7209			return X86EMUL_PROPAGATE_FAULT;
   7210	}
   7211
   7212	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
   7213		return X86EMUL_CONTINUE;
   7214
   7215	/*
   7216	 * Is this MMIO handled locally?
   7217	 */
   7218	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
   7219	if (handled == bytes)
   7220		return X86EMUL_CONTINUE;
   7221
   7222	gpa += handled;
   7223	bytes -= handled;
   7224	val += handled;
   7225
   7226	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
   7227	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
   7228	frag->gpa = gpa;
   7229	frag->data = val;
   7230	frag->len = bytes;
   7231	return X86EMUL_CONTINUE;
   7232}
   7233
   7234static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
   7235			unsigned long addr,
   7236			void *val, unsigned int bytes,
   7237			struct x86_exception *exception,
   7238			const struct read_write_emulator_ops *ops)
   7239{
   7240	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7241	gpa_t gpa;
   7242	int rc;
   7243
   7244	if (ops->read_write_prepare &&
   7245		  ops->read_write_prepare(vcpu, val, bytes))
   7246		return X86EMUL_CONTINUE;
   7247
   7248	vcpu->mmio_nr_fragments = 0;
   7249
   7250	/* Crossing a page boundary? */
   7251	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
   7252		int now;
   7253
   7254		now = -addr & ~PAGE_MASK;
   7255		rc = emulator_read_write_onepage(addr, val, now, exception,
   7256						 vcpu, ops);
   7257
   7258		if (rc != X86EMUL_CONTINUE)
   7259			return rc;
   7260		addr += now;
   7261		if (ctxt->mode != X86EMUL_MODE_PROT64)
   7262			addr = (u32)addr;
   7263		val += now;
   7264		bytes -= now;
   7265	}
   7266
   7267	rc = emulator_read_write_onepage(addr, val, bytes, exception,
   7268					 vcpu, ops);
   7269	if (rc != X86EMUL_CONTINUE)
   7270		return rc;
   7271
   7272	if (!vcpu->mmio_nr_fragments)
   7273		return rc;
   7274
   7275	gpa = vcpu->mmio_fragments[0].gpa;
   7276
   7277	vcpu->mmio_needed = 1;
   7278	vcpu->mmio_cur_fragment = 0;
   7279
   7280	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
   7281	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
   7282	vcpu->run->exit_reason = KVM_EXIT_MMIO;
   7283	vcpu->run->mmio.phys_addr = gpa;
   7284
   7285	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
   7286}
   7287
   7288static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
   7289				  unsigned long addr,
   7290				  void *val,
   7291				  unsigned int bytes,
   7292				  struct x86_exception *exception)
   7293{
   7294	return emulator_read_write(ctxt, addr, val, bytes,
   7295				   exception, &read_emultor);
   7296}
   7297
   7298static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
   7299			    unsigned long addr,
   7300			    const void *val,
   7301			    unsigned int bytes,
   7302			    struct x86_exception *exception)
   7303{
   7304	return emulator_read_write(ctxt, addr, (void *)val, bytes,
   7305				   exception, &write_emultor);
   7306}
   7307
   7308#define emulator_try_cmpxchg_user(t, ptr, old, new) \
   7309	(__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
   7310
   7311static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
   7312				     unsigned long addr,
   7313				     const void *old,
   7314				     const void *new,
   7315				     unsigned int bytes,
   7316				     struct x86_exception *exception)
   7317{
   7318	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7319	u64 page_line_mask;
   7320	unsigned long hva;
   7321	gpa_t gpa;
   7322	int r;
   7323
   7324	/* guests cmpxchg8b have to be emulated atomically */
   7325	if (bytes > 8 || (bytes & (bytes - 1)))
   7326		goto emul_write;
   7327
   7328	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
   7329
   7330	if (gpa == UNMAPPED_GVA ||
   7331	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
   7332		goto emul_write;
   7333
   7334	/*
   7335	 * Emulate the atomic as a straight write to avoid #AC if SLD is
   7336	 * enabled in the host and the access splits a cache line.
   7337	 */
   7338	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
   7339		page_line_mask = ~(cache_line_size() - 1);
   7340	else
   7341		page_line_mask = PAGE_MASK;
   7342
   7343	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
   7344		goto emul_write;
   7345
   7346	hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
   7347	if (kvm_is_error_hva(hva))
   7348		goto emul_write;
   7349
   7350	hva += offset_in_page(gpa);
   7351
   7352	switch (bytes) {
   7353	case 1:
   7354		r = emulator_try_cmpxchg_user(u8, hva, old, new);
   7355		break;
   7356	case 2:
   7357		r = emulator_try_cmpxchg_user(u16, hva, old, new);
   7358		break;
   7359	case 4:
   7360		r = emulator_try_cmpxchg_user(u32, hva, old, new);
   7361		break;
   7362	case 8:
   7363		r = emulator_try_cmpxchg_user(u64, hva, old, new);
   7364		break;
   7365	default:
   7366		BUG();
   7367	}
   7368
   7369	if (r < 0)
   7370		return X86EMUL_UNHANDLEABLE;
   7371	if (r)
   7372		return X86EMUL_CMPXCHG_FAILED;
   7373
   7374	kvm_page_track_write(vcpu, gpa, new, bytes);
   7375
   7376	return X86EMUL_CONTINUE;
   7377
   7378emul_write:
   7379	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
   7380
   7381	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
   7382}
   7383
   7384static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
   7385{
   7386	int r = 0, i;
   7387
   7388	for (i = 0; i < vcpu->arch.pio.count; i++) {
   7389		if (vcpu->arch.pio.in)
   7390			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
   7391					    vcpu->arch.pio.size, pd);
   7392		else
   7393			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
   7394					     vcpu->arch.pio.port, vcpu->arch.pio.size,
   7395					     pd);
   7396		if (r)
   7397			break;
   7398		pd += vcpu->arch.pio.size;
   7399	}
   7400	return r;
   7401}
   7402
   7403static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
   7404			       unsigned short port,
   7405			       unsigned int count, bool in)
   7406{
   7407	vcpu->arch.pio.port = port;
   7408	vcpu->arch.pio.in = in;
   7409	vcpu->arch.pio.count  = count;
   7410	vcpu->arch.pio.size = size;
   7411
   7412	if (!kernel_pio(vcpu, vcpu->arch.pio_data))
   7413		return 1;
   7414
   7415	vcpu->run->exit_reason = KVM_EXIT_IO;
   7416	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
   7417	vcpu->run->io.size = size;
   7418	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
   7419	vcpu->run->io.count = count;
   7420	vcpu->run->io.port = port;
   7421
   7422	return 0;
   7423}
   7424
   7425static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
   7426			     unsigned short port, unsigned int count)
   7427{
   7428	WARN_ON(vcpu->arch.pio.count);
   7429	memset(vcpu->arch.pio_data, 0, size * count);
   7430	return emulator_pio_in_out(vcpu, size, port, count, true);
   7431}
   7432
   7433static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
   7434{
   7435	int size = vcpu->arch.pio.size;
   7436	unsigned count = vcpu->arch.pio.count;
   7437	memcpy(val, vcpu->arch.pio_data, size * count);
   7438	trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
   7439	vcpu->arch.pio.count = 0;
   7440}
   7441
   7442static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
   7443			   unsigned short port, void *val, unsigned int count)
   7444{
   7445	if (vcpu->arch.pio.count) {
   7446		/*
   7447		 * Complete a previous iteration that required userspace I/O.
   7448		 * Note, @count isn't guaranteed to match pio.count as userspace
   7449		 * can modify ECX before rerunning the vCPU.  Ignore any such
   7450		 * shenanigans as KVM doesn't support modifying the rep count,
   7451		 * and the emulator ensures @count doesn't overflow the buffer.
   7452		 */
   7453	} else {
   7454		int r = __emulator_pio_in(vcpu, size, port, count);
   7455		if (!r)
   7456			return r;
   7457
   7458		/* Results already available, fall through.  */
   7459	}
   7460
   7461	complete_emulator_pio_in(vcpu, val);
   7462	return 1;
   7463}
   7464
   7465static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
   7466				    int size, unsigned short port, void *val,
   7467				    unsigned int count)
   7468{
   7469	return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
   7470
   7471}
   7472
   7473static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
   7474			    unsigned short port, const void *val,
   7475			    unsigned int count)
   7476{
   7477	int ret;
   7478
   7479	memcpy(vcpu->arch.pio_data, val, size * count);
   7480	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
   7481	ret = emulator_pio_in_out(vcpu, size, port, count, false);
   7482	if (ret)
   7483                vcpu->arch.pio.count = 0;
   7484
   7485        return ret;
   7486}
   7487
   7488static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
   7489				     int size, unsigned short port,
   7490				     const void *val, unsigned int count)
   7491{
   7492	return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
   7493}
   7494
   7495static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
   7496{
   7497	return static_call(kvm_x86_get_segment_base)(vcpu, seg);
   7498}
   7499
   7500static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
   7501{
   7502	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
   7503}
   7504
   7505static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
   7506{
   7507	if (!need_emulate_wbinvd(vcpu))
   7508		return X86EMUL_CONTINUE;
   7509
   7510	if (static_call(kvm_x86_has_wbinvd_exit)()) {
   7511		int cpu = get_cpu();
   7512
   7513		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
   7514		on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
   7515				wbinvd_ipi, NULL, 1);
   7516		put_cpu();
   7517		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
   7518	} else
   7519		wbinvd();
   7520	return X86EMUL_CONTINUE;
   7521}
   7522
   7523int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
   7524{
   7525	kvm_emulate_wbinvd_noskip(vcpu);
   7526	return kvm_skip_emulated_instruction(vcpu);
   7527}
   7528EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
   7529
   7530
   7531
   7532static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
   7533{
   7534	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
   7535}
   7536
   7537static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
   7538			    unsigned long *dest)
   7539{
   7540	kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
   7541}
   7542
   7543static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
   7544			   unsigned long value)
   7545{
   7546
   7547	return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
   7548}
   7549
   7550static u64 mk_cr_64(u64 curr_cr, u32 new_val)
   7551{
   7552	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
   7553}
   7554
   7555static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
   7556{
   7557	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7558	unsigned long value;
   7559
   7560	switch (cr) {
   7561	case 0:
   7562		value = kvm_read_cr0(vcpu);
   7563		break;
   7564	case 2:
   7565		value = vcpu->arch.cr2;
   7566		break;
   7567	case 3:
   7568		value = kvm_read_cr3(vcpu);
   7569		break;
   7570	case 4:
   7571		value = kvm_read_cr4(vcpu);
   7572		break;
   7573	case 8:
   7574		value = kvm_get_cr8(vcpu);
   7575		break;
   7576	default:
   7577		kvm_err("%s: unexpected cr %u\n", __func__, cr);
   7578		return 0;
   7579	}
   7580
   7581	return value;
   7582}
   7583
   7584static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
   7585{
   7586	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7587	int res = 0;
   7588
   7589	switch (cr) {
   7590	case 0:
   7591		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
   7592		break;
   7593	case 2:
   7594		vcpu->arch.cr2 = val;
   7595		break;
   7596	case 3:
   7597		res = kvm_set_cr3(vcpu, val);
   7598		break;
   7599	case 4:
   7600		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
   7601		break;
   7602	case 8:
   7603		res = kvm_set_cr8(vcpu, val);
   7604		break;
   7605	default:
   7606		kvm_err("%s: unexpected cr %u\n", __func__, cr);
   7607		res = -1;
   7608	}
   7609
   7610	return res;
   7611}
   7612
   7613static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
   7614{
   7615	return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
   7616}
   7617
   7618static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
   7619{
   7620	static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
   7621}
   7622
   7623static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
   7624{
   7625	static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
   7626}
   7627
   7628static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
   7629{
   7630	static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
   7631}
   7632
   7633static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
   7634{
   7635	static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
   7636}
   7637
   7638static unsigned long emulator_get_cached_segment_base(
   7639	struct x86_emulate_ctxt *ctxt, int seg)
   7640{
   7641	return get_segment_base(emul_to_vcpu(ctxt), seg);
   7642}
   7643
   7644static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
   7645				 struct desc_struct *desc, u32 *base3,
   7646				 int seg)
   7647{
   7648	struct kvm_segment var;
   7649
   7650	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
   7651	*selector = var.selector;
   7652
   7653	if (var.unusable) {
   7654		memset(desc, 0, sizeof(*desc));
   7655		if (base3)
   7656			*base3 = 0;
   7657		return false;
   7658	}
   7659
   7660	if (var.g)
   7661		var.limit >>= 12;
   7662	set_desc_limit(desc, var.limit);
   7663	set_desc_base(desc, (unsigned long)var.base);
   7664#ifdef CONFIG_X86_64
   7665	if (base3)
   7666		*base3 = var.base >> 32;
   7667#endif
   7668	desc->type = var.type;
   7669	desc->s = var.s;
   7670	desc->dpl = var.dpl;
   7671	desc->p = var.present;
   7672	desc->avl = var.avl;
   7673	desc->l = var.l;
   7674	desc->d = var.db;
   7675	desc->g = var.g;
   7676
   7677	return true;
   7678}
   7679
   7680static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
   7681				 struct desc_struct *desc, u32 base3,
   7682				 int seg)
   7683{
   7684	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7685	struct kvm_segment var;
   7686
   7687	var.selector = selector;
   7688	var.base = get_desc_base(desc);
   7689#ifdef CONFIG_X86_64
   7690	var.base |= ((u64)base3) << 32;
   7691#endif
   7692	var.limit = get_desc_limit(desc);
   7693	if (desc->g)
   7694		var.limit = (var.limit << 12) | 0xfff;
   7695	var.type = desc->type;
   7696	var.dpl = desc->dpl;
   7697	var.db = desc->d;
   7698	var.s = desc->s;
   7699	var.l = desc->l;
   7700	var.g = desc->g;
   7701	var.avl = desc->avl;
   7702	var.present = desc->p;
   7703	var.unusable = !var.present;
   7704	var.padding = 0;
   7705
   7706	kvm_set_segment(vcpu, &var, seg);
   7707	return;
   7708}
   7709
   7710static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
   7711					u32 msr_index, u64 *pdata)
   7712{
   7713	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7714	int r;
   7715
   7716	r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
   7717
   7718	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
   7719				    complete_emulated_rdmsr, r)) {
   7720		/* Bounce to user space */
   7721		return X86EMUL_IO_NEEDED;
   7722	}
   7723
   7724	return r;
   7725}
   7726
   7727static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
   7728					u32 msr_index, u64 data)
   7729{
   7730	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7731	int r;
   7732
   7733	r = kvm_set_msr_with_filter(vcpu, msr_index, data);
   7734
   7735	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
   7736				    complete_emulated_msr_access, r)) {
   7737		/* Bounce to user space */
   7738		return X86EMUL_IO_NEEDED;
   7739	}
   7740
   7741	return r;
   7742}
   7743
   7744static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
   7745			    u32 msr_index, u64 *pdata)
   7746{
   7747	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
   7748}
   7749
   7750static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
   7751			    u32 msr_index, u64 data)
   7752{
   7753	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
   7754}
   7755
   7756static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
   7757{
   7758	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7759
   7760	return vcpu->arch.smbase;
   7761}
   7762
   7763static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
   7764{
   7765	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7766
   7767	vcpu->arch.smbase = smbase;
   7768}
   7769
   7770static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
   7771			      u32 pmc)
   7772{
   7773	if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
   7774		return 0;
   7775	return -EINVAL;
   7776}
   7777
   7778static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
   7779			     u32 pmc, u64 *pdata)
   7780{
   7781	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
   7782}
   7783
   7784static void emulator_halt(struct x86_emulate_ctxt *ctxt)
   7785{
   7786	emul_to_vcpu(ctxt)->arch.halt_request = 1;
   7787}
   7788
   7789static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
   7790			      struct x86_instruction_info *info,
   7791			      enum x86_intercept_stage stage)
   7792{
   7793	return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
   7794					    &ctxt->exception);
   7795}
   7796
   7797static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
   7798			      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
   7799			      bool exact_only)
   7800{
   7801	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
   7802}
   7803
   7804static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
   7805{
   7806	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
   7807}
   7808
   7809static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
   7810{
   7811	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
   7812}
   7813
   7814static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
   7815{
   7816	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
   7817}
   7818
   7819static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
   7820{
   7821	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
   7822}
   7823
   7824static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
   7825{
   7826	return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
   7827}
   7828
   7829static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
   7830{
   7831	kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
   7832}
   7833
   7834static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
   7835{
   7836	static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
   7837}
   7838
   7839static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
   7840{
   7841	return emul_to_vcpu(ctxt)->arch.hflags;
   7842}
   7843
   7844static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
   7845{
   7846	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   7847
   7848	kvm_smm_changed(vcpu, false);
   7849}
   7850
   7851static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
   7852				  const char *smstate)
   7853{
   7854	return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
   7855}
   7856
   7857static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
   7858{
   7859	kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
   7860}
   7861
   7862static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
   7863{
   7864	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
   7865}
   7866
   7867static const struct x86_emulate_ops emulate_ops = {
   7868	.read_gpr            = emulator_read_gpr,
   7869	.write_gpr           = emulator_write_gpr,
   7870	.read_std            = emulator_read_std,
   7871	.write_std           = emulator_write_std,
   7872	.read_phys           = kvm_read_guest_phys_system,
   7873	.fetch               = kvm_fetch_guest_virt,
   7874	.read_emulated       = emulator_read_emulated,
   7875	.write_emulated      = emulator_write_emulated,
   7876	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
   7877	.invlpg              = emulator_invlpg,
   7878	.pio_in_emulated     = emulator_pio_in_emulated,
   7879	.pio_out_emulated    = emulator_pio_out_emulated,
   7880	.get_segment         = emulator_get_segment,
   7881	.set_segment         = emulator_set_segment,
   7882	.get_cached_segment_base = emulator_get_cached_segment_base,
   7883	.get_gdt             = emulator_get_gdt,
   7884	.get_idt	     = emulator_get_idt,
   7885	.set_gdt             = emulator_set_gdt,
   7886	.set_idt	     = emulator_set_idt,
   7887	.get_cr              = emulator_get_cr,
   7888	.set_cr              = emulator_set_cr,
   7889	.cpl                 = emulator_get_cpl,
   7890	.get_dr              = emulator_get_dr,
   7891	.set_dr              = emulator_set_dr,
   7892	.get_smbase          = emulator_get_smbase,
   7893	.set_smbase          = emulator_set_smbase,
   7894	.set_msr_with_filter = emulator_set_msr_with_filter,
   7895	.get_msr_with_filter = emulator_get_msr_with_filter,
   7896	.set_msr             = emulator_set_msr,
   7897	.get_msr             = emulator_get_msr,
   7898	.check_pmc	     = emulator_check_pmc,
   7899	.read_pmc            = emulator_read_pmc,
   7900	.halt                = emulator_halt,
   7901	.wbinvd              = emulator_wbinvd,
   7902	.fix_hypercall       = emulator_fix_hypercall,
   7903	.intercept           = emulator_intercept,
   7904	.get_cpuid           = emulator_get_cpuid,
   7905	.guest_has_long_mode = emulator_guest_has_long_mode,
   7906	.guest_has_movbe     = emulator_guest_has_movbe,
   7907	.guest_has_fxsr      = emulator_guest_has_fxsr,
   7908	.guest_has_rdpid     = emulator_guest_has_rdpid,
   7909	.set_nmi_mask        = emulator_set_nmi_mask,
   7910	.get_hflags          = emulator_get_hflags,
   7911	.exiting_smm         = emulator_exiting_smm,
   7912	.leave_smm           = emulator_leave_smm,
   7913	.triple_fault        = emulator_triple_fault,
   7914	.set_xcr             = emulator_set_xcr,
   7915};
   7916
   7917static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
   7918{
   7919	u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
   7920	/*
   7921	 * an sti; sti; sequence only disable interrupts for the first
   7922	 * instruction. So, if the last instruction, be it emulated or
   7923	 * not, left the system with the INT_STI flag enabled, it
   7924	 * means that the last instruction is an sti. We should not
   7925	 * leave the flag on in this case. The same goes for mov ss
   7926	 */
   7927	if (int_shadow & mask)
   7928		mask = 0;
   7929	if (unlikely(int_shadow || mask)) {
   7930		static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
   7931		if (!mask)
   7932			kvm_make_request(KVM_REQ_EVENT, vcpu);
   7933	}
   7934}
   7935
   7936static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
   7937{
   7938	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   7939	if (ctxt->exception.vector == PF_VECTOR)
   7940		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
   7941
   7942	if (ctxt->exception.error_code_valid)
   7943		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
   7944				      ctxt->exception.error_code);
   7945	else
   7946		kvm_queue_exception(vcpu, ctxt->exception.vector);
   7947	return false;
   7948}
   7949
   7950static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
   7951{
   7952	struct x86_emulate_ctxt *ctxt;
   7953
   7954	ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
   7955	if (!ctxt) {
   7956		pr_err("kvm: failed to allocate vcpu's emulator\n");
   7957		return NULL;
   7958	}
   7959
   7960	ctxt->vcpu = vcpu;
   7961	ctxt->ops = &emulate_ops;
   7962	vcpu->arch.emulate_ctxt = ctxt;
   7963
   7964	return ctxt;
   7965}
   7966
   7967static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
   7968{
   7969	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   7970	int cs_db, cs_l;
   7971
   7972	static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
   7973
   7974	ctxt->gpa_available = false;
   7975	ctxt->eflags = kvm_get_rflags(vcpu);
   7976	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
   7977
   7978	ctxt->eip = kvm_rip_read(vcpu);
   7979	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
   7980		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
   7981		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
   7982		     cs_db				? X86EMUL_MODE_PROT32 :
   7983							  X86EMUL_MODE_PROT16;
   7984	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
   7985	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
   7986	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
   7987
   7988	ctxt->interruptibility = 0;
   7989	ctxt->have_exception = false;
   7990	ctxt->exception.vector = -1;
   7991	ctxt->perm_ok = false;
   7992
   7993	init_decode_cache(ctxt);
   7994	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
   7995}
   7996
   7997void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
   7998{
   7999	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   8000	int ret;
   8001
   8002	init_emulate_ctxt(vcpu);
   8003
   8004	ctxt->op_bytes = 2;
   8005	ctxt->ad_bytes = 2;
   8006	ctxt->_eip = ctxt->eip + inc_eip;
   8007	ret = emulate_int_real(ctxt, irq);
   8008
   8009	if (ret != X86EMUL_CONTINUE) {
   8010		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   8011	} else {
   8012		ctxt->eip = ctxt->_eip;
   8013		kvm_rip_write(vcpu, ctxt->eip);
   8014		kvm_set_rflags(vcpu, ctxt->eflags);
   8015	}
   8016}
   8017EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
   8018
   8019static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
   8020					   u8 ndata, u8 *insn_bytes, u8 insn_size)
   8021{
   8022	struct kvm_run *run = vcpu->run;
   8023	u64 info[5];
   8024	u8 info_start;
   8025
   8026	/*
   8027	 * Zero the whole array used to retrieve the exit info, as casting to
   8028	 * u32 for select entries will leave some chunks uninitialized.
   8029	 */
   8030	memset(&info, 0, sizeof(info));
   8031
   8032	static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
   8033					   &info[2], (u32 *)&info[3],
   8034					   (u32 *)&info[4]);
   8035
   8036	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   8037	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
   8038
   8039	/*
   8040	 * There's currently space for 13 entries, but 5 are used for the exit
   8041	 * reason and info.  Restrict to 4 to reduce the maintenance burden
   8042	 * when expanding kvm_run.emulation_failure in the future.
   8043	 */
   8044	if (WARN_ON_ONCE(ndata > 4))
   8045		ndata = 4;
   8046
   8047	/* Always include the flags as a 'data' entry. */
   8048	info_start = 1;
   8049	run->emulation_failure.flags = 0;
   8050
   8051	if (insn_size) {
   8052		BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
   8053			      sizeof(run->emulation_failure.insn_bytes) != 16));
   8054		info_start += 2;
   8055		run->emulation_failure.flags |=
   8056			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
   8057		run->emulation_failure.insn_size = insn_size;
   8058		memset(run->emulation_failure.insn_bytes, 0x90,
   8059		       sizeof(run->emulation_failure.insn_bytes));
   8060		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
   8061	}
   8062
   8063	memcpy(&run->internal.data[info_start], info, sizeof(info));
   8064	memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
   8065	       ndata * sizeof(data[0]));
   8066
   8067	run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
   8068}
   8069
   8070static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
   8071{
   8072	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   8073
   8074	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
   8075				       ctxt->fetch.end - ctxt->fetch.data);
   8076}
   8077
   8078void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
   8079					  u8 ndata)
   8080{
   8081	prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
   8082}
   8083EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
   8084
   8085void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
   8086{
   8087	__kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
   8088}
   8089EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
   8090
   8091static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
   8092{
   8093	struct kvm *kvm = vcpu->kvm;
   8094
   8095	++vcpu->stat.insn_emulation_fail;
   8096	trace_kvm_emulate_insn_failed(vcpu);
   8097
   8098	if (emulation_type & EMULTYPE_VMWARE_GP) {
   8099		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
   8100		return 1;
   8101	}
   8102
   8103	if (kvm->arch.exit_on_emulation_error ||
   8104	    (emulation_type & EMULTYPE_SKIP)) {
   8105		prepare_emulation_ctxt_failure_exit(vcpu);
   8106		return 0;
   8107	}
   8108
   8109	kvm_queue_exception(vcpu, UD_VECTOR);
   8110
   8111	if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
   8112		prepare_emulation_ctxt_failure_exit(vcpu);
   8113		return 0;
   8114	}
   8115
   8116	return 1;
   8117}
   8118
   8119static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
   8120				  bool write_fault_to_shadow_pgtable,
   8121				  int emulation_type)
   8122{
   8123	gpa_t gpa = cr2_or_gpa;
   8124	kvm_pfn_t pfn;
   8125
   8126	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
   8127		return false;
   8128
   8129	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
   8130	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
   8131		return false;
   8132
   8133	if (!vcpu->arch.mmu->root_role.direct) {
   8134		/*
   8135		 * Write permission should be allowed since only
   8136		 * write access need to be emulated.
   8137		 */
   8138		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
   8139
   8140		/*
   8141		 * If the mapping is invalid in guest, let cpu retry
   8142		 * it to generate fault.
   8143		 */
   8144		if (gpa == UNMAPPED_GVA)
   8145			return true;
   8146	}
   8147
   8148	/*
   8149	 * Do not retry the unhandleable instruction if it faults on the
   8150	 * readonly host memory, otherwise it will goto a infinite loop:
   8151	 * retry instruction -> write #PF -> emulation fail -> retry
   8152	 * instruction -> ...
   8153	 */
   8154	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
   8155
   8156	/*
   8157	 * If the instruction failed on the error pfn, it can not be fixed,
   8158	 * report the error to userspace.
   8159	 */
   8160	if (is_error_noslot_pfn(pfn))
   8161		return false;
   8162
   8163	kvm_release_pfn_clean(pfn);
   8164
   8165	/* The instructions are well-emulated on direct mmu. */
   8166	if (vcpu->arch.mmu->root_role.direct) {
   8167		unsigned int indirect_shadow_pages;
   8168
   8169		write_lock(&vcpu->kvm->mmu_lock);
   8170		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
   8171		write_unlock(&vcpu->kvm->mmu_lock);
   8172
   8173		if (indirect_shadow_pages)
   8174			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
   8175
   8176		return true;
   8177	}
   8178
   8179	/*
   8180	 * if emulation was due to access to shadowed page table
   8181	 * and it failed try to unshadow page and re-enter the
   8182	 * guest to let CPU execute the instruction.
   8183	 */
   8184	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
   8185
   8186	/*
   8187	 * If the access faults on its page table, it can not
   8188	 * be fixed by unprotecting shadow page and it should
   8189	 * be reported to userspace.
   8190	 */
   8191	return !write_fault_to_shadow_pgtable;
   8192}
   8193
   8194static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
   8195			      gpa_t cr2_or_gpa,  int emulation_type)
   8196{
   8197	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   8198	unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
   8199
   8200	last_retry_eip = vcpu->arch.last_retry_eip;
   8201	last_retry_addr = vcpu->arch.last_retry_addr;
   8202
   8203	/*
   8204	 * If the emulation is caused by #PF and it is non-page_table
   8205	 * writing instruction, it means the VM-EXIT is caused by shadow
   8206	 * page protected, we can zap the shadow page and retry this
   8207	 * instruction directly.
   8208	 *
   8209	 * Note: if the guest uses a non-page-table modifying instruction
   8210	 * on the PDE that points to the instruction, then we will unmap
   8211	 * the instruction and go to an infinite loop. So, we cache the
   8212	 * last retried eip and the last fault address, if we meet the eip
   8213	 * and the address again, we can break out of the potential infinite
   8214	 * loop.
   8215	 */
   8216	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
   8217
   8218	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
   8219		return false;
   8220
   8221	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
   8222	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
   8223		return false;
   8224
   8225	if (x86_page_table_writing_insn(ctxt))
   8226		return false;
   8227
   8228	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
   8229		return false;
   8230
   8231	vcpu->arch.last_retry_eip = ctxt->eip;
   8232	vcpu->arch.last_retry_addr = cr2_or_gpa;
   8233
   8234	if (!vcpu->arch.mmu->root_role.direct)
   8235		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
   8236
   8237	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
   8238
   8239	return true;
   8240}
   8241
   8242static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
   8243static int complete_emulated_pio(struct kvm_vcpu *vcpu);
   8244
   8245static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
   8246{
   8247	trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
   8248
   8249	if (entering_smm) {
   8250		vcpu->arch.hflags |= HF_SMM_MASK;
   8251	} else {
   8252		vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
   8253
   8254		/* Process a latched INIT or SMI, if any.  */
   8255		kvm_make_request(KVM_REQ_EVENT, vcpu);
   8256
   8257		/*
   8258		 * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
   8259		 * on SMM exit we still need to reload them from
   8260		 * guest memory
   8261		 */
   8262		vcpu->arch.pdptrs_from_userspace = false;
   8263	}
   8264
   8265	kvm_mmu_reset_context(vcpu);
   8266}
   8267
   8268static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
   8269				unsigned long *db)
   8270{
   8271	u32 dr6 = 0;
   8272	int i;
   8273	u32 enable, rwlen;
   8274
   8275	enable = dr7;
   8276	rwlen = dr7 >> 16;
   8277	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
   8278		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
   8279			dr6 |= (1 << i);
   8280	return dr6;
   8281}
   8282
   8283static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
   8284{
   8285	struct kvm_run *kvm_run = vcpu->run;
   8286
   8287	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
   8288		kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
   8289		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
   8290		kvm_run->debug.arch.exception = DB_VECTOR;
   8291		kvm_run->exit_reason = KVM_EXIT_DEBUG;
   8292		return 0;
   8293	}
   8294	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
   8295	return 1;
   8296}
   8297
   8298int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
   8299{
   8300	unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
   8301	int r;
   8302
   8303	r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
   8304	if (unlikely(!r))
   8305		return 0;
   8306
   8307	kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
   8308
   8309	/*
   8310	 * rflags is the old, "raw" value of the flags.  The new value has
   8311	 * not been saved yet.
   8312	 *
   8313	 * This is correct even for TF set by the guest, because "the
   8314	 * processor will not generate this exception after the instruction
   8315	 * that sets the TF flag".
   8316	 */
   8317	if (unlikely(rflags & X86_EFLAGS_TF))
   8318		r = kvm_vcpu_do_singlestep(vcpu);
   8319	return r;
   8320}
   8321EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
   8322
   8323static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
   8324{
   8325	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
   8326	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
   8327		struct kvm_run *kvm_run = vcpu->run;
   8328		unsigned long eip = kvm_get_linear_rip(vcpu);
   8329		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
   8330					   vcpu->arch.guest_debug_dr7,
   8331					   vcpu->arch.eff_db);
   8332
   8333		if (dr6 != 0) {
   8334			kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
   8335			kvm_run->debug.arch.pc = eip;
   8336			kvm_run->debug.arch.exception = DB_VECTOR;
   8337			kvm_run->exit_reason = KVM_EXIT_DEBUG;
   8338			*r = 0;
   8339			return true;
   8340		}
   8341	}
   8342
   8343	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
   8344	    !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
   8345		unsigned long eip = kvm_get_linear_rip(vcpu);
   8346		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
   8347					   vcpu->arch.dr7,
   8348					   vcpu->arch.db);
   8349
   8350		if (dr6 != 0) {
   8351			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
   8352			*r = 1;
   8353			return true;
   8354		}
   8355	}
   8356
   8357	return false;
   8358}
   8359
   8360static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
   8361{
   8362	switch (ctxt->opcode_len) {
   8363	case 1:
   8364		switch (ctxt->b) {
   8365		case 0xe4:	/* IN */
   8366		case 0xe5:
   8367		case 0xec:
   8368		case 0xed:
   8369		case 0xe6:	/* OUT */
   8370		case 0xe7:
   8371		case 0xee:
   8372		case 0xef:
   8373		case 0x6c:	/* INS */
   8374		case 0x6d:
   8375		case 0x6e:	/* OUTS */
   8376		case 0x6f:
   8377			return true;
   8378		}
   8379		break;
   8380	case 2:
   8381		switch (ctxt->b) {
   8382		case 0x33:	/* RDPMC */
   8383			return true;
   8384		}
   8385		break;
   8386	}
   8387
   8388	return false;
   8389}
   8390
   8391/*
   8392 * Decode an instruction for emulation.  The caller is responsible for handling
   8393 * code breakpoints.  Note, manually detecting code breakpoints is unnecessary
   8394 * (and wrong) when emulating on an intercepted fault-like exception[*], as
   8395 * code breakpoints have higher priority and thus have already been done by
   8396 * hardware.
   8397 *
   8398 * [*] Except #MC, which is higher priority, but KVM should never emulate in
   8399 *     response to a machine check.
   8400 */
   8401int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
   8402				    void *insn, int insn_len)
   8403{
   8404	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   8405	int r;
   8406
   8407	init_emulate_ctxt(vcpu);
   8408
   8409	r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
   8410
   8411	trace_kvm_emulate_insn_start(vcpu);
   8412	++vcpu->stat.insn_emulation;
   8413
   8414	return r;
   8415}
   8416EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
   8417
   8418int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
   8419			    int emulation_type, void *insn, int insn_len)
   8420{
   8421	int r;
   8422	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   8423	bool writeback = true;
   8424	bool write_fault_to_spt;
   8425
   8426	if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
   8427		return 1;
   8428
   8429	vcpu->arch.l1tf_flush_l1d = true;
   8430
   8431	/*
   8432	 * Clear write_fault_to_shadow_pgtable here to ensure it is
   8433	 * never reused.
   8434	 */
   8435	write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
   8436	vcpu->arch.write_fault_to_shadow_pgtable = false;
   8437
   8438	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
   8439		kvm_clear_exception_queue(vcpu);
   8440
   8441		/*
   8442		 * Return immediately if RIP hits a code breakpoint, such #DBs
   8443		 * are fault-like and are higher priority than any faults on
   8444		 * the code fetch itself.
   8445		 */
   8446		if (!(emulation_type & EMULTYPE_SKIP) &&
   8447		    kvm_vcpu_check_code_breakpoint(vcpu, &r))
   8448			return r;
   8449
   8450		r = x86_decode_emulated_instruction(vcpu, emulation_type,
   8451						    insn, insn_len);
   8452		if (r != EMULATION_OK)  {
   8453			if ((emulation_type & EMULTYPE_TRAP_UD) ||
   8454			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
   8455				kvm_queue_exception(vcpu, UD_VECTOR);
   8456				return 1;
   8457			}
   8458			if (reexecute_instruction(vcpu, cr2_or_gpa,
   8459						  write_fault_to_spt,
   8460						  emulation_type))
   8461				return 1;
   8462			if (ctxt->have_exception) {
   8463				/*
   8464				 * #UD should result in just EMULATION_FAILED, and trap-like
   8465				 * exception should not be encountered during decode.
   8466				 */
   8467				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
   8468					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
   8469				inject_emulated_exception(vcpu);
   8470				return 1;
   8471			}
   8472			return handle_emulation_failure(vcpu, emulation_type);
   8473		}
   8474	}
   8475
   8476	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
   8477	    !is_vmware_backdoor_opcode(ctxt)) {
   8478		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
   8479		return 1;
   8480	}
   8481
   8482	/*
   8483	 * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
   8484	 * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
   8485	 * The caller is responsible for updating interruptibility state and
   8486	 * injecting single-step #DBs.
   8487	 */
   8488	if (emulation_type & EMULTYPE_SKIP) {
   8489		if (ctxt->mode != X86EMUL_MODE_PROT64)
   8490			ctxt->eip = (u32)ctxt->_eip;
   8491		else
   8492			ctxt->eip = ctxt->_eip;
   8493
   8494		if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
   8495			r = 1;
   8496			goto writeback;
   8497		}
   8498
   8499		kvm_rip_write(vcpu, ctxt->eip);
   8500		if (ctxt->eflags & X86_EFLAGS_RF)
   8501			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
   8502		return 1;
   8503	}
   8504
   8505	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
   8506		return 1;
   8507
   8508	/* this is needed for vmware backdoor interface to work since it
   8509	   changes registers values  during IO operation */
   8510	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
   8511		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
   8512		emulator_invalidate_register_cache(ctxt);
   8513	}
   8514
   8515restart:
   8516	if (emulation_type & EMULTYPE_PF) {
   8517		/* Save the faulting GPA (cr2) in the address field */
   8518		ctxt->exception.address = cr2_or_gpa;
   8519
   8520		/* With shadow page tables, cr2 contains a GVA or nGPA. */
   8521		if (vcpu->arch.mmu->root_role.direct) {
   8522			ctxt->gpa_available = true;
   8523			ctxt->gpa_val = cr2_or_gpa;
   8524		}
   8525	} else {
   8526		/* Sanitize the address out of an abundance of paranoia. */
   8527		ctxt->exception.address = 0;
   8528	}
   8529
   8530	r = x86_emulate_insn(ctxt);
   8531
   8532	if (r == EMULATION_INTERCEPTED)
   8533		return 1;
   8534
   8535	if (r == EMULATION_FAILED) {
   8536		if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
   8537					emulation_type))
   8538			return 1;
   8539
   8540		return handle_emulation_failure(vcpu, emulation_type);
   8541	}
   8542
   8543	if (ctxt->have_exception) {
   8544		r = 1;
   8545		if (inject_emulated_exception(vcpu))
   8546			return r;
   8547	} else if (vcpu->arch.pio.count) {
   8548		if (!vcpu->arch.pio.in) {
   8549			/* FIXME: return into emulator if single-stepping.  */
   8550			vcpu->arch.pio.count = 0;
   8551		} else {
   8552			writeback = false;
   8553			vcpu->arch.complete_userspace_io = complete_emulated_pio;
   8554		}
   8555		r = 0;
   8556	} else if (vcpu->mmio_needed) {
   8557		++vcpu->stat.mmio_exits;
   8558
   8559		if (!vcpu->mmio_is_write)
   8560			writeback = false;
   8561		r = 0;
   8562		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
   8563	} else if (vcpu->arch.complete_userspace_io) {
   8564		writeback = false;
   8565		r = 0;
   8566	} else if (r == EMULATION_RESTART)
   8567		goto restart;
   8568	else
   8569		r = 1;
   8570
   8571writeback:
   8572	if (writeback) {
   8573		unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
   8574		toggle_interruptibility(vcpu, ctxt->interruptibility);
   8575		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
   8576		if (!ctxt->have_exception ||
   8577		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
   8578			kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
   8579			if (ctxt->is_branch)
   8580				kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
   8581			kvm_rip_write(vcpu, ctxt->eip);
   8582			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
   8583				r = kvm_vcpu_do_singlestep(vcpu);
   8584			static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
   8585			__kvm_set_rflags(vcpu, ctxt->eflags);
   8586		}
   8587
   8588		/*
   8589		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
   8590		 * do nothing, and it will be requested again as soon as
   8591		 * the shadow expires.  But we still need to check here,
   8592		 * because POPF has no interrupt shadow.
   8593		 */
   8594		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
   8595			kvm_make_request(KVM_REQ_EVENT, vcpu);
   8596	} else
   8597		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
   8598
   8599	return r;
   8600}
   8601
   8602int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
   8603{
   8604	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
   8605}
   8606EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
   8607
   8608int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
   8609					void *insn, int insn_len)
   8610{
   8611	return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
   8612}
   8613EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
   8614
   8615static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
   8616{
   8617	vcpu->arch.pio.count = 0;
   8618	return 1;
   8619}
   8620
   8621static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
   8622{
   8623	vcpu->arch.pio.count = 0;
   8624
   8625	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
   8626		return 1;
   8627
   8628	return kvm_skip_emulated_instruction(vcpu);
   8629}
   8630
   8631static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
   8632			    unsigned short port)
   8633{
   8634	unsigned long val = kvm_rax_read(vcpu);
   8635	int ret = emulator_pio_out(vcpu, size, port, &val, 1);
   8636
   8637	if (ret)
   8638		return ret;
   8639
   8640	/*
   8641	 * Workaround userspace that relies on old KVM behavior of %rip being
   8642	 * incremented prior to exiting to userspace to handle "OUT 0x7e".
   8643	 */
   8644	if (port == 0x7e &&
   8645	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
   8646		vcpu->arch.complete_userspace_io =
   8647			complete_fast_pio_out_port_0x7e;
   8648		kvm_skip_emulated_instruction(vcpu);
   8649	} else {
   8650		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
   8651		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
   8652	}
   8653	return 0;
   8654}
   8655
   8656static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
   8657{
   8658	unsigned long val;
   8659
   8660	/* We should only ever be called with arch.pio.count equal to 1 */
   8661	BUG_ON(vcpu->arch.pio.count != 1);
   8662
   8663	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
   8664		vcpu->arch.pio.count = 0;
   8665		return 1;
   8666	}
   8667
   8668	/* For size less than 4 we merge, else we zero extend */
   8669	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
   8670
   8671	/*
   8672	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
   8673	 * the copy and tracing
   8674	 */
   8675	emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
   8676	kvm_rax_write(vcpu, val);
   8677
   8678	return kvm_skip_emulated_instruction(vcpu);
   8679}
   8680
   8681static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
   8682			   unsigned short port)
   8683{
   8684	unsigned long val;
   8685	int ret;
   8686
   8687	/* For size less than 4 we merge, else we zero extend */
   8688	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
   8689
   8690	ret = emulator_pio_in(vcpu, size, port, &val, 1);
   8691	if (ret) {
   8692		kvm_rax_write(vcpu, val);
   8693		return ret;
   8694	}
   8695
   8696	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
   8697	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
   8698
   8699	return 0;
   8700}
   8701
   8702int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
   8703{
   8704	int ret;
   8705
   8706	if (in)
   8707		ret = kvm_fast_pio_in(vcpu, size, port);
   8708	else
   8709		ret = kvm_fast_pio_out(vcpu, size, port);
   8710	return ret && kvm_skip_emulated_instruction(vcpu);
   8711}
   8712EXPORT_SYMBOL_GPL(kvm_fast_pio);
   8713
   8714static int kvmclock_cpu_down_prep(unsigned int cpu)
   8715{
   8716	__this_cpu_write(cpu_tsc_khz, 0);
   8717	return 0;
   8718}
   8719
   8720static void tsc_khz_changed(void *data)
   8721{
   8722	struct cpufreq_freqs *freq = data;
   8723	unsigned long khz = 0;
   8724
   8725	if (data)
   8726		khz = freq->new;
   8727	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
   8728		khz = cpufreq_quick_get(raw_smp_processor_id());
   8729	if (!khz)
   8730		khz = tsc_khz;
   8731	__this_cpu_write(cpu_tsc_khz, khz);
   8732}
   8733
   8734#ifdef CONFIG_X86_64
   8735static void kvm_hyperv_tsc_notifier(void)
   8736{
   8737	struct kvm *kvm;
   8738	int cpu;
   8739
   8740	mutex_lock(&kvm_lock);
   8741	list_for_each_entry(kvm, &vm_list, vm_list)
   8742		kvm_make_mclock_inprogress_request(kvm);
   8743
   8744	/* no guest entries from this point */
   8745	hyperv_stop_tsc_emulation();
   8746
   8747	/* TSC frequency always matches when on Hyper-V */
   8748	for_each_present_cpu(cpu)
   8749		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
   8750	kvm_max_guest_tsc_khz = tsc_khz;
   8751
   8752	list_for_each_entry(kvm, &vm_list, vm_list) {
   8753		__kvm_start_pvclock_update(kvm);
   8754		pvclock_update_vm_gtod_copy(kvm);
   8755		kvm_end_pvclock_update(kvm);
   8756	}
   8757
   8758	mutex_unlock(&kvm_lock);
   8759}
   8760#endif
   8761
   8762static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
   8763{
   8764	struct kvm *kvm;
   8765	struct kvm_vcpu *vcpu;
   8766	int send_ipi = 0;
   8767	unsigned long i;
   8768
   8769	/*
   8770	 * We allow guests to temporarily run on slowing clocks,
   8771	 * provided we notify them after, or to run on accelerating
   8772	 * clocks, provided we notify them before.  Thus time never
   8773	 * goes backwards.
   8774	 *
   8775	 * However, we have a problem.  We can't atomically update
   8776	 * the frequency of a given CPU from this function; it is
   8777	 * merely a notifier, which can be called from any CPU.
   8778	 * Changing the TSC frequency at arbitrary points in time
   8779	 * requires a recomputation of local variables related to
   8780	 * the TSC for each VCPU.  We must flag these local variables
   8781	 * to be updated and be sure the update takes place with the
   8782	 * new frequency before any guests proceed.
   8783	 *
   8784	 * Unfortunately, the combination of hotplug CPU and frequency
   8785	 * change creates an intractable locking scenario; the order
   8786	 * of when these callouts happen is undefined with respect to
   8787	 * CPU hotplug, and they can race with each other.  As such,
   8788	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
   8789	 * undefined; you can actually have a CPU frequency change take
   8790	 * place in between the computation of X and the setting of the
   8791	 * variable.  To protect against this problem, all updates of
   8792	 * the per_cpu tsc_khz variable are done in an interrupt
   8793	 * protected IPI, and all callers wishing to update the value
   8794	 * must wait for a synchronous IPI to complete (which is trivial
   8795	 * if the caller is on the CPU already).  This establishes the
   8796	 * necessary total order on variable updates.
   8797	 *
   8798	 * Note that because a guest time update may take place
   8799	 * anytime after the setting of the VCPU's request bit, the
   8800	 * correct TSC value must be set before the request.  However,
   8801	 * to ensure the update actually makes it to any guest which
   8802	 * starts running in hardware virtualization between the set
   8803	 * and the acquisition of the spinlock, we must also ping the
   8804	 * CPU after setting the request bit.
   8805	 *
   8806	 */
   8807
   8808	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
   8809
   8810	mutex_lock(&kvm_lock);
   8811	list_for_each_entry(kvm, &vm_list, vm_list) {
   8812		kvm_for_each_vcpu(i, vcpu, kvm) {
   8813			if (vcpu->cpu != cpu)
   8814				continue;
   8815			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   8816			if (vcpu->cpu != raw_smp_processor_id())
   8817				send_ipi = 1;
   8818		}
   8819	}
   8820	mutex_unlock(&kvm_lock);
   8821
   8822	if (freq->old < freq->new && send_ipi) {
   8823		/*
   8824		 * We upscale the frequency.  Must make the guest
   8825		 * doesn't see old kvmclock values while running with
   8826		 * the new frequency, otherwise we risk the guest sees
   8827		 * time go backwards.
   8828		 *
   8829		 * In case we update the frequency for another cpu
   8830		 * (which might be in guest context) send an interrupt
   8831		 * to kick the cpu out of guest context.  Next time
   8832		 * guest context is entered kvmclock will be updated,
   8833		 * so the guest will not see stale values.
   8834		 */
   8835		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
   8836	}
   8837}
   8838
   8839static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
   8840				     void *data)
   8841{
   8842	struct cpufreq_freqs *freq = data;
   8843	int cpu;
   8844
   8845	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
   8846		return 0;
   8847	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
   8848		return 0;
   8849
   8850	for_each_cpu(cpu, freq->policy->cpus)
   8851		__kvmclock_cpufreq_notifier(freq, cpu);
   8852
   8853	return 0;
   8854}
   8855
   8856static struct notifier_block kvmclock_cpufreq_notifier_block = {
   8857	.notifier_call  = kvmclock_cpufreq_notifier
   8858};
   8859
   8860static int kvmclock_cpu_online(unsigned int cpu)
   8861{
   8862	tsc_khz_changed(NULL);
   8863	return 0;
   8864}
   8865
   8866static void kvm_timer_init(void)
   8867{
   8868	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
   8869		max_tsc_khz = tsc_khz;
   8870
   8871		if (IS_ENABLED(CONFIG_CPU_FREQ)) {
   8872			struct cpufreq_policy *policy;
   8873			int cpu;
   8874
   8875			cpu = get_cpu();
   8876			policy = cpufreq_cpu_get(cpu);
   8877			if (policy) {
   8878				if (policy->cpuinfo.max_freq)
   8879					max_tsc_khz = policy->cpuinfo.max_freq;
   8880				cpufreq_cpu_put(policy);
   8881			}
   8882			put_cpu();
   8883		}
   8884		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
   8885					  CPUFREQ_TRANSITION_NOTIFIER);
   8886	}
   8887
   8888	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
   8889			  kvmclock_cpu_online, kvmclock_cpu_down_prep);
   8890}
   8891
   8892#ifdef CONFIG_X86_64
   8893static void pvclock_gtod_update_fn(struct work_struct *work)
   8894{
   8895	struct kvm *kvm;
   8896	struct kvm_vcpu *vcpu;
   8897	unsigned long i;
   8898
   8899	mutex_lock(&kvm_lock);
   8900	list_for_each_entry(kvm, &vm_list, vm_list)
   8901		kvm_for_each_vcpu(i, vcpu, kvm)
   8902			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
   8903	atomic_set(&kvm_guest_has_master_clock, 0);
   8904	mutex_unlock(&kvm_lock);
   8905}
   8906
   8907static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
   8908
   8909/*
   8910 * Indirection to move queue_work() out of the tk_core.seq write held
   8911 * region to prevent possible deadlocks against time accessors which
   8912 * are invoked with work related locks held.
   8913 */
   8914static void pvclock_irq_work_fn(struct irq_work *w)
   8915{
   8916	queue_work(system_long_wq, &pvclock_gtod_work);
   8917}
   8918
   8919static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
   8920
   8921/*
   8922 * Notification about pvclock gtod data update.
   8923 */
   8924static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
   8925			       void *priv)
   8926{
   8927	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
   8928	struct timekeeper *tk = priv;
   8929
   8930	update_pvclock_gtod(tk);
   8931
   8932	/*
   8933	 * Disable master clock if host does not trust, or does not use,
   8934	 * TSC based clocksource. Delegate queue_work() to irq_work as
   8935	 * this is invoked with tk_core.seq write held.
   8936	 */
   8937	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
   8938	    atomic_read(&kvm_guest_has_master_clock) != 0)
   8939		irq_work_queue(&pvclock_irq_work);
   8940	return 0;
   8941}
   8942
   8943static struct notifier_block pvclock_gtod_notifier = {
   8944	.notifier_call = pvclock_gtod_notify,
   8945};
   8946#endif
   8947
   8948int kvm_arch_init(void *opaque)
   8949{
   8950	struct kvm_x86_init_ops *ops = opaque;
   8951	int r;
   8952
   8953	if (kvm_x86_ops.hardware_enable) {
   8954		pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
   8955		r = -EEXIST;
   8956		goto out;
   8957	}
   8958
   8959	if (!ops->cpu_has_kvm_support()) {
   8960		pr_err_ratelimited("kvm: no hardware support for '%s'\n",
   8961				   ops->runtime_ops->name);
   8962		r = -EOPNOTSUPP;
   8963		goto out;
   8964	}
   8965	if (ops->disabled_by_bios()) {
   8966		pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
   8967				   ops->runtime_ops->name);
   8968		r = -EOPNOTSUPP;
   8969		goto out;
   8970	}
   8971
   8972	/*
   8973	 * KVM explicitly assumes that the guest has an FPU and
   8974	 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
   8975	 * vCPU's FPU state as a fxregs_state struct.
   8976	 */
   8977	if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
   8978		printk(KERN_ERR "kvm: inadequate fpu\n");
   8979		r = -EOPNOTSUPP;
   8980		goto out;
   8981	}
   8982
   8983	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
   8984		pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
   8985		r = -EOPNOTSUPP;
   8986		goto out;
   8987	}
   8988
   8989	r = -ENOMEM;
   8990
   8991	x86_emulator_cache = kvm_alloc_emulator_cache();
   8992	if (!x86_emulator_cache) {
   8993		pr_err("kvm: failed to allocate cache for x86 emulator\n");
   8994		goto out;
   8995	}
   8996
   8997	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
   8998	if (!user_return_msrs) {
   8999		printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
   9000		goto out_free_x86_emulator_cache;
   9001	}
   9002	kvm_nr_uret_msrs = 0;
   9003
   9004	r = kvm_mmu_vendor_module_init();
   9005	if (r)
   9006		goto out_free_percpu;
   9007
   9008	kvm_timer_init();
   9009
   9010	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
   9011		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
   9012		supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
   9013	}
   9014
   9015	if (pi_inject_timer == -1)
   9016		pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
   9017#ifdef CONFIG_X86_64
   9018	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
   9019
   9020	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
   9021		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
   9022#endif
   9023
   9024	return 0;
   9025
   9026out_free_percpu:
   9027	free_percpu(user_return_msrs);
   9028out_free_x86_emulator_cache:
   9029	kmem_cache_destroy(x86_emulator_cache);
   9030out:
   9031	return r;
   9032}
   9033
   9034void kvm_arch_exit(void)
   9035{
   9036#ifdef CONFIG_X86_64
   9037	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
   9038		clear_hv_tscchange_cb();
   9039#endif
   9040	kvm_lapic_exit();
   9041
   9042	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
   9043		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
   9044					    CPUFREQ_TRANSITION_NOTIFIER);
   9045	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
   9046#ifdef CONFIG_X86_64
   9047	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
   9048	irq_work_sync(&pvclock_irq_work);
   9049	cancel_work_sync(&pvclock_gtod_work);
   9050#endif
   9051	kvm_x86_ops.hardware_enable = NULL;
   9052	kvm_mmu_vendor_module_exit();
   9053	free_percpu(user_return_msrs);
   9054	kmem_cache_destroy(x86_emulator_cache);
   9055#ifdef CONFIG_KVM_XEN
   9056	static_key_deferred_flush(&kvm_xen_enabled);
   9057	WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
   9058#endif
   9059}
   9060
   9061static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
   9062{
   9063	/*
   9064	 * The vCPU has halted, e.g. executed HLT.  Update the run state if the
   9065	 * local APIC is in-kernel, the run loop will detect the non-runnable
   9066	 * state and halt the vCPU.  Exit to userspace if the local APIC is
   9067	 * managed by userspace, in which case userspace is responsible for
   9068	 * handling wake events.
   9069	 */
   9070	++vcpu->stat.halt_exits;
   9071	if (lapic_in_kernel(vcpu)) {
   9072		vcpu->arch.mp_state = state;
   9073		return 1;
   9074	} else {
   9075		vcpu->run->exit_reason = reason;
   9076		return 0;
   9077	}
   9078}
   9079
   9080int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
   9081{
   9082	return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
   9083}
   9084EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
   9085
   9086int kvm_emulate_halt(struct kvm_vcpu *vcpu)
   9087{
   9088	int ret = kvm_skip_emulated_instruction(vcpu);
   9089	/*
   9090	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
   9091	 * KVM_EXIT_DEBUG here.
   9092	 */
   9093	return kvm_emulate_halt_noskip(vcpu) && ret;
   9094}
   9095EXPORT_SYMBOL_GPL(kvm_emulate_halt);
   9096
   9097int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
   9098{
   9099	int ret = kvm_skip_emulated_instruction(vcpu);
   9100
   9101	return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
   9102					KVM_EXIT_AP_RESET_HOLD) && ret;
   9103}
   9104EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
   9105
   9106#ifdef CONFIG_X86_64
   9107static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
   9108			        unsigned long clock_type)
   9109{
   9110	struct kvm_clock_pairing clock_pairing;
   9111	struct timespec64 ts;
   9112	u64 cycle;
   9113	int ret;
   9114
   9115	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
   9116		return -KVM_EOPNOTSUPP;
   9117
   9118	/*
   9119	 * When tsc is in permanent catchup mode guests won't be able to use
   9120	 * pvclock_read_retry loop to get consistent view of pvclock
   9121	 */
   9122	if (vcpu->arch.tsc_always_catchup)
   9123		return -KVM_EOPNOTSUPP;
   9124
   9125	if (!kvm_get_walltime_and_clockread(&ts, &cycle))
   9126		return -KVM_EOPNOTSUPP;
   9127
   9128	clock_pairing.sec = ts.tv_sec;
   9129	clock_pairing.nsec = ts.tv_nsec;
   9130	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
   9131	clock_pairing.flags = 0;
   9132	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
   9133
   9134	ret = 0;
   9135	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
   9136			    sizeof(struct kvm_clock_pairing)))
   9137		ret = -KVM_EFAULT;
   9138
   9139	return ret;
   9140}
   9141#endif
   9142
   9143/*
   9144 * kvm_pv_kick_cpu_op:  Kick a vcpu.
   9145 *
   9146 * @apicid - apicid of vcpu to be kicked.
   9147 */
   9148static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
   9149{
   9150	struct kvm_lapic_irq lapic_irq;
   9151
   9152	lapic_irq.shorthand = APIC_DEST_NOSHORT;
   9153	lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
   9154	lapic_irq.level = 0;
   9155	lapic_irq.dest_id = apicid;
   9156	lapic_irq.msi_redir_hint = false;
   9157
   9158	lapic_irq.delivery_mode = APIC_DM_REMRD;
   9159	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
   9160}
   9161
   9162bool kvm_apicv_activated(struct kvm *kvm)
   9163{
   9164	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
   9165}
   9166EXPORT_SYMBOL_GPL(kvm_apicv_activated);
   9167
   9168bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
   9169{
   9170	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
   9171	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
   9172
   9173	return (vm_reasons | vcpu_reasons) == 0;
   9174}
   9175EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
   9176
   9177static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
   9178				       enum kvm_apicv_inhibit reason, bool set)
   9179{
   9180	if (set)
   9181		__set_bit(reason, inhibits);
   9182	else
   9183		__clear_bit(reason, inhibits);
   9184
   9185	trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
   9186}
   9187
   9188static void kvm_apicv_init(struct kvm *kvm)
   9189{
   9190	unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
   9191
   9192	init_rwsem(&kvm->arch.apicv_update_lock);
   9193
   9194	set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
   9195
   9196	if (!enable_apicv)
   9197		set_or_clear_apicv_inhibit(inhibits,
   9198					   APICV_INHIBIT_REASON_DISABLE, true);
   9199}
   9200
   9201static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
   9202{
   9203	struct kvm_vcpu *target = NULL;
   9204	struct kvm_apic_map *map;
   9205
   9206	vcpu->stat.directed_yield_attempted++;
   9207
   9208	if (single_task_running())
   9209		goto no_yield;
   9210
   9211	rcu_read_lock();
   9212	map = rcu_dereference(vcpu->kvm->arch.apic_map);
   9213
   9214	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
   9215		target = map->phys_map[dest_id]->vcpu;
   9216
   9217	rcu_read_unlock();
   9218
   9219	if (!target || !READ_ONCE(target->ready))
   9220		goto no_yield;
   9221
   9222	/* Ignore requests to yield to self */
   9223	if (vcpu == target)
   9224		goto no_yield;
   9225
   9226	if (kvm_vcpu_yield_to(target) <= 0)
   9227		goto no_yield;
   9228
   9229	vcpu->stat.directed_yield_successful++;
   9230
   9231no_yield:
   9232	return;
   9233}
   9234
   9235static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
   9236{
   9237	u64 ret = vcpu->run->hypercall.ret;
   9238
   9239	if (!is_64_bit_mode(vcpu))
   9240		ret = (u32)ret;
   9241	kvm_rax_write(vcpu, ret);
   9242	++vcpu->stat.hypercalls;
   9243	return kvm_skip_emulated_instruction(vcpu);
   9244}
   9245
   9246int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
   9247{
   9248	unsigned long nr, a0, a1, a2, a3, ret;
   9249	int op_64_bit;
   9250
   9251	if (kvm_xen_hypercall_enabled(vcpu->kvm))
   9252		return kvm_xen_hypercall(vcpu);
   9253
   9254	if (kvm_hv_hypercall_enabled(vcpu))
   9255		return kvm_hv_hypercall(vcpu);
   9256
   9257	nr = kvm_rax_read(vcpu);
   9258	a0 = kvm_rbx_read(vcpu);
   9259	a1 = kvm_rcx_read(vcpu);
   9260	a2 = kvm_rdx_read(vcpu);
   9261	a3 = kvm_rsi_read(vcpu);
   9262
   9263	trace_kvm_hypercall(nr, a0, a1, a2, a3);
   9264
   9265	op_64_bit = is_64_bit_hypercall(vcpu);
   9266	if (!op_64_bit) {
   9267		nr &= 0xFFFFFFFF;
   9268		a0 &= 0xFFFFFFFF;
   9269		a1 &= 0xFFFFFFFF;
   9270		a2 &= 0xFFFFFFFF;
   9271		a3 &= 0xFFFFFFFF;
   9272	}
   9273
   9274	// if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
   9275	// 	ret = -KVM_EPERM;
   9276	// 	goto out;
   9277	// }
   9278
   9279	ret = -KVM_ENOSYS;
   9280
   9281	switch (nr) {
   9282	case KVM_HC_VAPIC_POLL_IRQ:
   9283		ret = 0;
   9284		break;
   9285	case KVM_HC_KICK_CPU:
   9286		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
   9287			break;
   9288
   9289		kvm_pv_kick_cpu_op(vcpu->kvm, a1);
   9290		kvm_sched_yield(vcpu, a1);
   9291		ret = 0;
   9292		break;
   9293#ifdef CONFIG_X86_64
   9294	case KVM_HC_CLOCK_PAIRING:
   9295		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
   9296		break;
   9297#endif
   9298	case KVM_HC_SEND_IPI:
   9299		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
   9300			break;
   9301
   9302		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
   9303		break;
   9304	case KVM_HC_SCHED_YIELD:
   9305		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
   9306			break;
   9307
   9308		kvm_sched_yield(vcpu, a0);
   9309		ret = 0;
   9310		break;
   9311	case KVM_HC_MAP_GPA_RANGE: {
   9312		u64 gpa = a0, npages = a1, attrs = a2;
   9313
   9314		ret = -KVM_ENOSYS;
   9315		if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
   9316			break;
   9317
   9318		if (!PAGE_ALIGNED(gpa) || !npages ||
   9319		    gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
   9320			ret = -KVM_EINVAL;
   9321			break;
   9322		}
   9323
   9324		vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
   9325		vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
   9326		vcpu->run->hypercall.args[0]  = gpa;
   9327		vcpu->run->hypercall.args[1]  = npages;
   9328		vcpu->run->hypercall.args[2]  = attrs;
   9329		vcpu->run->hypercall.longmode = op_64_bit;
   9330		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
   9331		return 0;
   9332	}
   9333	case KVM_HC_CPC_VMMCALL_SIGNAL:
   9334		CPC_DBG("SIGNAL VMMCALL %lu:%lu\n", a0, a1);
   9335		cpc_send_guest_event(a0, a1);
   9336		ret = 0;
   9337		break;
   9338	case KVM_HC_CPC_VMMCALL_EXIT:
   9339		CPC_DBG("EXIT VMMCALL %lu:%lu\n", a0, a1);
   9340		vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
   9341		vcpu->run->hypercall.nr       = KVM_HC_CPC_VMMCALL_EXIT;
   9342		vcpu->run->hypercall.args[0]  = a0;
   9343		vcpu->run->hypercall.args[1]  = a1;
   9344		vcpu->run->hypercall.args[2]  = 0;
   9345		vcpu->run->hypercall.longmode = op_64_bit;
   9346		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
   9347		return 0;
   9348	default:
   9349		ret = -KVM_ENOSYS;
   9350		break;
   9351	}
   9352//out:
   9353	if (!op_64_bit)
   9354		ret = (u32)ret;
   9355	kvm_rax_write(vcpu, ret);
   9356
   9357	++vcpu->stat.hypercalls;
   9358	return kvm_skip_emulated_instruction(vcpu);
   9359}
   9360EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
   9361
   9362static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
   9363{
   9364	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   9365	char instruction[3];
   9366	unsigned long rip = kvm_rip_read(vcpu);
   9367
   9368	/*
   9369	 * If the quirk is disabled, synthesize a #UD and let the guest pick up
   9370	 * the pieces.
   9371	 */
   9372	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
   9373		ctxt->exception.error_code_valid = false;
   9374		ctxt->exception.vector = UD_VECTOR;
   9375		ctxt->have_exception = true;
   9376		return X86EMUL_PROPAGATE_FAULT;
   9377	}
   9378
   9379	static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
   9380
   9381	return emulator_write_emulated(ctxt, rip, instruction, 3,
   9382		&ctxt->exception);
   9383}
   9384
   9385static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
   9386{
   9387	return vcpu->run->request_interrupt_window &&
   9388		likely(!pic_in_kernel(vcpu->kvm));
   9389}
   9390
   9391/* Called within kvm->srcu read side.  */
   9392static void post_kvm_run_save(struct kvm_vcpu *vcpu)
   9393{
   9394	struct kvm_run *kvm_run = vcpu->run;
   9395
   9396	kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
   9397	kvm_run->cr8 = kvm_get_cr8(vcpu);
   9398	kvm_run->apic_base = kvm_get_apic_base(vcpu);
   9399
   9400	kvm_run->ready_for_interrupt_injection =
   9401		pic_in_kernel(vcpu->kvm) ||
   9402		kvm_vcpu_ready_for_interrupt_injection(vcpu);
   9403
   9404	if (is_smm(vcpu))
   9405		kvm_run->flags |= KVM_RUN_X86_SMM;
   9406}
   9407
   9408static void update_cr8_intercept(struct kvm_vcpu *vcpu)
   9409{
   9410	int max_irr, tpr;
   9411
   9412	if (!kvm_x86_ops.update_cr8_intercept)
   9413		return;
   9414
   9415	if (!lapic_in_kernel(vcpu))
   9416		return;
   9417
   9418	if (vcpu->arch.apicv_active)
   9419		return;
   9420
   9421	if (!vcpu->arch.apic->vapic_addr)
   9422		max_irr = kvm_lapic_find_highest_irr(vcpu);
   9423	else
   9424		max_irr = -1;
   9425
   9426	if (max_irr != -1)
   9427		max_irr >>= 4;
   9428
   9429	tpr = kvm_lapic_get_cr8(vcpu);
   9430
   9431	static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
   9432}
   9433
   9434
   9435int kvm_check_nested_events(struct kvm_vcpu *vcpu)
   9436{
   9437	if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
   9438		kvm_x86_ops.nested_ops->triple_fault(vcpu);
   9439		return 1;
   9440	}
   9441
   9442	return kvm_x86_ops.nested_ops->check_events(vcpu);
   9443}
   9444
   9445static void kvm_inject_exception(struct kvm_vcpu *vcpu)
   9446{
   9447	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
   9448		vcpu->arch.exception.error_code = false;
   9449	static_call(kvm_x86_queue_exception)(vcpu);
   9450}
   9451
   9452static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
   9453{
   9454	int r;
   9455	bool can_inject = true;
   9456
   9457	/* try to reinject previous events if any */
   9458
   9459	if (vcpu->arch.exception.injected) {
   9460		kvm_inject_exception(vcpu);
   9461		can_inject = false;
   9462	}
   9463	/*
   9464	 * Do not inject an NMI or interrupt if there is a pending
   9465	 * exception.  Exceptions and interrupts are recognized at
   9466	 * instruction boundaries, i.e. the start of an instruction.
   9467	 * Trap-like exceptions, e.g. #DB, have higher priority than
   9468	 * NMIs and interrupts, i.e. traps are recognized before an
   9469	 * NMI/interrupt that's pending on the same instruction.
   9470	 * Fault-like exceptions, e.g. #GP and #PF, are the lowest
   9471	 * priority, but are only generated (pended) during instruction
   9472	 * execution, i.e. a pending fault-like exception means the
   9473	 * fault occurred on the *previous* instruction and must be
   9474	 * serviced prior to recognizing any new events in order to
   9475	 * fully complete the previous instruction.
   9476	 */
   9477	else if (!vcpu->arch.exception.pending) {
   9478		if (vcpu->arch.nmi_injected) {
   9479			static_call(kvm_x86_inject_nmi)(vcpu);
   9480			can_inject = false;
   9481		} else if (vcpu->arch.interrupt.injected) {
   9482			static_call(kvm_x86_inject_irq)(vcpu);
   9483			can_inject = false;
   9484		}
   9485	}
   9486
   9487	WARN_ON_ONCE(vcpu->arch.exception.injected &&
   9488		     vcpu->arch.exception.pending);
   9489
   9490	/*
   9491	 * Call check_nested_events() even if we reinjected a previous event
   9492	 * in order for caller to determine if it should require immediate-exit
   9493	 * from L2 to L1 due to pending L1 events which require exit
   9494	 * from L2 to L1.
   9495	 */
   9496	if (is_guest_mode(vcpu)) {
   9497		r = kvm_check_nested_events(vcpu);
   9498		if (r < 0)
   9499			goto out;
   9500	}
   9501
   9502	/* try to inject new event if pending */
   9503	if (vcpu->arch.exception.pending) {
   9504		trace_kvm_inj_exception(vcpu->arch.exception.nr,
   9505					vcpu->arch.exception.has_error_code,
   9506					vcpu->arch.exception.error_code);
   9507
   9508		vcpu->arch.exception.pending = false;
   9509		vcpu->arch.exception.injected = true;
   9510
   9511		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
   9512			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
   9513					     X86_EFLAGS_RF);
   9514
   9515		if (vcpu->arch.exception.nr == DB_VECTOR) {
   9516			kvm_deliver_exception_payload(vcpu);
   9517			if (vcpu->arch.dr7 & DR7_GD) {
   9518				vcpu->arch.dr7 &= ~DR7_GD;
   9519				kvm_update_dr7(vcpu);
   9520			}
   9521		}
   9522
   9523		kvm_inject_exception(vcpu);
   9524		can_inject = false;
   9525	}
   9526
   9527	/* Don't inject interrupts if the user asked to avoid doing so */
   9528	if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
   9529		return 0;
   9530
   9531	/*
   9532	 * Finally, inject interrupt events.  If an event cannot be injected
   9533	 * due to architectural conditions (e.g. IF=0) a window-open exit
   9534	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
   9535	 * and can architecturally be injected, but we cannot do it right now:
   9536	 * an interrupt could have arrived just now and we have to inject it
   9537	 * as a vmexit, or there could already an event in the queue, which is
   9538	 * indicated by can_inject.  In that case we request an immediate exit
   9539	 * in order to make progress and get back here for another iteration.
   9540	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
   9541	 */
   9542	if (vcpu->arch.smi_pending) {
   9543		r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
   9544		if (r < 0)
   9545			goto out;
   9546		if (r) {
   9547			vcpu->arch.smi_pending = false;
   9548			++vcpu->arch.smi_count;
   9549			enter_smm(vcpu);
   9550			can_inject = false;
   9551		} else
   9552			static_call(kvm_x86_enable_smi_window)(vcpu);
   9553	}
   9554
   9555	if (vcpu->arch.nmi_pending) {
   9556		r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
   9557		if (r < 0)
   9558			goto out;
   9559		if (r) {
   9560			--vcpu->arch.nmi_pending;
   9561			vcpu->arch.nmi_injected = true;
   9562			static_call(kvm_x86_inject_nmi)(vcpu);
   9563			can_inject = false;
   9564			WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
   9565		}
   9566		if (vcpu->arch.nmi_pending)
   9567			static_call(kvm_x86_enable_nmi_window)(vcpu);
   9568	}
   9569
   9570	if (kvm_cpu_has_injectable_intr(vcpu)) {
   9571		r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
   9572		if (r < 0)
   9573			goto out;
   9574		if (r) {
   9575			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
   9576			static_call(kvm_x86_inject_irq)(vcpu);
   9577			WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
   9578		}
   9579		if (kvm_cpu_has_injectable_intr(vcpu))
   9580			static_call(kvm_x86_enable_irq_window)(vcpu);
   9581	}
   9582
   9583	if (is_guest_mode(vcpu) &&
   9584	    kvm_x86_ops.nested_ops->hv_timer_pending &&
   9585	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
   9586		*req_immediate_exit = true;
   9587
   9588	WARN_ON(vcpu->arch.exception.pending);
   9589	return 0;
   9590
   9591out:
   9592	if (r == -EBUSY) {
   9593		*req_immediate_exit = true;
   9594		r = 0;
   9595	}
   9596	return r;
   9597}
   9598
   9599static void process_nmi(struct kvm_vcpu *vcpu)
   9600{
   9601	unsigned limit = 2;
   9602
   9603	/*
   9604	 * x86 is limited to one NMI running, and one NMI pending after it.
   9605	 * If an NMI is already in progress, limit further NMIs to just one.
   9606	 * Otherwise, allow two (and we'll inject the first one immediately).
   9607	 */
   9608	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
   9609		limit = 1;
   9610
   9611	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
   9612	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
   9613	kvm_make_request(KVM_REQ_EVENT, vcpu);
   9614}
   9615
   9616static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
   9617{
   9618	u32 flags = 0;
   9619	flags |= seg->g       << 23;
   9620	flags |= seg->db      << 22;
   9621	flags |= seg->l       << 21;
   9622	flags |= seg->avl     << 20;
   9623	flags |= seg->present << 15;
   9624	flags |= seg->dpl     << 13;
   9625	flags |= seg->s       << 12;
   9626	flags |= seg->type    << 8;
   9627	return flags;
   9628}
   9629
   9630static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
   9631{
   9632	struct kvm_segment seg;
   9633	int offset;
   9634
   9635	kvm_get_segment(vcpu, &seg, n);
   9636	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
   9637
   9638	if (n < 3)
   9639		offset = 0x7f84 + n * 12;
   9640	else
   9641		offset = 0x7f2c + (n - 3) * 12;
   9642
   9643	put_smstate(u32, buf, offset + 8, seg.base);
   9644	put_smstate(u32, buf, offset + 4, seg.limit);
   9645	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
   9646}
   9647
   9648#ifdef CONFIG_X86_64
   9649static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
   9650{
   9651	struct kvm_segment seg;
   9652	int offset;
   9653	u16 flags;
   9654
   9655	kvm_get_segment(vcpu, &seg, n);
   9656	offset = 0x7e00 + n * 16;
   9657
   9658	flags = enter_smm_get_segment_flags(&seg) >> 8;
   9659	put_smstate(u16, buf, offset, seg.selector);
   9660	put_smstate(u16, buf, offset + 2, flags);
   9661	put_smstate(u32, buf, offset + 4, seg.limit);
   9662	put_smstate(u64, buf, offset + 8, seg.base);
   9663}
   9664#endif
   9665
   9666static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
   9667{
   9668	struct desc_ptr dt;
   9669	struct kvm_segment seg;
   9670	unsigned long val;
   9671	int i;
   9672
   9673	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
   9674	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
   9675	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
   9676	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
   9677
   9678	for (i = 0; i < 8; i++)
   9679		put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
   9680
   9681	kvm_get_dr(vcpu, 6, &val);
   9682	put_smstate(u32, buf, 0x7fcc, (u32)val);
   9683	kvm_get_dr(vcpu, 7, &val);
   9684	put_smstate(u32, buf, 0x7fc8, (u32)val);
   9685
   9686	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
   9687	put_smstate(u32, buf, 0x7fc4, seg.selector);
   9688	put_smstate(u32, buf, 0x7f64, seg.base);
   9689	put_smstate(u32, buf, 0x7f60, seg.limit);
   9690	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
   9691
   9692	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
   9693	put_smstate(u32, buf, 0x7fc0, seg.selector);
   9694	put_smstate(u32, buf, 0x7f80, seg.base);
   9695	put_smstate(u32, buf, 0x7f7c, seg.limit);
   9696	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
   9697
   9698	static_call(kvm_x86_get_gdt)(vcpu, &dt);
   9699	put_smstate(u32, buf, 0x7f74, dt.address);
   9700	put_smstate(u32, buf, 0x7f70, dt.size);
   9701
   9702	static_call(kvm_x86_get_idt)(vcpu, &dt);
   9703	put_smstate(u32, buf, 0x7f58, dt.address);
   9704	put_smstate(u32, buf, 0x7f54, dt.size);
   9705
   9706	for (i = 0; i < 6; i++)
   9707		enter_smm_save_seg_32(vcpu, buf, i);
   9708
   9709	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
   9710
   9711	/* revision id */
   9712	put_smstate(u32, buf, 0x7efc, 0x00020000);
   9713	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
   9714}
   9715
   9716#ifdef CONFIG_X86_64
   9717static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
   9718{
   9719	struct desc_ptr dt;
   9720	struct kvm_segment seg;
   9721	unsigned long val;
   9722	int i;
   9723
   9724	for (i = 0; i < 16; i++)
   9725		put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
   9726
   9727	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
   9728	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
   9729
   9730	kvm_get_dr(vcpu, 6, &val);
   9731	put_smstate(u64, buf, 0x7f68, val);
   9732	kvm_get_dr(vcpu, 7, &val);
   9733	put_smstate(u64, buf, 0x7f60, val);
   9734
   9735	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
   9736	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
   9737	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
   9738
   9739	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
   9740
   9741	/* revision id */
   9742	put_smstate(u32, buf, 0x7efc, 0x00020064);
   9743
   9744	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
   9745
   9746	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
   9747	put_smstate(u16, buf, 0x7e90, seg.selector);
   9748	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
   9749	put_smstate(u32, buf, 0x7e94, seg.limit);
   9750	put_smstate(u64, buf, 0x7e98, seg.base);
   9751
   9752	static_call(kvm_x86_get_idt)(vcpu, &dt);
   9753	put_smstate(u32, buf, 0x7e84, dt.size);
   9754	put_smstate(u64, buf, 0x7e88, dt.address);
   9755
   9756	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
   9757	put_smstate(u16, buf, 0x7e70, seg.selector);
   9758	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
   9759	put_smstate(u32, buf, 0x7e74, seg.limit);
   9760	put_smstate(u64, buf, 0x7e78, seg.base);
   9761
   9762	static_call(kvm_x86_get_gdt)(vcpu, &dt);
   9763	put_smstate(u32, buf, 0x7e64, dt.size);
   9764	put_smstate(u64, buf, 0x7e68, dt.address);
   9765
   9766	for (i = 0; i < 6; i++)
   9767		enter_smm_save_seg_64(vcpu, buf, i);
   9768}
   9769#endif
   9770
   9771static void enter_smm(struct kvm_vcpu *vcpu)
   9772{
   9773	struct kvm_segment cs, ds;
   9774	struct desc_ptr dt;
   9775	unsigned long cr0;
   9776	char buf[512];
   9777
   9778	memset(buf, 0, 512);
   9779#ifdef CONFIG_X86_64
   9780	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
   9781		enter_smm_save_state_64(vcpu, buf);
   9782	else
   9783#endif
   9784		enter_smm_save_state_32(vcpu, buf);
   9785
   9786	/*
   9787	 * Give enter_smm() a chance to make ISA-specific changes to the vCPU
   9788	 * state (e.g. leave guest mode) after we've saved the state into the
   9789	 * SMM state-save area.
   9790	 */
   9791	static_call(kvm_x86_enter_smm)(vcpu, buf);
   9792
   9793	kvm_smm_changed(vcpu, true);
   9794	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
   9795
   9796	if (static_call(kvm_x86_get_nmi_mask)(vcpu))
   9797		vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
   9798	else
   9799		static_call(kvm_x86_set_nmi_mask)(vcpu, true);
   9800
   9801	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
   9802	kvm_rip_write(vcpu, 0x8000);
   9803
   9804	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
   9805	static_call(kvm_x86_set_cr0)(vcpu, cr0);
   9806	vcpu->arch.cr0 = cr0;
   9807
   9808	static_call(kvm_x86_set_cr4)(vcpu, 0);
   9809
   9810	/* Undocumented: IDT limit is set to zero on entry to SMM.  */
   9811	dt.address = dt.size = 0;
   9812	static_call(kvm_x86_set_idt)(vcpu, &dt);
   9813
   9814	kvm_set_dr(vcpu, 7, DR7_FIXED_1);
   9815
   9816	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
   9817	cs.base = vcpu->arch.smbase;
   9818
   9819	ds.selector = 0;
   9820	ds.base = 0;
   9821
   9822	cs.limit    = ds.limit = 0xffffffff;
   9823	cs.type     = ds.type = 0x3;
   9824	cs.dpl      = ds.dpl = 0;
   9825	cs.db       = ds.db = 0;
   9826	cs.s        = ds.s = 1;
   9827	cs.l        = ds.l = 0;
   9828	cs.g        = ds.g = 1;
   9829	cs.avl      = ds.avl = 0;
   9830	cs.present  = ds.present = 1;
   9831	cs.unusable = ds.unusable = 0;
   9832	cs.padding  = ds.padding = 0;
   9833
   9834	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
   9835	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
   9836	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
   9837	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
   9838	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
   9839	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
   9840
   9841#ifdef CONFIG_X86_64
   9842	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
   9843		static_call(kvm_x86_set_efer)(vcpu, 0);
   9844#endif
   9845
   9846	kvm_update_cpuid_runtime(vcpu);
   9847	kvm_mmu_reset_context(vcpu);
   9848}
   9849
   9850static void process_smi(struct kvm_vcpu *vcpu)
   9851{
   9852	vcpu->arch.smi_pending = true;
   9853	kvm_make_request(KVM_REQ_EVENT, vcpu);
   9854}
   9855
   9856void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
   9857				       unsigned long *vcpu_bitmap)
   9858{
   9859	kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
   9860}
   9861
   9862void kvm_make_scan_ioapic_request(struct kvm *kvm)
   9863{
   9864	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
   9865}
   9866
   9867void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
   9868{
   9869	bool activate;
   9870
   9871	if (!lapic_in_kernel(vcpu))
   9872		return;
   9873
   9874	down_read(&vcpu->kvm->arch.apicv_update_lock);
   9875	preempt_disable();
   9876
   9877	activate = kvm_vcpu_apicv_activated(vcpu);
   9878
   9879	if (vcpu->arch.apicv_active == activate)
   9880		goto out;
   9881
   9882	vcpu->arch.apicv_active = activate;
   9883	kvm_apic_update_apicv(vcpu);
   9884	static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
   9885
   9886	/*
   9887	 * When APICv gets disabled, we may still have injected interrupts
   9888	 * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
   9889	 * still active when the interrupt got accepted. Make sure
   9890	 * inject_pending_event() is called to check for that.
   9891	 */
   9892	if (!vcpu->arch.apicv_active)
   9893		kvm_make_request(KVM_REQ_EVENT, vcpu);
   9894
   9895out:
   9896	preempt_enable();
   9897	up_read(&vcpu->kvm->arch.apicv_update_lock);
   9898}
   9899EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
   9900
   9901void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
   9902				      enum kvm_apicv_inhibit reason, bool set)
   9903{
   9904	unsigned long old, new;
   9905
   9906	lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
   9907
   9908	if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
   9909		return;
   9910
   9911	old = new = kvm->arch.apicv_inhibit_reasons;
   9912
   9913	set_or_clear_apicv_inhibit(&new, reason, set);
   9914
   9915	if (!!old != !!new) {
   9916		/*
   9917		 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
   9918		 * false positives in the sanity check WARN in svm_vcpu_run().
   9919		 * This task will wait for all vCPUs to ack the kick IRQ before
   9920		 * updating apicv_inhibit_reasons, and all other vCPUs will
   9921		 * block on acquiring apicv_update_lock so that vCPUs can't
   9922		 * redo svm_vcpu_run() without seeing the new inhibit state.
   9923		 *
   9924		 * Note, holding apicv_update_lock and taking it in the read
   9925		 * side (handling the request) also prevents other vCPUs from
   9926		 * servicing the request with a stale apicv_inhibit_reasons.
   9927		 */
   9928		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
   9929		kvm->arch.apicv_inhibit_reasons = new;
   9930		if (new) {
   9931			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
   9932			kvm_zap_gfn_range(kvm, gfn, gfn+1);
   9933		}
   9934	} else {
   9935		kvm->arch.apicv_inhibit_reasons = new;
   9936	}
   9937}
   9938
   9939void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
   9940				    enum kvm_apicv_inhibit reason, bool set)
   9941{
   9942	if (!enable_apicv)
   9943		return;
   9944
   9945	down_write(&kvm->arch.apicv_update_lock);
   9946	__kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
   9947	up_write(&kvm->arch.apicv_update_lock);
   9948}
   9949EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
   9950
   9951static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
   9952{
   9953	if (!kvm_apic_present(vcpu))
   9954		return;
   9955
   9956	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
   9957
   9958	if (irqchip_split(vcpu->kvm))
   9959		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
   9960	else {
   9961		static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
   9962		if (ioapic_in_kernel(vcpu->kvm))
   9963			kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
   9964	}
   9965
   9966	if (is_guest_mode(vcpu))
   9967		vcpu->arch.load_eoi_exitmap_pending = true;
   9968	else
   9969		kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
   9970}
   9971
   9972static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
   9973{
   9974	u64 eoi_exit_bitmap[4];
   9975
   9976	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
   9977		return;
   9978
   9979	if (to_hv_vcpu(vcpu)) {
   9980		bitmap_or((ulong *)eoi_exit_bitmap,
   9981			  vcpu->arch.ioapic_handled_vectors,
   9982			  to_hv_synic(vcpu)->vec_bitmap, 256);
   9983		static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
   9984		return;
   9985	}
   9986
   9987	static_call_cond(kvm_x86_load_eoi_exitmap)(
   9988		vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
   9989}
   9990
   9991void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
   9992					    unsigned long start, unsigned long end)
   9993{
   9994	unsigned long apic_address;
   9995
   9996	/*
   9997	 * The physical address of apic access page is stored in the VMCS.
   9998	 * Update it when it becomes invalid.
   9999	 */
  10000	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
  10001	if (start <= apic_address && apic_address < end)
  10002		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
  10003}
  10004
  10005static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
  10006{
  10007	if (!lapic_in_kernel(vcpu))
  10008		return;
  10009
  10010	static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
  10011}
  10012
  10013void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
  10014{
  10015	smp_send_reschedule(vcpu->cpu);
  10016}
  10017EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
  10018
  10019/*
  10020 * Called within kvm->srcu read side.
  10021 * Returns 1 to let vcpu_run() continue the guest execution loop without
  10022 * exiting to the userspace.  Otherwise, the value will be returned to the
  10023 * userspace.
  10024 */
  10025static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  10026{
  10027	int r;
  10028	bool req_int_win =
  10029		dm_request_for_irq_injection(vcpu) &&
  10030		kvm_cpu_accept_dm_intr(vcpu);
  10031	fastpath_t exit_fastpath;
  10032
  10033	bool req_immediate_exit = false;
  10034
  10035	/* Forbid vmenter if vcpu dirty ring is soft-full */
  10036	if (unlikely(vcpu->kvm->dirty_ring_size &&
  10037		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
  10038		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
  10039		trace_kvm_dirty_ring_exit(vcpu);
  10040		r = 0;
  10041		goto out;
  10042	}
  10043
  10044	if (kvm_request_pending(vcpu)) {
  10045		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
  10046			r = -EIO;
  10047			goto out;
  10048		}
  10049		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
  10050			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
  10051				r = 0;
  10052				goto out;
  10053			}
  10054		}
  10055		if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
  10056			kvm_mmu_free_obsolete_roots(vcpu);
  10057		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
  10058			__kvm_migrate_timers(vcpu);
  10059		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
  10060			kvm_update_masterclock(vcpu->kvm);
  10061		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
  10062			kvm_gen_kvmclock_update(vcpu);
  10063		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
  10064			r = kvm_guest_time_update(vcpu);
  10065			if (unlikely(r))
  10066				goto out;
  10067		}
  10068		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
  10069			kvm_mmu_sync_roots(vcpu);
  10070		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
  10071			kvm_mmu_load_pgd(vcpu);
  10072		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
  10073			kvm_vcpu_flush_tlb_all(vcpu);
  10074
  10075			/* Flushing all ASIDs flushes the current ASID... */
  10076			kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  10077		}
  10078		kvm_service_local_tlb_flush_requests(vcpu);
  10079
  10080		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
  10081			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
  10082			r = 0;
  10083			goto out;
  10084		}
  10085		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
  10086			if (is_guest_mode(vcpu)) {
  10087				kvm_x86_ops.nested_ops->triple_fault(vcpu);
  10088			} else {
  10089				vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
  10090				vcpu->mmio_needed = 0;
  10091				r = 0;
  10092				goto out;
  10093			}
  10094		}
  10095		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
  10096			/* Page is swapped out. Do synthetic halt */
  10097			vcpu->arch.apf.halted = true;
  10098			r = 1;
  10099			goto out;
  10100		}
  10101		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
  10102			record_steal_time(vcpu);
  10103		if (kvm_check_request(KVM_REQ_SMI, vcpu))
  10104			process_smi(vcpu);
  10105		if (kvm_check_request(KVM_REQ_NMI, vcpu))
  10106			process_nmi(vcpu);
  10107		if (kvm_check_request(KVM_REQ_PMU, vcpu))
  10108			kvm_pmu_handle_event(vcpu);
  10109		if (kvm_check_request(KVM_REQ_PMI, vcpu))
  10110			kvm_pmu_deliver_pmi(vcpu);
  10111		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
  10112			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
  10113			if (test_bit(vcpu->arch.pending_ioapic_eoi,
  10114				     vcpu->arch.ioapic_handled_vectors)) {
  10115				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
  10116				vcpu->run->eoi.vector =
  10117						vcpu->arch.pending_ioapic_eoi;
  10118				r = 0;
  10119				goto out;
  10120			}
  10121		}
  10122		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
  10123			vcpu_scan_ioapic(vcpu);
  10124		if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
  10125			vcpu_load_eoi_exitmap(vcpu);
  10126		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
  10127			kvm_vcpu_reload_apic_access_page(vcpu);
  10128		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
  10129			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
  10130			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
  10131			vcpu->run->system_event.ndata = 0;
  10132			r = 0;
  10133			goto out;
  10134		}
  10135		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
  10136			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
  10137			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
  10138			vcpu->run->system_event.ndata = 0;
  10139			r = 0;
  10140			goto out;
  10141		}
  10142		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
  10143			struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
  10144
  10145			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
  10146			vcpu->run->hyperv = hv_vcpu->exit;
  10147			r = 0;
  10148			goto out;
  10149		}
  10150
  10151		/*
  10152		 * KVM_REQ_HV_STIMER has to be processed after
  10153		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
  10154		 * depend on the guest clock being up-to-date
  10155		 */
  10156		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
  10157			kvm_hv_process_stimers(vcpu);
  10158		if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
  10159			kvm_vcpu_update_apicv(vcpu);
  10160		if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
  10161			kvm_check_async_pf_completion(vcpu);
  10162		if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
  10163			static_call(kvm_x86_msr_filter_changed)(vcpu);
  10164
  10165		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
  10166			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
  10167
  10168		if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
  10169			kvm_vcpu_reset(vcpu, true);
  10170			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
  10171				goto out;
  10172		}
  10173	}
  10174
  10175	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
  10176	    kvm_xen_has_interrupt(vcpu)) {
  10177		++vcpu->stat.req_event;
  10178		r = kvm_apic_accept_events(vcpu);
  10179		if (r < 0) {
  10180			r = 0;
  10181			goto out;
  10182		}
  10183		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
  10184			r = 1;
  10185			goto out;
  10186		}
  10187
  10188		r = inject_pending_event(vcpu, &req_immediate_exit);
  10189		if (r < 0) {
  10190			r = 0;
  10191			goto out;
  10192		}
  10193		if (req_int_win)
  10194			static_call(kvm_x86_enable_irq_window)(vcpu);
  10195
  10196		if (kvm_lapic_enabled(vcpu)) {
  10197			update_cr8_intercept(vcpu);
  10198			kvm_lapic_sync_to_vapic(vcpu);
  10199		}
  10200	}
  10201
  10202	r = kvm_mmu_reload(vcpu);
  10203	if (unlikely(r)) {
  10204		goto cancel_injection;
  10205	}
  10206
  10207	preempt_disable();
  10208
  10209	static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
  10210
  10211	/*
  10212	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
  10213	 * IPI are then delayed after guest entry, which ensures that they
  10214	 * result in virtual interrupt delivery.
  10215	 */
  10216	local_irq_disable();
  10217
  10218	/* Store vcpu->apicv_active before vcpu->mode.  */
  10219	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
  10220
  10221	kvm_vcpu_srcu_read_unlock(vcpu);
  10222
  10223	/*
  10224	 * 1) We should set ->mode before checking ->requests.  Please see
  10225	 * the comment in kvm_vcpu_exiting_guest_mode().
  10226	 *
  10227	 * 2) For APICv, we should set ->mode before checking PID.ON. This
  10228	 * pairs with the memory barrier implicit in pi_test_and_set_on
  10229	 * (see vmx_deliver_posted_interrupt).
  10230	 *
  10231	 * 3) This also orders the write to mode from any reads to the page
  10232	 * tables done while the VCPU is running.  Please see the comment
  10233	 * in kvm_flush_remote_tlbs.
  10234	 */
  10235	smp_mb__after_srcu_read_unlock();
  10236
  10237	/*
  10238	 * Process pending posted interrupts to handle the case where the
  10239	 * notification IRQ arrived in the host, or was never sent (because the
  10240	 * target vCPU wasn't running).  Do this regardless of the vCPU's APICv
  10241	 * status, KVM doesn't update assigned devices when APICv is inhibited,
  10242	 * i.e. they can post interrupts even if APICv is temporarily disabled.
  10243	 */
  10244	if (kvm_lapic_enabled(vcpu))
  10245		static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
  10246
  10247	if (kvm_vcpu_exit_request(vcpu)) {
  10248		vcpu->mode = OUTSIDE_GUEST_MODE;
  10249		smp_wmb();
  10250		local_irq_enable();
  10251		preempt_enable();
  10252		kvm_vcpu_srcu_read_lock(vcpu);
  10253		r = 1;
  10254		goto cancel_injection;
  10255	}
  10256
  10257	if (req_immediate_exit) {
  10258		kvm_make_request(KVM_REQ_EVENT, vcpu);
  10259		static_call(kvm_x86_request_immediate_exit)(vcpu);
  10260	}
  10261
  10262	fpregs_assert_state_consistent();
  10263	if (test_thread_flag(TIF_NEED_FPU_LOAD))
  10264		switch_fpu_return();
  10265
  10266	if (vcpu->arch.guest_fpu.xfd_err)
  10267		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
  10268
  10269	if (unlikely(vcpu->arch.switch_db_regs)) {
  10270		set_debugreg(0, 7);
  10271		set_debugreg(vcpu->arch.eff_db[0], 0);
  10272		set_debugreg(vcpu->arch.eff_db[1], 1);
  10273		set_debugreg(vcpu->arch.eff_db[2], 2);
  10274		set_debugreg(vcpu->arch.eff_db[3], 3);
  10275	} else if (unlikely(hw_breakpoint_active())) {
  10276		set_debugreg(0, 7);
  10277	}
  10278
  10279	guest_timing_enter_irqoff();
  10280
  10281	for (;;) {
  10282		/*
  10283		 * Assert that vCPU vs. VM APICv state is consistent.  An APICv
  10284		 * update must kick and wait for all vCPUs before toggling the
  10285		 * per-VM state, and responsing vCPUs must wait for the update
  10286		 * to complete before servicing KVM_REQ_APICV_UPDATE.
  10287		 */
  10288		WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
  10289
  10290		exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
  10291		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
  10292			break;
  10293
  10294		if (kvm_lapic_enabled(vcpu))
  10295			static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
  10296
  10297		if (unlikely(kvm_vcpu_exit_request(vcpu))) {
  10298			exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
  10299			break;
  10300		}
  10301	}
  10302
  10303	/*
  10304	 * Do this here before restoring debug registers on the host.  And
  10305	 * since we do this before handling the vmexit, a DR access vmexit
  10306	 * can (a) read the correct value of the debug registers, (b) set
  10307	 * KVM_DEBUGREG_WONT_EXIT again.
  10308	 */
  10309	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
  10310		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
  10311		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
  10312		kvm_update_dr0123(vcpu);
  10313		kvm_update_dr7(vcpu);
  10314	}
  10315
  10316	/*
  10317	 * If the guest has used debug registers, at least dr7
  10318	 * will be disabled while returning to the host.
  10319	 * If we don't have active breakpoints in the host, we don't
  10320	 * care about the messed up debug address registers. But if
  10321	 * we have some of them active, restore the old state.
  10322	 */
  10323	if (hw_breakpoint_active())
  10324		hw_breakpoint_restore();
  10325
  10326	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
  10327	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
  10328
  10329	vcpu->mode = OUTSIDE_GUEST_MODE;
  10330	smp_wmb();
  10331
  10332	/*
  10333	 * Sync xfd before calling handle_exit_irqoff() which may
  10334	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
  10335	 * in #NM irqoff handler).
  10336	 */
  10337	if (vcpu->arch.xfd_no_write_intercept)
  10338		fpu_sync_guest_vmexit_xfd_state();
  10339
  10340	static_call(kvm_x86_handle_exit_irqoff)(vcpu);
  10341
  10342	if (vcpu->arch.guest_fpu.xfd_err)
  10343		wrmsrl(MSR_IA32_XFD_ERR, 0);
  10344
  10345	/*
  10346	 * Consume any pending interrupts, including the possible source of
  10347	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
  10348	 * An instruction is required after local_irq_enable() to fully unblock
  10349	 * interrupts on processors that implement an interrupt shadow, the
  10350	 * stat.exits increment will do nicely.
  10351	 */
  10352	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
  10353	local_irq_enable();
  10354	++vcpu->stat.exits;
  10355	local_irq_disable();
  10356	kvm_after_interrupt(vcpu);
  10357
  10358	/*
  10359	 * Wait until after servicing IRQs to account guest time so that any
  10360	 * ticks that occurred while running the guest are properly accounted
  10361	 * to the guest.  Waiting until IRQs are enabled degrades the accuracy
  10362	 * of accounting via context tracking, but the loss of accuracy is
  10363	 * acceptable for all known use cases.
  10364	 */
  10365	guest_timing_exit_irqoff();
  10366
  10367	local_irq_enable();
  10368	preempt_enable();
  10369
  10370	kvm_vcpu_srcu_read_lock(vcpu);
  10371
  10372	/*
  10373	 * Profile KVM exit RIPs:
  10374	 */
  10375	if (unlikely(prof_on == KVM_PROFILING)) {
  10376		unsigned long rip = kvm_rip_read(vcpu);
  10377		profile_hit(KVM_PROFILING, (void *)rip);
  10378	}
  10379
  10380	if (unlikely(vcpu->arch.tsc_always_catchup))
  10381		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
  10382
  10383	if (vcpu->arch.apic_attention)
  10384		kvm_lapic_sync_from_vapic(vcpu);
  10385
  10386	r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
  10387	return r;
  10388
  10389cancel_injection:
  10390	if (req_immediate_exit)
  10391		kvm_make_request(KVM_REQ_EVENT, vcpu);
  10392	static_call(kvm_x86_cancel_injection)(vcpu);
  10393	if (unlikely(vcpu->arch.apic_attention))
  10394		kvm_lapic_sync_from_vapic(vcpu);
  10395out:
  10396	return r;
  10397}
  10398
  10399/* Called within kvm->srcu read side.  */
  10400static inline int vcpu_block(struct kvm_vcpu *vcpu)
  10401{
  10402	bool hv_timer;
  10403
  10404	if (!kvm_arch_vcpu_runnable(vcpu)) {
  10405		/*
  10406		 * Switch to the software timer before halt-polling/blocking as
  10407		 * the guest's timer may be a break event for the vCPU, and the
  10408		 * hypervisor timer runs only when the CPU is in guest mode.
  10409		 * Switch before halt-polling so that KVM recognizes an expired
  10410		 * timer before blocking.
  10411		 */
  10412		hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
  10413		if (hv_timer)
  10414			kvm_lapic_switch_to_sw_timer(vcpu);
  10415
  10416		kvm_vcpu_srcu_read_unlock(vcpu);
  10417		if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
  10418			kvm_vcpu_halt(vcpu);
  10419		else
  10420			kvm_vcpu_block(vcpu);
  10421		kvm_vcpu_srcu_read_lock(vcpu);
  10422
  10423		if (hv_timer)
  10424			kvm_lapic_switch_to_hv_timer(vcpu);
  10425
  10426		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
  10427			return 1;
  10428	}
  10429
  10430	if (kvm_apic_accept_events(vcpu) < 0)
  10431		return 0;
  10432	switch(vcpu->arch.mp_state) {
  10433	case KVM_MP_STATE_HALTED:
  10434	case KVM_MP_STATE_AP_RESET_HOLD:
  10435		vcpu->arch.pv.pv_unhalted = false;
  10436		vcpu->arch.mp_state =
  10437			KVM_MP_STATE_RUNNABLE;
  10438		fallthrough;
  10439	case KVM_MP_STATE_RUNNABLE:
  10440		vcpu->arch.apf.halted = false;
  10441		break;
  10442	case KVM_MP_STATE_INIT_RECEIVED:
  10443		break;
  10444	default:
  10445		return -EINTR;
  10446	}
  10447	return 1;
  10448}
  10449
  10450static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
  10451{
  10452	if (is_guest_mode(vcpu))
  10453		kvm_check_nested_events(vcpu);
  10454
  10455	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
  10456		!vcpu->arch.apf.halted);
  10457}
  10458
  10459/* Called within kvm->srcu read side.  */
  10460static int vcpu_run(struct kvm_vcpu *vcpu)
  10461{
  10462	int r;
  10463
  10464	vcpu->arch.l1tf_flush_l1d = true;
  10465
  10466	for (;;) {
  10467		/*
  10468		 * If another guest vCPU requests a PV TLB flush in the middle
  10469		 * of instruction emulation, the rest of the emulation could
  10470		 * use a stale page translation. Assume that any code after
  10471		 * this point can start executing an instruction.
  10472		 */
  10473		vcpu->arch.at_instruction_boundary = false;
  10474		if (kvm_vcpu_running(vcpu)) {
  10475			r = vcpu_enter_guest(vcpu);
  10476		} else {
  10477			r = vcpu_block(vcpu);
  10478		}
  10479
  10480		if (r <= 0)
  10481			break;
  10482
  10483		kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
  10484		if (kvm_xen_has_pending_events(vcpu))
  10485			kvm_xen_inject_pending_events(vcpu);
  10486
  10487		if (kvm_cpu_has_pending_timer(vcpu))
  10488			kvm_inject_pending_timer_irqs(vcpu);
  10489
  10490		if (dm_request_for_irq_injection(vcpu) &&
  10491			kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
  10492			r = 0;
  10493			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
  10494			++vcpu->stat.request_irq_exits;
  10495			break;
  10496		}
  10497
  10498		if (__xfer_to_guest_mode_work_pending()) {
  10499			kvm_vcpu_srcu_read_unlock(vcpu);
  10500			r = xfer_to_guest_mode_handle_work(vcpu);
  10501			kvm_vcpu_srcu_read_lock(vcpu);
  10502			if (r)
  10503				return r;
  10504		}
  10505	}
  10506
  10507	return r;
  10508}
  10509
  10510static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
  10511{
  10512	return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
  10513}
  10514
  10515static int complete_emulated_pio(struct kvm_vcpu *vcpu)
  10516{
  10517	BUG_ON(!vcpu->arch.pio.count);
  10518
  10519	return complete_emulated_io(vcpu);
  10520}
  10521
  10522/*
  10523 * Implements the following, as a state machine:
  10524 *
  10525 * read:
  10526 *   for each fragment
  10527 *     for each mmio piece in the fragment
  10528 *       write gpa, len
  10529 *       exit
  10530 *       copy data
  10531 *   execute insn
  10532 *
  10533 * write:
  10534 *   for each fragment
  10535 *     for each mmio piece in the fragment
  10536 *       write gpa, len
  10537 *       copy data
  10538 *       exit
  10539 */
  10540static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
  10541{
  10542	struct kvm_run *run = vcpu->run;
  10543	struct kvm_mmio_fragment *frag;
  10544	unsigned len;
  10545
  10546	BUG_ON(!vcpu->mmio_needed);
  10547
  10548	/* Complete previous fragment */
  10549	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
  10550	len = min(8u, frag->len);
  10551	if (!vcpu->mmio_is_write)
  10552		memcpy(frag->data, run->mmio.data, len);
  10553
  10554	if (frag->len <= 8) {
  10555		/* Switch to the next fragment. */
  10556		frag++;
  10557		vcpu->mmio_cur_fragment++;
  10558	} else {
  10559		/* Go forward to the next mmio piece. */
  10560		frag->data += len;
  10561		frag->gpa += len;
  10562		frag->len -= len;
  10563	}
  10564
  10565	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
  10566		vcpu->mmio_needed = 0;
  10567
  10568		/* FIXME: return into emulator if single-stepping.  */
  10569		if (vcpu->mmio_is_write)
  10570			return 1;
  10571		vcpu->mmio_read_completed = 1;
  10572		return complete_emulated_io(vcpu);
  10573	}
  10574
  10575	run->exit_reason = KVM_EXIT_MMIO;
  10576	run->mmio.phys_addr = frag->gpa;
  10577	if (vcpu->mmio_is_write)
  10578		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
  10579	run->mmio.len = min(8u, frag->len);
  10580	run->mmio.is_write = vcpu->mmio_is_write;
  10581	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
  10582	return 0;
  10583}
  10584
  10585/* Swap (qemu) user FPU context for the guest FPU context. */
  10586static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
  10587{
  10588	/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
  10589	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
  10590	trace_kvm_fpu(1);
  10591}
  10592
  10593/* When vcpu_run ends, restore user space FPU context. */
  10594static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  10595{
  10596	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
  10597	++vcpu->stat.fpu_reload;
  10598	trace_kvm_fpu(0);
  10599}
  10600
  10601int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  10602{
  10603	struct kvm_run *kvm_run = vcpu->run;
  10604	int r;
  10605
  10606	vcpu_load(vcpu);
  10607	kvm_sigset_activate(vcpu);
  10608	kvm_run->flags = 0;
  10609	kvm_load_guest_fpu(vcpu);
  10610
  10611	kvm_vcpu_srcu_read_lock(vcpu);
  10612	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
  10613		if (kvm_run->immediate_exit) {
  10614			r = -EINTR;
  10615			goto out;
  10616		}
  10617		/*
  10618		 * It should be impossible for the hypervisor timer to be in
  10619		 * use before KVM has ever run the vCPU.
  10620		 */
  10621		WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
  10622
  10623		kvm_vcpu_srcu_read_unlock(vcpu);
  10624		kvm_vcpu_block(vcpu);
  10625		kvm_vcpu_srcu_read_lock(vcpu);
  10626
  10627		if (kvm_apic_accept_events(vcpu) < 0) {
  10628			r = 0;
  10629			goto out;
  10630		}
  10631		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
  10632		r = -EAGAIN;
  10633		if (signal_pending(current)) {
  10634			r = -EINTR;
  10635			kvm_run->exit_reason = KVM_EXIT_INTR;
  10636			++vcpu->stat.signal_exits;
  10637		}
  10638		goto out;
  10639	}
  10640
  10641	if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
  10642	    (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
  10643		r = -EINVAL;
  10644		goto out;
  10645	}
  10646
  10647	if (kvm_run->kvm_dirty_regs) {
  10648		r = sync_regs(vcpu);
  10649		if (r != 0)
  10650			goto out;
  10651	}
  10652
  10653	/* re-sync apic's tpr */
  10654	if (!lapic_in_kernel(vcpu)) {
  10655		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
  10656			r = -EINVAL;
  10657			goto out;
  10658		}
  10659	}
  10660
  10661	if (unlikely(vcpu->arch.complete_userspace_io)) {
  10662		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
  10663		vcpu->arch.complete_userspace_io = NULL;
  10664		r = cui(vcpu);
  10665		if (r <= 0)
  10666			goto out;
  10667	} else
  10668		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
  10669
  10670	if (kvm_run->immediate_exit) {
  10671		r = -EINTR;
  10672		goto out;
  10673	}
  10674
  10675	r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
  10676	if (r <= 0)
  10677		goto out;
  10678
  10679	r = vcpu_run(vcpu);
  10680
  10681out:
  10682	kvm_put_guest_fpu(vcpu);
  10683	if (kvm_run->kvm_valid_regs)
  10684		store_regs(vcpu);
  10685	post_kvm_run_save(vcpu);
  10686	kvm_vcpu_srcu_read_unlock(vcpu);
  10687
  10688	kvm_sigset_deactivate(vcpu);
  10689	vcpu_put(vcpu);
  10690	return r;
  10691}
  10692
  10693static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  10694{
  10695	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
  10696		/*
  10697		 * We are here if userspace calls get_regs() in the middle of
  10698		 * instruction emulation. Registers state needs to be copied
  10699		 * back from emulation context to vcpu. Userspace shouldn't do
  10700		 * that usually, but some bad designed PV devices (vmware
  10701		 * backdoor interface) need this to work
  10702		 */
  10703		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
  10704		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
  10705	}
  10706	regs->rax = kvm_rax_read(vcpu);
  10707	regs->rbx = kvm_rbx_read(vcpu);
  10708	regs->rcx = kvm_rcx_read(vcpu);
  10709	regs->rdx = kvm_rdx_read(vcpu);
  10710	regs->rsi = kvm_rsi_read(vcpu);
  10711	regs->rdi = kvm_rdi_read(vcpu);
  10712	regs->rsp = kvm_rsp_read(vcpu);
  10713	regs->rbp = kvm_rbp_read(vcpu);
  10714#ifdef CONFIG_X86_64
  10715	regs->r8 = kvm_r8_read(vcpu);
  10716	regs->r9 = kvm_r9_read(vcpu);
  10717	regs->r10 = kvm_r10_read(vcpu);
  10718	regs->r11 = kvm_r11_read(vcpu);
  10719	regs->r12 = kvm_r12_read(vcpu);
  10720	regs->r13 = kvm_r13_read(vcpu);
  10721	regs->r14 = kvm_r14_read(vcpu);
  10722	regs->r15 = kvm_r15_read(vcpu);
  10723#endif
  10724
  10725	regs->rip = kvm_rip_read(vcpu);
  10726	regs->rflags = kvm_get_rflags(vcpu);
  10727}
  10728
  10729int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  10730{
  10731	vcpu_load(vcpu);
  10732	__get_regs(vcpu, regs);
  10733	vcpu_put(vcpu);
  10734	return 0;
  10735}
  10736
  10737static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  10738{
  10739	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
  10740	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
  10741
  10742	kvm_rax_write(vcpu, regs->rax);
  10743	kvm_rbx_write(vcpu, regs->rbx);
  10744	kvm_rcx_write(vcpu, regs->rcx);
  10745	kvm_rdx_write(vcpu, regs->rdx);
  10746	kvm_rsi_write(vcpu, regs->rsi);
  10747	kvm_rdi_write(vcpu, regs->rdi);
  10748	kvm_rsp_write(vcpu, regs->rsp);
  10749	kvm_rbp_write(vcpu, regs->rbp);
  10750#ifdef CONFIG_X86_64
  10751	kvm_r8_write(vcpu, regs->r8);
  10752	kvm_r9_write(vcpu, regs->r9);
  10753	kvm_r10_write(vcpu, regs->r10);
  10754	kvm_r11_write(vcpu, regs->r11);
  10755	kvm_r12_write(vcpu, regs->r12);
  10756	kvm_r13_write(vcpu, regs->r13);
  10757	kvm_r14_write(vcpu, regs->r14);
  10758	kvm_r15_write(vcpu, regs->r15);
  10759#endif
  10760
  10761	kvm_rip_write(vcpu, regs->rip);
  10762	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
  10763
  10764	vcpu->arch.exception.pending = false;
  10765
  10766	kvm_make_request(KVM_REQ_EVENT, vcpu);
  10767}
  10768
  10769int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  10770{
  10771	vcpu_load(vcpu);
  10772	__set_regs(vcpu, regs);
  10773	vcpu_put(vcpu);
  10774	return 0;
  10775}
  10776
  10777static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
  10778{
  10779	struct desc_ptr dt;
  10780
  10781	if (vcpu->arch.guest_state_protected)
  10782		goto skip_protected_regs;
  10783
  10784	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
  10785	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
  10786	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
  10787	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
  10788	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
  10789	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
  10790
  10791	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
  10792	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  10793
  10794	static_call(kvm_x86_get_idt)(vcpu, &dt);
  10795	sregs->idt.limit = dt.size;
  10796	sregs->idt.base = dt.address;
  10797	static_call(kvm_x86_get_gdt)(vcpu, &dt);
  10798	sregs->gdt.limit = dt.size;
  10799	sregs->gdt.base = dt.address;
  10800
  10801	sregs->cr2 = vcpu->arch.cr2;
  10802	sregs->cr3 = kvm_read_cr3(vcpu);
  10803
  10804skip_protected_regs:
  10805	sregs->cr0 = kvm_read_cr0(vcpu);
  10806	sregs->cr4 = kvm_read_cr4(vcpu);
  10807	sregs->cr8 = kvm_get_cr8(vcpu);
  10808	sregs->efer = vcpu->arch.efer;
  10809	sregs->apic_base = kvm_get_apic_base(vcpu);
  10810}
  10811
  10812static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
  10813{
  10814	__get_sregs_common(vcpu, sregs);
  10815
  10816	if (vcpu->arch.guest_state_protected)
  10817		return;
  10818
  10819	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
  10820		set_bit(vcpu->arch.interrupt.nr,
  10821			(unsigned long *)sregs->interrupt_bitmap);
  10822}
  10823
  10824static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
  10825{
  10826	int i;
  10827
  10828	__get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
  10829
  10830	if (vcpu->arch.guest_state_protected)
  10831		return;
  10832
  10833	if (is_pae_paging(vcpu)) {
  10834		for (i = 0 ; i < 4 ; i++)
  10835			sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
  10836		sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
  10837	}
  10838}
  10839
  10840int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
  10841				  struct kvm_sregs *sregs)
  10842{
  10843	vcpu_load(vcpu);
  10844	__get_sregs(vcpu, sregs);
  10845	vcpu_put(vcpu);
  10846	return 0;
  10847}
  10848
  10849int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
  10850				    struct kvm_mp_state *mp_state)
  10851{
  10852	int r;
  10853
  10854	vcpu_load(vcpu);
  10855	if (kvm_mpx_supported())
  10856		kvm_load_guest_fpu(vcpu);
  10857
  10858	r = kvm_apic_accept_events(vcpu);
  10859	if (r < 0)
  10860		goto out;
  10861	r = 0;
  10862
  10863	if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
  10864	     vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
  10865	    vcpu->arch.pv.pv_unhalted)
  10866		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
  10867	else
  10868		mp_state->mp_state = vcpu->arch.mp_state;
  10869
  10870out:
  10871	if (kvm_mpx_supported())
  10872		kvm_put_guest_fpu(vcpu);
  10873	vcpu_put(vcpu);
  10874	return r;
  10875}
  10876
  10877int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  10878				    struct kvm_mp_state *mp_state)
  10879{
  10880	int ret = -EINVAL;
  10881
  10882	vcpu_load(vcpu);
  10883
  10884	if (!lapic_in_kernel(vcpu) &&
  10885	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
  10886		goto out;
  10887
  10888	/*
  10889	 * KVM_MP_STATE_INIT_RECEIVED means the processor is in
  10890	 * INIT state; latched init should be reported using
  10891	 * KVM_SET_VCPU_EVENTS, so reject it here.
  10892	 */
  10893	if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
  10894	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
  10895	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
  10896		goto out;
  10897
  10898	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
  10899		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
  10900		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
  10901	} else
  10902		vcpu->arch.mp_state = mp_state->mp_state;
  10903	kvm_make_request(KVM_REQ_EVENT, vcpu);
  10904
  10905	ret = 0;
  10906out:
  10907	vcpu_put(vcpu);
  10908	return ret;
  10909}
  10910
  10911int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
  10912		    int reason, bool has_error_code, u32 error_code)
  10913{
  10914	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  10915	int ret;
  10916
  10917	init_emulate_ctxt(vcpu);
  10918
  10919	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
  10920				   has_error_code, error_code);
  10921	if (ret) {
  10922		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
  10923		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
  10924		vcpu->run->internal.ndata = 0;
  10925		return 0;
  10926	}
  10927
  10928	kvm_rip_write(vcpu, ctxt->eip);
  10929	kvm_set_rflags(vcpu, ctxt->eflags);
  10930	return 1;
  10931}
  10932EXPORT_SYMBOL_GPL(kvm_task_switch);
  10933
  10934static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
  10935{
  10936	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
  10937		/*
  10938		 * When EFER.LME and CR0.PG are set, the processor is in
  10939		 * 64-bit mode (though maybe in a 32-bit code segment).
  10940		 * CR4.PAE and EFER.LMA must be set.
  10941		 */
  10942		if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
  10943			return false;
  10944		if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
  10945			return false;
  10946	} else {
  10947		/*
  10948		 * Not in 64-bit mode: EFER.LMA is clear and the code
  10949		 * segment cannot be 64-bit.
  10950		 */
  10951		if (sregs->efer & EFER_LMA || sregs->cs.l)
  10952			return false;
  10953	}
  10954
  10955	return kvm_is_valid_cr4(vcpu, sregs->cr4);
  10956}
  10957
  10958static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
  10959		int *mmu_reset_needed, bool update_pdptrs)
  10960{
  10961	struct msr_data apic_base_msr;
  10962	int idx;
  10963	struct desc_ptr dt;
  10964
  10965	if (!kvm_is_valid_sregs(vcpu, sregs))
  10966		return -EINVAL;
  10967
  10968	apic_base_msr.data = sregs->apic_base;
  10969	apic_base_msr.host_initiated = true;
  10970	if (kvm_set_apic_base(vcpu, &apic_base_msr))
  10971		return -EINVAL;
  10972
  10973	if (vcpu->arch.guest_state_protected)
  10974		return 0;
  10975
  10976	dt.size = sregs->idt.limit;
  10977	dt.address = sregs->idt.base;
  10978	static_call(kvm_x86_set_idt)(vcpu, &dt);
  10979	dt.size = sregs->gdt.limit;
  10980	dt.address = sregs->gdt.base;
  10981	static_call(kvm_x86_set_gdt)(vcpu, &dt);
  10982
  10983	vcpu->arch.cr2 = sregs->cr2;
  10984	*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
  10985	vcpu->arch.cr3 = sregs->cr3;
  10986	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
  10987	static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
  10988
  10989	kvm_set_cr8(vcpu, sregs->cr8);
  10990
  10991	*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
  10992	static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
  10993
  10994	*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
  10995	static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
  10996	vcpu->arch.cr0 = sregs->cr0;
  10997
  10998	*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
  10999	static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
  11000
  11001	if (update_pdptrs) {
  11002		idx = srcu_read_lock(&vcpu->kvm->srcu);
  11003		if (is_pae_paging(vcpu)) {
  11004			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
  11005			*mmu_reset_needed = 1;
  11006		}
  11007		srcu_read_unlock(&vcpu->kvm->srcu, idx);
  11008	}
  11009
  11010	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
  11011	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
  11012	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
  11013	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
  11014	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
  11015	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
  11016
  11017	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
  11018	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  11019
  11020	update_cr8_intercept(vcpu);
  11021
  11022	/* Older userspace won't unhalt the vcpu on reset. */
  11023	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
  11024	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
  11025	    !is_protmode(vcpu))
  11026		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  11027
  11028	return 0;
  11029}
  11030
  11031static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
  11032{
  11033	int pending_vec, max_bits;
  11034	int mmu_reset_needed = 0;
  11035	int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
  11036
  11037	if (ret)
  11038		return ret;
  11039
  11040	if (mmu_reset_needed)
  11041		kvm_mmu_reset_context(vcpu);
  11042
  11043	max_bits = KVM_NR_INTERRUPTS;
  11044	pending_vec = find_first_bit(
  11045		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
  11046
  11047	if (pending_vec < max_bits) {
  11048		kvm_queue_interrupt(vcpu, pending_vec, false);
  11049		pr_debug("Set back pending irq %d\n", pending_vec);
  11050		kvm_make_request(KVM_REQ_EVENT, vcpu);
  11051	}
  11052	return 0;
  11053}
  11054
  11055static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
  11056{
  11057	int mmu_reset_needed = 0;
  11058	bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
  11059	bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
  11060		!(sregs2->efer & EFER_LMA);
  11061	int i, ret;
  11062
  11063	if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
  11064		return -EINVAL;
  11065
  11066	if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
  11067		return -EINVAL;
  11068
  11069	ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
  11070				 &mmu_reset_needed, !valid_pdptrs);
  11071	if (ret)
  11072		return ret;
  11073
  11074	if (valid_pdptrs) {
  11075		for (i = 0; i < 4 ; i++)
  11076			kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
  11077
  11078		kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
  11079		mmu_reset_needed = 1;
  11080		vcpu->arch.pdptrs_from_userspace = true;
  11081	}
  11082	if (mmu_reset_needed)
  11083		kvm_mmu_reset_context(vcpu);
  11084	return 0;
  11085}
  11086
  11087int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  11088				  struct kvm_sregs *sregs)
  11089{
  11090	int ret;
  11091
  11092	vcpu_load(vcpu);
  11093	ret = __set_sregs(vcpu, sregs);
  11094	vcpu_put(vcpu);
  11095	return ret;
  11096}
  11097
  11098static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
  11099{
  11100	bool set = false;
  11101	struct kvm_vcpu *vcpu;
  11102	unsigned long i;
  11103
  11104	if (!enable_apicv)
  11105		return;
  11106
  11107	down_write(&kvm->arch.apicv_update_lock);
  11108
  11109	kvm_for_each_vcpu(i, vcpu, kvm) {
  11110		if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
  11111			set = true;
  11112			break;
  11113		}
  11114	}
  11115	__kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
  11116	up_write(&kvm->arch.apicv_update_lock);
  11117}
  11118
  11119int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
  11120					struct kvm_guest_debug *dbg)
  11121{
  11122	unsigned long rflags;
  11123	int i, r;
  11124
  11125	if (vcpu->arch.guest_state_protected)
  11126		return -EINVAL;
  11127
  11128	vcpu_load(vcpu);
  11129
  11130	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
  11131		r = -EBUSY;
  11132		if (vcpu->arch.exception.pending)
  11133			goto out;
  11134		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
  11135			kvm_queue_exception(vcpu, DB_VECTOR);
  11136		else
  11137			kvm_queue_exception(vcpu, BP_VECTOR);
  11138	}
  11139
  11140	/*
  11141	 * Read rflags as long as potentially injected trace flags are still
  11142	 * filtered out.
  11143	 */
  11144	rflags = kvm_get_rflags(vcpu);
  11145
  11146	vcpu->guest_debug = dbg->control;
  11147	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
  11148		vcpu->guest_debug = 0;
  11149
  11150	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
  11151		for (i = 0; i < KVM_NR_DB_REGS; ++i)
  11152			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
  11153		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
  11154	} else {
  11155		for (i = 0; i < KVM_NR_DB_REGS; i++)
  11156			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
  11157	}
  11158	kvm_update_dr7(vcpu);
  11159
  11160	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
  11161		vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
  11162
  11163	/*
  11164	 * Trigger an rflags update that will inject or remove the trace
  11165	 * flags.
  11166	 */
  11167	kvm_set_rflags(vcpu, rflags);
  11168
  11169	static_call(kvm_x86_update_exception_bitmap)(vcpu);
  11170
  11171	kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
  11172
  11173	r = 0;
  11174
  11175out:
  11176	vcpu_put(vcpu);
  11177	return r;
  11178}
  11179
  11180/*
  11181 * Translate a guest virtual address to a guest physical address.
  11182 */
  11183int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  11184				    struct kvm_translation *tr)
  11185{
  11186	unsigned long vaddr = tr->linear_address;
  11187	gpa_t gpa;
  11188	int idx;
  11189
  11190	vcpu_load(vcpu);
  11191
  11192	idx = srcu_read_lock(&vcpu->kvm->srcu);
  11193	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
  11194	srcu_read_unlock(&vcpu->kvm->srcu, idx);
  11195	tr->physical_address = gpa;
  11196	tr->valid = gpa != UNMAPPED_GVA;
  11197	tr->writeable = 1;
  11198	tr->usermode = 0;
  11199
  11200	vcpu_put(vcpu);
  11201	return 0;
  11202}
  11203
  11204int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  11205{
  11206	struct fxregs_state *fxsave;
  11207
  11208	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
  11209		return 0;
  11210
  11211	vcpu_load(vcpu);
  11212
  11213	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
  11214	memcpy(fpu->fpr, fxsave->st_space, 128);
  11215	fpu->fcw = fxsave->cwd;
  11216	fpu->fsw = fxsave->swd;
  11217	fpu->ftwx = fxsave->twd;
  11218	fpu->last_opcode = fxsave->fop;
  11219	fpu->last_ip = fxsave->rip;
  11220	fpu->last_dp = fxsave->rdp;
  11221	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
  11222
  11223	vcpu_put(vcpu);
  11224	return 0;
  11225}
  11226
  11227int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  11228{
  11229	struct fxregs_state *fxsave;
  11230
  11231	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
  11232		return 0;
  11233
  11234	vcpu_load(vcpu);
  11235
  11236	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
  11237
  11238	memcpy(fxsave->st_space, fpu->fpr, 128);
  11239	fxsave->cwd = fpu->fcw;
  11240	fxsave->swd = fpu->fsw;
  11241	fxsave->twd = fpu->ftwx;
  11242	fxsave->fop = fpu->last_opcode;
  11243	fxsave->rip = fpu->last_ip;
  11244	fxsave->rdp = fpu->last_dp;
  11245	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
  11246
  11247	vcpu_put(vcpu);
  11248	return 0;
  11249}
  11250
  11251static void store_regs(struct kvm_vcpu *vcpu)
  11252{
  11253	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
  11254
  11255	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
  11256		__get_regs(vcpu, &vcpu->run->s.regs.regs);
  11257
  11258	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
  11259		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
  11260
  11261	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
  11262		kvm_vcpu_ioctl_x86_get_vcpu_events(
  11263				vcpu, &vcpu->run->s.regs.events);
  11264}
  11265
  11266static int sync_regs(struct kvm_vcpu *vcpu)
  11267{
  11268	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
  11269		__set_regs(vcpu, &vcpu->run->s.regs.regs);
  11270		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
  11271	}
  11272	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
  11273		if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
  11274			return -EINVAL;
  11275		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
  11276	}
  11277	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
  11278		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
  11279				vcpu, &vcpu->run->s.regs.events))
  11280			return -EINVAL;
  11281		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
  11282	}
  11283
  11284	return 0;
  11285}
  11286
  11287int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
  11288{
  11289	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
  11290		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
  11291			     "guest TSC will not be reliable\n");
  11292
  11293	return 0;
  11294}
  11295
  11296int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
  11297{
  11298	struct page *page;
  11299	int r;
  11300
  11301	vcpu->arch.last_vmentry_cpu = -1;
  11302	vcpu->arch.regs_avail = ~0;
  11303	vcpu->arch.regs_dirty = ~0;
  11304
  11305	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
  11306		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  11307	else
  11308		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
  11309
  11310	r = kvm_mmu_create(vcpu);
  11311	if (r < 0)
  11312		return r;
  11313
  11314	if (irqchip_in_kernel(vcpu->kvm)) {
  11315		r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
  11316		if (r < 0)
  11317			goto fail_mmu_destroy;
  11318
  11319		/*
  11320		 * Defer evaluating inhibits until the vCPU is first run, as
  11321		 * this vCPU will not get notified of any changes until this
  11322		 * vCPU is visible to other vCPUs (marked online and added to
  11323		 * the set of vCPUs).  Opportunistically mark APICv active as
  11324		 * VMX in particularly is highly unlikely to have inhibits.
  11325		 * Ignore the current per-VM APICv state so that vCPU creation
  11326		 * is guaranteed to run with a deterministic value, the request
  11327		 * will ensure the vCPU gets the correct state before VM-Entry.
  11328		 */
  11329		if (enable_apicv) {
  11330			vcpu->arch.apicv_active = true;
  11331			kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
  11332		}
  11333	} else
  11334		static_branch_inc(&kvm_has_noapic_vcpu);
  11335
  11336	r = -ENOMEM;
  11337
  11338	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  11339	if (!page)
  11340		goto fail_free_lapic;
  11341	vcpu->arch.pio_data = page_address(page);
  11342
  11343	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
  11344				       GFP_KERNEL_ACCOUNT);
  11345	if (!vcpu->arch.mce_banks)
  11346		goto fail_free_pio_data;
  11347	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
  11348
  11349	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
  11350				GFP_KERNEL_ACCOUNT))
  11351		goto fail_free_mce_banks;
  11352
  11353	if (!alloc_emulate_ctxt(vcpu))
  11354		goto free_wbinvd_dirty_mask;
  11355
  11356	if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
  11357		pr_err("kvm: failed to allocate vcpu's fpu\n");
  11358		goto free_emulate_ctxt;
  11359	}
  11360
  11361	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
  11362	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
  11363
  11364	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
  11365
  11366	kvm_async_pf_hash_reset(vcpu);
  11367	kvm_pmu_init(vcpu);
  11368
  11369	vcpu->arch.pending_external_vector = -1;
  11370	vcpu->arch.preempted_in_kernel = false;
  11371
  11372#if IS_ENABLED(CONFIG_HYPERV)
  11373	vcpu->arch.hv_root_tdp = INVALID_PAGE;
  11374#endif
  11375
  11376	r = static_call(kvm_x86_vcpu_create)(vcpu);
  11377	if (r)
  11378		goto free_guest_fpu;
  11379
  11380	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
  11381	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
  11382	kvm_xen_init_vcpu(vcpu);
  11383	kvm_vcpu_mtrr_init(vcpu);
  11384	vcpu_load(vcpu);
  11385	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
  11386	kvm_vcpu_reset(vcpu, false);
  11387	kvm_init_mmu(vcpu);
  11388	vcpu_put(vcpu);
  11389	return 0;
  11390
  11391free_guest_fpu:
  11392	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  11393free_emulate_ctxt:
  11394	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  11395free_wbinvd_dirty_mask:
  11396	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
  11397fail_free_mce_banks:
  11398	kfree(vcpu->arch.mce_banks);
  11399fail_free_pio_data:
  11400	free_page((unsigned long)vcpu->arch.pio_data);
  11401fail_free_lapic:
  11402	kvm_free_lapic(vcpu);
  11403fail_mmu_destroy:
  11404	kvm_mmu_destroy(vcpu);
  11405	return r;
  11406}
  11407
  11408void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  11409{
  11410	struct kvm *kvm = vcpu->kvm;
  11411
  11412	if (mutex_lock_killable(&vcpu->mutex))
  11413		return;
  11414	vcpu_load(vcpu);
  11415	kvm_synchronize_tsc(vcpu, 0);
  11416	vcpu_put(vcpu);
  11417
  11418	/* poll control enabled by default */
  11419	vcpu->arch.msr_kvm_poll_control = 1;
  11420
  11421	mutex_unlock(&vcpu->mutex);
  11422
  11423	if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
  11424		schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
  11425						KVMCLOCK_SYNC_PERIOD);
  11426}
  11427
  11428void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  11429{
  11430	int idx;
  11431
  11432	kvmclock_reset(vcpu);
  11433
  11434	static_call(kvm_x86_vcpu_free)(vcpu);
  11435
  11436	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  11437	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
  11438	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  11439
  11440	kvm_xen_destroy_vcpu(vcpu);
  11441	kvm_hv_vcpu_uninit(vcpu);
  11442	kvm_pmu_destroy(vcpu);
  11443	kfree(vcpu->arch.mce_banks);
  11444	kvm_free_lapic(vcpu);
  11445	idx = srcu_read_lock(&vcpu->kvm->srcu);
  11446	kvm_mmu_destroy(vcpu);
  11447	srcu_read_unlock(&vcpu->kvm->srcu, idx);
  11448	free_page((unsigned long)vcpu->arch.pio_data);
  11449	kvfree(vcpu->arch.cpuid_entries);
  11450	if (!lapic_in_kernel(vcpu))
  11451		static_branch_dec(&kvm_has_noapic_vcpu);
  11452}
  11453
  11454void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  11455{
  11456	struct kvm_cpuid_entry2 *cpuid_0x1;
  11457	unsigned long old_cr0 = kvm_read_cr0(vcpu);
  11458	unsigned long new_cr0;
  11459
  11460	/*
  11461	 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
  11462	 * to handle side effects.  RESET emulation hits those flows and relies
  11463	 * on emulated/virtualized registers, including those that are loaded
  11464	 * into hardware, to be zeroed at vCPU creation.  Use CRs as a sentinel
  11465	 * to detect improper or missing initialization.
  11466	 */
  11467	WARN_ON_ONCE(!init_event &&
  11468		     (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
  11469
  11470	kvm_lapic_reset(vcpu, init_event);
  11471
  11472	vcpu->arch.hflags = 0;
  11473
  11474	vcpu->arch.smi_pending = 0;
  11475	vcpu->arch.smi_count = 0;
  11476	atomic_set(&vcpu->arch.nmi_queued, 0);
  11477	vcpu->arch.nmi_pending = 0;
  11478	vcpu->arch.nmi_injected = false;
  11479	kvm_clear_interrupt_queue(vcpu);
  11480	kvm_clear_exception_queue(vcpu);
  11481
  11482	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
  11483	kvm_update_dr0123(vcpu);
  11484	vcpu->arch.dr6 = DR6_ACTIVE_LOW;
  11485	vcpu->arch.dr7 = DR7_FIXED_1;
  11486	kvm_update_dr7(vcpu);
  11487
  11488	vcpu->arch.cr2 = 0;
  11489
  11490	kvm_make_request(KVM_REQ_EVENT, vcpu);
  11491	vcpu->arch.apf.msr_en_val = 0;
  11492	vcpu->arch.apf.msr_int_val = 0;
  11493	vcpu->arch.st.msr_val = 0;
  11494
  11495	kvmclock_reset(vcpu);
  11496
  11497	kvm_clear_async_pf_completion_queue(vcpu);
  11498	kvm_async_pf_hash_reset(vcpu);
  11499	vcpu->arch.apf.halted = false;
  11500
  11501	if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
  11502		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
  11503
  11504		/*
  11505		 * To avoid have the INIT path from kvm_apic_has_events() that be
  11506		 * called with loaded FPU and does not let userspace fix the state.
  11507		 */
  11508		if (init_event)
  11509			kvm_put_guest_fpu(vcpu);
  11510
  11511		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
  11512		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
  11513
  11514		if (init_event)
  11515			kvm_load_guest_fpu(vcpu);
  11516	}
  11517
  11518	if (!init_event) {
  11519		kvm_pmu_reset(vcpu);
  11520		vcpu->arch.smbase = 0x30000;
  11521
  11522		vcpu->arch.msr_misc_features_enables = 0;
  11523
  11524		__kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
  11525		__kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
  11526	}
  11527
  11528	/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
  11529	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
  11530	kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
  11531
  11532	/*
  11533	 * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
  11534	 * if no CPUID match is found.  Note, it's impossible to get a match at
  11535	 * RESET since KVM emulates RESET before exposing the vCPU to userspace,
  11536	 * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
  11537	 * on RESET.  But, go through the motions in case that's ever remedied.
  11538	 */
  11539	cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
  11540	kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
  11541
  11542	static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
  11543
  11544	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
  11545	kvm_rip_write(vcpu, 0xfff0);
  11546
  11547	vcpu->arch.cr3 = 0;
  11548	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
  11549
  11550	/*
  11551	 * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
  11552	 * of Intel's SDM list CD/NW as being set on INIT, but they contradict
  11553	 * (or qualify) that with a footnote stating that CD/NW are preserved.
  11554	 */
  11555	new_cr0 = X86_CR0_ET;
  11556	if (init_event)
  11557		new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
  11558	else
  11559		new_cr0 |= X86_CR0_NW | X86_CR0_CD;
  11560
  11561	static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
  11562	static_call(kvm_x86_set_cr4)(vcpu, 0);
  11563	static_call(kvm_x86_set_efer)(vcpu, 0);
  11564	static_call(kvm_x86_update_exception_bitmap)(vcpu);
  11565
  11566	/*
  11567	 * On the standard CR0/CR4/EFER modification paths, there are several
  11568	 * complex conditions determining whether the MMU has to be reset and/or
  11569	 * which PCIDs have to be flushed.  However, CR0.WP and the paging-related
  11570	 * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
  11571	 * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
  11572	 * CR0 will be '0' prior to RESET).  So we only need to check CR0.PG here.
  11573	 */
  11574	if (old_cr0 & X86_CR0_PG) {
  11575		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  11576		kvm_mmu_reset_context(vcpu);
  11577	}
  11578
  11579	/*
  11580	 * Intel's SDM states that all TLB entries are flushed on INIT.  AMD's
  11581	 * APM states the TLBs are untouched by INIT, but it also states that
  11582	 * the TLBs are flushed on "External initialization of the processor."
  11583	 * Flush the guest TLB regardless of vendor, there is no meaningful
  11584	 * benefit in relying on the guest to flush the TLB immediately after
  11585	 * INIT.  A spurious TLB flush is benign and likely negligible from a
  11586	 * performance perspective.
  11587	 */
  11588	if (init_event)
  11589		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  11590}
  11591EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
  11592
  11593void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
  11594{
  11595	struct kvm_segment cs;
  11596
  11597	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
  11598	cs.selector = vector << 8;
  11599	cs.base = vector << 12;
  11600	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
  11601	kvm_rip_write(vcpu, 0);
  11602}
  11603EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
  11604
  11605int kvm_arch_hardware_enable(void)
  11606{
  11607	struct kvm *kvm;
  11608	struct kvm_vcpu *vcpu;
  11609	unsigned long i;
  11610	int ret;
  11611	u64 local_tsc;
  11612	u64 max_tsc = 0;
  11613	bool stable, backwards_tsc = false;
  11614
  11615	kvm_user_return_msr_cpu_online();
  11616	ret = static_call(kvm_x86_hardware_enable)();
  11617	if (ret != 0)
  11618		return ret;
  11619
  11620	local_tsc = rdtsc();
  11621	stable = !kvm_check_tsc_unstable();
  11622	list_for_each_entry(kvm, &vm_list, vm_list) {
  11623		kvm_for_each_vcpu(i, vcpu, kvm) {
  11624			if (!stable && vcpu->cpu == smp_processor_id())
  11625				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
  11626			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
  11627				backwards_tsc = true;
  11628				if (vcpu->arch.last_host_tsc > max_tsc)
  11629					max_tsc = vcpu->arch.last_host_tsc;
  11630			}
  11631		}
  11632	}
  11633
  11634	/*
  11635	 * Sometimes, even reliable TSCs go backwards.  This happens on
  11636	 * platforms that reset TSC during suspend or hibernate actions, but
  11637	 * maintain synchronization.  We must compensate.  Fortunately, we can
  11638	 * detect that condition here, which happens early in CPU bringup,
  11639	 * before any KVM threads can be running.  Unfortunately, we can't
  11640	 * bring the TSCs fully up to date with real time, as we aren't yet far
  11641	 * enough into CPU bringup that we know how much real time has actually
  11642	 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
  11643	 * variables that haven't been updated yet.
  11644	 *
  11645	 * So we simply find the maximum observed TSC above, then record the
  11646	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
  11647	 * the adjustment will be applied.  Note that we accumulate
  11648	 * adjustments, in case multiple suspend cycles happen before some VCPU
  11649	 * gets a chance to run again.  In the event that no KVM threads get a
  11650	 * chance to run, we will miss the entire elapsed period, as we'll have
  11651	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
  11652	 * loose cycle time.  This isn't too big a deal, since the loss will be
  11653	 * uniform across all VCPUs (not to mention the scenario is extremely
  11654	 * unlikely). It is possible that a second hibernate recovery happens
  11655	 * much faster than a first, causing the observed TSC here to be
  11656	 * smaller; this would require additional padding adjustment, which is
  11657	 * why we set last_host_tsc to the local tsc observed here.
  11658	 *
  11659	 * N.B. - this code below runs only on platforms with reliable TSC,
  11660	 * as that is the only way backwards_tsc is set above.  Also note
  11661	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
  11662	 * have the same delta_cyc adjustment applied if backwards_tsc
  11663	 * is detected.  Note further, this adjustment is only done once,
  11664	 * as we reset last_host_tsc on all VCPUs to stop this from being
  11665	 * called multiple times (one for each physical CPU bringup).
  11666	 *
  11667	 * Platforms with unreliable TSCs don't have to deal with this, they
  11668	 * will be compensated by the logic in vcpu_load, which sets the TSC to
  11669	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
  11670	 * guarantee that they stay in perfect synchronization.
  11671	 */
  11672	if (backwards_tsc) {
  11673		u64 delta_cyc = max_tsc - local_tsc;
  11674		list_for_each_entry(kvm, &vm_list, vm_list) {
  11675			kvm->arch.backwards_tsc_observed = true;
  11676			kvm_for_each_vcpu(i, vcpu, kvm) {
  11677				vcpu->arch.tsc_offset_adjustment += delta_cyc;
  11678				vcpu->arch.last_host_tsc = local_tsc;
  11679				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
  11680			}
  11681
  11682			/*
  11683			 * We have to disable TSC offset matching.. if you were
  11684			 * booting a VM while issuing an S4 host suspend....
  11685			 * you may have some problem.  Solving this issue is
  11686			 * left as an exercise to the reader.
  11687			 */
  11688			kvm->arch.last_tsc_nsec = 0;
  11689			kvm->arch.last_tsc_write = 0;
  11690		}
  11691
  11692	}
  11693	return 0;
  11694}
  11695
  11696void kvm_arch_hardware_disable(void)
  11697{
  11698	static_call(kvm_x86_hardware_disable)();
  11699	drop_user_return_notifiers();
  11700}
  11701
  11702static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
  11703{
  11704	memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
  11705
  11706#define __KVM_X86_OP(func) \
  11707	static_call_update(kvm_x86_##func, kvm_x86_ops.func);
  11708#define KVM_X86_OP(func) \
  11709	WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
  11710#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
  11711#define KVM_X86_OP_OPTIONAL_RET0(func) \
  11712	static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
  11713					   (void *)__static_call_return0);
  11714#include <asm/kvm-x86-ops.h>
  11715#undef __KVM_X86_OP
  11716
  11717	kvm_pmu_ops_update(ops->pmu_ops);
  11718}
  11719
  11720int kvm_arch_hardware_setup(void *opaque)
  11721{
  11722	struct kvm_x86_init_ops *ops = opaque;
  11723	int r;
  11724
  11725	rdmsrl_safe(MSR_EFER, &host_efer);
  11726
  11727	if (boot_cpu_has(X86_FEATURE_XSAVES))
  11728		rdmsrl(MSR_IA32_XSS, host_xss);
  11729
  11730	r = ops->hardware_setup();
  11731	if (r != 0)
  11732		return r;
  11733
  11734	kvm_ops_update(ops);
  11735
  11736	kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
  11737
  11738	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
  11739		supported_xss = 0;
  11740
  11741#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
  11742	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
  11743#undef __kvm_cpu_cap_has
  11744
  11745	if (kvm_has_tsc_control) {
  11746		/*
  11747		 * Make sure the user can only configure tsc_khz values that
  11748		 * fit into a signed integer.
  11749		 * A min value is not calculated because it will always
  11750		 * be 1 on all machines.
  11751		 */
  11752		u64 max = min(0x7fffffffULL,
  11753			      __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
  11754		kvm_max_guest_tsc_khz = max;
  11755	}
  11756	kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
  11757	kvm_init_msr_list();
  11758	return 0;
  11759}
  11760
  11761void kvm_arch_hardware_unsetup(void)
  11762{
  11763	kvm_unregister_perf_callbacks();
  11764
  11765	static_call(kvm_x86_hardware_unsetup)();
  11766}
  11767
  11768int kvm_arch_check_processor_compat(void *opaque)
  11769{
  11770	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
  11771	struct kvm_x86_init_ops *ops = opaque;
  11772
  11773	WARN_ON(!irqs_disabled());
  11774
  11775	if (__cr4_reserved_bits(cpu_has, c) !=
  11776	    __cr4_reserved_bits(cpu_has, &boot_cpu_data))
  11777		return -EIO;
  11778
  11779	return ops->check_processor_compatibility();
  11780}
  11781
  11782bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
  11783{
  11784	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
  11785}
  11786EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
  11787
  11788bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
  11789{
  11790	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
  11791}
  11792
  11793__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
  11794EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
  11795
  11796void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
  11797{
  11798	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
  11799
  11800	vcpu->arch.l1tf_flush_l1d = true;
  11801	if (pmu->version && unlikely(pmu->event_count)) {
  11802		pmu->need_cleanup = true;
  11803		kvm_make_request(KVM_REQ_PMU, vcpu);
  11804	}
  11805	static_call(kvm_x86_sched_in)(vcpu, cpu);
  11806}
  11807
  11808void kvm_arch_free_vm(struct kvm *kvm)
  11809{
  11810	kfree(to_kvm_hv(kvm)->hv_pa_pg);
  11811	__kvm_arch_free_vm(kvm);
  11812}
  11813
  11814
  11815int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  11816{
  11817	int ret;
  11818	unsigned long flags;
  11819
  11820	if (type)
  11821		return -EINVAL;
  11822
  11823	ret = kvm_page_track_init(kvm);
  11824	if (ret)
  11825		goto out;
  11826
  11827	ret = kvm_mmu_init_vm(kvm);
  11828	if (ret)
  11829		goto out_page_track;
  11830
  11831	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
  11832	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
  11833	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
  11834
  11835	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
  11836	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
  11837	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
  11838	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  11839		&kvm->arch.irq_sources_bitmap);
  11840
  11841	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
  11842	mutex_init(&kvm->arch.apic_map_lock);
  11843	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
  11844	kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
  11845
  11846	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
  11847	pvclock_update_vm_gtod_copy(kvm);
  11848	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  11849
  11850	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
  11851	kvm->arch.guest_can_read_msr_platform_info = true;
  11852	kvm->arch.enable_pmu = enable_pmu;
  11853
  11854#if IS_ENABLED(CONFIG_HYPERV)
  11855	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
  11856	kvm->arch.hv_root_tdp = INVALID_PAGE;
  11857#endif
  11858
  11859	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
  11860	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
  11861
  11862	kvm_apicv_init(kvm);
  11863	kvm_hv_init_vm(kvm);
  11864	kvm_xen_init_vm(kvm);
  11865
  11866	return static_call(kvm_x86_vm_init)(kvm);
  11867
  11868out_page_track:
  11869	kvm_page_track_cleanup(kvm);
  11870out:
  11871	return ret;
  11872}
  11873
  11874int kvm_arch_post_init_vm(struct kvm *kvm)
  11875{
  11876	return kvm_mmu_post_init_vm(kvm);
  11877}
  11878
  11879static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
  11880{
  11881	vcpu_load(vcpu);
  11882	kvm_mmu_unload(vcpu);
  11883	vcpu_put(vcpu);
  11884}
  11885
  11886static void kvm_unload_vcpu_mmus(struct kvm *kvm)
  11887{
  11888	unsigned long i;
  11889	struct kvm_vcpu *vcpu;
  11890
  11891	kvm_for_each_vcpu(i, vcpu, kvm) {
  11892		kvm_clear_async_pf_completion_queue(vcpu);
  11893		kvm_unload_vcpu_mmu(vcpu);
  11894	}
  11895}
  11896
  11897void kvm_arch_sync_events(struct kvm *kvm)
  11898{
  11899	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
  11900	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
  11901	kvm_free_pit(kvm);
  11902}
  11903
  11904/**
  11905 * __x86_set_memory_region: Setup KVM internal memory slot
  11906 *
  11907 * @kvm: the kvm pointer to the VM.
  11908 * @id: the slot ID to setup.
  11909 * @gpa: the GPA to install the slot (unused when @size == 0).
  11910 * @size: the size of the slot. Set to zero to uninstall a slot.
  11911 *
  11912 * This function helps to setup a KVM internal memory slot.  Specify
  11913 * @size > 0 to install a new slot, while @size == 0 to uninstall a
  11914 * slot.  The return code can be one of the following:
  11915 *
  11916 *   HVA:           on success (uninstall will return a bogus HVA)
  11917 *   -errno:        on error
  11918 *
  11919 * The caller should always use IS_ERR() to check the return value
  11920 * before use.  Note, the KVM internal memory slots are guaranteed to
  11921 * remain valid and unchanged until the VM is destroyed, i.e., the
  11922 * GPA->HVA translation will not change.  However, the HVA is a user
  11923 * address, i.e. its accessibility is not guaranteed, and must be
  11924 * accessed via __copy_{to,from}_user().
  11925 */
  11926void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
  11927				      u32 size)
  11928{
  11929	int i, r;
  11930	unsigned long hva, old_npages;
  11931	struct kvm_memslots *slots = kvm_memslots(kvm);
  11932	struct kvm_memory_slot *slot;
  11933
  11934	/* Called with kvm->slots_lock held.  */
  11935	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
  11936		return ERR_PTR_USR(-EINVAL);
  11937
  11938	slot = id_to_memslot(slots, id);
  11939	if (size) {
  11940		if (slot && slot->npages)
  11941			return ERR_PTR_USR(-EEXIST);
  11942
  11943		/*
  11944		 * MAP_SHARED to prevent internal slot pages from being moved
  11945		 * by fork()/COW.
  11946		 */
  11947		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
  11948			      MAP_SHARED | MAP_ANONYMOUS, 0);
  11949		if (IS_ERR((void *)hva))
  11950			return (void __user *)hva;
  11951	} else {
  11952		if (!slot || !slot->npages)
  11953			return NULL;
  11954
  11955		old_npages = slot->npages;
  11956		hva = slot->userspace_addr;
  11957	}
  11958
  11959	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
  11960		struct kvm_userspace_memory_region m;
  11961
  11962		m.slot = id | (i << 16);
  11963		m.flags = 0;
  11964		m.guest_phys_addr = gpa;
  11965		m.userspace_addr = hva;
  11966		m.memory_size = size;
  11967		r = __kvm_set_memory_region(kvm, &m);
  11968		if (r < 0)
  11969			return ERR_PTR_USR(r);
  11970	}
  11971
  11972	if (!size)
  11973		vm_munmap(hva, old_npages * PAGE_SIZE);
  11974
  11975	return (void __user *)hva;
  11976}
  11977EXPORT_SYMBOL_GPL(__x86_set_memory_region);
  11978
  11979void kvm_arch_pre_destroy_vm(struct kvm *kvm)
  11980{
  11981	kvm_mmu_pre_destroy_vm(kvm);
  11982}
  11983
  11984void kvm_arch_destroy_vm(struct kvm *kvm)
  11985{
  11986	if (current->mm == kvm->mm) {
  11987		/*
  11988		 * Free memory regions allocated on behalf of userspace,
  11989		 * unless the memory map has changed due to process exit
  11990		 * or fd copying.
  11991		 */
  11992		mutex_lock(&kvm->slots_lock);
  11993		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
  11994					0, 0);
  11995		__x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
  11996					0, 0);
  11997		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
  11998		mutex_unlock(&kvm->slots_lock);
  11999	}
  12000	kvm_unload_vcpu_mmus(kvm);
  12001	static_call_cond(kvm_x86_vm_destroy)(kvm);
  12002	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
  12003	kvm_pic_destroy(kvm);
  12004	kvm_ioapic_destroy(kvm);
  12005	kvm_destroy_vcpus(kvm);
  12006	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
  12007	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
  12008	kvm_mmu_uninit_vm(kvm);
  12009	kvm_page_track_cleanup(kvm);
  12010	kvm_xen_destroy_vm(kvm);
  12011	kvm_hv_destroy_vm(kvm);
  12012}
  12013
  12014static void memslot_rmap_free(struct kvm_memory_slot *slot)
  12015{
  12016	int i;
  12017
  12018	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
  12019		kvfree(slot->arch.rmap[i]);
  12020		slot->arch.rmap[i] = NULL;
  12021	}
  12022}
  12023
  12024void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
  12025{
  12026	int i;
  12027
  12028	memslot_rmap_free(slot);
  12029
  12030	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
  12031		kvfree(slot->arch.lpage_info[i - 1]);
  12032		slot->arch.lpage_info[i - 1] = NULL;
  12033	}
  12034
  12035	kvm_page_track_free_memslot(slot);
  12036}
  12037
  12038int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
  12039{
  12040	const int sz = sizeof(*slot->arch.rmap[0]);
  12041	int i;
  12042
  12043	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
  12044		int level = i + 1;
  12045		int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
  12046
  12047		if (slot->arch.rmap[i])
  12048			continue;
  12049
  12050		slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
  12051		if (!slot->arch.rmap[i]) {
  12052			memslot_rmap_free(slot);
  12053			return -ENOMEM;
  12054		}
  12055	}
  12056
  12057	return 0;
  12058}
  12059
  12060static int kvm_alloc_memslot_metadata(struct kvm *kvm,
  12061				      struct kvm_memory_slot *slot)
  12062{
  12063	unsigned long npages = slot->npages;
  12064	int i, r;
  12065
  12066	/*
  12067	 * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
  12068	 * old arrays will be freed by __kvm_set_memory_region() if installing
  12069	 * the new memslot is successful.
  12070	 */
  12071	memset(&slot->arch, 0, sizeof(slot->arch));
  12072
  12073	if (kvm_memslots_have_rmaps(kvm)) {
  12074		r = memslot_rmap_alloc(slot, npages);
  12075		if (r)
  12076			return r;
  12077	}
  12078
  12079	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
  12080		struct kvm_lpage_info *linfo;
  12081		unsigned long ugfn;
  12082		int lpages;
  12083		int level = i + 1;
  12084
  12085		lpages = __kvm_mmu_slot_lpages(slot, npages, level);
  12086
  12087		linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
  12088		if (!linfo)
  12089			goto out_free;
  12090
  12091		slot->arch.lpage_info[i - 1] = linfo;
  12092
  12093		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
  12094			linfo[0].disallow_lpage = 1;
  12095		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
  12096			linfo[lpages - 1].disallow_lpage = 1;
  12097		ugfn = slot->userspace_addr >> PAGE_SHIFT;
  12098		/*
  12099		 * If the gfn and userspace address are not aligned wrt each
  12100		 * other, disable large page support for this slot.
  12101		 */
  12102		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
  12103			unsigned long j;
  12104
  12105			for (j = 0; j < lpages; ++j)
  12106				linfo[j].disallow_lpage = 1;
  12107		}
  12108	}
  12109
  12110	if (kvm_page_track_create_memslot(kvm, slot, npages))
  12111		goto out_free;
  12112
  12113	return 0;
  12114
  12115out_free:
  12116	memslot_rmap_free(slot);
  12117
  12118	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
  12119		kvfree(slot->arch.lpage_info[i - 1]);
  12120		slot->arch.lpage_info[i - 1] = NULL;
  12121	}
  12122	return -ENOMEM;
  12123}
  12124
  12125void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
  12126{
  12127	struct kvm_vcpu *vcpu;
  12128	unsigned long i;
  12129
  12130	/*
  12131	 * memslots->generation has been incremented.
  12132	 * mmio generation may have reached its maximum value.
  12133	 */
  12134	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
  12135
  12136	/* Force re-initialization of steal_time cache */
  12137	kvm_for_each_vcpu(i, vcpu, kvm)
  12138		kvm_vcpu_kick(vcpu);
  12139}
  12140
  12141int kvm_arch_prepare_memory_region(struct kvm *kvm,
  12142				   const struct kvm_memory_slot *old,
  12143				   struct kvm_memory_slot *new,
  12144				   enum kvm_mr_change change)
  12145{
  12146	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
  12147		if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
  12148			return -EINVAL;
  12149
  12150		return kvm_alloc_memslot_metadata(kvm, new);
  12151	}
  12152
  12153	if (change == KVM_MR_FLAGS_ONLY)
  12154		memcpy(&new->arch, &old->arch, sizeof(old->arch));
  12155	else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
  12156		return -EIO;
  12157
  12158	return 0;
  12159}
  12160
  12161
  12162static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
  12163{
  12164	struct kvm_arch *ka = &kvm->arch;
  12165
  12166	if (!kvm_x86_ops.cpu_dirty_log_size)
  12167		return;
  12168
  12169	if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
  12170	    (!enable && --ka->cpu_dirty_logging_count == 0))
  12171		kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
  12172
  12173	WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
  12174}
  12175
  12176static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
  12177				     struct kvm_memory_slot *old,
  12178				     const struct kvm_memory_slot *new,
  12179				     enum kvm_mr_change change)
  12180{
  12181	u32 old_flags = old ? old->flags : 0;
  12182	u32 new_flags = new ? new->flags : 0;
  12183	bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
  12184
  12185	/*
  12186	 * Update CPU dirty logging if dirty logging is being toggled.  This
  12187	 * applies to all operations.
  12188	 */
  12189	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
  12190		kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
  12191
  12192	/*
  12193	 * Nothing more to do for RO slots (which can't be dirtied and can't be
  12194	 * made writable) or CREATE/MOVE/DELETE of a slot.
  12195	 *
  12196	 * For a memslot with dirty logging disabled:
  12197	 * CREATE:      No dirty mappings will already exist.
  12198	 * MOVE/DELETE: The old mappings will already have been cleaned up by
  12199	 *		kvm_arch_flush_shadow_memslot()
  12200	 *
  12201	 * For a memslot with dirty logging enabled:
  12202	 * CREATE:      No shadow pages exist, thus nothing to write-protect
  12203	 *		and no dirty bits to clear.
  12204	 * MOVE/DELETE: The old mappings will already have been cleaned up by
  12205	 *		kvm_arch_flush_shadow_memslot().
  12206	 */
  12207	if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
  12208		return;
  12209
  12210	/*
  12211	 * READONLY and non-flags changes were filtered out above, and the only
  12212	 * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
  12213	 * logging isn't being toggled on or off.
  12214	 */
  12215	if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
  12216		return;
  12217
  12218	if (!log_dirty_pages) {
  12219		/*
  12220		 * Dirty logging tracks sptes in 4k granularity, meaning that
  12221		 * large sptes have to be split.  If live migration succeeds,
  12222		 * the guest in the source machine will be destroyed and large
  12223		 * sptes will be created in the destination.  However, if the
  12224		 * guest continues to run in the source machine (for example if
  12225		 * live migration fails), small sptes will remain around and
  12226		 * cause bad performance.
  12227		 *
  12228		 * Scan sptes if dirty logging has been stopped, dropping those
  12229		 * which can be collapsed into a single large-page spte.  Later
  12230		 * page faults will create the large-page sptes.
  12231		 */
  12232		kvm_mmu_zap_collapsible_sptes(kvm, new);
  12233	} else {
  12234		/*
  12235		 * Initially-all-set does not require write protecting any page,
  12236		 * because they're all assumed to be dirty.
  12237		 */
  12238		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
  12239			return;
  12240
  12241		if (READ_ONCE(eager_page_split))
  12242			kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
  12243
  12244		if (kvm_x86_ops.cpu_dirty_log_size) {
  12245			kvm_mmu_slot_leaf_clear_dirty(kvm, new);
  12246			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
  12247		} else {
  12248			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
  12249		}
  12250	}
  12251}
  12252
  12253void kvm_arch_commit_memory_region(struct kvm *kvm,
  12254				struct kvm_memory_slot *old,
  12255				const struct kvm_memory_slot *new,
  12256				enum kvm_mr_change change)
  12257{
  12258	if (!kvm->arch.n_requested_mmu_pages &&
  12259	    (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
  12260		unsigned long nr_mmu_pages;
  12261
  12262		nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
  12263		nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
  12264		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
  12265	}
  12266
  12267	kvm_mmu_slot_apply_flags(kvm, old, new, change);
  12268
  12269	/* Free the arrays associated with the old memslot. */
  12270	if (change == KVM_MR_MOVE)
  12271		kvm_arch_free_memslot(kvm, old);
  12272}
  12273
  12274void kvm_arch_flush_shadow_all(struct kvm *kvm)
  12275{
  12276	kvm_mmu_zap_all(kvm);
  12277}
  12278
  12279void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
  12280				   struct kvm_memory_slot *slot)
  12281{
  12282	kvm_page_track_flush_slot(kvm, slot);
  12283}
  12284
  12285static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
  12286{
  12287	return (is_guest_mode(vcpu) &&
  12288		static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
  12289}
  12290
  12291static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
  12292{
  12293	if (!list_empty_careful(&vcpu->async_pf.done))
  12294		return true;
  12295
  12296	if (kvm_apic_has_events(vcpu) ||
  12297	    kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
  12298		return true;
  12299
  12300	if (vcpu->arch.pv.pv_unhalted)
  12301		return true;
  12302
  12303	if (vcpu->arch.exception.pending)
  12304		return true;
  12305
  12306	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
  12307	    (vcpu->arch.nmi_pending &&
  12308	     static_call(kvm_x86_nmi_allowed)(vcpu, false)))
  12309		return true;
  12310
  12311	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
  12312	    (vcpu->arch.smi_pending &&
  12313	     static_call(kvm_x86_smi_allowed)(vcpu, false)))
  12314		return true;
  12315
  12316	if (kvm_arch_interrupt_allowed(vcpu) &&
  12317	    (kvm_cpu_has_interrupt(vcpu) ||
  12318	    kvm_guest_apic_has_interrupt(vcpu)))
  12319		return true;
  12320
  12321	if (kvm_hv_has_stimer_pending(vcpu))
  12322		return true;
  12323
  12324	if (is_guest_mode(vcpu) &&
  12325	    kvm_x86_ops.nested_ops->hv_timer_pending &&
  12326	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
  12327		return true;
  12328
  12329	if (kvm_xen_has_pending_events(vcpu))
  12330		return true;
  12331
  12332	if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
  12333		return true;
  12334
  12335	return false;
  12336}
  12337
  12338int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  12339{
  12340	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
  12341}
  12342
  12343bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
  12344{
  12345	if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
  12346		return true;
  12347
  12348	return false;
  12349}
  12350
  12351bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
  12352{
  12353	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
  12354		return true;
  12355
  12356	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
  12357		kvm_test_request(KVM_REQ_SMI, vcpu) ||
  12358		 kvm_test_request(KVM_REQ_EVENT, vcpu))
  12359		return true;
  12360
  12361	return kvm_arch_dy_has_pending_interrupt(vcpu);
  12362}
  12363
  12364bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
  12365{
  12366	if (vcpu->arch.guest_state_protected)
  12367		return true;
  12368
  12369	return vcpu->arch.preempted_in_kernel;
  12370}
  12371
  12372unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
  12373{
  12374	return kvm_rip_read(vcpu);
  12375}
  12376
  12377int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  12378{
  12379	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
  12380}
  12381
  12382int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
  12383{
  12384	return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
  12385}
  12386
  12387unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
  12388{
  12389	/* Can't read the RIP when guest state is protected, just return 0 */
  12390	if (vcpu->arch.guest_state_protected)
  12391		return 0;
  12392
  12393	if (is_64_bit_mode(vcpu))
  12394		return kvm_rip_read(vcpu);
  12395	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
  12396		     kvm_rip_read(vcpu));
  12397}
  12398EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
  12399
  12400bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
  12401{
  12402	return kvm_get_linear_rip(vcpu) == linear_rip;
  12403}
  12404EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
  12405
  12406unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
  12407{
  12408	unsigned long rflags;
  12409
  12410	rflags = static_call(kvm_x86_get_rflags)(vcpu);
  12411	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
  12412		rflags &= ~X86_EFLAGS_TF;
  12413	return rflags;
  12414}
  12415EXPORT_SYMBOL_GPL(kvm_get_rflags);
  12416
  12417static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  12418{
  12419	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
  12420	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
  12421		rflags |= X86_EFLAGS_TF;
  12422	static_call(kvm_x86_set_rflags)(vcpu, rflags);
  12423}
  12424
  12425void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  12426{
  12427	__kvm_set_rflags(vcpu, rflags);
  12428	kvm_make_request(KVM_REQ_EVENT, vcpu);
  12429}
  12430EXPORT_SYMBOL_GPL(kvm_set_rflags);
  12431
  12432static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
  12433{
  12434	BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
  12435
  12436	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
  12437}
  12438
  12439static inline u32 kvm_async_pf_next_probe(u32 key)
  12440{
  12441	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
  12442}
  12443
  12444static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
  12445{
  12446	u32 key = kvm_async_pf_hash_fn(gfn);
  12447
  12448	while (vcpu->arch.apf.gfns[key] != ~0)
  12449		key = kvm_async_pf_next_probe(key);
  12450
  12451	vcpu->arch.apf.gfns[key] = gfn;
  12452}
  12453
  12454static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
  12455{
  12456	int i;
  12457	u32 key = kvm_async_pf_hash_fn(gfn);
  12458
  12459	for (i = 0; i < ASYNC_PF_PER_VCPU &&
  12460		     (vcpu->arch.apf.gfns[key] != gfn &&
  12461		      vcpu->arch.apf.gfns[key] != ~0); i++)
  12462		key = kvm_async_pf_next_probe(key);
  12463
  12464	return key;
  12465}
  12466
  12467bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
  12468{
  12469	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
  12470}
  12471
  12472static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
  12473{
  12474	u32 i, j, k;
  12475
  12476	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
  12477
  12478	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
  12479		return;
  12480
  12481	while (true) {
  12482		vcpu->arch.apf.gfns[i] = ~0;
  12483		do {
  12484			j = kvm_async_pf_next_probe(j);
  12485			if (vcpu->arch.apf.gfns[j] == ~0)
  12486				return;
  12487			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
  12488			/*
  12489			 * k lies cyclically in ]i,j]
  12490			 * |    i.k.j |
  12491			 * |....j i.k.| or  |.k..j i...|
  12492			 */
  12493		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
  12494		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
  12495		i = j;
  12496	}
  12497}
  12498
  12499static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
  12500{
  12501	u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
  12502
  12503	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
  12504				      sizeof(reason));
  12505}
  12506
  12507static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
  12508{
  12509	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
  12510
  12511	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
  12512					     &token, offset, sizeof(token));
  12513}
  12514
  12515static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
  12516{
  12517	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
  12518	u32 val;
  12519
  12520	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
  12521					 &val, offset, sizeof(val)))
  12522		return false;
  12523
  12524	return !val;
  12525}
  12526
  12527static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
  12528{
  12529
  12530	if (!kvm_pv_async_pf_enabled(vcpu))
  12531		return false;
  12532
  12533	if (vcpu->arch.apf.send_user_only &&
  12534	    static_call(kvm_x86_get_cpl)(vcpu) == 0)
  12535		return false;
  12536
  12537	if (is_guest_mode(vcpu)) {
  12538		/*
  12539		 * L1 needs to opt into the special #PF vmexits that are
  12540		 * used to deliver async page faults.
  12541		 */
  12542		return vcpu->arch.apf.delivery_as_pf_vmexit;
  12543	} else {
  12544		/*
  12545		 * Play it safe in case the guest temporarily disables paging.
  12546		 * The real mode IDT in particular is unlikely to have a #PF
  12547		 * exception setup.
  12548		 */
  12549		return is_paging(vcpu);
  12550	}
  12551}
  12552
  12553bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
  12554{
  12555	if (unlikely(!lapic_in_kernel(vcpu) ||
  12556		     kvm_event_needs_reinjection(vcpu) ||
  12557		     vcpu->arch.exception.pending))
  12558		return false;
  12559
  12560	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
  12561		return false;
  12562
  12563	/*
  12564	 * If interrupts are off we cannot even use an artificial
  12565	 * halt state.
  12566	 */
  12567	return kvm_arch_interrupt_allowed(vcpu);
  12568}
  12569
  12570bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
  12571				     struct kvm_async_pf *work)
  12572{
  12573	struct x86_exception fault;
  12574
  12575	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
  12576	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
  12577
  12578	if (kvm_can_deliver_async_pf(vcpu) &&
  12579	    !apf_put_user_notpresent(vcpu)) {
  12580		fault.vector = PF_VECTOR;
  12581		fault.error_code_valid = true;
  12582		fault.error_code = 0;
  12583		fault.nested_page_fault = false;
  12584		fault.address = work->arch.token;
  12585		fault.async_page_fault = true;
  12586		kvm_inject_page_fault(vcpu, &fault);
  12587		return true;
  12588	} else {
  12589		/*
  12590		 * It is not possible to deliver a paravirtualized asynchronous
  12591		 * page fault, but putting the guest in an artificial halt state
  12592		 * can be beneficial nevertheless: if an interrupt arrives, we
  12593		 * can deliver it timely and perhaps the guest will schedule
  12594		 * another process.  When the instruction that triggered a page
  12595		 * fault is retried, hopefully the page will be ready in the host.
  12596		 */
  12597		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
  12598		return false;
  12599	}
  12600}
  12601
  12602void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
  12603				 struct kvm_async_pf *work)
  12604{
  12605	struct kvm_lapic_irq irq = {
  12606		.delivery_mode = APIC_DM_FIXED,
  12607		.vector = vcpu->arch.apf.vec
  12608	};
  12609
  12610	if (work->wakeup_all)
  12611		work->arch.token = ~0; /* broadcast wakeup */
  12612	else
  12613		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
  12614	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
  12615
  12616	if ((work->wakeup_all || work->notpresent_injected) &&
  12617	    kvm_pv_async_pf_enabled(vcpu) &&
  12618	    !apf_put_user_ready(vcpu, work->arch.token)) {
  12619		vcpu->arch.apf.pageready_pending = true;
  12620		kvm_apic_set_irq(vcpu, &irq, NULL);
  12621	}
  12622
  12623	vcpu->arch.apf.halted = false;
  12624	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  12625}
  12626
  12627void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
  12628{
  12629	kvm_make_request(KVM_REQ_APF_READY, vcpu);
  12630	if (!vcpu->arch.apf.pageready_pending)
  12631		kvm_vcpu_kick(vcpu);
  12632}
  12633
  12634bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
  12635{
  12636	if (!kvm_pv_async_pf_enabled(vcpu))
  12637		return true;
  12638	else
  12639		return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
  12640}
  12641
  12642void kvm_arch_start_assignment(struct kvm *kvm)
  12643{
  12644	if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
  12645		static_call_cond(kvm_x86_pi_start_assignment)(kvm);
  12646}
  12647EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
  12648
  12649void kvm_arch_end_assignment(struct kvm *kvm)
  12650{
  12651	atomic_dec(&kvm->arch.assigned_device_count);
  12652}
  12653EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
  12654
  12655bool kvm_arch_has_assigned_device(struct kvm *kvm)
  12656{
  12657	return atomic_read(&kvm->arch.assigned_device_count);
  12658}
  12659EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
  12660
  12661void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
  12662{
  12663	atomic_inc(&kvm->arch.noncoherent_dma_count);
  12664}
  12665EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
  12666
  12667void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
  12668{
  12669	atomic_dec(&kvm->arch.noncoherent_dma_count);
  12670}
  12671EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
  12672
  12673bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
  12674{
  12675	return atomic_read(&kvm->arch.noncoherent_dma_count);
  12676}
  12677EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
  12678
  12679bool kvm_arch_has_irq_bypass(void)
  12680{
  12681	return true;
  12682}
  12683
  12684int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
  12685				      struct irq_bypass_producer *prod)
  12686{
  12687	struct kvm_kernel_irqfd *irqfd =
  12688		container_of(cons, struct kvm_kernel_irqfd, consumer);
  12689	int ret;
  12690
  12691	irqfd->producer = prod;
  12692	kvm_arch_start_assignment(irqfd->kvm);
  12693	ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
  12694					 prod->irq, irqfd->gsi, 1);
  12695
  12696	if (ret)
  12697		kvm_arch_end_assignment(irqfd->kvm);
  12698
  12699	return ret;
  12700}
  12701
  12702void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
  12703				      struct irq_bypass_producer *prod)
  12704{
  12705	int ret;
  12706	struct kvm_kernel_irqfd *irqfd =
  12707		container_of(cons, struct kvm_kernel_irqfd, consumer);
  12708
  12709	WARN_ON(irqfd->producer != prod);
  12710	irqfd->producer = NULL;
  12711
  12712	/*
  12713	 * When producer of consumer is unregistered, we change back to
  12714	 * remapped mode, so we can re-use the current implementation
  12715	 * when the irq is masked/disabled or the consumer side (KVM
  12716	 * int this case doesn't want to receive the interrupts.
  12717	*/
  12718	ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
  12719	if (ret)
  12720		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
  12721		       " fails: %d\n", irqfd->consumer.token, ret);
  12722
  12723	kvm_arch_end_assignment(irqfd->kvm);
  12724}
  12725
  12726int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
  12727				   uint32_t guest_irq, bool set)
  12728{
  12729	return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
  12730}
  12731
  12732bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
  12733				  struct kvm_kernel_irq_routing_entry *new)
  12734{
  12735	if (new->type != KVM_IRQ_ROUTING_MSI)
  12736		return true;
  12737
  12738	return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
  12739}
  12740
  12741bool kvm_vector_hashing_enabled(void)
  12742{
  12743	return vector_hashing;
  12744}
  12745
  12746bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
  12747{
  12748	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
  12749}
  12750EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
  12751
  12752
  12753int kvm_spec_ctrl_test_value(u64 value)
  12754{
  12755	/*
  12756	 * test that setting IA32_SPEC_CTRL to given value
  12757	 * is allowed by the host processor
  12758	 */
  12759
  12760	u64 saved_value;
  12761	unsigned long flags;
  12762	int ret = 0;
  12763
  12764	local_irq_save(flags);
  12765
  12766	if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
  12767		ret = 1;
  12768	else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
  12769		ret = 1;
  12770	else
  12771		wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
  12772
  12773	local_irq_restore(flags);
  12774
  12775	return ret;
  12776}
  12777EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
  12778
  12779void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
  12780{
  12781	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
  12782	struct x86_exception fault;
  12783	u64 access = error_code &
  12784		(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
  12785
  12786	if (!(error_code & PFERR_PRESENT_MASK) ||
  12787	    mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
  12788		/*
  12789		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
  12790		 * tables probably do not match the TLB.  Just proceed
  12791		 * with the error code that the processor gave.
  12792		 */
  12793		fault.vector = PF_VECTOR;
  12794		fault.error_code_valid = true;
  12795		fault.error_code = error_code;
  12796		fault.nested_page_fault = false;
  12797		fault.address = gva;
  12798	}
  12799	vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
  12800}
  12801EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
  12802
  12803/*
  12804 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
  12805 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
  12806 * indicates whether exit to userspace is needed.
  12807 */
  12808int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
  12809			      struct x86_exception *e)
  12810{
  12811	if (r == X86EMUL_PROPAGATE_FAULT) {
  12812		kvm_inject_emulated_page_fault(vcpu, e);
  12813		return 1;
  12814	}
  12815
  12816	/*
  12817	 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
  12818	 * while handling a VMX instruction KVM could've handled the request
  12819	 * correctly by exiting to userspace and performing I/O but there
  12820	 * doesn't seem to be a real use-case behind such requests, just return
  12821	 * KVM_EXIT_INTERNAL_ERROR for now.
  12822	 */
  12823	kvm_prepare_emulation_failure_exit(vcpu);
  12824
  12825	return 0;
  12826}
  12827EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
  12828
  12829int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
  12830{
  12831	bool pcid_enabled;
  12832	struct x86_exception e;
  12833	struct {
  12834		u64 pcid;
  12835		u64 gla;
  12836	} operand;
  12837	int r;
  12838
  12839	r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
  12840	if (r != X86EMUL_CONTINUE)
  12841		return kvm_handle_memory_failure(vcpu, r, &e);
  12842
  12843	if (operand.pcid >> 12 != 0) {
  12844		kvm_inject_gp(vcpu, 0);
  12845		return 1;
  12846	}
  12847
  12848	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
  12849
  12850	switch (type) {
  12851	case INVPCID_TYPE_INDIV_ADDR:
  12852		if ((!pcid_enabled && (operand.pcid != 0)) ||
  12853		    is_noncanonical_address(operand.gla, vcpu)) {
  12854			kvm_inject_gp(vcpu, 0);
  12855			return 1;
  12856		}
  12857		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
  12858		return kvm_skip_emulated_instruction(vcpu);
  12859
  12860	case INVPCID_TYPE_SINGLE_CTXT:
  12861		if (!pcid_enabled && (operand.pcid != 0)) {
  12862			kvm_inject_gp(vcpu, 0);
  12863			return 1;
  12864		}
  12865
  12866		kvm_invalidate_pcid(vcpu, operand.pcid);
  12867		return kvm_skip_emulated_instruction(vcpu);
  12868
  12869	case INVPCID_TYPE_ALL_NON_GLOBAL:
  12870		/*
  12871		 * Currently, KVM doesn't mark global entries in the shadow
  12872		 * page tables, so a non-global flush just degenerates to a
  12873		 * global flush. If needed, we could optimize this later by
  12874		 * keeping track of global entries in shadow page tables.
  12875		 */
  12876
  12877		fallthrough;
  12878	case INVPCID_TYPE_ALL_INCL_GLOBAL:
  12879		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
  12880		return kvm_skip_emulated_instruction(vcpu);
  12881
  12882	default:
  12883		kvm_inject_gp(vcpu, 0);
  12884		return 1;
  12885	}
  12886}
  12887EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
  12888
  12889static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
  12890{
  12891	struct kvm_run *run = vcpu->run;
  12892	struct kvm_mmio_fragment *frag;
  12893	unsigned int len;
  12894
  12895	BUG_ON(!vcpu->mmio_needed);
  12896
  12897	/* Complete previous fragment */
  12898	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
  12899	len = min(8u, frag->len);
  12900	if (!vcpu->mmio_is_write)
  12901		memcpy(frag->data, run->mmio.data, len);
  12902
  12903	if (frag->len <= 8) {
  12904		/* Switch to the next fragment. */
  12905		frag++;
  12906		vcpu->mmio_cur_fragment++;
  12907	} else {
  12908		/* Go forward to the next mmio piece. */
  12909		frag->data += len;
  12910		frag->gpa += len;
  12911		frag->len -= len;
  12912	}
  12913
  12914	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
  12915		vcpu->mmio_needed = 0;
  12916
  12917		// VMG change, at this point, we're always done
  12918		// RIP has already been advanced
  12919		return 1;
  12920	}
  12921
  12922	// More MMIO is needed
  12923	run->mmio.phys_addr = frag->gpa;
  12924	run->mmio.len = min(8u, frag->len);
  12925	run->mmio.is_write = vcpu->mmio_is_write;
  12926	if (run->mmio.is_write)
  12927		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
  12928	run->exit_reason = KVM_EXIT_MMIO;
  12929
  12930	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
  12931
  12932	return 0;
  12933}
  12934
  12935int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
  12936			  void *data)
  12937{
  12938	int handled;
  12939	struct kvm_mmio_fragment *frag;
  12940
  12941	if (!data)
  12942		return -EINVAL;
  12943
  12944	handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
  12945	if (handled == bytes)
  12946		return 1;
  12947
  12948	bytes -= handled;
  12949	gpa += handled;
  12950	data += handled;
  12951
  12952	/*TODO: Check if need to increment number of frags */
  12953	frag = vcpu->mmio_fragments;
  12954	vcpu->mmio_nr_fragments = 1;
  12955	frag->len = bytes;
  12956	frag->gpa = gpa;
  12957	frag->data = data;
  12958
  12959	vcpu->mmio_needed = 1;
  12960	vcpu->mmio_cur_fragment = 0;
  12961
  12962	vcpu->run->mmio.phys_addr = gpa;
  12963	vcpu->run->mmio.len = min(8u, frag->len);
  12964	vcpu->run->mmio.is_write = 1;
  12965	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
  12966	vcpu->run->exit_reason = KVM_EXIT_MMIO;
  12967
  12968	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
  12969
  12970	return 0;
  12971}
  12972EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
  12973
  12974int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
  12975			 void *data)
  12976{
  12977	int handled;
  12978	struct kvm_mmio_fragment *frag;
  12979
  12980	if (!data)
  12981		return -EINVAL;
  12982
  12983	handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
  12984	if (handled == bytes)
  12985		return 1;
  12986
  12987	bytes -= handled;
  12988	gpa += handled;
  12989	data += handled;
  12990
  12991	/*TODO: Check if need to increment number of frags */
  12992	frag = vcpu->mmio_fragments;
  12993	vcpu->mmio_nr_fragments = 1;
  12994	frag->len = bytes;
  12995	frag->gpa = gpa;
  12996	frag->data = data;
  12997
  12998	vcpu->mmio_needed = 1;
  12999	vcpu->mmio_cur_fragment = 0;
  13000
  13001	vcpu->run->mmio.phys_addr = gpa;
  13002	vcpu->run->mmio.len = min(8u, frag->len);
  13003	vcpu->run->mmio.is_write = 0;
  13004	vcpu->run->exit_reason = KVM_EXIT_MMIO;
  13005
  13006	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
  13007
  13008	return 0;
  13009}
  13010EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
  13011
  13012static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
  13013			   unsigned int port);
  13014
  13015static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
  13016{
  13017	int size = vcpu->arch.pio.size;
  13018	int port = vcpu->arch.pio.port;
  13019
  13020	vcpu->arch.pio.count = 0;
  13021	if (vcpu->arch.sev_pio_count)
  13022		return kvm_sev_es_outs(vcpu, size, port);
  13023	return 1;
  13024}
  13025
  13026static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
  13027			   unsigned int port)
  13028{
  13029	for (;;) {
  13030		unsigned int count =
  13031			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
  13032		int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
  13033
  13034		/* memcpy done already by emulator_pio_out.  */
  13035		vcpu->arch.sev_pio_count -= count;
  13036		vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
  13037		if (!ret)
  13038			break;
  13039
  13040		/* Emulation done by the kernel.  */
  13041		if (!vcpu->arch.sev_pio_count)
  13042			return 1;
  13043	}
  13044
  13045	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
  13046	return 0;
  13047}
  13048
  13049static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
  13050			  unsigned int port);
  13051
  13052static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
  13053{
  13054	unsigned count = vcpu->arch.pio.count;
  13055	complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
  13056	vcpu->arch.sev_pio_count -= count;
  13057	vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
  13058}
  13059
  13060static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
  13061{
  13062	int size = vcpu->arch.pio.size;
  13063	int port = vcpu->arch.pio.port;
  13064
  13065	advance_sev_es_emulated_ins(vcpu);
  13066	if (vcpu->arch.sev_pio_count)
  13067		return kvm_sev_es_ins(vcpu, size, port);
  13068	return 1;
  13069}
  13070
  13071static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
  13072			  unsigned int port)
  13073{
  13074	for (;;) {
  13075		unsigned int count =
  13076			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
  13077		if (!__emulator_pio_in(vcpu, size, port, count))
  13078			break;
  13079
  13080		/* Emulation done by the kernel.  */
  13081		advance_sev_es_emulated_ins(vcpu);
  13082		if (!vcpu->arch.sev_pio_count)
  13083			return 1;
  13084	}
  13085
  13086	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
  13087	return 0;
  13088}
  13089
  13090int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
  13091			 unsigned int port, void *data,  unsigned int count,
  13092			 int in)
  13093{
  13094	vcpu->arch.sev_pio_data = data;
  13095	vcpu->arch.sev_pio_count = count;
  13096	return in ? kvm_sev_es_ins(vcpu, size, port)
  13097		  : kvm_sev_es_outs(vcpu, size, port);
  13098}
  13099EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
  13100
  13101EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
  13102EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
  13103EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
  13104EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
  13105EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
  13106EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
  13107EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
  13108EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
  13109EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
  13110EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
  13111EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
  13112EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
  13113EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
  13114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
  13115EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
  13116EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
  13117EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
  13118EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
  13119EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
  13120EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
  13121EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
  13122EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
  13123EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
  13124EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
  13125EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
  13126EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
  13127EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
  13128EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
  13129EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc);
  13130EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_sev_es_unmap_ghcb);
  13131
  13132static int __init kvm_x86_init(void)
  13133{
  13134	kvm_mmu_x86_module_init();
  13135	return 0;
  13136}
  13137module_init(kvm_x86_init);
  13138
  13139static void __exit kvm_x86_exit(void)
  13140{
  13141	/*
  13142	 * If module_init() is implemented, module_exit() must also be
  13143	 * implemented to allow module unload.
  13144	 */
  13145}
  13146module_exit(kvm_x86_exit);