cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vmx.c (240665B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * This module enables machines with Intel VT-x extensions to run virtual
      6 * machines without emulation or binary translation.
      7 *
      8 * Copyright (C) 2006 Qumranet, Inc.
      9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
     10 *
     11 * Authors:
     12 *   Avi Kivity   <avi@qumranet.com>
     13 *   Yaniv Kamay  <yaniv@qumranet.com>
     14 */
     15
     16#include <linux/highmem.h>
     17#include <linux/hrtimer.h>
     18#include <linux/kernel.h>
     19#include <linux/kvm_host.h>
     20#include <linux/module.h>
     21#include <linux/moduleparam.h>
     22#include <linux/mod_devicetable.h>
     23#include <linux/mm.h>
     24#include <linux/objtool.h>
     25#include <linux/sched.h>
     26#include <linux/sched/smt.h>
     27#include <linux/slab.h>
     28#include <linux/tboot.h>
     29#include <linux/trace_events.h>
     30#include <linux/entry-kvm.h>
     31
     32#include <asm/apic.h>
     33#include <asm/asm.h>
     34#include <asm/cpu.h>
     35#include <asm/cpu_device_id.h>
     36#include <asm/debugreg.h>
     37#include <asm/desc.h>
     38#include <asm/fpu/api.h>
     39#include <asm/fpu/xstate.h>
     40#include <asm/idtentry.h>
     41#include <asm/io.h>
     42#include <asm/irq_remapping.h>
     43#include <asm/kexec.h>
     44#include <asm/perf_event.h>
     45#include <asm/mmu_context.h>
     46#include <asm/mshyperv.h>
     47#include <asm/mwait.h>
     48#include <asm/spec-ctrl.h>
     49#include <asm/virtext.h>
     50#include <asm/vmx.h>
     51
     52#include "capabilities.h"
     53#include "cpuid.h"
     54#include "evmcs.h"
     55#include "hyperv.h"
     56#include "kvm_onhyperv.h"
     57#include "irq.h"
     58#include "kvm_cache_regs.h"
     59#include "lapic.h"
     60#include "mmu.h"
     61#include "nested.h"
     62#include "pmu.h"
     63#include "sgx.h"
     64#include "trace.h"
     65#include "vmcs.h"
     66#include "vmcs12.h"
     67#include "vmx.h"
     68#include "x86.h"
     69
     70MODULE_AUTHOR("Qumranet");
     71MODULE_LICENSE("GPL");
     72
     73#ifdef MODULE
     74static const struct x86_cpu_id vmx_cpu_id[] = {
     75	X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
     76	{}
     77};
     78MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
     79#endif
     80
     81bool __read_mostly enable_vpid = 1;
     82module_param_named(vpid, enable_vpid, bool, 0444);
     83
     84static bool __read_mostly enable_vnmi = 1;
     85module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
     86
     87bool __read_mostly flexpriority_enabled = 1;
     88module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
     89
     90bool __read_mostly enable_ept = 1;
     91module_param_named(ept, enable_ept, bool, S_IRUGO);
     92
     93bool __read_mostly enable_unrestricted_guest = 1;
     94module_param_named(unrestricted_guest,
     95			enable_unrestricted_guest, bool, S_IRUGO);
     96
     97bool __read_mostly enable_ept_ad_bits = 1;
     98module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
     99
    100static bool __read_mostly emulate_invalid_guest_state = true;
    101module_param(emulate_invalid_guest_state, bool, S_IRUGO);
    102
    103static bool __read_mostly fasteoi = 1;
    104module_param(fasteoi, bool, S_IRUGO);
    105
    106module_param(enable_apicv, bool, S_IRUGO);
    107
    108/*
    109 * If nested=1, nested virtualization is supported, i.e., guests may use
    110 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
    111 * use VMX instructions.
    112 */
    113static bool __read_mostly nested = 1;
    114module_param(nested, bool, S_IRUGO);
    115
    116bool __read_mostly enable_pml = 1;
    117module_param_named(pml, enable_pml, bool, S_IRUGO);
    118
    119static bool __read_mostly dump_invalid_vmcs = 0;
    120module_param(dump_invalid_vmcs, bool, 0644);
    121
    122#define MSR_BITMAP_MODE_X2APIC		1
    123#define MSR_BITMAP_MODE_X2APIC_APICV	2
    124
    125#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
    126
    127/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
    128static int __read_mostly cpu_preemption_timer_multi;
    129static bool __read_mostly enable_preemption_timer = 1;
    130#ifdef CONFIG_X86_64
    131module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
    132#endif
    133
    134extern bool __read_mostly allow_smaller_maxphyaddr;
    135module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
    136
    137#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
    138#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
    139#define KVM_VM_CR0_ALWAYS_ON				\
    140	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
    141
    142#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
    143#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
    144#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
    145
    146#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
    147
    148#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
    149	RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
    150	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
    151	RTIT_STATUS_BYTECNT))
    152
    153/*
    154 * List of MSRs that can be directly passed to the guest.
    155 * In addition to these x2apic and PT MSRs are handled specially.
    156 */
    157static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
    158	MSR_IA32_SPEC_CTRL,
    159	MSR_IA32_PRED_CMD,
    160	MSR_IA32_TSC,
    161#ifdef CONFIG_X86_64
    162	MSR_FS_BASE,
    163	MSR_GS_BASE,
    164	MSR_KERNEL_GS_BASE,
    165	MSR_IA32_XFD,
    166	MSR_IA32_XFD_ERR,
    167#endif
    168	MSR_IA32_SYSENTER_CS,
    169	MSR_IA32_SYSENTER_ESP,
    170	MSR_IA32_SYSENTER_EIP,
    171	MSR_CORE_C1_RES,
    172	MSR_CORE_C3_RESIDENCY,
    173	MSR_CORE_C6_RESIDENCY,
    174	MSR_CORE_C7_RESIDENCY,
    175};
    176
    177/*
    178 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
    179 * ple_gap:    upper bound on the amount of time between two successive
    180 *             executions of PAUSE in a loop. Also indicate if ple enabled.
    181 *             According to test, this time is usually smaller than 128 cycles.
    182 * ple_window: upper bound on the amount of time a guest is allowed to execute
    183 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
    184 *             less than 2^12 cycles
    185 * Time is measured based on a counter that runs at the same rate as the TSC,
    186 * refer SDM volume 3b section 21.6.13 & 22.1.3.
    187 */
    188static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
    189module_param(ple_gap, uint, 0444);
    190
    191static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
    192module_param(ple_window, uint, 0444);
    193
    194/* Default doubles per-vcpu window every exit. */
    195static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
    196module_param(ple_window_grow, uint, 0444);
    197
    198/* Default resets per-vcpu window every exit to ple_window. */
    199static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
    200module_param(ple_window_shrink, uint, 0444);
    201
    202/* Default is to compute the maximum so we can never overflow. */
    203static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
    204module_param(ple_window_max, uint, 0444);
    205
    206/* Default is SYSTEM mode, 1 for host-guest mode */
    207int __read_mostly pt_mode = PT_MODE_SYSTEM;
    208module_param(pt_mode, int, S_IRUGO);
    209
    210static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
    211static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
    212static DEFINE_MUTEX(vmx_l1d_flush_mutex);
    213
    214/* Storage for pre module init parameter parsing */
    215static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
    216
    217static const struct {
    218	const char *option;
    219	bool for_parse;
    220} vmentry_l1d_param[] = {
    221	[VMENTER_L1D_FLUSH_AUTO]	 = {"auto", true},
    222	[VMENTER_L1D_FLUSH_NEVER]	 = {"never", true},
    223	[VMENTER_L1D_FLUSH_COND]	 = {"cond", true},
    224	[VMENTER_L1D_FLUSH_ALWAYS]	 = {"always", true},
    225	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
    226	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
    227};
    228
    229#define L1D_CACHE_ORDER 4
    230static void *vmx_l1d_flush_pages;
    231
    232/* Control for disabling CPU Fill buffer clear */
    233static bool __read_mostly vmx_fb_clear_ctrl_available;
    234
    235static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
    236{
    237	struct page *page;
    238	unsigned int i;
    239
    240	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
    241		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
    242		return 0;
    243	}
    244
    245	if (!enable_ept) {
    246		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
    247		return 0;
    248	}
    249
    250	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
    251		u64 msr;
    252
    253		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
    254		if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
    255			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
    256			return 0;
    257		}
    258	}
    259
    260	/* If set to auto use the default l1tf mitigation method */
    261	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
    262		switch (l1tf_mitigation) {
    263		case L1TF_MITIGATION_OFF:
    264			l1tf = VMENTER_L1D_FLUSH_NEVER;
    265			break;
    266		case L1TF_MITIGATION_FLUSH_NOWARN:
    267		case L1TF_MITIGATION_FLUSH:
    268		case L1TF_MITIGATION_FLUSH_NOSMT:
    269			l1tf = VMENTER_L1D_FLUSH_COND;
    270			break;
    271		case L1TF_MITIGATION_FULL:
    272		case L1TF_MITIGATION_FULL_FORCE:
    273			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
    274			break;
    275		}
    276	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
    277		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
    278	}
    279
    280	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
    281	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
    282		/*
    283		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
    284		 * lifetime and so should not be charged to a memcg.
    285		 */
    286		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
    287		if (!page)
    288			return -ENOMEM;
    289		vmx_l1d_flush_pages = page_address(page);
    290
    291		/*
    292		 * Initialize each page with a different pattern in
    293		 * order to protect against KSM in the nested
    294		 * virtualization case.
    295		 */
    296		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
    297			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
    298			       PAGE_SIZE);
    299		}
    300	}
    301
    302	l1tf_vmx_mitigation = l1tf;
    303
    304	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
    305		static_branch_enable(&vmx_l1d_should_flush);
    306	else
    307		static_branch_disable(&vmx_l1d_should_flush);
    308
    309	if (l1tf == VMENTER_L1D_FLUSH_COND)
    310		static_branch_enable(&vmx_l1d_flush_cond);
    311	else
    312		static_branch_disable(&vmx_l1d_flush_cond);
    313	return 0;
    314}
    315
    316static int vmentry_l1d_flush_parse(const char *s)
    317{
    318	unsigned int i;
    319
    320	if (s) {
    321		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
    322			if (vmentry_l1d_param[i].for_parse &&
    323			    sysfs_streq(s, vmentry_l1d_param[i].option))
    324				return i;
    325		}
    326	}
    327	return -EINVAL;
    328}
    329
    330static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
    331{
    332	int l1tf, ret;
    333
    334	l1tf = vmentry_l1d_flush_parse(s);
    335	if (l1tf < 0)
    336		return l1tf;
    337
    338	if (!boot_cpu_has(X86_BUG_L1TF))
    339		return 0;
    340
    341	/*
    342	 * Has vmx_init() run already? If not then this is the pre init
    343	 * parameter parsing. In that case just store the value and let
    344	 * vmx_init() do the proper setup after enable_ept has been
    345	 * established.
    346	 */
    347	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
    348		vmentry_l1d_flush_param = l1tf;
    349		return 0;
    350	}
    351
    352	mutex_lock(&vmx_l1d_flush_mutex);
    353	ret = vmx_setup_l1d_flush(l1tf);
    354	mutex_unlock(&vmx_l1d_flush_mutex);
    355	return ret;
    356}
    357
    358static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
    359{
    360	if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
    361		return sprintf(s, "???\n");
    362
    363	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
    364}
    365
    366static void vmx_setup_fb_clear_ctrl(void)
    367{
    368	u64 msr;
    369
    370	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
    371	    !boot_cpu_has_bug(X86_BUG_MDS) &&
    372	    !boot_cpu_has_bug(X86_BUG_TAA)) {
    373		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
    374		if (msr & ARCH_CAP_FB_CLEAR_CTRL)
    375			vmx_fb_clear_ctrl_available = true;
    376	}
    377}
    378
    379static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
    380{
    381	u64 msr;
    382
    383	if (!vmx->disable_fb_clear)
    384		return;
    385
    386	rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
    387	msr |= FB_CLEAR_DIS;
    388	wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
    389	/* Cache the MSR value to avoid reading it later */
    390	vmx->msr_ia32_mcu_opt_ctrl = msr;
    391}
    392
    393static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
    394{
    395	if (!vmx->disable_fb_clear)
    396		return;
    397
    398	vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
    399	wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
    400}
    401
    402static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
    403{
    404	vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
    405
    406	/*
    407	 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
    408	 * at VMEntry. Skip the MSR read/write when a guest has no use case to
    409	 * execute VERW.
    410	 */
    411	if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
    412	   ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
    413	    (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
    414	    (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
    415	    (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
    416	    (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
    417		vmx->disable_fb_clear = false;
    418}
    419
    420static const struct kernel_param_ops vmentry_l1d_flush_ops = {
    421	.set = vmentry_l1d_flush_set,
    422	.get = vmentry_l1d_flush_get,
    423};
    424module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
    425
    426static u32 vmx_segment_access_rights(struct kvm_segment *var);
    427
    428void vmx_vmexit(void);
    429
    430#define vmx_insn_failed(fmt...)		\
    431do {					\
    432	WARN_ONCE(1, fmt);		\
    433	pr_warn_ratelimited(fmt);	\
    434} while (0)
    435
    436asmlinkage void vmread_error(unsigned long field, bool fault)
    437{
    438	if (fault)
    439		kvm_spurious_fault();
    440	else
    441		vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
    442}
    443
    444noinline void vmwrite_error(unsigned long field, unsigned long value)
    445{
    446	vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
    447			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
    448}
    449
    450noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
    451{
    452	vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
    453}
    454
    455noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
    456{
    457	vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
    458}
    459
    460noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
    461{
    462	vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
    463			ext, vpid, gva);
    464}
    465
    466noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
    467{
    468	vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
    469			ext, eptp, gpa);
    470}
    471
    472static DEFINE_PER_CPU(struct vmcs *, vmxarea);
    473DEFINE_PER_CPU(struct vmcs *, current_vmcs);
    474/*
    475 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
    476 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
    477 */
    478static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
    479
    480static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
    481static DEFINE_SPINLOCK(vmx_vpid_lock);
    482
    483struct vmcs_config vmcs_config;
    484struct vmx_capability vmx_capability;
    485
    486#define VMX_SEGMENT_FIELD(seg)					\
    487	[VCPU_SREG_##seg] = {                                   \
    488		.selector = GUEST_##seg##_SELECTOR,		\
    489		.base = GUEST_##seg##_BASE,		   	\
    490		.limit = GUEST_##seg##_LIMIT,		   	\
    491		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
    492	}
    493
    494static const struct kvm_vmx_segment_field {
    495	unsigned selector;
    496	unsigned base;
    497	unsigned limit;
    498	unsigned ar_bytes;
    499} kvm_vmx_segment_fields[] = {
    500	VMX_SEGMENT_FIELD(CS),
    501	VMX_SEGMENT_FIELD(DS),
    502	VMX_SEGMENT_FIELD(ES),
    503	VMX_SEGMENT_FIELD(FS),
    504	VMX_SEGMENT_FIELD(GS),
    505	VMX_SEGMENT_FIELD(SS),
    506	VMX_SEGMENT_FIELD(TR),
    507	VMX_SEGMENT_FIELD(LDTR),
    508};
    509
    510static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
    511{
    512	vmx->segment_cache.bitmask = 0;
    513}
    514
    515static unsigned long host_idt_base;
    516
    517#if IS_ENABLED(CONFIG_HYPERV)
    518static bool __read_mostly enlightened_vmcs = true;
    519module_param(enlightened_vmcs, bool, 0444);
    520
    521static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
    522{
    523	struct hv_enlightened_vmcs *evmcs;
    524	struct hv_partition_assist_pg **p_hv_pa_pg =
    525			&to_kvm_hv(vcpu->kvm)->hv_pa_pg;
    526	/*
    527	 * Synthetic VM-Exit is not enabled in current code and so All
    528	 * evmcs in singe VM shares same assist page.
    529	 */
    530	if (!*p_hv_pa_pg)
    531		*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
    532
    533	if (!*p_hv_pa_pg)
    534		return -ENOMEM;
    535
    536	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
    537
    538	evmcs->partition_assist_page =
    539		__pa(*p_hv_pa_pg);
    540	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
    541	evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
    542
    543	return 0;
    544}
    545
    546#endif /* IS_ENABLED(CONFIG_HYPERV) */
    547
    548/*
    549 * Comment's format: document - errata name - stepping - processor name.
    550 * Refer from
    551 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
    552 */
    553static u32 vmx_preemption_cpu_tfms[] = {
    554/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
    5550x000206E6,
    556/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
    557/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
    558/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
    5590x00020652,
    560/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
    5610x00020655,
    562/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
    563/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
    564/*
    565 * 320767.pdf - AAP86  - B1 -
    566 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
    567 */
    5680x000106E5,
    569/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
    5700x000106A0,
    571/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
    5720x000106A1,
    573/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
    5740x000106A4,
    575 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
    576 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
    577 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
    5780x000106A5,
    579 /* Xeon E3-1220 V2 */
    5800x000306A8,
    581};
    582
    583static inline bool cpu_has_broken_vmx_preemption_timer(void)
    584{
    585	u32 eax = cpuid_eax(0x00000001), i;
    586
    587	/* Clear the reserved bits */
    588	eax &= ~(0x3U << 14 | 0xfU << 28);
    589	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
    590		if (eax == vmx_preemption_cpu_tfms[i])
    591			return true;
    592
    593	return false;
    594}
    595
    596static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
    597{
    598	return flexpriority_enabled && lapic_in_kernel(vcpu);
    599}
    600
    601static int possible_passthrough_msr_slot(u32 msr)
    602{
    603	u32 i;
    604
    605	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
    606		if (vmx_possible_passthrough_msrs[i] == msr)
    607			return i;
    608
    609	return -ENOENT;
    610}
    611
    612static bool is_valid_passthrough_msr(u32 msr)
    613{
    614	bool r;
    615
    616	switch (msr) {
    617	case 0x800 ... 0x8ff:
    618		/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
    619		return true;
    620	case MSR_IA32_RTIT_STATUS:
    621	case MSR_IA32_RTIT_OUTPUT_BASE:
    622	case MSR_IA32_RTIT_OUTPUT_MASK:
    623	case MSR_IA32_RTIT_CR3_MATCH:
    624	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
    625		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
    626	case MSR_LBR_SELECT:
    627	case MSR_LBR_TOS:
    628	case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
    629	case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
    630	case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
    631	case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
    632	case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
    633		/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
    634		return true;
    635	}
    636
    637	r = possible_passthrough_msr_slot(msr) != -ENOENT;
    638
    639	WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
    640
    641	return r;
    642}
    643
    644struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
    645{
    646	int i;
    647
    648	i = kvm_find_user_return_msr(msr);
    649	if (i >= 0)
    650		return &vmx->guest_uret_msrs[i];
    651	return NULL;
    652}
    653
    654static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
    655				  struct vmx_uret_msr *msr, u64 data)
    656{
    657	unsigned int slot = msr - vmx->guest_uret_msrs;
    658	int ret = 0;
    659
    660	if (msr->load_into_hardware) {
    661		preempt_disable();
    662		ret = kvm_set_user_return_msr(slot, data, msr->mask);
    663		preempt_enable();
    664	}
    665	if (!ret)
    666		msr->data = data;
    667	return ret;
    668}
    669
    670#ifdef CONFIG_KEXEC_CORE
    671static void crash_vmclear_local_loaded_vmcss(void)
    672{
    673	int cpu = raw_smp_processor_id();
    674	struct loaded_vmcs *v;
    675
    676	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
    677			    loaded_vmcss_on_cpu_link)
    678		vmcs_clear(v->vmcs);
    679}
    680#endif /* CONFIG_KEXEC_CORE */
    681
    682static void __loaded_vmcs_clear(void *arg)
    683{
    684	struct loaded_vmcs *loaded_vmcs = arg;
    685	int cpu = raw_smp_processor_id();
    686
    687	if (loaded_vmcs->cpu != cpu)
    688		return; /* vcpu migration can race with cpu offline */
    689	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
    690		per_cpu(current_vmcs, cpu) = NULL;
    691
    692	vmcs_clear(loaded_vmcs->vmcs);
    693	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
    694		vmcs_clear(loaded_vmcs->shadow_vmcs);
    695
    696	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
    697
    698	/*
    699	 * Ensure all writes to loaded_vmcs, including deleting it from its
    700	 * current percpu list, complete before setting loaded_vmcs->cpu to
    701	 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
    702	 * and add loaded_vmcs to its percpu list before it's deleted from this
    703	 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
    704	 */
    705	smp_wmb();
    706
    707	loaded_vmcs->cpu = -1;
    708	loaded_vmcs->launched = 0;
    709}
    710
    711void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
    712{
    713	int cpu = loaded_vmcs->cpu;
    714
    715	if (cpu != -1)
    716		smp_call_function_single(cpu,
    717			 __loaded_vmcs_clear, loaded_vmcs, 1);
    718}
    719
    720static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
    721				       unsigned field)
    722{
    723	bool ret;
    724	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
    725
    726	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
    727		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
    728		vmx->segment_cache.bitmask = 0;
    729	}
    730	ret = vmx->segment_cache.bitmask & mask;
    731	vmx->segment_cache.bitmask |= mask;
    732	return ret;
    733}
    734
    735static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
    736{
    737	u16 *p = &vmx->segment_cache.seg[seg].selector;
    738
    739	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
    740		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
    741	return *p;
    742}
    743
    744static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
    745{
    746	ulong *p = &vmx->segment_cache.seg[seg].base;
    747
    748	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
    749		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
    750	return *p;
    751}
    752
    753static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
    754{
    755	u32 *p = &vmx->segment_cache.seg[seg].limit;
    756
    757	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
    758		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
    759	return *p;
    760}
    761
    762static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
    763{
    764	u32 *p = &vmx->segment_cache.seg[seg].ar;
    765
    766	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
    767		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
    768	return *p;
    769}
    770
    771void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
    772{
    773	u32 eb;
    774
    775	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
    776	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
    777	/*
    778	 * Guest access to VMware backdoor ports could legitimately
    779	 * trigger #GP because of TSS I/O permission bitmap.
    780	 * We intercept those #GP and allow access to them anyway
    781	 * as VMware does.
    782	 */
    783	if (enable_vmware_backdoor)
    784		eb |= (1u << GP_VECTOR);
    785	if ((vcpu->guest_debug &
    786	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
    787	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
    788		eb |= 1u << BP_VECTOR;
    789	if (to_vmx(vcpu)->rmode.vm86_active)
    790		eb = ~0;
    791	if (!vmx_need_pf_intercept(vcpu))
    792		eb &= ~(1u << PF_VECTOR);
    793
    794	/* When we are running a nested L2 guest and L1 specified for it a
    795	 * certain exception bitmap, we must trap the same exceptions and pass
    796	 * them to L1. When running L2, we will only handle the exceptions
    797	 * specified above if L1 did not want them.
    798	 */
    799	if (is_guest_mode(vcpu))
    800		eb |= get_vmcs12(vcpu)->exception_bitmap;
    801        else {
    802		int mask = 0, match = 0;
    803
    804		if (enable_ept && (eb & (1u << PF_VECTOR))) {
    805			/*
    806			 * If EPT is enabled, #PF is currently only intercepted
    807			 * if MAXPHYADDR is smaller on the guest than on the
    808			 * host.  In that case we only care about present,
    809			 * non-reserved faults.  For vmcs02, however, PFEC_MASK
    810			 * and PFEC_MATCH are set in prepare_vmcs02_rare.
    811			 */
    812			mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
    813			match = PFERR_PRESENT_MASK;
    814		}
    815		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
    816		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
    817	}
    818
    819	/*
    820	 * Disabling xfd interception indicates that dynamic xfeatures
    821	 * might be used in the guest. Always trap #NM in this case
    822	 * to save guest xfd_err timely.
    823	 */
    824	if (vcpu->arch.xfd_no_write_intercept)
    825		eb |= (1u << NM_VECTOR);
    826
    827	vmcs_write32(EXCEPTION_BITMAP, eb);
    828}
    829
    830/*
    831 * Check if MSR is intercepted for currently loaded MSR bitmap.
    832 */
    833static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
    834{
    835	if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
    836		return true;
    837
    838	return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
    839					 MSR_IA32_SPEC_CTRL);
    840}
    841
    842static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
    843		unsigned long entry, unsigned long exit)
    844{
    845	vm_entry_controls_clearbit(vmx, entry);
    846	vm_exit_controls_clearbit(vmx, exit);
    847}
    848
    849int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
    850{
    851	unsigned int i;
    852
    853	for (i = 0; i < m->nr; ++i) {
    854		if (m->val[i].index == msr)
    855			return i;
    856	}
    857	return -ENOENT;
    858}
    859
    860static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
    861{
    862	int i;
    863	struct msr_autoload *m = &vmx->msr_autoload;
    864
    865	switch (msr) {
    866	case MSR_EFER:
    867		if (cpu_has_load_ia32_efer()) {
    868			clear_atomic_switch_msr_special(vmx,
    869					VM_ENTRY_LOAD_IA32_EFER,
    870					VM_EXIT_LOAD_IA32_EFER);
    871			return;
    872		}
    873		break;
    874	case MSR_CORE_PERF_GLOBAL_CTRL:
    875		if (cpu_has_load_perf_global_ctrl()) {
    876			clear_atomic_switch_msr_special(vmx,
    877					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
    878					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
    879			return;
    880		}
    881		break;
    882	}
    883	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
    884	if (i < 0)
    885		goto skip_guest;
    886	--m->guest.nr;
    887	m->guest.val[i] = m->guest.val[m->guest.nr];
    888	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
    889
    890skip_guest:
    891	i = vmx_find_loadstore_msr_slot(&m->host, msr);
    892	if (i < 0)
    893		return;
    894
    895	--m->host.nr;
    896	m->host.val[i] = m->host.val[m->host.nr];
    897	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
    898}
    899
    900static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
    901		unsigned long entry, unsigned long exit,
    902		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
    903		u64 guest_val, u64 host_val)
    904{
    905	vmcs_write64(guest_val_vmcs, guest_val);
    906	if (host_val_vmcs != HOST_IA32_EFER)
    907		vmcs_write64(host_val_vmcs, host_val);
    908	vm_entry_controls_setbit(vmx, entry);
    909	vm_exit_controls_setbit(vmx, exit);
    910}
    911
    912static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
    913				  u64 guest_val, u64 host_val, bool entry_only)
    914{
    915	int i, j = 0;
    916	struct msr_autoload *m = &vmx->msr_autoload;
    917
    918	switch (msr) {
    919	case MSR_EFER:
    920		if (cpu_has_load_ia32_efer()) {
    921			add_atomic_switch_msr_special(vmx,
    922					VM_ENTRY_LOAD_IA32_EFER,
    923					VM_EXIT_LOAD_IA32_EFER,
    924					GUEST_IA32_EFER,
    925					HOST_IA32_EFER,
    926					guest_val, host_val);
    927			return;
    928		}
    929		break;
    930	case MSR_CORE_PERF_GLOBAL_CTRL:
    931		if (cpu_has_load_perf_global_ctrl()) {
    932			add_atomic_switch_msr_special(vmx,
    933					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
    934					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
    935					GUEST_IA32_PERF_GLOBAL_CTRL,
    936					HOST_IA32_PERF_GLOBAL_CTRL,
    937					guest_val, host_val);
    938			return;
    939		}
    940		break;
    941	case MSR_IA32_PEBS_ENABLE:
    942		/* PEBS needs a quiescent period after being disabled (to write
    943		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
    944		 * provide that period, so a CPU could write host's record into
    945		 * guest's memory.
    946		 */
    947		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
    948	}
    949
    950	i = vmx_find_loadstore_msr_slot(&m->guest, msr);
    951	if (!entry_only)
    952		j = vmx_find_loadstore_msr_slot(&m->host, msr);
    953
    954	if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
    955	    (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
    956		printk_once(KERN_WARNING "Not enough msr switch entries. "
    957				"Can't add msr %x\n", msr);
    958		return;
    959	}
    960	if (i < 0) {
    961		i = m->guest.nr++;
    962		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
    963	}
    964	m->guest.val[i].index = msr;
    965	m->guest.val[i].value = guest_val;
    966
    967	if (entry_only)
    968		return;
    969
    970	if (j < 0) {
    971		j = m->host.nr++;
    972		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
    973	}
    974	m->host.val[j].index = msr;
    975	m->host.val[j].value = host_val;
    976}
    977
    978static bool update_transition_efer(struct vcpu_vmx *vmx)
    979{
    980	u64 guest_efer = vmx->vcpu.arch.efer;
    981	u64 ignore_bits = 0;
    982	int i;
    983
    984	/* Shadow paging assumes NX to be available.  */
    985	if (!enable_ept)
    986		guest_efer |= EFER_NX;
    987
    988	/*
    989	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
    990	 */
    991	ignore_bits |= EFER_SCE;
    992#ifdef CONFIG_X86_64
    993	ignore_bits |= EFER_LMA | EFER_LME;
    994	/* SCE is meaningful only in long mode on Intel */
    995	if (guest_efer & EFER_LMA)
    996		ignore_bits &= ~(u64)EFER_SCE;
    997#endif
    998
    999	/*
   1000	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
   1001	 * On CPUs that support "load IA32_EFER", always switch EFER
   1002	 * atomically, since it's faster than switching it manually.
   1003	 */
   1004	if (cpu_has_load_ia32_efer() ||
   1005	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
   1006		if (!(guest_efer & EFER_LMA))
   1007			guest_efer &= ~EFER_LME;
   1008		if (guest_efer != host_efer)
   1009			add_atomic_switch_msr(vmx, MSR_EFER,
   1010					      guest_efer, host_efer, false);
   1011		else
   1012			clear_atomic_switch_msr(vmx, MSR_EFER);
   1013		return false;
   1014	}
   1015
   1016	i = kvm_find_user_return_msr(MSR_EFER);
   1017	if (i < 0)
   1018		return false;
   1019
   1020	clear_atomic_switch_msr(vmx, MSR_EFER);
   1021
   1022	guest_efer &= ~ignore_bits;
   1023	guest_efer |= host_efer & ignore_bits;
   1024
   1025	vmx->guest_uret_msrs[i].data = guest_efer;
   1026	vmx->guest_uret_msrs[i].mask = ~ignore_bits;
   1027
   1028	return true;
   1029}
   1030
   1031#ifdef CONFIG_X86_32
   1032/*
   1033 * On 32-bit kernels, VM exits still load the FS and GS bases from the
   1034 * VMCS rather than the segment table.  KVM uses this helper to figure
   1035 * out the current bases to poke them into the VMCS before entry.
   1036 */
   1037static unsigned long segment_base(u16 selector)
   1038{
   1039	struct desc_struct *table;
   1040	unsigned long v;
   1041
   1042	if (!(selector & ~SEGMENT_RPL_MASK))
   1043		return 0;
   1044
   1045	table = get_current_gdt_ro();
   1046
   1047	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
   1048		u16 ldt_selector = kvm_read_ldt();
   1049
   1050		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
   1051			return 0;
   1052
   1053		table = (struct desc_struct *)segment_base(ldt_selector);
   1054	}
   1055	v = get_desc_base(&table[selector >> 3]);
   1056	return v;
   1057}
   1058#endif
   1059
   1060static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
   1061{
   1062	return vmx_pt_mode_is_host_guest() &&
   1063	       !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
   1064}
   1065
   1066static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
   1067{
   1068	/* The base must be 128-byte aligned and a legal physical address. */
   1069	return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
   1070}
   1071
   1072static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
   1073{
   1074	u32 i;
   1075
   1076	wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
   1077	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
   1078	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
   1079	wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
   1080	for (i = 0; i < addr_range; i++) {
   1081		wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
   1082		wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
   1083	}
   1084}
   1085
   1086static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
   1087{
   1088	u32 i;
   1089
   1090	rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
   1091	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
   1092	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
   1093	rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
   1094	for (i = 0; i < addr_range; i++) {
   1095		rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
   1096		rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
   1097	}
   1098}
   1099
   1100static void pt_guest_enter(struct vcpu_vmx *vmx)
   1101{
   1102	if (vmx_pt_mode_is_system())
   1103		return;
   1104
   1105	/*
   1106	 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
   1107	 * Save host state before VM entry.
   1108	 */
   1109	rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
   1110	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
   1111		wrmsrl(MSR_IA32_RTIT_CTL, 0);
   1112		pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
   1113		pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
   1114	}
   1115}
   1116
   1117static void pt_guest_exit(struct vcpu_vmx *vmx)
   1118{
   1119	if (vmx_pt_mode_is_system())
   1120		return;
   1121
   1122	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
   1123		pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
   1124		pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
   1125	}
   1126
   1127	/*
   1128	 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
   1129	 * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
   1130	 */
   1131	if (vmx->pt_desc.host.ctl)
   1132		wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
   1133}
   1134
   1135void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
   1136			unsigned long fs_base, unsigned long gs_base)
   1137{
   1138	if (unlikely(fs_sel != host->fs_sel)) {
   1139		if (!(fs_sel & 7))
   1140			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
   1141		else
   1142			vmcs_write16(HOST_FS_SELECTOR, 0);
   1143		host->fs_sel = fs_sel;
   1144	}
   1145	if (unlikely(gs_sel != host->gs_sel)) {
   1146		if (!(gs_sel & 7))
   1147			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
   1148		else
   1149			vmcs_write16(HOST_GS_SELECTOR, 0);
   1150		host->gs_sel = gs_sel;
   1151	}
   1152	if (unlikely(fs_base != host->fs_base)) {
   1153		vmcs_writel(HOST_FS_BASE, fs_base);
   1154		host->fs_base = fs_base;
   1155	}
   1156	if (unlikely(gs_base != host->gs_base)) {
   1157		vmcs_writel(HOST_GS_BASE, gs_base);
   1158		host->gs_base = gs_base;
   1159	}
   1160}
   1161
   1162void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
   1163{
   1164	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1165	struct vmcs_host_state *host_state;
   1166#ifdef CONFIG_X86_64
   1167	int cpu = raw_smp_processor_id();
   1168#endif
   1169	unsigned long fs_base, gs_base;
   1170	u16 fs_sel, gs_sel;
   1171	int i;
   1172
   1173	vmx->req_immediate_exit = false;
   1174
   1175	/*
   1176	 * Note that guest MSRs to be saved/restored can also be changed
   1177	 * when guest state is loaded. This happens when guest transitions
   1178	 * to/from long-mode by setting MSR_EFER.LMA.
   1179	 */
   1180	if (!vmx->guest_uret_msrs_loaded) {
   1181		vmx->guest_uret_msrs_loaded = true;
   1182		for (i = 0; i < kvm_nr_uret_msrs; ++i) {
   1183			if (!vmx->guest_uret_msrs[i].load_into_hardware)
   1184				continue;
   1185
   1186			kvm_set_user_return_msr(i,
   1187						vmx->guest_uret_msrs[i].data,
   1188						vmx->guest_uret_msrs[i].mask);
   1189		}
   1190	}
   1191
   1192    	if (vmx->nested.need_vmcs12_to_shadow_sync)
   1193		nested_sync_vmcs12_to_shadow(vcpu);
   1194
   1195	if (vmx->guest_state_loaded)
   1196		return;
   1197
   1198	host_state = &vmx->loaded_vmcs->host_state;
   1199
   1200	/*
   1201	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
   1202	 * allow segment selectors with cpl > 0 or ti == 1.
   1203	 */
   1204	host_state->ldt_sel = kvm_read_ldt();
   1205
   1206#ifdef CONFIG_X86_64
   1207	savesegment(ds, host_state->ds_sel);
   1208	savesegment(es, host_state->es_sel);
   1209
   1210	gs_base = cpu_kernelmode_gs_base(cpu);
   1211	if (likely(is_64bit_mm(current->mm))) {
   1212		current_save_fsgs();
   1213		fs_sel = current->thread.fsindex;
   1214		gs_sel = current->thread.gsindex;
   1215		fs_base = current->thread.fsbase;
   1216		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
   1217	} else {
   1218		savesegment(fs, fs_sel);
   1219		savesegment(gs, gs_sel);
   1220		fs_base = read_msr(MSR_FS_BASE);
   1221		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
   1222	}
   1223
   1224	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
   1225#else
   1226	savesegment(fs, fs_sel);
   1227	savesegment(gs, gs_sel);
   1228	fs_base = segment_base(fs_sel);
   1229	gs_base = segment_base(gs_sel);
   1230#endif
   1231
   1232	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
   1233	vmx->guest_state_loaded = true;
   1234}
   1235
   1236static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
   1237{
   1238	struct vmcs_host_state *host_state;
   1239
   1240	if (!vmx->guest_state_loaded)
   1241		return;
   1242
   1243	host_state = &vmx->loaded_vmcs->host_state;
   1244
   1245	++vmx->vcpu.stat.host_state_reload;
   1246
   1247#ifdef CONFIG_X86_64
   1248	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
   1249#endif
   1250	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
   1251		kvm_load_ldt(host_state->ldt_sel);
   1252#ifdef CONFIG_X86_64
   1253		load_gs_index(host_state->gs_sel);
   1254#else
   1255		loadsegment(gs, host_state->gs_sel);
   1256#endif
   1257	}
   1258	if (host_state->fs_sel & 7)
   1259		loadsegment(fs, host_state->fs_sel);
   1260#ifdef CONFIG_X86_64
   1261	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
   1262		loadsegment(ds, host_state->ds_sel);
   1263		loadsegment(es, host_state->es_sel);
   1264	}
   1265#endif
   1266	invalidate_tss_limit();
   1267#ifdef CONFIG_X86_64
   1268	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
   1269#endif
   1270	load_fixmap_gdt(raw_smp_processor_id());
   1271	vmx->guest_state_loaded = false;
   1272	vmx->guest_uret_msrs_loaded = false;
   1273}
   1274
   1275#ifdef CONFIG_X86_64
   1276static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
   1277{
   1278	preempt_disable();
   1279	if (vmx->guest_state_loaded)
   1280		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
   1281	preempt_enable();
   1282	return vmx->msr_guest_kernel_gs_base;
   1283}
   1284
   1285static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
   1286{
   1287	preempt_disable();
   1288	if (vmx->guest_state_loaded)
   1289		wrmsrl(MSR_KERNEL_GS_BASE, data);
   1290	preempt_enable();
   1291	vmx->msr_guest_kernel_gs_base = data;
   1292}
   1293#endif
   1294
   1295void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
   1296			struct loaded_vmcs *buddy)
   1297{
   1298	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1299	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
   1300	struct vmcs *prev;
   1301
   1302	if (!already_loaded) {
   1303		loaded_vmcs_clear(vmx->loaded_vmcs);
   1304		local_irq_disable();
   1305
   1306		/*
   1307		 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
   1308		 * this cpu's percpu list, otherwise it may not yet be deleted
   1309		 * from its previous cpu's percpu list.  Pairs with the
   1310		 * smb_wmb() in __loaded_vmcs_clear().
   1311		 */
   1312		smp_rmb();
   1313
   1314		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
   1315			 &per_cpu(loaded_vmcss_on_cpu, cpu));
   1316		local_irq_enable();
   1317	}
   1318
   1319	prev = per_cpu(current_vmcs, cpu);
   1320	if (prev != vmx->loaded_vmcs->vmcs) {
   1321		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
   1322		vmcs_load(vmx->loaded_vmcs->vmcs);
   1323
   1324		/*
   1325		 * No indirect branch prediction barrier needed when switching
   1326		 * the active VMCS within a guest, e.g. on nested VM-Enter.
   1327		 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
   1328		 */
   1329		if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
   1330			indirect_branch_prediction_barrier();
   1331	}
   1332
   1333	if (!already_loaded) {
   1334		void *gdt = get_current_gdt_ro();
   1335
   1336		/*
   1337		 * Flush all EPTP/VPID contexts, the new pCPU may have stale
   1338		 * TLB entries from its previous association with the vCPU.
   1339		 */
   1340		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   1341
   1342		/*
   1343		 * Linux uses per-cpu TSS and GDT, so set these when switching
   1344		 * processors.  See 22.2.4.
   1345		 */
   1346		vmcs_writel(HOST_TR_BASE,
   1347			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
   1348		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
   1349
   1350		if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
   1351			/* 22.2.3 */
   1352			vmcs_writel(HOST_IA32_SYSENTER_ESP,
   1353				    (unsigned long)(cpu_entry_stack(cpu) + 1));
   1354		}
   1355
   1356		vmx->loaded_vmcs->cpu = cpu;
   1357	}
   1358}
   1359
   1360/*
   1361 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   1362 * vcpu mutex is already taken.
   1363 */
   1364static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   1365{
   1366	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1367
   1368	vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
   1369
   1370	vmx_vcpu_pi_load(vcpu, cpu);
   1371
   1372	vmx->host_debugctlmsr = get_debugctlmsr();
   1373}
   1374
   1375static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
   1376{
   1377	vmx_vcpu_pi_put(vcpu);
   1378
   1379	vmx_prepare_switch_to_host(to_vmx(vcpu));
   1380}
   1381
   1382bool vmx_emulation_required(struct kvm_vcpu *vcpu)
   1383{
   1384	return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
   1385}
   1386
   1387unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
   1388{
   1389	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1390	unsigned long rflags, save_rflags;
   1391
   1392	if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
   1393		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
   1394		rflags = vmcs_readl(GUEST_RFLAGS);
   1395		if (vmx->rmode.vm86_active) {
   1396			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
   1397			save_rflags = vmx->rmode.save_rflags;
   1398			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
   1399		}
   1400		vmx->rflags = rflags;
   1401	}
   1402	return vmx->rflags;
   1403}
   1404
   1405void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
   1406{
   1407	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1408	unsigned long old_rflags;
   1409
   1410	if (is_unrestricted_guest(vcpu)) {
   1411		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
   1412		vmx->rflags = rflags;
   1413		vmcs_writel(GUEST_RFLAGS, rflags);
   1414		return;
   1415	}
   1416
   1417	old_rflags = vmx_get_rflags(vcpu);
   1418	vmx->rflags = rflags;
   1419	if (vmx->rmode.vm86_active) {
   1420		vmx->rmode.save_rflags = rflags;
   1421		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
   1422	}
   1423	vmcs_writel(GUEST_RFLAGS, rflags);
   1424
   1425	if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
   1426		vmx->emulation_required = vmx_emulation_required(vcpu);
   1427}
   1428
   1429static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
   1430{
   1431	return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
   1432}
   1433
   1434u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
   1435{
   1436	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
   1437	int ret = 0;
   1438
   1439	if (interruptibility & GUEST_INTR_STATE_STI)
   1440		ret |= KVM_X86_SHADOW_INT_STI;
   1441	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
   1442		ret |= KVM_X86_SHADOW_INT_MOV_SS;
   1443
   1444	return ret;
   1445}
   1446
   1447void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
   1448{
   1449	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
   1450	u32 interruptibility = interruptibility_old;
   1451
   1452	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
   1453
   1454	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
   1455		interruptibility |= GUEST_INTR_STATE_MOV_SS;
   1456	else if (mask & KVM_X86_SHADOW_INT_STI)
   1457		interruptibility |= GUEST_INTR_STATE_STI;
   1458
   1459	if ((interruptibility != interruptibility_old))
   1460		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
   1461}
   1462
   1463static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
   1464{
   1465	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1466	unsigned long value;
   1467
   1468	/*
   1469	 * Any MSR write that attempts to change bits marked reserved will
   1470	 * case a #GP fault.
   1471	 */
   1472	if (data & vmx->pt_desc.ctl_bitmask)
   1473		return 1;
   1474
   1475	/*
   1476	 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
   1477	 * result in a #GP unless the same write also clears TraceEn.
   1478	 */
   1479	if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
   1480		((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
   1481		return 1;
   1482
   1483	/*
   1484	 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
   1485	 * and FabricEn would cause #GP, if
   1486	 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
   1487	 */
   1488	if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
   1489		!(data & RTIT_CTL_FABRIC_EN) &&
   1490		!intel_pt_validate_cap(vmx->pt_desc.caps,
   1491					PT_CAP_single_range_output))
   1492		return 1;
   1493
   1494	/*
   1495	 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
   1496	 * utilize encodings marked reserved will cause a #GP fault.
   1497	 */
   1498	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
   1499	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
   1500			!test_bit((data & RTIT_CTL_MTC_RANGE) >>
   1501			RTIT_CTL_MTC_RANGE_OFFSET, &value))
   1502		return 1;
   1503	value = intel_pt_validate_cap(vmx->pt_desc.caps,
   1504						PT_CAP_cycle_thresholds);
   1505	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
   1506			!test_bit((data & RTIT_CTL_CYC_THRESH) >>
   1507			RTIT_CTL_CYC_THRESH_OFFSET, &value))
   1508		return 1;
   1509	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
   1510	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
   1511			!test_bit((data & RTIT_CTL_PSB_FREQ) >>
   1512			RTIT_CTL_PSB_FREQ_OFFSET, &value))
   1513		return 1;
   1514
   1515	/*
   1516	 * If ADDRx_CFG is reserved or the encodings is >2 will
   1517	 * cause a #GP fault.
   1518	 */
   1519	value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
   1520	if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
   1521		return 1;
   1522	value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
   1523	if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
   1524		return 1;
   1525	value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
   1526	if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
   1527		return 1;
   1528	value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
   1529	if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
   1530		return 1;
   1531
   1532	return 0;
   1533}
   1534
   1535static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
   1536					void *insn, int insn_len)
   1537{
   1538	/*
   1539	 * Emulation of instructions in SGX enclaves is impossible as RIP does
   1540	 * not point at the failing instruction, and even if it did, the code
   1541	 * stream is inaccessible.  Inject #UD instead of exiting to userspace
   1542	 * so that guest userspace can't DoS the guest simply by triggering
   1543	 * emulation (enclaves are CPL3 only).
   1544	 */
   1545	if (to_vmx(vcpu)->exit_reason.enclave_mode) {
   1546		kvm_queue_exception(vcpu, UD_VECTOR);
   1547		return false;
   1548	}
   1549	return true;
   1550}
   1551
   1552static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
   1553{
   1554	union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
   1555	unsigned long rip, orig_rip;
   1556	u32 instr_len;
   1557
   1558	/*
   1559	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
   1560	 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
   1561	 * set when EPT misconfig occurs.  In practice, real hardware updates
   1562	 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
   1563	 * (namely Hyper-V) don't set it due to it being undefined behavior,
   1564	 * i.e. we end up advancing IP with some random value.
   1565	 */
   1566	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
   1567	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
   1568		instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
   1569
   1570		/*
   1571		 * Emulating an enclave's instructions isn't supported as KVM
   1572		 * cannot access the enclave's memory or its true RIP, e.g. the
   1573		 * vmcs.GUEST_RIP points at the exit point of the enclave, not
   1574		 * the RIP that actually triggered the VM-Exit.  But, because
   1575		 * most instructions that cause VM-Exit will #UD in an enclave,
   1576		 * most instruction-based VM-Exits simply do not occur.
   1577		 *
   1578		 * There are a few exceptions, notably the debug instructions
   1579		 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
   1580		 * and generate #DB/#BP as expected, which KVM might intercept.
   1581		 * But again, the CPU does the dirty work and saves an instr
   1582		 * length of zero so VMMs don't shoot themselves in the foot.
   1583		 * WARN if KVM tries to skip a non-zero length instruction on
   1584		 * a VM-Exit from an enclave.
   1585		 */
   1586		if (!instr_len)
   1587			goto rip_updated;
   1588
   1589		WARN(exit_reason.enclave_mode,
   1590		     "KVM: skipping instruction after SGX enclave VM-Exit");
   1591
   1592		orig_rip = kvm_rip_read(vcpu);
   1593		rip = orig_rip + instr_len;
   1594#ifdef CONFIG_X86_64
   1595		/*
   1596		 * We need to mask out the high 32 bits of RIP if not in 64-bit
   1597		 * mode, but just finding out that we are in 64-bit mode is
   1598		 * quite expensive.  Only do it if there was a carry.
   1599		 */
   1600		if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
   1601			rip = (u32)rip;
   1602#endif
   1603		kvm_rip_write(vcpu, rip);
   1604	} else {
   1605		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
   1606			return 0;
   1607	}
   1608
   1609rip_updated:
   1610	/* skipping an emulated instruction also counts */
   1611	vmx_set_interrupt_shadow(vcpu, 0);
   1612
   1613	return 1;
   1614}
   1615
   1616/*
   1617 * Recognizes a pending MTF VM-exit and records the nested state for later
   1618 * delivery.
   1619 */
   1620static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
   1621{
   1622	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   1623	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1624
   1625	if (!is_guest_mode(vcpu))
   1626		return;
   1627
   1628	/*
   1629	 * Per the SDM, MTF takes priority over debug-trap exceptions besides
   1630	 * T-bit traps. As instruction emulation is completed (i.e. at the
   1631	 * instruction boundary), any #DB exception pending delivery must be a
   1632	 * debug-trap. Record the pending MTF state to be delivered in
   1633	 * vmx_check_nested_events().
   1634	 */
   1635	if (nested_cpu_has_mtf(vmcs12) &&
   1636	    (!vcpu->arch.exception.pending ||
   1637	     vcpu->arch.exception.nr == DB_VECTOR))
   1638		vmx->nested.mtf_pending = true;
   1639	else
   1640		vmx->nested.mtf_pending = false;
   1641}
   1642
   1643static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
   1644{
   1645	vmx_update_emulated_instruction(vcpu);
   1646	return skip_emulated_instruction(vcpu);
   1647}
   1648
   1649static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
   1650{
   1651	/*
   1652	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
   1653	 * explicitly skip the instruction because if the HLT state is set,
   1654	 * then the instruction is already executing and RIP has already been
   1655	 * advanced.
   1656	 */
   1657	if (kvm_hlt_in_guest(vcpu->kvm) &&
   1658			vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
   1659		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
   1660}
   1661
   1662static void vmx_queue_exception(struct kvm_vcpu *vcpu)
   1663{
   1664	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1665	unsigned nr = vcpu->arch.exception.nr;
   1666	bool has_error_code = vcpu->arch.exception.has_error_code;
   1667	u32 error_code = vcpu->arch.exception.error_code;
   1668	u32 intr_info = nr | INTR_INFO_VALID_MASK;
   1669
   1670	kvm_deliver_exception_payload(vcpu);
   1671
   1672	if (has_error_code) {
   1673		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
   1674		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
   1675	}
   1676
   1677	if (vmx->rmode.vm86_active) {
   1678		int inc_eip = 0;
   1679		if (kvm_exception_is_soft(nr))
   1680			inc_eip = vcpu->arch.event_exit_inst_len;
   1681		kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
   1682		return;
   1683	}
   1684
   1685	WARN_ON_ONCE(vmx->emulation_required);
   1686
   1687	if (kvm_exception_is_soft(nr)) {
   1688		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
   1689			     vmx->vcpu.arch.event_exit_inst_len);
   1690		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
   1691	} else
   1692		intr_info |= INTR_TYPE_HARD_EXCEPTION;
   1693
   1694	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
   1695
   1696	vmx_clear_hlt(vcpu);
   1697}
   1698
   1699static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
   1700			       bool load_into_hardware)
   1701{
   1702	struct vmx_uret_msr *uret_msr;
   1703
   1704	uret_msr = vmx_find_uret_msr(vmx, msr);
   1705	if (!uret_msr)
   1706		return;
   1707
   1708	uret_msr->load_into_hardware = load_into_hardware;
   1709}
   1710
   1711/*
   1712 * Configuring user return MSRs to automatically save, load, and restore MSRs
   1713 * that need to be shoved into hardware when running the guest.  Note, omitting
   1714 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
   1715 * loaded into hardware when running the guest.
   1716 */
   1717static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
   1718{
   1719#ifdef CONFIG_X86_64
   1720	bool load_syscall_msrs;
   1721
   1722	/*
   1723	 * The SYSCALL MSRs are only needed on long mode guests, and only
   1724	 * when EFER.SCE is set.
   1725	 */
   1726	load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
   1727			    (vmx->vcpu.arch.efer & EFER_SCE);
   1728
   1729	vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
   1730	vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
   1731	vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
   1732#endif
   1733	vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
   1734
   1735	vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
   1736			   guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
   1737			   guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
   1738
   1739	/*
   1740	 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
   1741	 * kernel and old userspace.  If those guests run on a tsx=off host, do
   1742	 * allow guests to use TSX_CTRL, but don't change the value in hardware
   1743	 * so that TSX remains always disabled.
   1744	 */
   1745	vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
   1746
   1747	/*
   1748	 * The set of MSRs to load may have changed, reload MSRs before the
   1749	 * next VM-Enter.
   1750	 */
   1751	vmx->guest_uret_msrs_loaded = false;
   1752}
   1753
   1754u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
   1755{
   1756	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   1757
   1758	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
   1759		return vmcs12->tsc_offset;
   1760
   1761	return 0;
   1762}
   1763
   1764u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
   1765{
   1766	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   1767
   1768	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
   1769	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
   1770		return vmcs12->tsc_multiplier;
   1771
   1772	return kvm_default_tsc_scaling_ratio;
   1773}
   1774
   1775static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
   1776{
   1777	vmcs_write64(TSC_OFFSET, offset);
   1778}
   1779
   1780static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
   1781{
   1782	vmcs_write64(TSC_MULTIPLIER, multiplier);
   1783}
   1784
   1785/*
   1786 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
   1787 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
   1788 * all guests if the "nested" module option is off, and can also be disabled
   1789 * for a single guest by disabling its VMX cpuid bit.
   1790 */
   1791bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
   1792{
   1793	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
   1794}
   1795
   1796static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
   1797						 uint64_t val)
   1798{
   1799	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
   1800
   1801	return !(val & ~valid_bits);
   1802}
   1803
   1804static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
   1805{
   1806	switch (msr->index) {
   1807	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
   1808		if (!nested)
   1809			return 1;
   1810		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
   1811	case MSR_IA32_PERF_CAPABILITIES:
   1812		msr->data = vmx_get_perf_capabilities();
   1813		return 0;
   1814	default:
   1815		return KVM_MSR_RET_INVALID;
   1816	}
   1817}
   1818
   1819/*
   1820 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
   1821 * Returns 0 on success, non-0 otherwise.
   1822 * Assumes vcpu_load() was already called.
   1823 */
   1824static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   1825{
   1826	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1827	struct vmx_uret_msr *msr;
   1828	u32 index;
   1829
   1830	switch (msr_info->index) {
   1831#ifdef CONFIG_X86_64
   1832	case MSR_FS_BASE:
   1833		msr_info->data = vmcs_readl(GUEST_FS_BASE);
   1834		break;
   1835	case MSR_GS_BASE:
   1836		msr_info->data = vmcs_readl(GUEST_GS_BASE);
   1837		break;
   1838	case MSR_KERNEL_GS_BASE:
   1839		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
   1840		break;
   1841#endif
   1842	case MSR_EFER:
   1843		return kvm_get_msr_common(vcpu, msr_info);
   1844	case MSR_IA32_TSX_CTRL:
   1845		if (!msr_info->host_initiated &&
   1846		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
   1847			return 1;
   1848		goto find_uret_msr;
   1849	case MSR_IA32_UMWAIT_CONTROL:
   1850		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
   1851			return 1;
   1852
   1853		msr_info->data = vmx->msr_ia32_umwait_control;
   1854		break;
   1855	case MSR_IA32_SPEC_CTRL:
   1856		if (!msr_info->host_initiated &&
   1857		    !guest_has_spec_ctrl_msr(vcpu))
   1858			return 1;
   1859
   1860		msr_info->data = to_vmx(vcpu)->spec_ctrl;
   1861		break;
   1862	case MSR_IA32_SYSENTER_CS:
   1863		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
   1864		break;
   1865	case MSR_IA32_SYSENTER_EIP:
   1866		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
   1867		break;
   1868	case MSR_IA32_SYSENTER_ESP:
   1869		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
   1870		break;
   1871	case MSR_IA32_BNDCFGS:
   1872		if (!kvm_mpx_supported() ||
   1873		    (!msr_info->host_initiated &&
   1874		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
   1875			return 1;
   1876		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
   1877		break;
   1878	case MSR_IA32_MCG_EXT_CTL:
   1879		if (!msr_info->host_initiated &&
   1880		    !(vmx->msr_ia32_feature_control &
   1881		      FEAT_CTL_LMCE_ENABLED))
   1882			return 1;
   1883		msr_info->data = vcpu->arch.mcg_ext_ctl;
   1884		break;
   1885	case MSR_IA32_FEAT_CTL:
   1886		msr_info->data = vmx->msr_ia32_feature_control;
   1887		break;
   1888	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
   1889		if (!msr_info->host_initiated &&
   1890		    !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
   1891			return 1;
   1892		msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
   1893			[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
   1894		break;
   1895	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
   1896		if (!nested_vmx_allowed(vcpu))
   1897			return 1;
   1898		if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
   1899				    &msr_info->data))
   1900			return 1;
   1901		/*
   1902		 * Enlightened VMCS v1 doesn't have certain VMCS fields but
   1903		 * instead of just ignoring the features, different Hyper-V
   1904		 * versions are either trying to use them and fail or do some
   1905		 * sanity checking and refuse to boot. Filter all unsupported
   1906		 * features out.
   1907		 */
   1908		if (!msr_info->host_initiated &&
   1909		    vmx->nested.enlightened_vmcs_enabled)
   1910			nested_evmcs_filter_control_msr(msr_info->index,
   1911							&msr_info->data);
   1912		break;
   1913	case MSR_IA32_RTIT_CTL:
   1914		if (!vmx_pt_mode_is_host_guest())
   1915			return 1;
   1916		msr_info->data = vmx->pt_desc.guest.ctl;
   1917		break;
   1918	case MSR_IA32_RTIT_STATUS:
   1919		if (!vmx_pt_mode_is_host_guest())
   1920			return 1;
   1921		msr_info->data = vmx->pt_desc.guest.status;
   1922		break;
   1923	case MSR_IA32_RTIT_CR3_MATCH:
   1924		if (!vmx_pt_mode_is_host_guest() ||
   1925			!intel_pt_validate_cap(vmx->pt_desc.caps,
   1926						PT_CAP_cr3_filtering))
   1927			return 1;
   1928		msr_info->data = vmx->pt_desc.guest.cr3_match;
   1929		break;
   1930	case MSR_IA32_RTIT_OUTPUT_BASE:
   1931		if (!vmx_pt_mode_is_host_guest() ||
   1932			(!intel_pt_validate_cap(vmx->pt_desc.caps,
   1933					PT_CAP_topa_output) &&
   1934			 !intel_pt_validate_cap(vmx->pt_desc.caps,
   1935					PT_CAP_single_range_output)))
   1936			return 1;
   1937		msr_info->data = vmx->pt_desc.guest.output_base;
   1938		break;
   1939	case MSR_IA32_RTIT_OUTPUT_MASK:
   1940		if (!vmx_pt_mode_is_host_guest() ||
   1941			(!intel_pt_validate_cap(vmx->pt_desc.caps,
   1942					PT_CAP_topa_output) &&
   1943			 !intel_pt_validate_cap(vmx->pt_desc.caps,
   1944					PT_CAP_single_range_output)))
   1945			return 1;
   1946		msr_info->data = vmx->pt_desc.guest.output_mask;
   1947		break;
   1948	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
   1949		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
   1950		if (!vmx_pt_mode_is_host_guest() ||
   1951		    (index >= 2 * vmx->pt_desc.num_address_ranges))
   1952			return 1;
   1953		if (index % 2)
   1954			msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
   1955		else
   1956			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
   1957		break;
   1958	case MSR_IA32_DEBUGCTLMSR:
   1959		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
   1960		break;
   1961	default:
   1962	find_uret_msr:
   1963		msr = vmx_find_uret_msr(vmx, msr_info->index);
   1964		if (msr) {
   1965			msr_info->data = msr->data;
   1966			break;
   1967		}
   1968		return kvm_get_msr_common(vcpu, msr_info);
   1969	}
   1970
   1971	return 0;
   1972}
   1973
   1974static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
   1975						    u64 data)
   1976{
   1977#ifdef CONFIG_X86_64
   1978	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
   1979		return (u32)data;
   1980#endif
   1981	return (unsigned long)data;
   1982}
   1983
   1984static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
   1985{
   1986	u64 debugctl = vmx_supported_debugctl();
   1987
   1988	if (!intel_pmu_lbr_is_enabled(vcpu))
   1989		debugctl &= ~DEBUGCTLMSR_LBR_MASK;
   1990
   1991	if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
   1992		debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
   1993
   1994	return debugctl;
   1995}
   1996
   1997/*
   1998 * Writes msr value into the appropriate "register".
   1999 * Returns 0 on success, non-0 otherwise.
   2000 * Assumes vcpu_load() was already called.
   2001 */
   2002static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   2003{
   2004	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2005	struct vmx_uret_msr *msr;
   2006	int ret = 0;
   2007	u32 msr_index = msr_info->index;
   2008	u64 data = msr_info->data;
   2009	u32 index;
   2010
   2011	switch (msr_index) {
   2012	case MSR_EFER:
   2013		ret = kvm_set_msr_common(vcpu, msr_info);
   2014		break;
   2015#ifdef CONFIG_X86_64
   2016	case MSR_FS_BASE:
   2017		vmx_segment_cache_clear(vmx);
   2018		vmcs_writel(GUEST_FS_BASE, data);
   2019		break;
   2020	case MSR_GS_BASE:
   2021		vmx_segment_cache_clear(vmx);
   2022		vmcs_writel(GUEST_GS_BASE, data);
   2023		break;
   2024	case MSR_KERNEL_GS_BASE:
   2025		vmx_write_guest_kernel_gs_base(vmx, data);
   2026		break;
   2027	case MSR_IA32_XFD:
   2028		ret = kvm_set_msr_common(vcpu, msr_info);
   2029		/*
   2030		 * Always intercepting WRMSR could incur non-negligible
   2031		 * overhead given xfd might be changed frequently in
   2032		 * guest context switch. Disable write interception
   2033		 * upon the first write with a non-zero value (indicating
   2034		 * potential usage on dynamic xfeatures). Also update
   2035		 * exception bitmap to trap #NM for proper virtualization
   2036		 * of guest xfd_err.
   2037		 */
   2038		if (!ret && data) {
   2039			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
   2040						      MSR_TYPE_RW);
   2041			vcpu->arch.xfd_no_write_intercept = true;
   2042			vmx_update_exception_bitmap(vcpu);
   2043		}
   2044		break;
   2045#endif
   2046	case MSR_IA32_SYSENTER_CS:
   2047		if (is_guest_mode(vcpu))
   2048			get_vmcs12(vcpu)->guest_sysenter_cs = data;
   2049		vmcs_write32(GUEST_SYSENTER_CS, data);
   2050		break;
   2051	case MSR_IA32_SYSENTER_EIP:
   2052		if (is_guest_mode(vcpu)) {
   2053			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
   2054			get_vmcs12(vcpu)->guest_sysenter_eip = data;
   2055		}
   2056		vmcs_writel(GUEST_SYSENTER_EIP, data);
   2057		break;
   2058	case MSR_IA32_SYSENTER_ESP:
   2059		if (is_guest_mode(vcpu)) {
   2060			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
   2061			get_vmcs12(vcpu)->guest_sysenter_esp = data;
   2062		}
   2063		vmcs_writel(GUEST_SYSENTER_ESP, data);
   2064		break;
   2065	case MSR_IA32_DEBUGCTLMSR: {
   2066		u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
   2067		if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
   2068			if (report_ignored_msrs)
   2069				vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
   2070					    __func__, data);
   2071			data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
   2072			invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
   2073		}
   2074
   2075		if (invalid)
   2076			return 1;
   2077
   2078		if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
   2079						VM_EXIT_SAVE_DEBUG_CONTROLS)
   2080			get_vmcs12(vcpu)->guest_ia32_debugctl = data;
   2081
   2082		vmcs_write64(GUEST_IA32_DEBUGCTL, data);
   2083		if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
   2084		    (data & DEBUGCTLMSR_LBR))
   2085			intel_pmu_create_guest_lbr_event(vcpu);
   2086		return 0;
   2087	}
   2088	case MSR_IA32_BNDCFGS:
   2089		if (!kvm_mpx_supported() ||
   2090		    (!msr_info->host_initiated &&
   2091		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
   2092			return 1;
   2093		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
   2094		    (data & MSR_IA32_BNDCFGS_RSVD))
   2095			return 1;
   2096		vmcs_write64(GUEST_BNDCFGS, data);
   2097		break;
   2098	case MSR_IA32_UMWAIT_CONTROL:
   2099		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
   2100			return 1;
   2101
   2102		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
   2103		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
   2104			return 1;
   2105
   2106		vmx->msr_ia32_umwait_control = data;
   2107		break;
   2108	case MSR_IA32_SPEC_CTRL:
   2109		if (!msr_info->host_initiated &&
   2110		    !guest_has_spec_ctrl_msr(vcpu))
   2111			return 1;
   2112
   2113		if (kvm_spec_ctrl_test_value(data))
   2114			return 1;
   2115
   2116		vmx->spec_ctrl = data;
   2117		if (!data)
   2118			break;
   2119
   2120		/*
   2121		 * For non-nested:
   2122		 * When it's written (to non-zero) for the first time, pass
   2123		 * it through.
   2124		 *
   2125		 * For nested:
   2126		 * The handling of the MSR bitmap for L2 guests is done in
   2127		 * nested_vmx_prepare_msr_bitmap. We should not touch the
   2128		 * vmcs02.msr_bitmap here since it gets completely overwritten
   2129		 * in the merging. We update the vmcs01 here for L1 as well
   2130		 * since it will end up touching the MSR anyway now.
   2131		 */
   2132		vmx_disable_intercept_for_msr(vcpu,
   2133					      MSR_IA32_SPEC_CTRL,
   2134					      MSR_TYPE_RW);
   2135		break;
   2136	case MSR_IA32_TSX_CTRL:
   2137		if (!msr_info->host_initiated &&
   2138		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
   2139			return 1;
   2140		if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
   2141			return 1;
   2142		goto find_uret_msr;
   2143	case MSR_IA32_PRED_CMD:
   2144		if (!msr_info->host_initiated &&
   2145		    !guest_has_pred_cmd_msr(vcpu))
   2146			return 1;
   2147
   2148		if (data & ~PRED_CMD_IBPB)
   2149			return 1;
   2150		if (!boot_cpu_has(X86_FEATURE_IBPB))
   2151			return 1;
   2152		if (!data)
   2153			break;
   2154
   2155		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
   2156
   2157		/*
   2158		 * For non-nested:
   2159		 * When it's written (to non-zero) for the first time, pass
   2160		 * it through.
   2161		 *
   2162		 * For nested:
   2163		 * The handling of the MSR bitmap for L2 guests is done in
   2164		 * nested_vmx_prepare_msr_bitmap. We should not touch the
   2165		 * vmcs02.msr_bitmap here since it gets completely overwritten
   2166		 * in the merging.
   2167		 */
   2168		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
   2169		break;
   2170	case MSR_IA32_CR_PAT:
   2171		if (!kvm_pat_valid(data))
   2172			return 1;
   2173
   2174		if (is_guest_mode(vcpu) &&
   2175		    get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
   2176			get_vmcs12(vcpu)->guest_ia32_pat = data;
   2177
   2178		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
   2179			vmcs_write64(GUEST_IA32_PAT, data);
   2180			vcpu->arch.pat = data;
   2181			break;
   2182		}
   2183		ret = kvm_set_msr_common(vcpu, msr_info);
   2184		break;
   2185	case MSR_IA32_MCG_EXT_CTL:
   2186		if ((!msr_info->host_initiated &&
   2187		     !(to_vmx(vcpu)->msr_ia32_feature_control &
   2188		       FEAT_CTL_LMCE_ENABLED)) ||
   2189		    (data & ~MCG_EXT_CTL_LMCE_EN))
   2190			return 1;
   2191		vcpu->arch.mcg_ext_ctl = data;
   2192		break;
   2193	case MSR_IA32_FEAT_CTL:
   2194		if (!vmx_feature_control_msr_valid(vcpu, data) ||
   2195		    (to_vmx(vcpu)->msr_ia32_feature_control &
   2196		     FEAT_CTL_LOCKED && !msr_info->host_initiated))
   2197			return 1;
   2198		vmx->msr_ia32_feature_control = data;
   2199		if (msr_info->host_initiated && data == 0)
   2200			vmx_leave_nested(vcpu);
   2201
   2202		/* SGX may be enabled/disabled by guest's firmware */
   2203		vmx_write_encls_bitmap(vcpu, NULL);
   2204		break;
   2205	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
   2206		/*
   2207		 * On real hardware, the LE hash MSRs are writable before
   2208		 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
   2209		 * at which point SGX related bits in IA32_FEATURE_CONTROL
   2210		 * become writable.
   2211		 *
   2212		 * KVM does not emulate SGX activation for simplicity, so
   2213		 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
   2214		 * is unlocked.  This is technically not architectural
   2215		 * behavior, but it's close enough.
   2216		 */
   2217		if (!msr_info->host_initiated &&
   2218		    (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
   2219		    ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
   2220		    !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
   2221			return 1;
   2222		vmx->msr_ia32_sgxlepubkeyhash
   2223			[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
   2224		break;
   2225	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
   2226		if (!msr_info->host_initiated)
   2227			return 1; /* they are read-only */
   2228		if (!nested_vmx_allowed(vcpu))
   2229			return 1;
   2230		return vmx_set_vmx_msr(vcpu, msr_index, data);
   2231	case MSR_IA32_RTIT_CTL:
   2232		if (!vmx_pt_mode_is_host_guest() ||
   2233			vmx_rtit_ctl_check(vcpu, data) ||
   2234			vmx->nested.vmxon)
   2235			return 1;
   2236		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
   2237		vmx->pt_desc.guest.ctl = data;
   2238		pt_update_intercept_for_msr(vcpu);
   2239		break;
   2240	case MSR_IA32_RTIT_STATUS:
   2241		if (!pt_can_write_msr(vmx))
   2242			return 1;
   2243		if (data & MSR_IA32_RTIT_STATUS_MASK)
   2244			return 1;
   2245		vmx->pt_desc.guest.status = data;
   2246		break;
   2247	case MSR_IA32_RTIT_CR3_MATCH:
   2248		if (!pt_can_write_msr(vmx))
   2249			return 1;
   2250		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
   2251					   PT_CAP_cr3_filtering))
   2252			return 1;
   2253		vmx->pt_desc.guest.cr3_match = data;
   2254		break;
   2255	case MSR_IA32_RTIT_OUTPUT_BASE:
   2256		if (!pt_can_write_msr(vmx))
   2257			return 1;
   2258		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
   2259					   PT_CAP_topa_output) &&
   2260		    !intel_pt_validate_cap(vmx->pt_desc.caps,
   2261					   PT_CAP_single_range_output))
   2262			return 1;
   2263		if (!pt_output_base_valid(vcpu, data))
   2264			return 1;
   2265		vmx->pt_desc.guest.output_base = data;
   2266		break;
   2267	case MSR_IA32_RTIT_OUTPUT_MASK:
   2268		if (!pt_can_write_msr(vmx))
   2269			return 1;
   2270		if (!intel_pt_validate_cap(vmx->pt_desc.caps,
   2271					   PT_CAP_topa_output) &&
   2272		    !intel_pt_validate_cap(vmx->pt_desc.caps,
   2273					   PT_CAP_single_range_output))
   2274			return 1;
   2275		vmx->pt_desc.guest.output_mask = data;
   2276		break;
   2277	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
   2278		if (!pt_can_write_msr(vmx))
   2279			return 1;
   2280		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
   2281		if (index >= 2 * vmx->pt_desc.num_address_ranges)
   2282			return 1;
   2283		if (is_noncanonical_address(data, vcpu))
   2284			return 1;
   2285		if (index % 2)
   2286			vmx->pt_desc.guest.addr_b[index / 2] = data;
   2287		else
   2288			vmx->pt_desc.guest.addr_a[index / 2] = data;
   2289		break;
   2290	case MSR_IA32_PERF_CAPABILITIES:
   2291		if (data && !vcpu_to_pmu(vcpu)->version)
   2292			return 1;
   2293		if (data & PMU_CAP_LBR_FMT) {
   2294			if ((data & PMU_CAP_LBR_FMT) !=
   2295			    (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
   2296				return 1;
   2297			if (!intel_pmu_lbr_is_compatible(vcpu))
   2298				return 1;
   2299		}
   2300		ret = kvm_set_msr_common(vcpu, msr_info);
   2301		break;
   2302
   2303	default:
   2304	find_uret_msr:
   2305		msr = vmx_find_uret_msr(vmx, msr_index);
   2306		if (msr)
   2307			ret = vmx_set_guest_uret_msr(vmx, msr, data);
   2308		else
   2309			ret = kvm_set_msr_common(vcpu, msr_info);
   2310	}
   2311
   2312	/* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
   2313	if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
   2314		vmx_update_fb_clear_dis(vcpu, vmx);
   2315
   2316	return ret;
   2317}
   2318
   2319static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
   2320{
   2321	unsigned long guest_owned_bits;
   2322
   2323	kvm_register_mark_available(vcpu, reg);
   2324
   2325	switch (reg) {
   2326	case VCPU_REGS_RSP:
   2327		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
   2328		break;
   2329	case VCPU_REGS_RIP:
   2330		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
   2331		break;
   2332	case VCPU_EXREG_PDPTR:
   2333		if (enable_ept)
   2334			ept_save_pdptrs(vcpu);
   2335		break;
   2336	case VCPU_EXREG_CR0:
   2337		guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
   2338
   2339		vcpu->arch.cr0 &= ~guest_owned_bits;
   2340		vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
   2341		break;
   2342	case VCPU_EXREG_CR3:
   2343		/*
   2344		 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
   2345		 * CR3 is loaded into hardware, not the guest's CR3.
   2346		 */
   2347		if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
   2348			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
   2349		break;
   2350	case VCPU_EXREG_CR4:
   2351		guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
   2352
   2353		vcpu->arch.cr4 &= ~guest_owned_bits;
   2354		vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
   2355		break;
   2356	default:
   2357		KVM_BUG_ON(1, vcpu->kvm);
   2358		break;
   2359	}
   2360}
   2361
   2362static __init int cpu_has_kvm_support(void)
   2363{
   2364	return cpu_has_vmx();
   2365}
   2366
   2367static __init int vmx_disabled_by_bios(void)
   2368{
   2369	return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
   2370	       !boot_cpu_has(X86_FEATURE_VMX);
   2371}
   2372
   2373static int kvm_cpu_vmxon(u64 vmxon_pointer)
   2374{
   2375	u64 msr;
   2376
   2377	cr4_set_bits(X86_CR4_VMXE);
   2378
   2379	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
   2380			  _ASM_EXTABLE(1b, %l[fault])
   2381			  : : [vmxon_pointer] "m"(vmxon_pointer)
   2382			  : : fault);
   2383	return 0;
   2384
   2385fault:
   2386	WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
   2387		  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
   2388	cr4_clear_bits(X86_CR4_VMXE);
   2389
   2390	return -EFAULT;
   2391}
   2392
   2393static int vmx_hardware_enable(void)
   2394{
   2395	int cpu = raw_smp_processor_id();
   2396	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
   2397	int r;
   2398
   2399	if (cr4_read_shadow() & X86_CR4_VMXE)
   2400		return -EBUSY;
   2401
   2402	/*
   2403	 * This can happen if we hot-added a CPU but failed to allocate
   2404	 * VP assist page for it.
   2405	 */
   2406	if (static_branch_unlikely(&enable_evmcs) &&
   2407	    !hv_get_vp_assist_page(cpu))
   2408		return -EFAULT;
   2409
   2410	intel_pt_handle_vmx(1);
   2411
   2412	r = kvm_cpu_vmxon(phys_addr);
   2413	if (r) {
   2414		intel_pt_handle_vmx(0);
   2415		return r;
   2416	}
   2417
   2418	if (enable_ept)
   2419		ept_sync_global();
   2420
   2421	return 0;
   2422}
   2423
   2424static void vmclear_local_loaded_vmcss(void)
   2425{
   2426	int cpu = raw_smp_processor_id();
   2427	struct loaded_vmcs *v, *n;
   2428
   2429	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
   2430				 loaded_vmcss_on_cpu_link)
   2431		__loaded_vmcs_clear(v);
   2432}
   2433
   2434static void vmx_hardware_disable(void)
   2435{
   2436	vmclear_local_loaded_vmcss();
   2437
   2438	if (cpu_vmxoff())
   2439		kvm_spurious_fault();
   2440
   2441	intel_pt_handle_vmx(0);
   2442}
   2443
   2444/*
   2445 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
   2446 * directly instead of going through cpu_has(), to ensure KVM is trapping
   2447 * ENCLS whenever it's supported in hardware.  It does not matter whether
   2448 * the host OS supports or has enabled SGX.
   2449 */
   2450static bool cpu_has_sgx(void)
   2451{
   2452	return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
   2453}
   2454
   2455static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
   2456				      u32 msr, u32 *result)
   2457{
   2458	u32 vmx_msr_low, vmx_msr_high;
   2459	u32 ctl = ctl_min | ctl_opt;
   2460
   2461	rdmsr(msr, vmx_msr_low, vmx_msr_high);
   2462
   2463	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
   2464	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
   2465
   2466	/* Ensure minimum (required) set of control bits are supported. */
   2467	if (ctl_min & ~ctl)
   2468		return -EIO;
   2469
   2470	*result = ctl;
   2471	return 0;
   2472}
   2473
   2474static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
   2475				    struct vmx_capability *vmx_cap)
   2476{
   2477	u32 vmx_msr_low, vmx_msr_high;
   2478	u32 min, opt, min2, opt2;
   2479	u32 _pin_based_exec_control = 0;
   2480	u32 _cpu_based_exec_control = 0;
   2481	u32 _cpu_based_2nd_exec_control = 0;
   2482	u32 _vmexit_control = 0;
   2483	u32 _vmentry_control = 0;
   2484
   2485	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
   2486	min = CPU_BASED_HLT_EXITING |
   2487#ifdef CONFIG_X86_64
   2488	      CPU_BASED_CR8_LOAD_EXITING |
   2489	      CPU_BASED_CR8_STORE_EXITING |
   2490#endif
   2491	      CPU_BASED_CR3_LOAD_EXITING |
   2492	      CPU_BASED_CR3_STORE_EXITING |
   2493	      CPU_BASED_UNCOND_IO_EXITING |
   2494	      CPU_BASED_MOV_DR_EXITING |
   2495	      CPU_BASED_USE_TSC_OFFSETTING |
   2496	      CPU_BASED_MWAIT_EXITING |
   2497	      CPU_BASED_MONITOR_EXITING |
   2498	      CPU_BASED_INVLPG_EXITING |
   2499	      CPU_BASED_RDPMC_EXITING;
   2500
   2501	opt = CPU_BASED_TPR_SHADOW |
   2502	      CPU_BASED_USE_MSR_BITMAPS |
   2503	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
   2504	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
   2505				&_cpu_based_exec_control) < 0)
   2506		return -EIO;
   2507#ifdef CONFIG_X86_64
   2508	if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
   2509		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
   2510					   ~CPU_BASED_CR8_STORE_EXITING;
   2511#endif
   2512	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
   2513		min2 = 0;
   2514		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
   2515			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
   2516			SECONDARY_EXEC_WBINVD_EXITING |
   2517			SECONDARY_EXEC_ENABLE_VPID |
   2518			SECONDARY_EXEC_ENABLE_EPT |
   2519			SECONDARY_EXEC_UNRESTRICTED_GUEST |
   2520			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
   2521			SECONDARY_EXEC_DESC |
   2522			SECONDARY_EXEC_ENABLE_RDTSCP |
   2523			SECONDARY_EXEC_ENABLE_INVPCID |
   2524			SECONDARY_EXEC_APIC_REGISTER_VIRT |
   2525			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
   2526			SECONDARY_EXEC_SHADOW_VMCS |
   2527			SECONDARY_EXEC_XSAVES |
   2528			SECONDARY_EXEC_RDSEED_EXITING |
   2529			SECONDARY_EXEC_RDRAND_EXITING |
   2530			SECONDARY_EXEC_ENABLE_PML |
   2531			SECONDARY_EXEC_TSC_SCALING |
   2532			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
   2533			SECONDARY_EXEC_PT_USE_GPA |
   2534			SECONDARY_EXEC_PT_CONCEAL_VMX |
   2535			SECONDARY_EXEC_ENABLE_VMFUNC |
   2536			SECONDARY_EXEC_BUS_LOCK_DETECTION;
   2537		if (cpu_has_sgx())
   2538			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
   2539		if (adjust_vmx_controls(min2, opt2,
   2540					MSR_IA32_VMX_PROCBASED_CTLS2,
   2541					&_cpu_based_2nd_exec_control) < 0)
   2542			return -EIO;
   2543	}
   2544#ifndef CONFIG_X86_64
   2545	if (!(_cpu_based_2nd_exec_control &
   2546				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
   2547		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
   2548#endif
   2549
   2550	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
   2551		_cpu_based_2nd_exec_control &= ~(
   2552				SECONDARY_EXEC_APIC_REGISTER_VIRT |
   2553				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
   2554				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
   2555
   2556	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
   2557		&vmx_cap->ept, &vmx_cap->vpid);
   2558
   2559	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
   2560		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
   2561		   enabled */
   2562		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
   2563					     CPU_BASED_CR3_STORE_EXITING |
   2564					     CPU_BASED_INVLPG_EXITING);
   2565	} else if (vmx_cap->ept) {
   2566		vmx_cap->ept = 0;
   2567		pr_warn_once("EPT CAP should not exist if not support "
   2568				"1-setting enable EPT VM-execution control\n");
   2569	}
   2570	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
   2571		vmx_cap->vpid) {
   2572		vmx_cap->vpid = 0;
   2573		pr_warn_once("VPID CAP should not exist if not support "
   2574				"1-setting enable VPID VM-execution control\n");
   2575	}
   2576
   2577	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
   2578#ifdef CONFIG_X86_64
   2579	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
   2580#endif
   2581	opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
   2582	      VM_EXIT_LOAD_IA32_PAT |
   2583	      VM_EXIT_LOAD_IA32_EFER |
   2584	      VM_EXIT_CLEAR_BNDCFGS |
   2585	      VM_EXIT_PT_CONCEAL_PIP |
   2586	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
   2587	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
   2588				&_vmexit_control) < 0)
   2589		return -EIO;
   2590
   2591	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
   2592	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
   2593		 PIN_BASED_VMX_PREEMPTION_TIMER;
   2594	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
   2595				&_pin_based_exec_control) < 0)
   2596		return -EIO;
   2597
   2598	if (cpu_has_broken_vmx_preemption_timer())
   2599		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
   2600	if (!(_cpu_based_2nd_exec_control &
   2601		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
   2602		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
   2603
   2604	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
   2605	opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
   2606	      VM_ENTRY_LOAD_IA32_PAT |
   2607	      VM_ENTRY_LOAD_IA32_EFER |
   2608	      VM_ENTRY_LOAD_BNDCFGS |
   2609	      VM_ENTRY_PT_CONCEAL_PIP |
   2610	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
   2611	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
   2612				&_vmentry_control) < 0)
   2613		return -EIO;
   2614
   2615	/*
   2616	 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
   2617	 * can't be used due to an errata where VM Exit may incorrectly clear
   2618	 * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
   2619	 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
   2620	 */
   2621	if (boot_cpu_data.x86 == 0x6) {
   2622		switch (boot_cpu_data.x86_model) {
   2623		case 26: /* AAK155 */
   2624		case 30: /* AAP115 */
   2625		case 37: /* AAT100 */
   2626		case 44: /* BC86,AAY89,BD102 */
   2627		case 46: /* BA97 */
   2628			_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
   2629			_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
   2630			pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
   2631					"does not work properly. Using workaround\n");
   2632			break;
   2633		default:
   2634			break;
   2635		}
   2636	}
   2637
   2638
   2639	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
   2640
   2641	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
   2642	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
   2643		return -EIO;
   2644
   2645#ifdef CONFIG_X86_64
   2646	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
   2647	if (vmx_msr_high & (1u<<16))
   2648		return -EIO;
   2649#endif
   2650
   2651	/* Require Write-Back (WB) memory type for VMCS accesses. */
   2652	if (((vmx_msr_high >> 18) & 15) != 6)
   2653		return -EIO;
   2654
   2655	vmcs_conf->size = vmx_msr_high & 0x1fff;
   2656	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
   2657
   2658	vmcs_conf->revision_id = vmx_msr_low;
   2659
   2660	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
   2661	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
   2662	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
   2663	vmcs_conf->vmexit_ctrl         = _vmexit_control;
   2664	vmcs_conf->vmentry_ctrl        = _vmentry_control;
   2665
   2666#if IS_ENABLED(CONFIG_HYPERV)
   2667	if (enlightened_vmcs)
   2668		evmcs_sanitize_exec_ctrls(vmcs_conf);
   2669#endif
   2670
   2671	return 0;
   2672}
   2673
   2674struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
   2675{
   2676	int node = cpu_to_node(cpu);
   2677	struct page *pages;
   2678	struct vmcs *vmcs;
   2679
   2680	pages = __alloc_pages_node(node, flags, 0);
   2681	if (!pages)
   2682		return NULL;
   2683	vmcs = page_address(pages);
   2684	memset(vmcs, 0, vmcs_config.size);
   2685
   2686	/* KVM supports Enlightened VMCS v1 only */
   2687	if (static_branch_unlikely(&enable_evmcs))
   2688		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
   2689	else
   2690		vmcs->hdr.revision_id = vmcs_config.revision_id;
   2691
   2692	if (shadow)
   2693		vmcs->hdr.shadow_vmcs = 1;
   2694	return vmcs;
   2695}
   2696
   2697void free_vmcs(struct vmcs *vmcs)
   2698{
   2699	free_page((unsigned long)vmcs);
   2700}
   2701
   2702/*
   2703 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
   2704 */
   2705void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
   2706{
   2707	if (!loaded_vmcs->vmcs)
   2708		return;
   2709	loaded_vmcs_clear(loaded_vmcs);
   2710	free_vmcs(loaded_vmcs->vmcs);
   2711	loaded_vmcs->vmcs = NULL;
   2712	if (loaded_vmcs->msr_bitmap)
   2713		free_page((unsigned long)loaded_vmcs->msr_bitmap);
   2714	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
   2715}
   2716
   2717int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
   2718{
   2719	loaded_vmcs->vmcs = alloc_vmcs(false);
   2720	if (!loaded_vmcs->vmcs)
   2721		return -ENOMEM;
   2722
   2723	vmcs_clear(loaded_vmcs->vmcs);
   2724
   2725	loaded_vmcs->shadow_vmcs = NULL;
   2726	loaded_vmcs->hv_timer_soft_disabled = false;
   2727	loaded_vmcs->cpu = -1;
   2728	loaded_vmcs->launched = 0;
   2729
   2730	if (cpu_has_vmx_msr_bitmap()) {
   2731		loaded_vmcs->msr_bitmap = (unsigned long *)
   2732				__get_free_page(GFP_KERNEL_ACCOUNT);
   2733		if (!loaded_vmcs->msr_bitmap)
   2734			goto out_vmcs;
   2735		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
   2736	}
   2737
   2738	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
   2739	memset(&loaded_vmcs->controls_shadow, 0,
   2740		sizeof(struct vmcs_controls_shadow));
   2741
   2742	return 0;
   2743
   2744out_vmcs:
   2745	free_loaded_vmcs(loaded_vmcs);
   2746	return -ENOMEM;
   2747}
   2748
   2749static void free_kvm_area(void)
   2750{
   2751	int cpu;
   2752
   2753	for_each_possible_cpu(cpu) {
   2754		free_vmcs(per_cpu(vmxarea, cpu));
   2755		per_cpu(vmxarea, cpu) = NULL;
   2756	}
   2757}
   2758
   2759static __init int alloc_kvm_area(void)
   2760{
   2761	int cpu;
   2762
   2763	for_each_possible_cpu(cpu) {
   2764		struct vmcs *vmcs;
   2765
   2766		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
   2767		if (!vmcs) {
   2768			free_kvm_area();
   2769			return -ENOMEM;
   2770		}
   2771
   2772		/*
   2773		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
   2774		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
   2775		 * revision_id reported by MSR_IA32_VMX_BASIC.
   2776		 *
   2777		 * However, even though not explicitly documented by
   2778		 * TLFS, VMXArea passed as VMXON argument should
   2779		 * still be marked with revision_id reported by
   2780		 * physical CPU.
   2781		 */
   2782		if (static_branch_unlikely(&enable_evmcs))
   2783			vmcs->hdr.revision_id = vmcs_config.revision_id;
   2784
   2785		per_cpu(vmxarea, cpu) = vmcs;
   2786	}
   2787	return 0;
   2788}
   2789
   2790static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
   2791		struct kvm_segment *save)
   2792{
   2793	if (!emulate_invalid_guest_state) {
   2794		/*
   2795		 * CS and SS RPL should be equal during guest entry according
   2796		 * to VMX spec, but in reality it is not always so. Since vcpu
   2797		 * is in the middle of the transition from real mode to
   2798		 * protected mode it is safe to assume that RPL 0 is a good
   2799		 * default value.
   2800		 */
   2801		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
   2802			save->selector &= ~SEGMENT_RPL_MASK;
   2803		save->dpl = save->selector & SEGMENT_RPL_MASK;
   2804		save->s = 1;
   2805	}
   2806	__vmx_set_segment(vcpu, save, seg);
   2807}
   2808
   2809static void enter_pmode(struct kvm_vcpu *vcpu)
   2810{
   2811	unsigned long flags;
   2812	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2813
   2814	/*
   2815	 * Update real mode segment cache. It may be not up-to-date if segment
   2816	 * register was written while vcpu was in a guest mode.
   2817	 */
   2818	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
   2819	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
   2820	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
   2821	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
   2822	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
   2823	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
   2824
   2825	vmx->rmode.vm86_active = 0;
   2826
   2827	__vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
   2828
   2829	flags = vmcs_readl(GUEST_RFLAGS);
   2830	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
   2831	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
   2832	vmcs_writel(GUEST_RFLAGS, flags);
   2833
   2834	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
   2835			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
   2836
   2837	vmx_update_exception_bitmap(vcpu);
   2838
   2839	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
   2840	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
   2841	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
   2842	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
   2843	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
   2844	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
   2845}
   2846
   2847static void fix_rmode_seg(int seg, struct kvm_segment *save)
   2848{
   2849	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
   2850	struct kvm_segment var = *save;
   2851
   2852	var.dpl = 0x3;
   2853	if (seg == VCPU_SREG_CS)
   2854		var.type = 0x3;
   2855
   2856	if (!emulate_invalid_guest_state) {
   2857		var.selector = var.base >> 4;
   2858		var.base = var.base & 0xffff0;
   2859		var.limit = 0xffff;
   2860		var.g = 0;
   2861		var.db = 0;
   2862		var.present = 1;
   2863		var.s = 1;
   2864		var.l = 0;
   2865		var.unusable = 0;
   2866		var.type = 0x3;
   2867		var.avl = 0;
   2868		if (save->base & 0xf)
   2869			printk_once(KERN_WARNING "kvm: segment base is not "
   2870					"paragraph aligned when entering "
   2871					"protected mode (seg=%d)", seg);
   2872	}
   2873
   2874	vmcs_write16(sf->selector, var.selector);
   2875	vmcs_writel(sf->base, var.base);
   2876	vmcs_write32(sf->limit, var.limit);
   2877	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
   2878}
   2879
   2880static void enter_rmode(struct kvm_vcpu *vcpu)
   2881{
   2882	unsigned long flags;
   2883	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2884	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
   2885
   2886	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
   2887	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
   2888	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
   2889	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
   2890	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
   2891	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
   2892	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
   2893
   2894	vmx->rmode.vm86_active = 1;
   2895
   2896	/*
   2897	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
   2898	 * vcpu. Warn the user that an update is overdue.
   2899	 */
   2900	if (!kvm_vmx->tss_addr)
   2901		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
   2902			     "called before entering vcpu\n");
   2903
   2904	vmx_segment_cache_clear(vmx);
   2905
   2906	vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
   2907	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
   2908	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
   2909
   2910	flags = vmcs_readl(GUEST_RFLAGS);
   2911	vmx->rmode.save_rflags = flags;
   2912
   2913	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
   2914
   2915	vmcs_writel(GUEST_RFLAGS, flags);
   2916	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
   2917	vmx_update_exception_bitmap(vcpu);
   2918
   2919	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
   2920	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
   2921	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
   2922	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
   2923	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
   2924	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
   2925}
   2926
   2927int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
   2928{
   2929	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2930
   2931	/* Nothing to do if hardware doesn't support EFER. */
   2932	if (!vmx_find_uret_msr(vmx, MSR_EFER))
   2933		return 0;
   2934
   2935	vcpu->arch.efer = efer;
   2936	if (efer & EFER_LMA)
   2937		vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
   2938	else
   2939		vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
   2940
   2941	vmx_setup_uret_msrs(vmx);
   2942	return 0;
   2943}
   2944
   2945#ifdef CONFIG_X86_64
   2946
   2947static void enter_lmode(struct kvm_vcpu *vcpu)
   2948{
   2949	u32 guest_tr_ar;
   2950
   2951	vmx_segment_cache_clear(to_vmx(vcpu));
   2952
   2953	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
   2954	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
   2955		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
   2956				     __func__);
   2957		vmcs_write32(GUEST_TR_AR_BYTES,
   2958			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
   2959			     | VMX_AR_TYPE_BUSY_64_TSS);
   2960	}
   2961	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
   2962}
   2963
   2964static void exit_lmode(struct kvm_vcpu *vcpu)
   2965{
   2966	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
   2967}
   2968
   2969#endif
   2970
   2971static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
   2972{
   2973	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2974
   2975	/*
   2976	 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
   2977	 * the CPU is not required to invalidate guest-physical mappings on
   2978	 * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
   2979	 * associated with the root EPT structure and not any particular VPID
   2980	 * (INVVPID also isn't required to invalidate guest-physical mappings).
   2981	 */
   2982	if (enable_ept) {
   2983		ept_sync_global();
   2984	} else if (enable_vpid) {
   2985		if (cpu_has_vmx_invvpid_global()) {
   2986			vpid_sync_vcpu_global();
   2987		} else {
   2988			vpid_sync_vcpu_single(vmx->vpid);
   2989			vpid_sync_vcpu_single(vmx->nested.vpid02);
   2990		}
   2991	}
   2992}
   2993
   2994static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
   2995{
   2996	if (is_guest_mode(vcpu))
   2997		return nested_get_vpid02(vcpu);
   2998	return to_vmx(vcpu)->vpid;
   2999}
   3000
   3001static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
   3002{
   3003	struct kvm_mmu *mmu = vcpu->arch.mmu;
   3004	u64 root_hpa = mmu->root.hpa;
   3005
   3006	/* No flush required if the current context is invalid. */
   3007	if (!VALID_PAGE(root_hpa))
   3008		return;
   3009
   3010	if (enable_ept)
   3011		ept_sync_context(construct_eptp(vcpu, root_hpa,
   3012						mmu->root_role.level));
   3013	else
   3014		vpid_sync_context(vmx_get_current_vpid(vcpu));
   3015}
   3016
   3017static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
   3018{
   3019	/*
   3020	 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
   3021	 * vmx_flush_tlb_guest() for an explanation of why this is ok.
   3022	 */
   3023	vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
   3024}
   3025
   3026static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
   3027{
   3028	/*
   3029	 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
   3030	 * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
   3031	 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
   3032	 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
   3033	 * i.e. no explicit INVVPID is necessary.
   3034	 */
   3035	vpid_sync_context(vmx_get_current_vpid(vcpu));
   3036}
   3037
   3038void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
   3039{
   3040	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   3041
   3042	if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
   3043		return;
   3044
   3045	if (is_pae_paging(vcpu)) {
   3046		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
   3047		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
   3048		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
   3049		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
   3050	}
   3051}
   3052
   3053void ept_save_pdptrs(struct kvm_vcpu *vcpu)
   3054{
   3055	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
   3056
   3057	if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
   3058		return;
   3059
   3060	mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
   3061	mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
   3062	mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
   3063	mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
   3064
   3065	kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
   3066}
   3067
   3068#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
   3069			  CPU_BASED_CR3_STORE_EXITING)
   3070
   3071void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
   3072{
   3073	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3074	unsigned long hw_cr0, old_cr0_pg;
   3075	u32 tmp;
   3076
   3077	old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
   3078
   3079	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
   3080	if (is_unrestricted_guest(vcpu))
   3081		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
   3082	else {
   3083		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
   3084		if (!enable_ept)
   3085			hw_cr0 |= X86_CR0_WP;
   3086
   3087		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
   3088			enter_pmode(vcpu);
   3089
   3090		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
   3091			enter_rmode(vcpu);
   3092	}
   3093
   3094	vmcs_writel(CR0_READ_SHADOW, cr0);
   3095	vmcs_writel(GUEST_CR0, hw_cr0);
   3096	vcpu->arch.cr0 = cr0;
   3097	kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
   3098
   3099#ifdef CONFIG_X86_64
   3100	if (vcpu->arch.efer & EFER_LME) {
   3101		if (!old_cr0_pg && (cr0 & X86_CR0_PG))
   3102			enter_lmode(vcpu);
   3103		else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
   3104			exit_lmode(vcpu);
   3105	}
   3106#endif
   3107
   3108	if (enable_ept && !is_unrestricted_guest(vcpu)) {
   3109		/*
   3110		 * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
   3111		 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
   3112		 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
   3113		 * KVM's CR3 is installed.
   3114		 */
   3115		if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
   3116			vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
   3117
   3118		/*
   3119		 * When running with EPT but not unrestricted guest, KVM must
   3120		 * intercept CR3 accesses when paging is _disabled_.  This is
   3121		 * necessary because restricted guests can't actually run with
   3122		 * paging disabled, and so KVM stuffs its own CR3 in order to
   3123		 * run the guest when identity mapped page tables.
   3124		 *
   3125		 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
   3126		 * update, it may be stale with respect to CR3 interception,
   3127		 * e.g. after nested VM-Enter.
   3128		 *
   3129		 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
   3130		 * stores to forward them to L1, even if KVM does not need to
   3131		 * intercept them to preserve its identity mapped page tables.
   3132		 */
   3133		if (!(cr0 & X86_CR0_PG)) {
   3134			exec_controls_setbit(vmx, CR3_EXITING_BITS);
   3135		} else if (!is_guest_mode(vcpu)) {
   3136			exec_controls_clearbit(vmx, CR3_EXITING_BITS);
   3137		} else {
   3138			tmp = exec_controls_get(vmx);
   3139			tmp &= ~CR3_EXITING_BITS;
   3140			tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
   3141			exec_controls_set(vmx, tmp);
   3142		}
   3143
   3144		/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
   3145		if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
   3146			vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
   3147
   3148		/*
   3149		 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
   3150		 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
   3151		 */
   3152		if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
   3153			kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
   3154	}
   3155
   3156	/* depends on vcpu->arch.cr0 to be set to a new value */
   3157	vmx->emulation_required = vmx_emulation_required(vcpu);
   3158}
   3159
   3160static int vmx_get_max_tdp_level(void)
   3161{
   3162	if (cpu_has_vmx_ept_5levels())
   3163		return 5;
   3164	return 4;
   3165}
   3166
   3167u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
   3168{
   3169	u64 eptp = VMX_EPTP_MT_WB;
   3170
   3171	eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
   3172
   3173	if (enable_ept_ad_bits &&
   3174	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
   3175		eptp |= VMX_EPTP_AD_ENABLE_BIT;
   3176	eptp |= root_hpa;
   3177
   3178	return eptp;
   3179}
   3180
   3181static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
   3182			     int root_level)
   3183{
   3184	struct kvm *kvm = vcpu->kvm;
   3185	bool update_guest_cr3 = true;
   3186	unsigned long guest_cr3;
   3187	u64 eptp;
   3188
   3189	if (enable_ept) {
   3190		eptp = construct_eptp(vcpu, root_hpa, root_level);
   3191		vmcs_write64(EPT_POINTER, eptp);
   3192
   3193		hv_track_root_tdp(vcpu, root_hpa);
   3194
   3195		if (!enable_unrestricted_guest && !is_paging(vcpu))
   3196			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
   3197		else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
   3198			guest_cr3 = vcpu->arch.cr3;
   3199		else /* vmcs.GUEST_CR3 is already up-to-date. */
   3200			update_guest_cr3 = false;
   3201		vmx_ept_load_pdptrs(vcpu);
   3202	} else {
   3203		guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
   3204	}
   3205
   3206	if (update_guest_cr3)
   3207		vmcs_writel(GUEST_CR3, guest_cr3);
   3208}
   3209
   3210
   3211static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   3212{
   3213	/*
   3214	 * We operate under the default treatment of SMM, so VMX cannot be
   3215	 * enabled under SMM.  Note, whether or not VMXE is allowed at all is
   3216	 * handled by kvm_is_valid_cr4().
   3217	 */
   3218	if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
   3219		return false;
   3220
   3221	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
   3222		return false;
   3223
   3224	return true;
   3225}
   3226
   3227void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   3228{
   3229	unsigned long old_cr4 = vcpu->arch.cr4;
   3230	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3231	/*
   3232	 * Pass through host's Machine Check Enable value to hw_cr4, which
   3233	 * is in force while we are in guest mode.  Do not let guests control
   3234	 * this bit, even if host CR4.MCE == 0.
   3235	 */
   3236	unsigned long hw_cr4;
   3237
   3238	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
   3239	if (is_unrestricted_guest(vcpu))
   3240		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
   3241	else if (vmx->rmode.vm86_active)
   3242		hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
   3243	else
   3244		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
   3245
   3246	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
   3247		if (cr4 & X86_CR4_UMIP) {
   3248			secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
   3249			hw_cr4 &= ~X86_CR4_UMIP;
   3250		} else if (!is_guest_mode(vcpu) ||
   3251			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
   3252			secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
   3253		}
   3254	}
   3255
   3256	vcpu->arch.cr4 = cr4;
   3257	kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
   3258
   3259	if (!is_unrestricted_guest(vcpu)) {
   3260		if (enable_ept) {
   3261			if (!is_paging(vcpu)) {
   3262				hw_cr4 &= ~X86_CR4_PAE;
   3263				hw_cr4 |= X86_CR4_PSE;
   3264			} else if (!(cr4 & X86_CR4_PAE)) {
   3265				hw_cr4 &= ~X86_CR4_PAE;
   3266			}
   3267		}
   3268
   3269		/*
   3270		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
   3271		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
   3272		 * to be manually disabled when guest switches to non-paging
   3273		 * mode.
   3274		 *
   3275		 * If !enable_unrestricted_guest, the CPU is always running
   3276		 * with CR0.PG=1 and CR4 needs to be modified.
   3277		 * If enable_unrestricted_guest, the CPU automatically
   3278		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
   3279		 */
   3280		if (!is_paging(vcpu))
   3281			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
   3282	}
   3283
   3284	vmcs_writel(CR4_READ_SHADOW, cr4);
   3285	vmcs_writel(GUEST_CR4, hw_cr4);
   3286
   3287	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
   3288		kvm_update_cpuid_runtime(vcpu);
   3289}
   3290
   3291void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
   3292{
   3293	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3294	u32 ar;
   3295
   3296	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
   3297		*var = vmx->rmode.segs[seg];
   3298		if (seg == VCPU_SREG_TR
   3299		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
   3300			return;
   3301		var->base = vmx_read_guest_seg_base(vmx, seg);
   3302		var->selector = vmx_read_guest_seg_selector(vmx, seg);
   3303		return;
   3304	}
   3305	var->base = vmx_read_guest_seg_base(vmx, seg);
   3306	var->limit = vmx_read_guest_seg_limit(vmx, seg);
   3307	var->selector = vmx_read_guest_seg_selector(vmx, seg);
   3308	ar = vmx_read_guest_seg_ar(vmx, seg);
   3309	var->unusable = (ar >> 16) & 1;
   3310	var->type = ar & 15;
   3311	var->s = (ar >> 4) & 1;
   3312	var->dpl = (ar >> 5) & 3;
   3313	/*
   3314	 * Some userspaces do not preserve unusable property. Since usable
   3315	 * segment has to be present according to VMX spec we can use present
   3316	 * property to amend userspace bug by making unusable segment always
   3317	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
   3318	 * segment as unusable.
   3319	 */
   3320	var->present = !var->unusable;
   3321	var->avl = (ar >> 12) & 1;
   3322	var->l = (ar >> 13) & 1;
   3323	var->db = (ar >> 14) & 1;
   3324	var->g = (ar >> 15) & 1;
   3325}
   3326
   3327static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
   3328{
   3329	struct kvm_segment s;
   3330
   3331	if (to_vmx(vcpu)->rmode.vm86_active) {
   3332		vmx_get_segment(vcpu, &s, seg);
   3333		return s.base;
   3334	}
   3335	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
   3336}
   3337
   3338int vmx_get_cpl(struct kvm_vcpu *vcpu)
   3339{
   3340	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3341
   3342	if (unlikely(vmx->rmode.vm86_active))
   3343		return 0;
   3344	else {
   3345		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
   3346		return VMX_AR_DPL(ar);
   3347	}
   3348}
   3349
   3350static u32 vmx_segment_access_rights(struct kvm_segment *var)
   3351{
   3352	u32 ar;
   3353
   3354	if (var->unusable || !var->present)
   3355		ar = 1 << 16;
   3356	else {
   3357		ar = var->type & 15;
   3358		ar |= (var->s & 1) << 4;
   3359		ar |= (var->dpl & 3) << 5;
   3360		ar |= (var->present & 1) << 7;
   3361		ar |= (var->avl & 1) << 12;
   3362		ar |= (var->l & 1) << 13;
   3363		ar |= (var->db & 1) << 14;
   3364		ar |= (var->g & 1) << 15;
   3365	}
   3366
   3367	return ar;
   3368}
   3369
   3370void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
   3371{
   3372	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3373	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
   3374
   3375	vmx_segment_cache_clear(vmx);
   3376
   3377	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
   3378		vmx->rmode.segs[seg] = *var;
   3379		if (seg == VCPU_SREG_TR)
   3380			vmcs_write16(sf->selector, var->selector);
   3381		else if (var->s)
   3382			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
   3383		return;
   3384	}
   3385
   3386	vmcs_writel(sf->base, var->base);
   3387	vmcs_write32(sf->limit, var->limit);
   3388	vmcs_write16(sf->selector, var->selector);
   3389
   3390	/*
   3391	 *   Fix the "Accessed" bit in AR field of segment registers for older
   3392	 * qemu binaries.
   3393	 *   IA32 arch specifies that at the time of processor reset the
   3394	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
   3395	 * is setting it to 0 in the userland code. This causes invalid guest
   3396	 * state vmexit when "unrestricted guest" mode is turned on.
   3397	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
   3398	 * tree. Newer qemu binaries with that qemu fix would not need this
   3399	 * kvm hack.
   3400	 */
   3401	if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
   3402		var->type |= 0x1; /* Accessed */
   3403
   3404	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
   3405}
   3406
   3407static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
   3408{
   3409	__vmx_set_segment(vcpu, var, seg);
   3410
   3411	to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
   3412}
   3413
   3414static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
   3415{
   3416	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
   3417
   3418	*db = (ar >> 14) & 1;
   3419	*l = (ar >> 13) & 1;
   3420}
   3421
   3422static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   3423{
   3424	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
   3425	dt->address = vmcs_readl(GUEST_IDTR_BASE);
   3426}
   3427
   3428static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   3429{
   3430	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
   3431	vmcs_writel(GUEST_IDTR_BASE, dt->address);
   3432}
   3433
   3434static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   3435{
   3436	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
   3437	dt->address = vmcs_readl(GUEST_GDTR_BASE);
   3438}
   3439
   3440static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   3441{
   3442	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
   3443	vmcs_writel(GUEST_GDTR_BASE, dt->address);
   3444}
   3445
   3446static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
   3447{
   3448	struct kvm_segment var;
   3449	u32 ar;
   3450
   3451	vmx_get_segment(vcpu, &var, seg);
   3452	var.dpl = 0x3;
   3453	if (seg == VCPU_SREG_CS)
   3454		var.type = 0x3;
   3455	ar = vmx_segment_access_rights(&var);
   3456
   3457	if (var.base != (var.selector << 4))
   3458		return false;
   3459	if (var.limit != 0xffff)
   3460		return false;
   3461	if (ar != 0xf3)
   3462		return false;
   3463
   3464	return true;
   3465}
   3466
   3467static bool code_segment_valid(struct kvm_vcpu *vcpu)
   3468{
   3469	struct kvm_segment cs;
   3470	unsigned int cs_rpl;
   3471
   3472	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
   3473	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
   3474
   3475	if (cs.unusable)
   3476		return false;
   3477	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
   3478		return false;
   3479	if (!cs.s)
   3480		return false;
   3481	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
   3482		if (cs.dpl > cs_rpl)
   3483			return false;
   3484	} else {
   3485		if (cs.dpl != cs_rpl)
   3486			return false;
   3487	}
   3488	if (!cs.present)
   3489		return false;
   3490
   3491	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
   3492	return true;
   3493}
   3494
   3495static bool stack_segment_valid(struct kvm_vcpu *vcpu)
   3496{
   3497	struct kvm_segment ss;
   3498	unsigned int ss_rpl;
   3499
   3500	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
   3501	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
   3502
   3503	if (ss.unusable)
   3504		return true;
   3505	if (ss.type != 3 && ss.type != 7)
   3506		return false;
   3507	if (!ss.s)
   3508		return false;
   3509	if (ss.dpl != ss_rpl) /* DPL != RPL */
   3510		return false;
   3511	if (!ss.present)
   3512		return false;
   3513
   3514	return true;
   3515}
   3516
   3517static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
   3518{
   3519	struct kvm_segment var;
   3520	unsigned int rpl;
   3521
   3522	vmx_get_segment(vcpu, &var, seg);
   3523	rpl = var.selector & SEGMENT_RPL_MASK;
   3524
   3525	if (var.unusable)
   3526		return true;
   3527	if (!var.s)
   3528		return false;
   3529	if (!var.present)
   3530		return false;
   3531	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
   3532		if (var.dpl < rpl) /* DPL < RPL */
   3533			return false;
   3534	}
   3535
   3536	/* TODO: Add other members to kvm_segment_field to allow checking for other access
   3537	 * rights flags
   3538	 */
   3539	return true;
   3540}
   3541
   3542static bool tr_valid(struct kvm_vcpu *vcpu)
   3543{
   3544	struct kvm_segment tr;
   3545
   3546	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
   3547
   3548	if (tr.unusable)
   3549		return false;
   3550	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
   3551		return false;
   3552	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
   3553		return false;
   3554	if (!tr.present)
   3555		return false;
   3556
   3557	return true;
   3558}
   3559
   3560static bool ldtr_valid(struct kvm_vcpu *vcpu)
   3561{
   3562	struct kvm_segment ldtr;
   3563
   3564	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
   3565
   3566	if (ldtr.unusable)
   3567		return true;
   3568	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
   3569		return false;
   3570	if (ldtr.type != 2)
   3571		return false;
   3572	if (!ldtr.present)
   3573		return false;
   3574
   3575	return true;
   3576}
   3577
   3578static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
   3579{
   3580	struct kvm_segment cs, ss;
   3581
   3582	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
   3583	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
   3584
   3585	return ((cs.selector & SEGMENT_RPL_MASK) ==
   3586		 (ss.selector & SEGMENT_RPL_MASK));
   3587}
   3588
   3589/*
   3590 * Check if guest state is valid. Returns true if valid, false if
   3591 * not.
   3592 * We assume that registers are always usable
   3593 */
   3594bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
   3595{
   3596	/* real mode guest state checks */
   3597	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
   3598		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
   3599			return false;
   3600		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
   3601			return false;
   3602		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
   3603			return false;
   3604		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
   3605			return false;
   3606		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
   3607			return false;
   3608		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
   3609			return false;
   3610	} else {
   3611	/* protected mode guest state checks */
   3612		if (!cs_ss_rpl_check(vcpu))
   3613			return false;
   3614		if (!code_segment_valid(vcpu))
   3615			return false;
   3616		if (!stack_segment_valid(vcpu))
   3617			return false;
   3618		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
   3619			return false;
   3620		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
   3621			return false;
   3622		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
   3623			return false;
   3624		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
   3625			return false;
   3626		if (!tr_valid(vcpu))
   3627			return false;
   3628		if (!ldtr_valid(vcpu))
   3629			return false;
   3630	}
   3631	/* TODO:
   3632	 * - Add checks on RIP
   3633	 * - Add checks on RFLAGS
   3634	 */
   3635
   3636	return true;
   3637}
   3638
   3639static int init_rmode_tss(struct kvm *kvm, void __user *ua)
   3640{
   3641	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
   3642	u16 data;
   3643	int i;
   3644
   3645	for (i = 0; i < 3; i++) {
   3646		if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
   3647			return -EFAULT;
   3648	}
   3649
   3650	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
   3651	if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
   3652		return -EFAULT;
   3653
   3654	data = ~0;
   3655	if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
   3656		return -EFAULT;
   3657
   3658	return 0;
   3659}
   3660
   3661static int init_rmode_identity_map(struct kvm *kvm)
   3662{
   3663	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
   3664	int i, r = 0;
   3665	void __user *uaddr;
   3666	u32 tmp;
   3667
   3668	/* Protect kvm_vmx->ept_identity_pagetable_done. */
   3669	mutex_lock(&kvm->slots_lock);
   3670
   3671	if (likely(kvm_vmx->ept_identity_pagetable_done))
   3672		goto out;
   3673
   3674	if (!kvm_vmx->ept_identity_map_addr)
   3675		kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
   3676
   3677	uaddr = __x86_set_memory_region(kvm,
   3678					IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
   3679					kvm_vmx->ept_identity_map_addr,
   3680					PAGE_SIZE);
   3681	if (IS_ERR(uaddr)) {
   3682		r = PTR_ERR(uaddr);
   3683		goto out;
   3684	}
   3685
   3686	/* Set up identity-mapping pagetable for EPT in real mode */
   3687	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
   3688		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
   3689			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
   3690		if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
   3691			r = -EFAULT;
   3692			goto out;
   3693		}
   3694	}
   3695	kvm_vmx->ept_identity_pagetable_done = true;
   3696
   3697out:
   3698	mutex_unlock(&kvm->slots_lock);
   3699	return r;
   3700}
   3701
   3702static void seg_setup(int seg)
   3703{
   3704	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
   3705	unsigned int ar;
   3706
   3707	vmcs_write16(sf->selector, 0);
   3708	vmcs_writel(sf->base, 0);
   3709	vmcs_write32(sf->limit, 0xffff);
   3710	ar = 0x93;
   3711	if (seg == VCPU_SREG_CS)
   3712		ar |= 0x08; /* code segment */
   3713
   3714	vmcs_write32(sf->ar_bytes, ar);
   3715}
   3716
   3717static int alloc_apic_access_page(struct kvm *kvm)
   3718{
   3719	struct page *page;
   3720	void __user *hva;
   3721	int ret = 0;
   3722
   3723	mutex_lock(&kvm->slots_lock);
   3724	if (kvm->arch.apic_access_memslot_enabled)
   3725		goto out;
   3726	hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
   3727				      APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
   3728	if (IS_ERR(hva)) {
   3729		ret = PTR_ERR(hva);
   3730		goto out;
   3731	}
   3732
   3733	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
   3734	if (is_error_page(page)) {
   3735		ret = -EFAULT;
   3736		goto out;
   3737	}
   3738
   3739	/*
   3740	 * Do not pin the page in memory, so that memory hot-unplug
   3741	 * is able to migrate it.
   3742	 */
   3743	put_page(page);
   3744	kvm->arch.apic_access_memslot_enabled = true;
   3745out:
   3746	mutex_unlock(&kvm->slots_lock);
   3747	return ret;
   3748}
   3749
   3750int allocate_vpid(void)
   3751{
   3752	int vpid;
   3753
   3754	if (!enable_vpid)
   3755		return 0;
   3756	spin_lock(&vmx_vpid_lock);
   3757	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
   3758	if (vpid < VMX_NR_VPIDS)
   3759		__set_bit(vpid, vmx_vpid_bitmap);
   3760	else
   3761		vpid = 0;
   3762	spin_unlock(&vmx_vpid_lock);
   3763	return vpid;
   3764}
   3765
   3766void free_vpid(int vpid)
   3767{
   3768	if (!enable_vpid || vpid == 0)
   3769		return;
   3770	spin_lock(&vmx_vpid_lock);
   3771	__clear_bit(vpid, vmx_vpid_bitmap);
   3772	spin_unlock(&vmx_vpid_lock);
   3773}
   3774
   3775static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
   3776{
   3777	/*
   3778	 * When KVM is a nested hypervisor on top of Hyper-V and uses
   3779	 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
   3780	 * bitmap has changed.
   3781	 */
   3782	if (static_branch_unlikely(&enable_evmcs))
   3783		evmcs_touch_msr_bitmap();
   3784
   3785	vmx->nested.force_msr_bitmap_recalc = true;
   3786}
   3787
   3788void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
   3789{
   3790	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3791	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
   3792
   3793	if (!cpu_has_vmx_msr_bitmap())
   3794		return;
   3795
   3796	vmx_msr_bitmap_l01_changed(vmx);
   3797
   3798	/*
   3799	 * Mark the desired intercept state in shadow bitmap, this is needed
   3800	 * for resync when the MSR filters change.
   3801	*/
   3802	if (is_valid_passthrough_msr(msr)) {
   3803		int idx = possible_passthrough_msr_slot(msr);
   3804
   3805		if (idx != -ENOENT) {
   3806			if (type & MSR_TYPE_R)
   3807				clear_bit(idx, vmx->shadow_msr_intercept.read);
   3808			if (type & MSR_TYPE_W)
   3809				clear_bit(idx, vmx->shadow_msr_intercept.write);
   3810		}
   3811	}
   3812
   3813	if ((type & MSR_TYPE_R) &&
   3814	    !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
   3815		vmx_set_msr_bitmap_read(msr_bitmap, msr);
   3816		type &= ~MSR_TYPE_R;
   3817	}
   3818
   3819	if ((type & MSR_TYPE_W) &&
   3820	    !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
   3821		vmx_set_msr_bitmap_write(msr_bitmap, msr);
   3822		type &= ~MSR_TYPE_W;
   3823	}
   3824
   3825	if (type & MSR_TYPE_R)
   3826		vmx_clear_msr_bitmap_read(msr_bitmap, msr);
   3827
   3828	if (type & MSR_TYPE_W)
   3829		vmx_clear_msr_bitmap_write(msr_bitmap, msr);
   3830}
   3831
   3832void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
   3833{
   3834	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3835	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
   3836
   3837	if (!cpu_has_vmx_msr_bitmap())
   3838		return;
   3839
   3840	vmx_msr_bitmap_l01_changed(vmx);
   3841
   3842	/*
   3843	 * Mark the desired intercept state in shadow bitmap, this is needed
   3844	 * for resync when the MSR filter changes.
   3845	*/
   3846	if (is_valid_passthrough_msr(msr)) {
   3847		int idx = possible_passthrough_msr_slot(msr);
   3848
   3849		if (idx != -ENOENT) {
   3850			if (type & MSR_TYPE_R)
   3851				set_bit(idx, vmx->shadow_msr_intercept.read);
   3852			if (type & MSR_TYPE_W)
   3853				set_bit(idx, vmx->shadow_msr_intercept.write);
   3854		}
   3855	}
   3856
   3857	if (type & MSR_TYPE_R)
   3858		vmx_set_msr_bitmap_read(msr_bitmap, msr);
   3859
   3860	if (type & MSR_TYPE_W)
   3861		vmx_set_msr_bitmap_write(msr_bitmap, msr);
   3862}
   3863
   3864static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
   3865{
   3866	unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
   3867	unsigned long read_intercept;
   3868	int msr;
   3869
   3870	read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
   3871
   3872	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
   3873		unsigned int read_idx = msr / BITS_PER_LONG;
   3874		unsigned int write_idx = read_idx + (0x800 / sizeof(long));
   3875
   3876		msr_bitmap[read_idx] = read_intercept;
   3877		msr_bitmap[write_idx] = ~0ul;
   3878	}
   3879}
   3880
   3881static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
   3882{
   3883	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3884	u8 mode;
   3885
   3886	if (!cpu_has_vmx_msr_bitmap())
   3887		return;
   3888
   3889	if (cpu_has_secondary_exec_ctrls() &&
   3890	    (secondary_exec_controls_get(vmx) &
   3891	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
   3892		mode = MSR_BITMAP_MODE_X2APIC;
   3893		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
   3894			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
   3895	} else {
   3896		mode = 0;
   3897	}
   3898
   3899	if (mode == vmx->x2apic_msr_bitmap_mode)
   3900		return;
   3901
   3902	vmx->x2apic_msr_bitmap_mode = mode;
   3903
   3904	vmx_reset_x2apic_msrs(vcpu, mode);
   3905
   3906	/*
   3907	 * TPR reads and writes can be virtualized even if virtual interrupt
   3908	 * delivery is not in use.
   3909	 */
   3910	vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
   3911				  !(mode & MSR_BITMAP_MODE_X2APIC));
   3912
   3913	if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
   3914		vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
   3915		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
   3916		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
   3917	}
   3918}
   3919
   3920void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
   3921{
   3922	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3923	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
   3924	u32 i;
   3925
   3926	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
   3927	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
   3928	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
   3929	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
   3930	for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
   3931		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
   3932		vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
   3933	}
   3934}
   3935
   3936static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
   3937{
   3938	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3939	void *vapic_page;
   3940	u32 vppr;
   3941	int rvi;
   3942
   3943	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
   3944		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
   3945		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
   3946		return false;
   3947
   3948	rvi = vmx_get_rvi();
   3949
   3950	vapic_page = vmx->nested.virtual_apic_map.hva;
   3951	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
   3952
   3953	return ((rvi & 0xf0) > (vppr & 0xf0));
   3954}
   3955
   3956static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
   3957{
   3958	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3959	u32 i;
   3960
   3961	/*
   3962	 * Set intercept permissions for all potentially passed through MSRs
   3963	 * again. They will automatically get filtered through the MSR filter,
   3964	 * so we are back in sync after this.
   3965	 */
   3966	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
   3967		u32 msr = vmx_possible_passthrough_msrs[i];
   3968		bool read = test_bit(i, vmx->shadow_msr_intercept.read);
   3969		bool write = test_bit(i, vmx->shadow_msr_intercept.write);
   3970
   3971		vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
   3972		vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
   3973	}
   3974
   3975	pt_update_intercept_for_msr(vcpu);
   3976}
   3977
   3978static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
   3979						     int pi_vec)
   3980{
   3981#ifdef CONFIG_SMP
   3982	if (vcpu->mode == IN_GUEST_MODE) {
   3983		/*
   3984		 * The vector of the virtual has already been set in the PIR.
   3985		 * Send a notification event to deliver the virtual interrupt
   3986		 * unless the vCPU is the currently running vCPU, i.e. the
   3987		 * event is being sent from a fastpath VM-Exit handler, in
   3988		 * which case the PIR will be synced to the vIRR before
   3989		 * re-entering the guest.
   3990		 *
   3991		 * When the target is not the running vCPU, the following
   3992		 * possibilities emerge:
   3993		 *
   3994		 * Case 1: vCPU stays in non-root mode. Sending a notification
   3995		 * event posts the interrupt to the vCPU.
   3996		 *
   3997		 * Case 2: vCPU exits to root mode and is still runnable. The
   3998		 * PIR will be synced to the vIRR before re-entering the guest.
   3999		 * Sending a notification event is ok as the host IRQ handler
   4000		 * will ignore the spurious event.
   4001		 *
   4002		 * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
   4003		 * has already synced PIR to vIRR and never blocks the vCPU if
   4004		 * the vIRR is not empty. Therefore, a blocked vCPU here does
   4005		 * not wait for any requested interrupts in PIR, and sending a
   4006		 * notification event also results in a benign, spurious event.
   4007		 */
   4008
   4009		if (vcpu != kvm_get_running_vcpu())
   4010			apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
   4011		return;
   4012	}
   4013#endif
   4014	/*
   4015	 * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
   4016	 * otherwise do nothing as KVM will grab the highest priority pending
   4017	 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
   4018	 */
   4019	kvm_vcpu_wake_up(vcpu);
   4020}
   4021
   4022static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
   4023						int vector)
   4024{
   4025	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4026
   4027	if (is_guest_mode(vcpu) &&
   4028	    vector == vmx->nested.posted_intr_nv) {
   4029		/*
   4030		 * If a posted intr is not recognized by hardware,
   4031		 * we will accomplish it in the next vmentry.
   4032		 */
   4033		vmx->nested.pi_pending = true;
   4034		kvm_make_request(KVM_REQ_EVENT, vcpu);
   4035
   4036		/*
   4037		 * This pairs with the smp_mb_*() after setting vcpu->mode in
   4038		 * vcpu_enter_guest() to guarantee the vCPU sees the event
   4039		 * request if triggering a posted interrupt "fails" because
   4040		 * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
   4041		 * the smb_wmb() in kvm_make_request() only ensures everything
   4042		 * done before making the request is visible when the request
   4043		 * is visible, it doesn't ensure ordering between the store to
   4044		 * vcpu->requests and the load from vcpu->mode.
   4045		 */
   4046		smp_mb__after_atomic();
   4047
   4048		/* the PIR and ON have been set by L1. */
   4049		kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
   4050		return 0;
   4051	}
   4052	return -1;
   4053}
   4054/*
   4055 * Send interrupt to vcpu via posted interrupt way.
   4056 * 1. If target vcpu is running(non-root mode), send posted interrupt
   4057 * notification to vcpu and hardware will sync PIR to vIRR atomically.
   4058 * 2. If target vcpu isn't running(root mode), kick it to pick up the
   4059 * interrupt from PIR in next vmentry.
   4060 */
   4061static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
   4062{
   4063	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4064	int r;
   4065
   4066	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
   4067	if (!r)
   4068		return 0;
   4069
   4070	if (!vcpu->arch.apicv_active)
   4071		return -1;
   4072
   4073	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
   4074		return 0;
   4075
   4076	/* If a previous notification has sent the IPI, nothing to do.  */
   4077	if (pi_test_and_set_on(&vmx->pi_desc))
   4078		return 0;
   4079
   4080	/*
   4081	 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
   4082	 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
   4083	 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
   4084	 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
   4085	 */
   4086	kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
   4087	return 0;
   4088}
   4089
   4090static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
   4091				  int trig_mode, int vector)
   4092{
   4093	struct kvm_vcpu *vcpu = apic->vcpu;
   4094
   4095	if (vmx_deliver_posted_interrupt(vcpu, vector)) {
   4096		kvm_lapic_set_irr(vector, apic);
   4097		kvm_make_request(KVM_REQ_EVENT, vcpu);
   4098		kvm_vcpu_kick(vcpu);
   4099	} else {
   4100		trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
   4101					   trig_mode, vector);
   4102	}
   4103}
   4104
   4105/*
   4106 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
   4107 * will not change in the lifetime of the guest.
   4108 * Note that host-state that does change is set elsewhere. E.g., host-state
   4109 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
   4110 */
   4111void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
   4112{
   4113	u32 low32, high32;
   4114	unsigned long tmpl;
   4115	unsigned long cr0, cr3, cr4;
   4116
   4117	cr0 = read_cr0();
   4118	WARN_ON(cr0 & X86_CR0_TS);
   4119	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
   4120
   4121	/*
   4122	 * Save the most likely value for this task's CR3 in the VMCS.
   4123	 * We can't use __get_current_cr3_fast() because we're not atomic.
   4124	 */
   4125	cr3 = __read_cr3();
   4126	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
   4127	vmx->loaded_vmcs->host_state.cr3 = cr3;
   4128
   4129	/* Save the most likely value for this task's CR4 in the VMCS. */
   4130	cr4 = cr4_read_shadow();
   4131	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
   4132	vmx->loaded_vmcs->host_state.cr4 = cr4;
   4133
   4134	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
   4135#ifdef CONFIG_X86_64
   4136	/*
   4137	 * Load null selectors, so we can avoid reloading them in
   4138	 * vmx_prepare_switch_to_host(), in case userspace uses
   4139	 * the null selectors too (the expected case).
   4140	 */
   4141	vmcs_write16(HOST_DS_SELECTOR, 0);
   4142	vmcs_write16(HOST_ES_SELECTOR, 0);
   4143#else
   4144	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
   4145	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
   4146#endif
   4147	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
   4148	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
   4149
   4150	vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
   4151
   4152	vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
   4153
   4154	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
   4155	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
   4156
   4157	/*
   4158	 * SYSENTER is used for 32-bit system calls on either 32-bit or
   4159	 * 64-bit kernels.  It is always zero If neither is allowed, otherwise
   4160	 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
   4161	 * have already done so!).
   4162	 */
   4163	if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
   4164		vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
   4165
   4166	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
   4167	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
   4168
   4169	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
   4170		rdmsr(MSR_IA32_CR_PAT, low32, high32);
   4171		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
   4172	}
   4173
   4174	if (cpu_has_load_ia32_efer())
   4175		vmcs_write64(HOST_IA32_EFER, host_efer);
   4176}
   4177
   4178void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
   4179{
   4180	struct kvm_vcpu *vcpu = &vmx->vcpu;
   4181
   4182	vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
   4183					  ~vcpu->arch.cr4_guest_rsvd_bits;
   4184	if (!enable_ept) {
   4185		vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
   4186		vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
   4187	}
   4188	if (is_guest_mode(&vmx->vcpu))
   4189		vcpu->arch.cr4_guest_owned_bits &=
   4190			~get_vmcs12(vcpu)->cr4_guest_host_mask;
   4191	vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
   4192}
   4193
   4194static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
   4195{
   4196	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
   4197
   4198	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
   4199		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
   4200
   4201	if (!enable_vnmi)
   4202		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
   4203
   4204	if (!enable_preemption_timer)
   4205		pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
   4206
   4207	return pin_based_exec_ctrl;
   4208}
   4209
   4210static u32 vmx_vmentry_ctrl(void)
   4211{
   4212	u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
   4213
   4214	if (vmx_pt_mode_is_system())
   4215		vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
   4216				  VM_ENTRY_LOAD_IA32_RTIT_CTL);
   4217	/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
   4218	return vmentry_ctrl &
   4219		~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
   4220}
   4221
   4222static u32 vmx_vmexit_ctrl(void)
   4223{
   4224	u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
   4225
   4226	if (vmx_pt_mode_is_system())
   4227		vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
   4228				 VM_EXIT_CLEAR_IA32_RTIT_CTL);
   4229	/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
   4230	return vmexit_ctrl &
   4231		~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
   4232}
   4233
   4234static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
   4235{
   4236	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4237
   4238	if (is_guest_mode(vcpu)) {
   4239		vmx->nested.update_vmcs01_apicv_status = true;
   4240		return;
   4241	}
   4242
   4243	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
   4244	if (cpu_has_secondary_exec_ctrls()) {
   4245		if (kvm_vcpu_apicv_active(vcpu))
   4246			secondary_exec_controls_setbit(vmx,
   4247				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
   4248				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
   4249		else
   4250			secondary_exec_controls_clearbit(vmx,
   4251					SECONDARY_EXEC_APIC_REGISTER_VIRT |
   4252					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
   4253	}
   4254
   4255	vmx_update_msr_bitmap_x2apic(vcpu);
   4256}
   4257
   4258static u32 vmx_exec_control(struct vcpu_vmx *vmx)
   4259{
   4260	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
   4261
   4262	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
   4263		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
   4264
   4265	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
   4266		exec_control &= ~CPU_BASED_TPR_SHADOW;
   4267#ifdef CONFIG_X86_64
   4268		exec_control |= CPU_BASED_CR8_STORE_EXITING |
   4269				CPU_BASED_CR8_LOAD_EXITING;
   4270#endif
   4271	}
   4272	if (!enable_ept)
   4273		exec_control |= CPU_BASED_CR3_STORE_EXITING |
   4274				CPU_BASED_CR3_LOAD_EXITING  |
   4275				CPU_BASED_INVLPG_EXITING;
   4276	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
   4277		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
   4278				CPU_BASED_MONITOR_EXITING);
   4279	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
   4280		exec_control &= ~CPU_BASED_HLT_EXITING;
   4281	return exec_control;
   4282}
   4283
   4284/*
   4285 * Adjust a single secondary execution control bit to intercept/allow an
   4286 * instruction in the guest.  This is usually done based on whether or not a
   4287 * feature has been exposed to the guest in order to correctly emulate faults.
   4288 */
   4289static inline void
   4290vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
   4291				  u32 control, bool enabled, bool exiting)
   4292{
   4293	/*
   4294	 * If the control is for an opt-in feature, clear the control if the
   4295	 * feature is not exposed to the guest, i.e. not enabled.  If the
   4296	 * control is opt-out, i.e. an exiting control, clear the control if
   4297	 * the feature _is_ exposed to the guest, i.e. exiting/interception is
   4298	 * disabled for the associated instruction.  Note, the caller is
   4299	 * responsible presetting exec_control to set all supported bits.
   4300	 */
   4301	if (enabled == exiting)
   4302		*exec_control &= ~control;
   4303
   4304	/*
   4305	 * Update the nested MSR settings so that a nested VMM can/can't set
   4306	 * controls for features that are/aren't exposed to the guest.
   4307	 */
   4308	if (nested) {
   4309		if (enabled)
   4310			vmx->nested.msrs.secondary_ctls_high |= control;
   4311		else
   4312			vmx->nested.msrs.secondary_ctls_high &= ~control;
   4313	}
   4314}
   4315
   4316/*
   4317 * Wrapper macro for the common case of adjusting a secondary execution control
   4318 * based on a single guest CPUID bit, with a dedicated feature bit.  This also
   4319 * verifies that the control is actually supported by KVM and hardware.
   4320 */
   4321#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
   4322({									 \
   4323	bool __enabled;							 \
   4324									 \
   4325	if (cpu_has_vmx_##name()) {					 \
   4326		__enabled = guest_cpuid_has(&(vmx)->vcpu,		 \
   4327					    X86_FEATURE_##feat_name);	 \
   4328		vmx_adjust_secondary_exec_control(vmx, exec_control,	 \
   4329			SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
   4330	}								 \
   4331})
   4332
   4333/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
   4334#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
   4335	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
   4336
   4337#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
   4338	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
   4339
   4340static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
   4341{
   4342	struct kvm_vcpu *vcpu = &vmx->vcpu;
   4343
   4344	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
   4345
   4346	if (vmx_pt_mode_is_system())
   4347		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
   4348	if (!cpu_need_virtualize_apic_accesses(vcpu))
   4349		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
   4350	if (vmx->vpid == 0)
   4351		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
   4352	if (!enable_ept) {
   4353		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
   4354		enable_unrestricted_guest = 0;
   4355	}
   4356	if (!enable_unrestricted_guest)
   4357		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
   4358	if (kvm_pause_in_guest(vmx->vcpu.kvm))
   4359		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
   4360	if (!kvm_vcpu_apicv_active(vcpu))
   4361		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
   4362				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
   4363	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
   4364
   4365	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
   4366	 * in vmx_set_cr4.  */
   4367	exec_control &= ~SECONDARY_EXEC_DESC;
   4368
   4369	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
   4370	   (handle_vmptrld).
   4371	   We can NOT enable shadow_vmcs here because we don't have yet
   4372	   a current VMCS12
   4373	*/
   4374	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
   4375
   4376	/*
   4377	 * PML is enabled/disabled when dirty logging of memsmlots changes, but
   4378	 * it needs to be set here when dirty logging is already active, e.g.
   4379	 * if this vCPU was created after dirty logging was enabled.
   4380	 */
   4381	if (!vcpu->kvm->arch.cpu_dirty_logging_count)
   4382		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
   4383
   4384	if (cpu_has_vmx_xsaves()) {
   4385		/* Exposing XSAVES only when XSAVE is exposed */
   4386		bool xsaves_enabled =
   4387			boot_cpu_has(X86_FEATURE_XSAVE) &&
   4388			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
   4389			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
   4390
   4391		vcpu->arch.xsaves_enabled = xsaves_enabled;
   4392
   4393		vmx_adjust_secondary_exec_control(vmx, &exec_control,
   4394						  SECONDARY_EXEC_XSAVES,
   4395						  xsaves_enabled, false);
   4396	}
   4397
   4398	/*
   4399	 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
   4400	 * feature is exposed to the guest.  This creates a virtualization hole
   4401	 * if both are supported in hardware but only one is exposed to the
   4402	 * guest, but letting the guest execute RDTSCP or RDPID when either one
   4403	 * is advertised is preferable to emulating the advertised instruction
   4404	 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
   4405	 */
   4406	if (cpu_has_vmx_rdtscp()) {
   4407		bool rdpid_or_rdtscp_enabled =
   4408			guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
   4409			guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
   4410
   4411		vmx_adjust_secondary_exec_control(vmx, &exec_control,
   4412						  SECONDARY_EXEC_ENABLE_RDTSCP,
   4413						  rdpid_or_rdtscp_enabled, false);
   4414	}
   4415	vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
   4416
   4417	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
   4418	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
   4419
   4420	vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
   4421				    ENABLE_USR_WAIT_PAUSE, false);
   4422
   4423	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
   4424		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
   4425
   4426	return exec_control;
   4427}
   4428
   4429#define VMX_XSS_EXIT_BITMAP 0
   4430
   4431static void init_vmcs(struct vcpu_vmx *vmx)
   4432{
   4433	if (nested)
   4434		nested_vmx_set_vmcs_shadowing_bitmap();
   4435
   4436	if (cpu_has_vmx_msr_bitmap())
   4437		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
   4438
   4439	vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
   4440
   4441	/* Control */
   4442	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
   4443
   4444	exec_controls_set(vmx, vmx_exec_control(vmx));
   4445
   4446	if (cpu_has_secondary_exec_ctrls())
   4447		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
   4448
   4449	if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
   4450		vmcs_write64(EOI_EXIT_BITMAP0, 0);
   4451		vmcs_write64(EOI_EXIT_BITMAP1, 0);
   4452		vmcs_write64(EOI_EXIT_BITMAP2, 0);
   4453		vmcs_write64(EOI_EXIT_BITMAP3, 0);
   4454
   4455		vmcs_write16(GUEST_INTR_STATUS, 0);
   4456
   4457		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
   4458		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
   4459	}
   4460
   4461	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
   4462		vmcs_write32(PLE_GAP, ple_gap);
   4463		vmx->ple_window = ple_window;
   4464		vmx->ple_window_dirty = true;
   4465	}
   4466
   4467	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
   4468	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
   4469	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
   4470
   4471	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
   4472	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
   4473	vmx_set_constant_host_state(vmx);
   4474	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
   4475	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
   4476
   4477	if (cpu_has_vmx_vmfunc())
   4478		vmcs_write64(VM_FUNCTION_CONTROL, 0);
   4479
   4480	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
   4481	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
   4482	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
   4483	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
   4484	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
   4485
   4486	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
   4487		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
   4488
   4489	vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
   4490
   4491	/* 22.2.1, 20.8.1 */
   4492	vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
   4493
   4494	vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
   4495	vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
   4496
   4497	set_cr4_guest_host_mask(vmx);
   4498
   4499	if (vmx->vpid != 0)
   4500		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
   4501
   4502	if (cpu_has_vmx_xsaves())
   4503		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
   4504
   4505	if (enable_pml) {
   4506		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
   4507		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
   4508	}
   4509
   4510	vmx_write_encls_bitmap(&vmx->vcpu, NULL);
   4511
   4512	if (vmx_pt_mode_is_host_guest()) {
   4513		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
   4514		/* Bit[6~0] are forced to 1, writes are ignored. */
   4515		vmx->pt_desc.guest.output_mask = 0x7F;
   4516		vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
   4517	}
   4518
   4519	vmcs_write32(GUEST_SYSENTER_CS, 0);
   4520	vmcs_writel(GUEST_SYSENTER_ESP, 0);
   4521	vmcs_writel(GUEST_SYSENTER_EIP, 0);
   4522	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
   4523
   4524	if (cpu_has_vmx_tpr_shadow()) {
   4525		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
   4526		if (cpu_need_tpr_shadow(&vmx->vcpu))
   4527			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
   4528				     __pa(vmx->vcpu.arch.apic->regs));
   4529		vmcs_write32(TPR_THRESHOLD, 0);
   4530	}
   4531
   4532	vmx_setup_uret_msrs(vmx);
   4533}
   4534
   4535static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
   4536{
   4537	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4538
   4539	init_vmcs(vmx);
   4540
   4541	if (nested)
   4542		memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
   4543
   4544	vcpu_setup_sgx_lepubkeyhash(vcpu);
   4545
   4546	vmx->nested.posted_intr_nv = -1;
   4547	vmx->nested.vmxon_ptr = INVALID_GPA;
   4548	vmx->nested.current_vmptr = INVALID_GPA;
   4549	vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
   4550
   4551	vcpu->arch.microcode_version = 0x100000000ULL;
   4552	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
   4553
   4554	/*
   4555	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
   4556	 * or POSTED_INTR_WAKEUP_VECTOR.
   4557	 */
   4558	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
   4559	vmx->pi_desc.sn = 1;
   4560}
   4561
   4562static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
   4563{
   4564	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4565
   4566	if (!init_event)
   4567		__vmx_vcpu_reset(vcpu);
   4568
   4569	vmx->rmode.vm86_active = 0;
   4570	vmx->spec_ctrl = 0;
   4571
   4572	vmx->msr_ia32_umwait_control = 0;
   4573
   4574	vmx->hv_deadline_tsc = -1;
   4575	kvm_set_cr8(vcpu, 0);
   4576
   4577	vmx_segment_cache_clear(vmx);
   4578	kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
   4579
   4580	seg_setup(VCPU_SREG_CS);
   4581	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
   4582	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
   4583
   4584	seg_setup(VCPU_SREG_DS);
   4585	seg_setup(VCPU_SREG_ES);
   4586	seg_setup(VCPU_SREG_FS);
   4587	seg_setup(VCPU_SREG_GS);
   4588	seg_setup(VCPU_SREG_SS);
   4589
   4590	vmcs_write16(GUEST_TR_SELECTOR, 0);
   4591	vmcs_writel(GUEST_TR_BASE, 0);
   4592	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
   4593	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
   4594
   4595	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
   4596	vmcs_writel(GUEST_LDTR_BASE, 0);
   4597	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
   4598	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
   4599
   4600	vmcs_writel(GUEST_GDTR_BASE, 0);
   4601	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
   4602
   4603	vmcs_writel(GUEST_IDTR_BASE, 0);
   4604	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
   4605
   4606	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
   4607	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
   4608	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
   4609	if (kvm_mpx_supported())
   4610		vmcs_write64(GUEST_BNDCFGS, 0);
   4611
   4612	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
   4613
   4614	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
   4615
   4616	vpid_sync_context(vmx->vpid);
   4617
   4618	vmx_update_fb_clear_dis(vcpu, vmx);
   4619}
   4620
   4621static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
   4622{
   4623	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
   4624}
   4625
   4626static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
   4627{
   4628	if (!enable_vnmi ||
   4629	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
   4630		vmx_enable_irq_window(vcpu);
   4631		return;
   4632	}
   4633
   4634	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
   4635}
   4636
   4637static void vmx_inject_irq(struct kvm_vcpu *vcpu)
   4638{
   4639	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4640	uint32_t intr;
   4641	int irq = vcpu->arch.interrupt.nr;
   4642
   4643	trace_kvm_inj_virq(irq);
   4644
   4645	++vcpu->stat.irq_injections;
   4646	if (vmx->rmode.vm86_active) {
   4647		int inc_eip = 0;
   4648		if (vcpu->arch.interrupt.soft)
   4649			inc_eip = vcpu->arch.event_exit_inst_len;
   4650		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
   4651		return;
   4652	}
   4653	intr = irq | INTR_INFO_VALID_MASK;
   4654	if (vcpu->arch.interrupt.soft) {
   4655		intr |= INTR_TYPE_SOFT_INTR;
   4656		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
   4657			     vmx->vcpu.arch.event_exit_inst_len);
   4658	} else
   4659		intr |= INTR_TYPE_EXT_INTR;
   4660	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
   4661
   4662	vmx_clear_hlt(vcpu);
   4663}
   4664
   4665static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
   4666{
   4667	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4668
   4669	if (!enable_vnmi) {
   4670		/*
   4671		 * Tracking the NMI-blocked state in software is built upon
   4672		 * finding the next open IRQ window. This, in turn, depends on
   4673		 * well-behaving guests: They have to keep IRQs disabled at
   4674		 * least as long as the NMI handler runs. Otherwise we may
   4675		 * cause NMI nesting, maybe breaking the guest. But as this is
   4676		 * highly unlikely, we can live with the residual risk.
   4677		 */
   4678		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
   4679		vmx->loaded_vmcs->vnmi_blocked_time = 0;
   4680	}
   4681
   4682	++vcpu->stat.nmi_injections;
   4683	vmx->loaded_vmcs->nmi_known_unmasked = false;
   4684
   4685	if (vmx->rmode.vm86_active) {
   4686		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
   4687		return;
   4688	}
   4689
   4690	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
   4691			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
   4692
   4693	vmx_clear_hlt(vcpu);
   4694}
   4695
   4696bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
   4697{
   4698	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4699	bool masked;
   4700
   4701	if (!enable_vnmi)
   4702		return vmx->loaded_vmcs->soft_vnmi_blocked;
   4703	if (vmx->loaded_vmcs->nmi_known_unmasked)
   4704		return false;
   4705	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
   4706	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
   4707	return masked;
   4708}
   4709
   4710void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
   4711{
   4712	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4713
   4714	if (!enable_vnmi) {
   4715		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
   4716			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
   4717			vmx->loaded_vmcs->vnmi_blocked_time = 0;
   4718		}
   4719	} else {
   4720		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
   4721		if (masked)
   4722			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
   4723				      GUEST_INTR_STATE_NMI);
   4724		else
   4725			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
   4726					GUEST_INTR_STATE_NMI);
   4727	}
   4728}
   4729
   4730bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
   4731{
   4732	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
   4733		return false;
   4734
   4735	if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
   4736		return true;
   4737
   4738	return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
   4739		(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
   4740		 GUEST_INTR_STATE_NMI));
   4741}
   4742
   4743static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   4744{
   4745	if (to_vmx(vcpu)->nested.nested_run_pending)
   4746		return -EBUSY;
   4747
   4748	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
   4749	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
   4750		return -EBUSY;
   4751
   4752	return !vmx_nmi_blocked(vcpu);
   4753}
   4754
   4755bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
   4756{
   4757	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
   4758		return false;
   4759
   4760	return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
   4761	       (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
   4762		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
   4763}
   4764
   4765static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   4766{
   4767	if (to_vmx(vcpu)->nested.nested_run_pending)
   4768		return -EBUSY;
   4769
   4770       /*
   4771        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
   4772        * e.g. if the IRQ arrived asynchronously after checking nested events.
   4773        */
   4774	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
   4775		return -EBUSY;
   4776
   4777	return !vmx_interrupt_blocked(vcpu);
   4778}
   4779
   4780static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
   4781{
   4782	void __user *ret;
   4783
   4784	if (enable_unrestricted_guest)
   4785		return 0;
   4786
   4787	mutex_lock(&kvm->slots_lock);
   4788	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
   4789				      PAGE_SIZE * 3);
   4790	mutex_unlock(&kvm->slots_lock);
   4791
   4792	if (IS_ERR(ret))
   4793		return PTR_ERR(ret);
   4794
   4795	to_kvm_vmx(kvm)->tss_addr = addr;
   4796
   4797	return init_rmode_tss(kvm, ret);
   4798}
   4799
   4800static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
   4801{
   4802	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
   4803	return 0;
   4804}
   4805
   4806static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
   4807{
   4808	switch (vec) {
   4809	case BP_VECTOR:
   4810		/*
   4811		 * Update instruction length as we may reinject the exception
   4812		 * from user space while in guest debugging mode.
   4813		 */
   4814		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
   4815			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
   4816		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
   4817			return false;
   4818		fallthrough;
   4819	case DB_VECTOR:
   4820		return !(vcpu->guest_debug &
   4821			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
   4822	case DE_VECTOR:
   4823	case OF_VECTOR:
   4824	case BR_VECTOR:
   4825	case UD_VECTOR:
   4826	case DF_VECTOR:
   4827	case SS_VECTOR:
   4828	case GP_VECTOR:
   4829	case MF_VECTOR:
   4830		return true;
   4831	}
   4832	return false;
   4833}
   4834
   4835static int handle_rmode_exception(struct kvm_vcpu *vcpu,
   4836				  int vec, u32 err_code)
   4837{
   4838	/*
   4839	 * Instruction with address size override prefix opcode 0x67
   4840	 * Cause the #SS fault with 0 error code in VM86 mode.
   4841	 */
   4842	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
   4843		if (kvm_emulate_instruction(vcpu, 0)) {
   4844			if (vcpu->arch.halt_request) {
   4845				vcpu->arch.halt_request = 0;
   4846				return kvm_emulate_halt_noskip(vcpu);
   4847			}
   4848			return 1;
   4849		}
   4850		return 0;
   4851	}
   4852
   4853	/*
   4854	 * Forward all other exceptions that are valid in real mode.
   4855	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
   4856	 *        the required debugging infrastructure rework.
   4857	 */
   4858	kvm_queue_exception(vcpu, vec);
   4859	return 1;
   4860}
   4861
   4862static int handle_machine_check(struct kvm_vcpu *vcpu)
   4863{
   4864	/* handled by vmx_vcpu_run() */
   4865	return 1;
   4866}
   4867
   4868/*
   4869 * If the host has split lock detection disabled, then #AC is
   4870 * unconditionally injected into the guest, which is the pre split lock
   4871 * detection behaviour.
   4872 *
   4873 * If the host has split lock detection enabled then #AC is
   4874 * only injected into the guest when:
   4875 *  - Guest CPL == 3 (user mode)
   4876 *  - Guest has #AC detection enabled in CR0
   4877 *  - Guest EFLAGS has AC bit set
   4878 */
   4879bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
   4880{
   4881	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
   4882		return true;
   4883
   4884	return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
   4885	       (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
   4886}
   4887
   4888static int handle_exception_nmi(struct kvm_vcpu *vcpu)
   4889{
   4890	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4891	struct kvm_run *kvm_run = vcpu->run;
   4892	u32 intr_info, ex_no, error_code;
   4893	unsigned long cr2, dr6;
   4894	u32 vect_info;
   4895
   4896	vect_info = vmx->idt_vectoring_info;
   4897	intr_info = vmx_get_intr_info(vcpu);
   4898
   4899	if (is_machine_check(intr_info) || is_nmi(intr_info))
   4900		return 1; /* handled by handle_exception_nmi_irqoff() */
   4901
   4902	/*
   4903	 * Queue the exception here instead of in handle_nm_fault_irqoff().
   4904	 * This ensures the nested_vmx check is not skipped so vmexit can
   4905	 * be reflected to L1 (when it intercepts #NM) before reaching this
   4906	 * point.
   4907	 */
   4908	if (is_nm_fault(intr_info)) {
   4909		kvm_queue_exception(vcpu, NM_VECTOR);
   4910		return 1;
   4911	}
   4912
   4913	if (is_invalid_opcode(intr_info))
   4914		return handle_ud(vcpu);
   4915
   4916	error_code = 0;
   4917	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
   4918		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
   4919
   4920	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
   4921		WARN_ON_ONCE(!enable_vmware_backdoor);
   4922
   4923		/*
   4924		 * VMware backdoor emulation on #GP interception only handles
   4925		 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
   4926		 * error code on #GP.
   4927		 */
   4928		if (error_code) {
   4929			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
   4930			return 1;
   4931		}
   4932		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
   4933	}
   4934
   4935	/*
   4936	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
   4937	 * MMIO, it is better to report an internal error.
   4938	 * See the comments in vmx_handle_exit.
   4939	 */
   4940	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
   4941	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
   4942		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   4943		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
   4944		vcpu->run->internal.ndata = 4;
   4945		vcpu->run->internal.data[0] = vect_info;
   4946		vcpu->run->internal.data[1] = intr_info;
   4947		vcpu->run->internal.data[2] = error_code;
   4948		vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
   4949		return 0;
   4950	}
   4951
   4952	if (is_page_fault(intr_info)) {
   4953		cr2 = vmx_get_exit_qual(vcpu);
   4954		if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
   4955			/*
   4956			 * EPT will cause page fault only if we need to
   4957			 * detect illegal GPAs.
   4958			 */
   4959			WARN_ON_ONCE(!allow_smaller_maxphyaddr);
   4960			kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
   4961			return 1;
   4962		} else
   4963			return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
   4964	}
   4965
   4966	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
   4967
   4968	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
   4969		return handle_rmode_exception(vcpu, ex_no, error_code);
   4970
   4971	switch (ex_no) {
   4972	case DB_VECTOR:
   4973		dr6 = vmx_get_exit_qual(vcpu);
   4974		if (!(vcpu->guest_debug &
   4975		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
   4976			/*
   4977			 * If the #DB was due to ICEBP, a.k.a. INT1, skip the
   4978			 * instruction.  ICEBP generates a trap-like #DB, but
   4979			 * despite its interception control being tied to #DB,
   4980			 * is an instruction intercept, i.e. the VM-Exit occurs
   4981			 * on the ICEBP itself.  Note, skipping ICEBP also
   4982			 * clears STI and MOVSS blocking.
   4983			 *
   4984			 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
   4985			 * if single-step is enabled in RFLAGS and STI or MOVSS
   4986			 * blocking is active, as the CPU doesn't set the bit
   4987			 * on VM-Exit due to #DB interception.  VM-Entry has a
   4988			 * consistency check that a single-step #DB is pending
   4989			 * in this scenario as the previous instruction cannot
   4990			 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
   4991			 * don't modify RFLAGS), therefore the one instruction
   4992			 * delay when activating single-step breakpoints must
   4993			 * have already expired.  Note, the CPU sets/clears BS
   4994			 * as appropriate for all other VM-Exits types.
   4995			 */
   4996			if (is_icebp(intr_info))
   4997				WARN_ON(!skip_emulated_instruction(vcpu));
   4998			else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
   4999				 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
   5000				  (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
   5001				vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
   5002					    vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
   5003
   5004			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
   5005			return 1;
   5006		}
   5007		kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
   5008		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
   5009		fallthrough;
   5010	case BP_VECTOR:
   5011		/*
   5012		 * Update instruction length as we may reinject #BP from
   5013		 * user space while in guest debugging mode. Reading it for
   5014		 * #DB as well causes no harm, it is not used in that case.
   5015		 */
   5016		vmx->vcpu.arch.event_exit_inst_len =
   5017			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
   5018		kvm_run->exit_reason = KVM_EXIT_DEBUG;
   5019		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
   5020		kvm_run->debug.arch.exception = ex_no;
   5021		break;
   5022	case AC_VECTOR:
   5023		if (vmx_guest_inject_ac(vcpu)) {
   5024			kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
   5025			return 1;
   5026		}
   5027
   5028		/*
   5029		 * Handle split lock. Depending on detection mode this will
   5030		 * either warn and disable split lock detection for this
   5031		 * task or force SIGBUS on it.
   5032		 */
   5033		if (handle_guest_split_lock(kvm_rip_read(vcpu)))
   5034			return 1;
   5035		fallthrough;
   5036	default:
   5037		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
   5038		kvm_run->ex.exception = ex_no;
   5039		kvm_run->ex.error_code = error_code;
   5040		break;
   5041	}
   5042	return 0;
   5043}
   5044
   5045static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
   5046{
   5047	++vcpu->stat.irq_exits;
   5048	return 1;
   5049}
   5050
   5051static int handle_triple_fault(struct kvm_vcpu *vcpu)
   5052{
   5053	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
   5054	vcpu->mmio_needed = 0;
   5055	return 0;
   5056}
   5057
   5058static int handle_io(struct kvm_vcpu *vcpu)
   5059{
   5060	unsigned long exit_qualification;
   5061	int size, in, string;
   5062	unsigned port;
   5063
   5064	exit_qualification = vmx_get_exit_qual(vcpu);
   5065	string = (exit_qualification & 16) != 0;
   5066
   5067	++vcpu->stat.io_exits;
   5068
   5069	if (string)
   5070		return kvm_emulate_instruction(vcpu, 0);
   5071
   5072	port = exit_qualification >> 16;
   5073	size = (exit_qualification & 7) + 1;
   5074	in = (exit_qualification & 8) != 0;
   5075
   5076	return kvm_fast_pio(vcpu, size, port, in);
   5077}
   5078
   5079static void
   5080vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
   5081{
   5082	/*
   5083	 * Patch in the VMCALL instruction:
   5084	 */
   5085	hypercall[0] = 0x0f;
   5086	hypercall[1] = 0x01;
   5087	hypercall[2] = 0xc1;
   5088}
   5089
   5090/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
   5091static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
   5092{
   5093	if (is_guest_mode(vcpu)) {
   5094		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   5095		unsigned long orig_val = val;
   5096
   5097		/*
   5098		 * We get here when L2 changed cr0 in a way that did not change
   5099		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
   5100		 * but did change L0 shadowed bits. So we first calculate the
   5101		 * effective cr0 value that L1 would like to write into the
   5102		 * hardware. It consists of the L2-owned bits from the new
   5103		 * value combined with the L1-owned bits from L1's guest_cr0.
   5104		 */
   5105		val = (val & ~vmcs12->cr0_guest_host_mask) |
   5106			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
   5107
   5108		if (!nested_guest_cr0_valid(vcpu, val))
   5109			return 1;
   5110
   5111		if (kvm_set_cr0(vcpu, val))
   5112			return 1;
   5113		vmcs_writel(CR0_READ_SHADOW, orig_val);
   5114		return 0;
   5115	} else {
   5116		if (to_vmx(vcpu)->nested.vmxon &&
   5117		    !nested_host_cr0_valid(vcpu, val))
   5118			return 1;
   5119
   5120		return kvm_set_cr0(vcpu, val);
   5121	}
   5122}
   5123
   5124static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
   5125{
   5126	if (is_guest_mode(vcpu)) {
   5127		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   5128		unsigned long orig_val = val;
   5129
   5130		/* analogously to handle_set_cr0 */
   5131		val = (val & ~vmcs12->cr4_guest_host_mask) |
   5132			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
   5133		if (kvm_set_cr4(vcpu, val))
   5134			return 1;
   5135		vmcs_writel(CR4_READ_SHADOW, orig_val);
   5136		return 0;
   5137	} else
   5138		return kvm_set_cr4(vcpu, val);
   5139}
   5140
   5141static int handle_desc(struct kvm_vcpu *vcpu)
   5142{
   5143	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
   5144	return kvm_emulate_instruction(vcpu, 0);
   5145}
   5146
   5147static int handle_cr(struct kvm_vcpu *vcpu)
   5148{
   5149	unsigned long exit_qualification, val;
   5150	int cr;
   5151	int reg;
   5152	int err;
   5153	int ret;
   5154
   5155	exit_qualification = vmx_get_exit_qual(vcpu);
   5156	cr = exit_qualification & 15;
   5157	reg = (exit_qualification >> 8) & 15;
   5158	switch ((exit_qualification >> 4) & 3) {
   5159	case 0: /* mov to cr */
   5160		val = kvm_register_read(vcpu, reg);
   5161		trace_kvm_cr_write(cr, val);
   5162		switch (cr) {
   5163		case 0:
   5164			err = handle_set_cr0(vcpu, val);
   5165			return kvm_complete_insn_gp(vcpu, err);
   5166		case 3:
   5167			WARN_ON_ONCE(enable_unrestricted_guest);
   5168
   5169			err = kvm_set_cr3(vcpu, val);
   5170			return kvm_complete_insn_gp(vcpu, err);
   5171		case 4:
   5172			err = handle_set_cr4(vcpu, val);
   5173			return kvm_complete_insn_gp(vcpu, err);
   5174		case 8: {
   5175				u8 cr8_prev = kvm_get_cr8(vcpu);
   5176				u8 cr8 = (u8)val;
   5177				err = kvm_set_cr8(vcpu, cr8);
   5178				ret = kvm_complete_insn_gp(vcpu, err);
   5179				if (lapic_in_kernel(vcpu))
   5180					return ret;
   5181				if (cr8_prev <= cr8)
   5182					return ret;
   5183				/*
   5184				 * TODO: we might be squashing a
   5185				 * KVM_GUESTDBG_SINGLESTEP-triggered
   5186				 * KVM_EXIT_DEBUG here.
   5187				 */
   5188				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
   5189				return 0;
   5190			}
   5191		}
   5192		break;
   5193	case 2: /* clts */
   5194		KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
   5195		return -EIO;
   5196	case 1: /*mov from cr*/
   5197		switch (cr) {
   5198		case 3:
   5199			WARN_ON_ONCE(enable_unrestricted_guest);
   5200
   5201			val = kvm_read_cr3(vcpu);
   5202			kvm_register_write(vcpu, reg, val);
   5203			trace_kvm_cr_read(cr, val);
   5204			return kvm_skip_emulated_instruction(vcpu);
   5205		case 8:
   5206			val = kvm_get_cr8(vcpu);
   5207			kvm_register_write(vcpu, reg, val);
   5208			trace_kvm_cr_read(cr, val);
   5209			return kvm_skip_emulated_instruction(vcpu);
   5210		}
   5211		break;
   5212	case 3: /* lmsw */
   5213		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
   5214		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
   5215		kvm_lmsw(vcpu, val);
   5216
   5217		return kvm_skip_emulated_instruction(vcpu);
   5218	default:
   5219		break;
   5220	}
   5221	vcpu->run->exit_reason = 0;
   5222	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
   5223	       (int)(exit_qualification >> 4) & 3, cr);
   5224	return 0;
   5225}
   5226
   5227static int handle_dr(struct kvm_vcpu *vcpu)
   5228{
   5229	unsigned long exit_qualification;
   5230	int dr, dr7, reg;
   5231	int err = 1;
   5232
   5233	exit_qualification = vmx_get_exit_qual(vcpu);
   5234	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
   5235
   5236	/* First, if DR does not exist, trigger UD */
   5237	if (!kvm_require_dr(vcpu, dr))
   5238		return 1;
   5239
   5240	if (vmx_get_cpl(vcpu) > 0)
   5241		goto out;
   5242
   5243	dr7 = vmcs_readl(GUEST_DR7);
   5244	if (dr7 & DR7_GD) {
   5245		/*
   5246		 * As the vm-exit takes precedence over the debug trap, we
   5247		 * need to emulate the latter, either for the host or the
   5248		 * guest debugging itself.
   5249		 */
   5250		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
   5251			vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
   5252			vcpu->run->debug.arch.dr7 = dr7;
   5253			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
   5254			vcpu->run->debug.arch.exception = DB_VECTOR;
   5255			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
   5256			return 0;
   5257		} else {
   5258			kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
   5259			return 1;
   5260		}
   5261	}
   5262
   5263	if (vcpu->guest_debug == 0) {
   5264		exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
   5265
   5266		/*
   5267		 * No more DR vmexits; force a reload of the debug registers
   5268		 * and reenter on this instruction.  The next vmexit will
   5269		 * retrieve the full state of the debug registers.
   5270		 */
   5271		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
   5272		return 1;
   5273	}
   5274
   5275	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
   5276	if (exit_qualification & TYPE_MOV_FROM_DR) {
   5277		unsigned long val;
   5278
   5279		kvm_get_dr(vcpu, dr, &val);
   5280		kvm_register_write(vcpu, reg, val);
   5281		err = 0;
   5282	} else {
   5283		err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
   5284	}
   5285
   5286out:
   5287	return kvm_complete_insn_gp(vcpu, err);
   5288}
   5289
   5290static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
   5291{
   5292	get_debugreg(vcpu->arch.db[0], 0);
   5293	get_debugreg(vcpu->arch.db[1], 1);
   5294	get_debugreg(vcpu->arch.db[2], 2);
   5295	get_debugreg(vcpu->arch.db[3], 3);
   5296	get_debugreg(vcpu->arch.dr6, 6);
   5297	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
   5298
   5299	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
   5300	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
   5301
   5302	/*
   5303	 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
   5304	 * a stale dr6 from the guest.
   5305	 */
   5306	set_debugreg(DR6_RESERVED, 6);
   5307}
   5308
   5309static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
   5310{
   5311	vmcs_writel(GUEST_DR7, val);
   5312}
   5313
   5314static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
   5315{
   5316	kvm_apic_update_ppr(vcpu);
   5317	return 1;
   5318}
   5319
   5320static int handle_interrupt_window(struct kvm_vcpu *vcpu)
   5321{
   5322	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
   5323
   5324	kvm_make_request(KVM_REQ_EVENT, vcpu);
   5325
   5326	++vcpu->stat.irq_window_exits;
   5327	return 1;
   5328}
   5329
   5330static int handle_invlpg(struct kvm_vcpu *vcpu)
   5331{
   5332	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5333
   5334	kvm_mmu_invlpg(vcpu, exit_qualification);
   5335	return kvm_skip_emulated_instruction(vcpu);
   5336}
   5337
   5338static int handle_apic_access(struct kvm_vcpu *vcpu)
   5339{
   5340	if (likely(fasteoi)) {
   5341		unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5342		int access_type, offset;
   5343
   5344		access_type = exit_qualification & APIC_ACCESS_TYPE;
   5345		offset = exit_qualification & APIC_ACCESS_OFFSET;
   5346		/*
   5347		 * Sane guest uses MOV to write EOI, with written value
   5348		 * not cared. So make a short-circuit here by avoiding
   5349		 * heavy instruction emulation.
   5350		 */
   5351		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
   5352		    (offset == APIC_EOI)) {
   5353			kvm_lapic_set_eoi(vcpu);
   5354			return kvm_skip_emulated_instruction(vcpu);
   5355		}
   5356	}
   5357	return kvm_emulate_instruction(vcpu, 0);
   5358}
   5359
   5360static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
   5361{
   5362	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5363	int vector = exit_qualification & 0xff;
   5364
   5365	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
   5366	kvm_apic_set_eoi_accelerated(vcpu, vector);
   5367	return 1;
   5368}
   5369
   5370static int handle_apic_write(struct kvm_vcpu *vcpu)
   5371{
   5372	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5373
   5374	/*
   5375	 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
   5376	 * hardware has done any necessary aliasing, offset adjustments, etc...
   5377	 * for the access.  I.e. the correct value has already been  written to
   5378	 * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
   5379	 * retrieve the register value and emulate the access.
   5380	 */
   5381	u32 offset = exit_qualification & 0xff0;
   5382
   5383	kvm_apic_write_nodecode(vcpu, offset);
   5384	return 1;
   5385}
   5386
   5387static int handle_task_switch(struct kvm_vcpu *vcpu)
   5388{
   5389	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5390	unsigned long exit_qualification;
   5391	bool has_error_code = false;
   5392	u32 error_code = 0;
   5393	u16 tss_selector;
   5394	int reason, type, idt_v, idt_index;
   5395
   5396	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
   5397	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
   5398	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
   5399
   5400	exit_qualification = vmx_get_exit_qual(vcpu);
   5401
   5402	reason = (u32)exit_qualification >> 30;
   5403	if (reason == TASK_SWITCH_GATE && idt_v) {
   5404		switch (type) {
   5405		case INTR_TYPE_NMI_INTR:
   5406			vcpu->arch.nmi_injected = false;
   5407			vmx_set_nmi_mask(vcpu, true);
   5408			break;
   5409		case INTR_TYPE_EXT_INTR:
   5410		case INTR_TYPE_SOFT_INTR:
   5411			kvm_clear_interrupt_queue(vcpu);
   5412			break;
   5413		case INTR_TYPE_HARD_EXCEPTION:
   5414			if (vmx->idt_vectoring_info &
   5415			    VECTORING_INFO_DELIVER_CODE_MASK) {
   5416				has_error_code = true;
   5417				error_code =
   5418					vmcs_read32(IDT_VECTORING_ERROR_CODE);
   5419			}
   5420			fallthrough;
   5421		case INTR_TYPE_SOFT_EXCEPTION:
   5422			kvm_clear_exception_queue(vcpu);
   5423			break;
   5424		default:
   5425			break;
   5426		}
   5427	}
   5428	tss_selector = exit_qualification;
   5429
   5430	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
   5431		       type != INTR_TYPE_EXT_INTR &&
   5432		       type != INTR_TYPE_NMI_INTR))
   5433		WARN_ON(!skip_emulated_instruction(vcpu));
   5434
   5435	/*
   5436	 * TODO: What about debug traps on tss switch?
   5437	 *       Are we supposed to inject them and update dr6?
   5438	 */
   5439	return kvm_task_switch(vcpu, tss_selector,
   5440			       type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
   5441			       reason, has_error_code, error_code);
   5442}
   5443
   5444static int handle_ept_violation(struct kvm_vcpu *vcpu)
   5445{
   5446	unsigned long exit_qualification;
   5447	gpa_t gpa;
   5448	u64 error_code;
   5449
   5450	exit_qualification = vmx_get_exit_qual(vcpu);
   5451
   5452	/*
   5453	 * EPT violation happened while executing iret from NMI,
   5454	 * "blocked by NMI" bit has to be set before next VM entry.
   5455	 * There are errata that may cause this bit to not be set:
   5456	 * AAK134, BY25.
   5457	 */
   5458	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
   5459			enable_vnmi &&
   5460			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
   5461		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
   5462
   5463	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
   5464	trace_kvm_page_fault(gpa, exit_qualification);
   5465
   5466	/* Is it a read fault? */
   5467	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
   5468		     ? PFERR_USER_MASK : 0;
   5469	/* Is it a write fault? */
   5470	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
   5471		      ? PFERR_WRITE_MASK : 0;
   5472	/* Is it a fetch fault? */
   5473	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
   5474		      ? PFERR_FETCH_MASK : 0;
   5475	/* ept page table entry is present? */
   5476	error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
   5477		      ? PFERR_PRESENT_MASK : 0;
   5478
   5479	error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
   5480	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
   5481
   5482	vcpu->arch.exit_qualification = exit_qualification;
   5483
   5484	/*
   5485	 * Check that the GPA doesn't exceed physical memory limits, as that is
   5486	 * a guest page fault.  We have to emulate the instruction here, because
   5487	 * if the illegal address is that of a paging structure, then
   5488	 * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
   5489	 * would also use advanced VM-exit information for EPT violations to
   5490	 * reconstruct the page fault error code.
   5491	 */
   5492	if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
   5493		return kvm_emulate_instruction(vcpu, 0);
   5494
   5495	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
   5496}
   5497
   5498static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
   5499{
   5500	gpa_t gpa;
   5501
   5502	if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
   5503		return 1;
   5504
   5505	/*
   5506	 * A nested guest cannot optimize MMIO vmexits, because we have an
   5507	 * nGPA here instead of the required GPA.
   5508	 */
   5509	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
   5510	if (!is_guest_mode(vcpu) &&
   5511	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
   5512		trace_kvm_fast_mmio(gpa);
   5513		return kvm_skip_emulated_instruction(vcpu);
   5514	}
   5515
   5516	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
   5517}
   5518
   5519static int handle_nmi_window(struct kvm_vcpu *vcpu)
   5520{
   5521	if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
   5522		return -EIO;
   5523
   5524	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
   5525	++vcpu->stat.nmi_window_exits;
   5526	kvm_make_request(KVM_REQ_EVENT, vcpu);
   5527
   5528	return 1;
   5529}
   5530
   5531static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
   5532{
   5533	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5534
   5535	return vmx->emulation_required && !vmx->rmode.vm86_active &&
   5536	       (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
   5537}
   5538
   5539static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
   5540{
   5541	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5542	bool intr_window_requested;
   5543	unsigned count = 130;
   5544
   5545	intr_window_requested = exec_controls_get(vmx) &
   5546				CPU_BASED_INTR_WINDOW_EXITING;
   5547
   5548	while (vmx->emulation_required && count-- != 0) {
   5549		if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
   5550			return handle_interrupt_window(&vmx->vcpu);
   5551
   5552		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
   5553			return 1;
   5554
   5555		if (!kvm_emulate_instruction(vcpu, 0))
   5556			return 0;
   5557
   5558		if (vmx_emulation_required_with_pending_exception(vcpu)) {
   5559			kvm_prepare_emulation_failure_exit(vcpu);
   5560			return 0;
   5561		}
   5562
   5563		if (vcpu->arch.halt_request) {
   5564			vcpu->arch.halt_request = 0;
   5565			return kvm_emulate_halt_noskip(vcpu);
   5566		}
   5567
   5568		/*
   5569		 * Note, return 1 and not 0, vcpu_run() will invoke
   5570		 * xfer_to_guest_mode() which will create a proper return
   5571		 * code.
   5572		 */
   5573		if (__xfer_to_guest_mode_work_pending())
   5574			return 1;
   5575	}
   5576
   5577	return 1;
   5578}
   5579
   5580static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
   5581{
   5582	if (vmx_emulation_required_with_pending_exception(vcpu)) {
   5583		kvm_prepare_emulation_failure_exit(vcpu);
   5584		return 0;
   5585	}
   5586
   5587	return 1;
   5588}
   5589
   5590static void grow_ple_window(struct kvm_vcpu *vcpu)
   5591{
   5592	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5593	unsigned int old = vmx->ple_window;
   5594
   5595	vmx->ple_window = __grow_ple_window(old, ple_window,
   5596					    ple_window_grow,
   5597					    ple_window_max);
   5598
   5599	if (vmx->ple_window != old) {
   5600		vmx->ple_window_dirty = true;
   5601		trace_kvm_ple_window_update(vcpu->vcpu_id,
   5602					    vmx->ple_window, old);
   5603	}
   5604}
   5605
   5606static void shrink_ple_window(struct kvm_vcpu *vcpu)
   5607{
   5608	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5609	unsigned int old = vmx->ple_window;
   5610
   5611	vmx->ple_window = __shrink_ple_window(old, ple_window,
   5612					      ple_window_shrink,
   5613					      ple_window);
   5614
   5615	if (vmx->ple_window != old) {
   5616		vmx->ple_window_dirty = true;
   5617		trace_kvm_ple_window_update(vcpu->vcpu_id,
   5618					    vmx->ple_window, old);
   5619	}
   5620}
   5621
   5622/*
   5623 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
   5624 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
   5625 */
   5626static int handle_pause(struct kvm_vcpu *vcpu)
   5627{
   5628	if (!kvm_pause_in_guest(vcpu->kvm))
   5629		grow_ple_window(vcpu);
   5630
   5631	/*
   5632	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
   5633	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
   5634	 * never set PAUSE_EXITING and just set PLE if supported,
   5635	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
   5636	 */
   5637	kvm_vcpu_on_spin(vcpu, true);
   5638	return kvm_skip_emulated_instruction(vcpu);
   5639}
   5640
   5641static int handle_monitor_trap(struct kvm_vcpu *vcpu)
   5642{
   5643	return 1;
   5644}
   5645
   5646static int handle_invpcid(struct kvm_vcpu *vcpu)
   5647{
   5648	u32 vmx_instruction_info;
   5649	unsigned long type;
   5650	gva_t gva;
   5651	struct {
   5652		u64 pcid;
   5653		u64 gla;
   5654	} operand;
   5655	int gpr_index;
   5656
   5657	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
   5658		kvm_queue_exception(vcpu, UD_VECTOR);
   5659		return 1;
   5660	}
   5661
   5662	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5663	gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
   5664	type = kvm_register_read(vcpu, gpr_index);
   5665
   5666	/* According to the Intel instruction reference, the memory operand
   5667	 * is read even if it isn't needed (e.g., for type==all)
   5668	 */
   5669	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
   5670				vmx_instruction_info, false,
   5671				sizeof(operand), &gva))
   5672		return 1;
   5673
   5674	return kvm_handle_invpcid(vcpu, type, gva);
   5675}
   5676
   5677static int handle_pml_full(struct kvm_vcpu *vcpu)
   5678{
   5679	unsigned long exit_qualification;
   5680
   5681	trace_kvm_pml_full(vcpu->vcpu_id);
   5682
   5683	exit_qualification = vmx_get_exit_qual(vcpu);
   5684
   5685	/*
   5686	 * PML buffer FULL happened while executing iret from NMI,
   5687	 * "blocked by NMI" bit has to be set before next VM entry.
   5688	 */
   5689	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
   5690			enable_vnmi &&
   5691			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
   5692		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
   5693				GUEST_INTR_STATE_NMI);
   5694
   5695	/*
   5696	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
   5697	 * here.., and there's no userspace involvement needed for PML.
   5698	 */
   5699	return 1;
   5700}
   5701
   5702static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
   5703{
   5704	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5705
   5706	if (!vmx->req_immediate_exit &&
   5707	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
   5708		kvm_lapic_expired_hv_timer(vcpu);
   5709		return EXIT_FASTPATH_REENTER_GUEST;
   5710	}
   5711
   5712	return EXIT_FASTPATH_NONE;
   5713}
   5714
   5715static int handle_preemption_timer(struct kvm_vcpu *vcpu)
   5716{
   5717	handle_fastpath_preemption_timer(vcpu);
   5718	return 1;
   5719}
   5720
   5721/*
   5722 * When nested=0, all VMX instruction VM Exits filter here.  The handlers
   5723 * are overwritten by nested_vmx_setup() when nested=1.
   5724 */
   5725static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
   5726{
   5727	kvm_queue_exception(vcpu, UD_VECTOR);
   5728	return 1;
   5729}
   5730
   5731#ifndef CONFIG_X86_SGX_KVM
   5732static int handle_encls(struct kvm_vcpu *vcpu)
   5733{
   5734	/*
   5735	 * SGX virtualization is disabled.  There is no software enable bit for
   5736	 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
   5737	 * the guest from executing ENCLS (when SGX is supported by hardware).
   5738	 */
   5739	kvm_queue_exception(vcpu, UD_VECTOR);
   5740	return 1;
   5741}
   5742#endif /* CONFIG_X86_SGX_KVM */
   5743
   5744static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
   5745{
   5746	/*
   5747	 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
   5748	 * VM-Exits. Unconditionally set the flag here and leave the handling to
   5749	 * vmx_handle_exit().
   5750	 */
   5751	to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
   5752	return 1;
   5753}
   5754
   5755/*
   5756 * The exit handlers return 1 if the exit was handled fully and guest execution
   5757 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
   5758 * to be done to userspace and return 0.
   5759 */
   5760static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
   5761	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
   5762	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
   5763	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
   5764	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
   5765	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
   5766	[EXIT_REASON_CR_ACCESS]               = handle_cr,
   5767	[EXIT_REASON_DR_ACCESS]               = handle_dr,
   5768	[EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
   5769	[EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
   5770	[EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
   5771	[EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
   5772	[EXIT_REASON_HLT]                     = kvm_emulate_halt,
   5773	[EXIT_REASON_INVD]		      = kvm_emulate_invd,
   5774	[EXIT_REASON_INVLPG]		      = handle_invlpg,
   5775	[EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
   5776	[EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
   5777	[EXIT_REASON_VMCLEAR]		      = handle_vmx_instruction,
   5778	[EXIT_REASON_VMLAUNCH]		      = handle_vmx_instruction,
   5779	[EXIT_REASON_VMPTRLD]		      = handle_vmx_instruction,
   5780	[EXIT_REASON_VMPTRST]		      = handle_vmx_instruction,
   5781	[EXIT_REASON_VMREAD]		      = handle_vmx_instruction,
   5782	[EXIT_REASON_VMRESUME]		      = handle_vmx_instruction,
   5783	[EXIT_REASON_VMWRITE]		      = handle_vmx_instruction,
   5784	[EXIT_REASON_VMOFF]		      = handle_vmx_instruction,
   5785	[EXIT_REASON_VMON]		      = handle_vmx_instruction,
   5786	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
   5787	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
   5788	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
   5789	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
   5790	[EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
   5791	[EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
   5792	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
   5793	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
   5794	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
   5795	[EXIT_REASON_LDTR_TR]		      = handle_desc,
   5796	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
   5797	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
   5798	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
   5799	[EXIT_REASON_MWAIT_INSTRUCTION]	      = kvm_emulate_mwait,
   5800	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
   5801	[EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
   5802	[EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
   5803	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
   5804	[EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
   5805	[EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
   5806	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
   5807	[EXIT_REASON_INVPCID]                 = handle_invpcid,
   5808	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
   5809	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
   5810	[EXIT_REASON_ENCLS]		      = handle_encls,
   5811	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
   5812};
   5813
   5814static const int kvm_vmx_max_exit_handlers =
   5815	ARRAY_SIZE(kvm_vmx_exit_handlers);
   5816
   5817static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
   5818			      u64 *info1, u64 *info2,
   5819			      u32 *intr_info, u32 *error_code)
   5820{
   5821	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5822
   5823	*reason = vmx->exit_reason.full;
   5824	*info1 = vmx_get_exit_qual(vcpu);
   5825	if (!(vmx->exit_reason.failed_vmentry)) {
   5826		*info2 = vmx->idt_vectoring_info;
   5827		*intr_info = vmx_get_intr_info(vcpu);
   5828		if (is_exception_with_error_code(*intr_info))
   5829			*error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
   5830		else
   5831			*error_code = 0;
   5832	} else {
   5833		*info2 = 0;
   5834		*intr_info = 0;
   5835		*error_code = 0;
   5836	}
   5837}
   5838
   5839static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
   5840{
   5841	if (vmx->pml_pg) {
   5842		__free_page(vmx->pml_pg);
   5843		vmx->pml_pg = NULL;
   5844	}
   5845}
   5846
   5847static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
   5848{
   5849	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5850	u64 *pml_buf;
   5851	u16 pml_idx;
   5852
   5853	pml_idx = vmcs_read16(GUEST_PML_INDEX);
   5854
   5855	/* Do nothing if PML buffer is empty */
   5856	if (pml_idx == (PML_ENTITY_NUM - 1))
   5857		return;
   5858
   5859	/* PML index always points to next available PML buffer entity */
   5860	if (pml_idx >= PML_ENTITY_NUM)
   5861		pml_idx = 0;
   5862	else
   5863		pml_idx++;
   5864
   5865	pml_buf = page_address(vmx->pml_pg);
   5866	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
   5867		u64 gpa;
   5868
   5869		gpa = pml_buf[pml_idx];
   5870		WARN_ON(gpa & (PAGE_SIZE - 1));
   5871		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
   5872	}
   5873
   5874	/* reset PML index */
   5875	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
   5876}
   5877
   5878static void vmx_dump_sel(char *name, uint32_t sel)
   5879{
   5880	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
   5881	       name, vmcs_read16(sel),
   5882	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
   5883	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
   5884	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
   5885}
   5886
   5887static void vmx_dump_dtsel(char *name, uint32_t limit)
   5888{
   5889	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
   5890	       name, vmcs_read32(limit),
   5891	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
   5892}
   5893
   5894static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
   5895{
   5896	unsigned int i;
   5897	struct vmx_msr_entry *e;
   5898
   5899	pr_err("MSR %s:\n", name);
   5900	for (i = 0, e = m->val; i < m->nr; ++i, ++e)
   5901		pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
   5902}
   5903
   5904void dump_vmcs(struct kvm_vcpu *vcpu)
   5905{
   5906	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5907	u32 vmentry_ctl, vmexit_ctl;
   5908	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
   5909	unsigned long cr4;
   5910	int efer_slot;
   5911
   5912	if (!dump_invalid_vmcs) {
   5913		pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
   5914		return;
   5915	}
   5916
   5917	vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
   5918	vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
   5919	cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
   5920	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
   5921	cr4 = vmcs_readl(GUEST_CR4);
   5922	secondary_exec_control = 0;
   5923	if (cpu_has_secondary_exec_ctrls())
   5924		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
   5925
   5926	pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
   5927	       vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
   5928	pr_err("*** Guest State ***\n");
   5929	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
   5930	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
   5931	       vmcs_readl(CR0_GUEST_HOST_MASK));
   5932	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
   5933	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
   5934	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
   5935	if (cpu_has_vmx_ept()) {
   5936		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
   5937		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
   5938		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
   5939		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
   5940	}
   5941	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
   5942	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
   5943	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
   5944	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
   5945	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
   5946	       vmcs_readl(GUEST_SYSENTER_ESP),
   5947	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
   5948	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
   5949	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
   5950	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
   5951	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
   5952	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
   5953	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
   5954	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
   5955	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
   5956	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
   5957	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
   5958	efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
   5959	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
   5960		pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
   5961	else if (efer_slot >= 0)
   5962		pr_err("EFER= 0x%016llx (autoload)\n",
   5963		       vmx->msr_autoload.guest.val[efer_slot].value);
   5964	else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
   5965		pr_err("EFER= 0x%016llx (effective)\n",
   5966		       vcpu->arch.efer | (EFER_LMA | EFER_LME));
   5967	else
   5968		pr_err("EFER= 0x%016llx (effective)\n",
   5969		       vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
   5970	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
   5971		pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
   5972	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
   5973	       vmcs_read64(GUEST_IA32_DEBUGCTL),
   5974	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
   5975	if (cpu_has_load_perf_global_ctrl() &&
   5976	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
   5977		pr_err("PerfGlobCtl = 0x%016llx\n",
   5978		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
   5979	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
   5980		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
   5981	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
   5982	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
   5983	       vmcs_read32(GUEST_ACTIVITY_STATE));
   5984	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
   5985		pr_err("InterruptStatus = %04x\n",
   5986		       vmcs_read16(GUEST_INTR_STATUS));
   5987	if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
   5988		vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
   5989	if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
   5990		vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
   5991
   5992	pr_err("*** Host State ***\n");
   5993	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
   5994	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
   5995	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
   5996	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
   5997	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
   5998	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
   5999	       vmcs_read16(HOST_TR_SELECTOR));
   6000	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
   6001	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
   6002	       vmcs_readl(HOST_TR_BASE));
   6003	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
   6004	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
   6005	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
   6006	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
   6007	       vmcs_readl(HOST_CR4));
   6008	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
   6009	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
   6010	       vmcs_read32(HOST_IA32_SYSENTER_CS),
   6011	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
   6012	if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
   6013		pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
   6014	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
   6015		pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
   6016	if (cpu_has_load_perf_global_ctrl() &&
   6017	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
   6018		pr_err("PerfGlobCtl = 0x%016llx\n",
   6019		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
   6020	if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
   6021		vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
   6022
   6023	pr_err("*** Control State ***\n");
   6024	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
   6025	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
   6026	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
   6027	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
   6028	       vmcs_read32(EXCEPTION_BITMAP),
   6029	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
   6030	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
   6031	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
   6032	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
   6033	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
   6034	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
   6035	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
   6036	       vmcs_read32(VM_EXIT_INTR_INFO),
   6037	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
   6038	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
   6039	pr_err("        reason=%08x qualification=%016lx\n",
   6040	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
   6041	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
   6042	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
   6043	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
   6044	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
   6045	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
   6046		pr_err("TSC Multiplier = 0x%016llx\n",
   6047		       vmcs_read64(TSC_MULTIPLIER));
   6048	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
   6049		if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
   6050			u16 status = vmcs_read16(GUEST_INTR_STATUS);
   6051			pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
   6052		}
   6053		pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
   6054		if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
   6055			pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
   6056		pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
   6057	}
   6058	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
   6059		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
   6060	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
   6061		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
   6062	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
   6063		pr_err("PLE Gap=%08x Window=%08x\n",
   6064		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
   6065	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
   6066		pr_err("Virtual processor ID = 0x%04x\n",
   6067		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
   6068}
   6069
   6070/*
   6071 * The guest has exited.  See if we can fix it or if we need userspace
   6072 * assistance.
   6073 */
   6074static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
   6075{
   6076	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6077	union vmx_exit_reason exit_reason = vmx->exit_reason;
   6078	u32 vectoring_info = vmx->idt_vectoring_info;
   6079	u16 exit_handler_index;
   6080
   6081	/*
   6082	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
   6083	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
   6084	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
   6085	 * mode as if vcpus is in root mode, the PML buffer must has been
   6086	 * flushed already.  Note, PML is never enabled in hardware while
   6087	 * running L2.
   6088	 */
   6089	if (enable_pml && !is_guest_mode(vcpu))
   6090		vmx_flush_pml_buffer(vcpu);
   6091
   6092	/*
   6093	 * KVM should never reach this point with a pending nested VM-Enter.
   6094	 * More specifically, short-circuiting VM-Entry to emulate L2 due to
   6095	 * invalid guest state should never happen as that means KVM knowingly
   6096	 * allowed a nested VM-Enter with an invalid vmcs12.  More below.
   6097	 */
   6098	if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
   6099		return -EIO;
   6100
   6101	if (is_guest_mode(vcpu)) {
   6102		/*
   6103		 * PML is never enabled when running L2, bail immediately if a
   6104		 * PML full exit occurs as something is horribly wrong.
   6105		 */
   6106		if (exit_reason.basic == EXIT_REASON_PML_FULL)
   6107			goto unexpected_vmexit;
   6108
   6109		/*
   6110		 * The host physical addresses of some pages of guest memory
   6111		 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
   6112		 * Page). The CPU may write to these pages via their host
   6113		 * physical address while L2 is running, bypassing any
   6114		 * address-translation-based dirty tracking (e.g. EPT write
   6115		 * protection).
   6116		 *
   6117		 * Mark them dirty on every exit from L2 to prevent them from
   6118		 * getting out of sync with dirty tracking.
   6119		 */
   6120		nested_mark_vmcs12_pages_dirty(vcpu);
   6121
   6122		/*
   6123		 * Synthesize a triple fault if L2 state is invalid.  In normal
   6124		 * operation, nested VM-Enter rejects any attempt to enter L2
   6125		 * with invalid state.  However, those checks are skipped if
   6126		 * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
   6127		 * L2 state is invalid, it means either L1 modified SMRAM state
   6128		 * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
   6129		 * doing so is architecturally allowed in the RSM case, and is
   6130		 * the least awful solution for the userspace case without
   6131		 * risking false positives.
   6132		 */
   6133		if (vmx->emulation_required) {
   6134			nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
   6135			return 1;
   6136		}
   6137
   6138		if (nested_vmx_reflect_vmexit(vcpu))
   6139			return 1;
   6140	}
   6141
   6142	/* If guest state is invalid, start emulating.  L2 is handled above. */
   6143	if (vmx->emulation_required)
   6144		return handle_invalid_guest_state(vcpu);
   6145
   6146	if (exit_reason.failed_vmentry) {
   6147		dump_vmcs(vcpu);
   6148		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   6149		vcpu->run->fail_entry.hardware_entry_failure_reason
   6150			= exit_reason.full;
   6151		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
   6152		return 0;
   6153	}
   6154
   6155	if (unlikely(vmx->fail)) {
   6156		dump_vmcs(vcpu);
   6157		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   6158		vcpu->run->fail_entry.hardware_entry_failure_reason
   6159			= vmcs_read32(VM_INSTRUCTION_ERROR);
   6160		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
   6161		return 0;
   6162	}
   6163
   6164	/*
   6165	 * Note:
   6166	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
   6167	 * delivery event since it indicates guest is accessing MMIO.
   6168	 * The vm-exit can be triggered again after return to guest that
   6169	 * will cause infinite loop.
   6170	 */
   6171	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
   6172	    (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
   6173	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
   6174	     exit_reason.basic != EXIT_REASON_PML_FULL &&
   6175	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
   6176	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
   6177		int ndata = 3;
   6178
   6179		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   6180		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
   6181		vcpu->run->internal.data[0] = vectoring_info;
   6182		vcpu->run->internal.data[1] = exit_reason.full;
   6183		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
   6184		if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
   6185			vcpu->run->internal.data[ndata++] =
   6186				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
   6187		}
   6188		vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
   6189		vcpu->run->internal.ndata = ndata;
   6190		return 0;
   6191	}
   6192
   6193	if (unlikely(!enable_vnmi &&
   6194		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
   6195		if (!vmx_interrupt_blocked(vcpu)) {
   6196			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
   6197		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
   6198			   vcpu->arch.nmi_pending) {
   6199			/*
   6200			 * This CPU don't support us in finding the end of an
   6201			 * NMI-blocked window if the guest runs with IRQs
   6202			 * disabled. So we pull the trigger after 1 s of
   6203			 * futile waiting, but inform the user about this.
   6204			 */
   6205			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
   6206			       "state on VCPU %d after 1 s timeout\n",
   6207			       __func__, vcpu->vcpu_id);
   6208			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
   6209		}
   6210	}
   6211
   6212	if (exit_fastpath != EXIT_FASTPATH_NONE)
   6213		return 1;
   6214
   6215	if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
   6216		goto unexpected_vmexit;
   6217#ifdef CONFIG_RETPOLINE
   6218	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
   6219		return kvm_emulate_wrmsr(vcpu);
   6220	else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
   6221		return handle_preemption_timer(vcpu);
   6222	else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
   6223		return handle_interrupt_window(vcpu);
   6224	else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
   6225		return handle_external_interrupt(vcpu);
   6226	else if (exit_reason.basic == EXIT_REASON_HLT)
   6227		return kvm_emulate_halt(vcpu);
   6228	else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
   6229		return handle_ept_misconfig(vcpu);
   6230#endif
   6231
   6232	exit_handler_index = array_index_nospec((u16)exit_reason.basic,
   6233						kvm_vmx_max_exit_handlers);
   6234	if (!kvm_vmx_exit_handlers[exit_handler_index])
   6235		goto unexpected_vmexit;
   6236
   6237	return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
   6238
   6239unexpected_vmexit:
   6240	vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
   6241		    exit_reason.full);
   6242	dump_vmcs(vcpu);
   6243	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   6244	vcpu->run->internal.suberror =
   6245			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
   6246	vcpu->run->internal.ndata = 2;
   6247	vcpu->run->internal.data[0] = exit_reason.full;
   6248	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
   6249	return 0;
   6250}
   6251
   6252static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
   6253{
   6254	int ret = __vmx_handle_exit(vcpu, exit_fastpath);
   6255
   6256	/*
   6257	 * Exit to user space when bus lock detected to inform that there is
   6258	 * a bus lock in guest.
   6259	 */
   6260	if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
   6261		if (ret > 0)
   6262			vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
   6263
   6264		vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
   6265		return 0;
   6266	}
   6267	return ret;
   6268}
   6269
   6270/*
   6271 * Software based L1D cache flush which is used when microcode providing
   6272 * the cache control MSR is not loaded.
   6273 *
   6274 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
   6275 * flush it is required to read in 64 KiB because the replacement algorithm
   6276 * is not exactly LRU. This could be sized at runtime via topology
   6277 * information but as all relevant affected CPUs have 32KiB L1D cache size
   6278 * there is no point in doing so.
   6279 */
   6280static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
   6281{
   6282	int size = PAGE_SIZE << L1D_CACHE_ORDER;
   6283
   6284	/*
   6285	 * This code is only executed when the flush mode is 'cond' or
   6286	 * 'always'
   6287	 */
   6288	if (static_branch_likely(&vmx_l1d_flush_cond)) {
   6289		bool flush_l1d;
   6290
   6291		/*
   6292		 * Clear the per-vcpu flush bit, it gets set again
   6293		 * either from vcpu_run() or from one of the unsafe
   6294		 * VMEXIT handlers.
   6295		 */
   6296		flush_l1d = vcpu->arch.l1tf_flush_l1d;
   6297		vcpu->arch.l1tf_flush_l1d = false;
   6298
   6299		/*
   6300		 * Clear the per-cpu flush bit, it gets set again from
   6301		 * the interrupt handlers.
   6302		 */
   6303		flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
   6304		kvm_clear_cpu_l1tf_flush_l1d();
   6305
   6306		if (!flush_l1d)
   6307			return;
   6308	}
   6309
   6310	vcpu->stat.l1d_flush++;
   6311
   6312	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
   6313		native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
   6314		return;
   6315	}
   6316
   6317	asm volatile(
   6318		/* First ensure the pages are in the TLB */
   6319		"xorl	%%eax, %%eax\n"
   6320		".Lpopulate_tlb:\n\t"
   6321		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
   6322		"addl	$4096, %%eax\n\t"
   6323		"cmpl	%%eax, %[size]\n\t"
   6324		"jne	.Lpopulate_tlb\n\t"
   6325		"xorl	%%eax, %%eax\n\t"
   6326		"cpuid\n\t"
   6327		/* Now fill the cache */
   6328		"xorl	%%eax, %%eax\n"
   6329		".Lfill_cache:\n"
   6330		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
   6331		"addl	$64, %%eax\n\t"
   6332		"cmpl	%%eax, %[size]\n\t"
   6333		"jne	.Lfill_cache\n\t"
   6334		"lfence\n"
   6335		:: [flush_pages] "r" (vmx_l1d_flush_pages),
   6336		    [size] "r" (size)
   6337		: "eax", "ebx", "ecx", "edx");
   6338}
   6339
   6340static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
   6341{
   6342	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   6343	int tpr_threshold;
   6344
   6345	if (is_guest_mode(vcpu) &&
   6346		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
   6347		return;
   6348
   6349	tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
   6350	if (is_guest_mode(vcpu))
   6351		to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
   6352	else
   6353		vmcs_write32(TPR_THRESHOLD, tpr_threshold);
   6354}
   6355
   6356void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
   6357{
   6358	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6359	u32 sec_exec_control;
   6360
   6361	if (!lapic_in_kernel(vcpu))
   6362		return;
   6363
   6364	if (!flexpriority_enabled &&
   6365	    !cpu_has_vmx_virtualize_x2apic_mode())
   6366		return;
   6367
   6368	/* Postpone execution until vmcs01 is the current VMCS. */
   6369	if (is_guest_mode(vcpu)) {
   6370		vmx->nested.change_vmcs01_virtual_apic_mode = true;
   6371		return;
   6372	}
   6373
   6374	sec_exec_control = secondary_exec_controls_get(vmx);
   6375	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
   6376			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
   6377
   6378	switch (kvm_get_apic_mode(vcpu)) {
   6379	case LAPIC_MODE_INVALID:
   6380		WARN_ONCE(true, "Invalid local APIC state");
   6381		break;
   6382	case LAPIC_MODE_DISABLED:
   6383		break;
   6384	case LAPIC_MODE_XAPIC:
   6385		if (flexpriority_enabled) {
   6386			sec_exec_control |=
   6387				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
   6388			kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
   6389
   6390			/*
   6391			 * Flush the TLB, reloading the APIC access page will
   6392			 * only do so if its physical address has changed, but
   6393			 * the guest may have inserted a non-APIC mapping into
   6394			 * the TLB while the APIC access page was disabled.
   6395			 */
   6396			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
   6397		}
   6398		break;
   6399	case LAPIC_MODE_X2APIC:
   6400		if (cpu_has_vmx_virtualize_x2apic_mode())
   6401			sec_exec_control |=
   6402				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
   6403		break;
   6404	}
   6405	secondary_exec_controls_set(vmx, sec_exec_control);
   6406
   6407	vmx_update_msr_bitmap_x2apic(vcpu);
   6408}
   6409
   6410static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
   6411{
   6412	struct page *page;
   6413
   6414	/* Defer reload until vmcs01 is the current VMCS. */
   6415	if (is_guest_mode(vcpu)) {
   6416		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
   6417		return;
   6418	}
   6419
   6420	if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
   6421	    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
   6422		return;
   6423
   6424	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
   6425	if (is_error_page(page))
   6426		return;
   6427
   6428	vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
   6429	vmx_flush_tlb_current(vcpu);
   6430
   6431	/*
   6432	 * Do not pin apic access page in memory, the MMU notifier
   6433	 * will call us again if it is migrated or swapped out.
   6434	 */
   6435	put_page(page);
   6436}
   6437
   6438static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
   6439{
   6440	u16 status;
   6441	u8 old;
   6442
   6443	if (max_isr == -1)
   6444		max_isr = 0;
   6445
   6446	status = vmcs_read16(GUEST_INTR_STATUS);
   6447	old = status >> 8;
   6448	if (max_isr != old) {
   6449		status &= 0xff;
   6450		status |= max_isr << 8;
   6451		vmcs_write16(GUEST_INTR_STATUS, status);
   6452	}
   6453}
   6454
   6455static void vmx_set_rvi(int vector)
   6456{
   6457	u16 status;
   6458	u8 old;
   6459
   6460	if (vector == -1)
   6461		vector = 0;
   6462
   6463	status = vmcs_read16(GUEST_INTR_STATUS);
   6464	old = (u8)status & 0xff;
   6465	if ((u8)vector != old) {
   6466		status &= ~0xff;
   6467		status |= (u8)vector;
   6468		vmcs_write16(GUEST_INTR_STATUS, status);
   6469	}
   6470}
   6471
   6472static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
   6473{
   6474	/*
   6475	 * When running L2, updating RVI is only relevant when
   6476	 * vmcs12 virtual-interrupt-delivery enabled.
   6477	 * However, it can be enabled only when L1 also
   6478	 * intercepts external-interrupts and in that case
   6479	 * we should not update vmcs02 RVI but instead intercept
   6480	 * interrupt. Therefore, do nothing when running L2.
   6481	 */
   6482	if (!is_guest_mode(vcpu))
   6483		vmx_set_rvi(max_irr);
   6484}
   6485
   6486static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
   6487{
   6488	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6489	int max_irr;
   6490	bool got_posted_interrupt;
   6491
   6492	if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
   6493		return -EIO;
   6494
   6495	if (pi_test_on(&vmx->pi_desc)) {
   6496		pi_clear_on(&vmx->pi_desc);
   6497		/*
   6498		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
   6499		 * But on x86 this is just a compiler barrier anyway.
   6500		 */
   6501		smp_mb__after_atomic();
   6502		got_posted_interrupt =
   6503			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
   6504	} else {
   6505		max_irr = kvm_lapic_find_highest_irr(vcpu);
   6506		got_posted_interrupt = false;
   6507	}
   6508
   6509	/*
   6510	 * Newly recognized interrupts are injected via either virtual interrupt
   6511	 * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
   6512	 * disabled in two cases:
   6513	 *
   6514	 * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
   6515	 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
   6516	 * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
   6517	 * into L2, but KVM doesn't use virtual interrupt delivery to inject
   6518	 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
   6519	 *
   6520	 * 2) If APICv is disabled for this vCPU, assigned devices may still
   6521	 * attempt to post interrupts.  The posted interrupt vector will cause
   6522	 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
   6523	 */
   6524	if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
   6525		vmx_set_rvi(max_irr);
   6526	else if (got_posted_interrupt)
   6527		kvm_make_request(KVM_REQ_EVENT, vcpu);
   6528
   6529	return max_irr;
   6530}
   6531
   6532static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
   6533{
   6534	if (!kvm_vcpu_apicv_active(vcpu))
   6535		return;
   6536
   6537	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
   6538	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
   6539	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
   6540	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
   6541}
   6542
   6543static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
   6544{
   6545	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6546
   6547	pi_clear_on(&vmx->pi_desc);
   6548	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
   6549}
   6550
   6551void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
   6552
   6553static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
   6554					unsigned long entry)
   6555{
   6556	bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
   6557
   6558	kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
   6559	vmx_do_interrupt_nmi_irqoff(entry);
   6560	kvm_after_interrupt(vcpu);
   6561}
   6562
   6563static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
   6564{
   6565	/*
   6566	 * Save xfd_err to guest_fpu before interrupt is enabled, so the
   6567	 * MSR value is not clobbered by the host activity before the guest
   6568	 * has chance to consume it.
   6569	 *
   6570	 * Do not blindly read xfd_err here, since this exception might
   6571	 * be caused by L1 interception on a platform which doesn't
   6572	 * support xfd at all.
   6573	 *
   6574	 * Do it conditionally upon guest_fpu::xfd. xfd_err matters
   6575	 * only when xfd contains a non-zero value.
   6576	 *
   6577	 * Queuing exception is done in vmx_handle_exit. See comment there.
   6578	 */
   6579	if (vcpu->arch.guest_fpu.fpstate->xfd)
   6580		rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
   6581}
   6582
   6583static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
   6584{
   6585	const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
   6586	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
   6587
   6588	/* if exit due to PF check for async PF */
   6589	if (is_page_fault(intr_info))
   6590		vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
   6591	/* if exit due to NM, handle before interrupts are enabled */
   6592	else if (is_nm_fault(intr_info))
   6593		handle_nm_fault_irqoff(&vmx->vcpu);
   6594	/* Handle machine checks before interrupts are enabled */
   6595	else if (is_machine_check(intr_info))
   6596		kvm_machine_check();
   6597	/* We need to handle NMIs before interrupts are enabled */
   6598	else if (is_nmi(intr_info))
   6599		handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
   6600}
   6601
   6602static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
   6603{
   6604	u32 intr_info = vmx_get_intr_info(vcpu);
   6605	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
   6606	gate_desc *desc = (gate_desc *)host_idt_base + vector;
   6607
   6608	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
   6609	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
   6610		return;
   6611
   6612	handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
   6613	vcpu->arch.at_instruction_boundary = true;
   6614}
   6615
   6616static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
   6617{
   6618	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6619
   6620	if (vmx->emulation_required)
   6621		return;
   6622
   6623	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
   6624		handle_external_interrupt_irqoff(vcpu);
   6625	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
   6626		handle_exception_nmi_irqoff(vmx);
   6627}
   6628
   6629/*
   6630 * The kvm parameter can be NULL (module initialization, or invocation before
   6631 * VM creation). Be sure to check the kvm parameter before using it.
   6632 */
   6633static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
   6634{
   6635	switch (index) {
   6636	case MSR_IA32_SMBASE:
   6637		/*
   6638		 * We cannot do SMM unless we can run the guest in big
   6639		 * real mode.
   6640		 */
   6641		return enable_unrestricted_guest || emulate_invalid_guest_state;
   6642	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
   6643		return nested;
   6644	case MSR_AMD64_VIRT_SPEC_CTRL:
   6645	case MSR_AMD64_TSC_RATIO:
   6646		/* This is AMD only.  */
   6647		return false;
   6648	default:
   6649		return true;
   6650	}
   6651}
   6652
   6653static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
   6654{
   6655	u32 exit_intr_info;
   6656	bool unblock_nmi;
   6657	u8 vector;
   6658	bool idtv_info_valid;
   6659
   6660	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
   6661
   6662	if (enable_vnmi) {
   6663		if (vmx->loaded_vmcs->nmi_known_unmasked)
   6664			return;
   6665
   6666		exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
   6667		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
   6668		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
   6669		/*
   6670		 * SDM 3: 27.7.1.2 (September 2008)
   6671		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
   6672		 * a guest IRET fault.
   6673		 * SDM 3: 23.2.2 (September 2008)
   6674		 * Bit 12 is undefined in any of the following cases:
   6675		 *  If the VM exit sets the valid bit in the IDT-vectoring
   6676		 *   information field.
   6677		 *  If the VM exit is due to a double fault.
   6678		 */
   6679		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
   6680		    vector != DF_VECTOR && !idtv_info_valid)
   6681			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
   6682				      GUEST_INTR_STATE_NMI);
   6683		else
   6684			vmx->loaded_vmcs->nmi_known_unmasked =
   6685				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
   6686				  & GUEST_INTR_STATE_NMI);
   6687	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
   6688		vmx->loaded_vmcs->vnmi_blocked_time +=
   6689			ktime_to_ns(ktime_sub(ktime_get(),
   6690					      vmx->loaded_vmcs->entry_time));
   6691}
   6692
   6693static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
   6694				      u32 idt_vectoring_info,
   6695				      int instr_len_field,
   6696				      int error_code_field)
   6697{
   6698	u8 vector;
   6699	int type;
   6700	bool idtv_info_valid;
   6701
   6702	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
   6703
   6704	vcpu->arch.nmi_injected = false;
   6705	kvm_clear_exception_queue(vcpu);
   6706	kvm_clear_interrupt_queue(vcpu);
   6707
   6708	if (!idtv_info_valid)
   6709		return;
   6710
   6711	kvm_make_request(KVM_REQ_EVENT, vcpu);
   6712
   6713	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
   6714	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
   6715
   6716	switch (type) {
   6717	case INTR_TYPE_NMI_INTR:
   6718		vcpu->arch.nmi_injected = true;
   6719		/*
   6720		 * SDM 3: 27.7.1.2 (September 2008)
   6721		 * Clear bit "block by NMI" before VM entry if a NMI
   6722		 * delivery faulted.
   6723		 */
   6724		vmx_set_nmi_mask(vcpu, false);
   6725		break;
   6726	case INTR_TYPE_SOFT_EXCEPTION:
   6727		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
   6728		fallthrough;
   6729	case INTR_TYPE_HARD_EXCEPTION:
   6730		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
   6731			u32 err = vmcs_read32(error_code_field);
   6732			kvm_requeue_exception_e(vcpu, vector, err);
   6733		} else
   6734			kvm_requeue_exception(vcpu, vector);
   6735		break;
   6736	case INTR_TYPE_SOFT_INTR:
   6737		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
   6738		fallthrough;
   6739	case INTR_TYPE_EXT_INTR:
   6740		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
   6741		break;
   6742	default:
   6743		break;
   6744	}
   6745}
   6746
   6747static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
   6748{
   6749	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
   6750				  VM_EXIT_INSTRUCTION_LEN,
   6751				  IDT_VECTORING_ERROR_CODE);
   6752}
   6753
   6754static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
   6755{
   6756	__vmx_complete_interrupts(vcpu,
   6757				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
   6758				  VM_ENTRY_INSTRUCTION_LEN,
   6759				  VM_ENTRY_EXCEPTION_ERROR_CODE);
   6760
   6761	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
   6762}
   6763
   6764static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
   6765{
   6766	int i, nr_msrs;
   6767	struct perf_guest_switch_msr *msrs;
   6768
   6769	/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
   6770	msrs = perf_guest_get_msrs(&nr_msrs);
   6771	if (!msrs)
   6772		return;
   6773
   6774	for (i = 0; i < nr_msrs; i++)
   6775		if (msrs[i].host == msrs[i].guest)
   6776			clear_atomic_switch_msr(vmx, msrs[i].msr);
   6777		else
   6778			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
   6779					msrs[i].host, false);
   6780}
   6781
   6782static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
   6783{
   6784	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6785	u64 tscl;
   6786	u32 delta_tsc;
   6787
   6788	if (vmx->req_immediate_exit) {
   6789		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
   6790		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
   6791	} else if (vmx->hv_deadline_tsc != -1) {
   6792		tscl = rdtsc();
   6793		if (vmx->hv_deadline_tsc > tscl)
   6794			/* set_hv_timer ensures the delta fits in 32-bits */
   6795			delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
   6796				cpu_preemption_timer_multi);
   6797		else
   6798			delta_tsc = 0;
   6799
   6800		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
   6801		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
   6802	} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
   6803		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
   6804		vmx->loaded_vmcs->hv_timer_soft_disabled = true;
   6805	}
   6806}
   6807
   6808void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
   6809{
   6810	if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
   6811		vmx->loaded_vmcs->host_state.rsp = host_rsp;
   6812		vmcs_writel(HOST_RSP, host_rsp);
   6813	}
   6814}
   6815
   6816static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
   6817{
   6818	switch (to_vmx(vcpu)->exit_reason.basic) {
   6819	case EXIT_REASON_MSR_WRITE:
   6820		return handle_fastpath_set_msr_irqoff(vcpu);
   6821	case EXIT_REASON_PREEMPTION_TIMER:
   6822		return handle_fastpath_preemption_timer(vcpu);
   6823	default:
   6824		return EXIT_FASTPATH_NONE;
   6825	}
   6826}
   6827
   6828static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
   6829					struct vcpu_vmx *vmx)
   6830{
   6831	guest_state_enter_irqoff();
   6832
   6833	/* L1D Flush includes CPU buffer clear to mitigate MDS */
   6834	if (static_branch_unlikely(&vmx_l1d_should_flush))
   6835		vmx_l1d_flush(vcpu);
   6836	else if (static_branch_unlikely(&mds_user_clear))
   6837		mds_clear_cpu_buffers();
   6838	else if (static_branch_unlikely(&mmio_stale_data_clear) &&
   6839		 kvm_arch_has_assigned_device(vcpu->kvm))
   6840		mds_clear_cpu_buffers();
   6841
   6842	vmx_disable_fb_clear(vmx);
   6843
   6844	if (vcpu->arch.cr2 != native_read_cr2())
   6845		native_write_cr2(vcpu->arch.cr2);
   6846
   6847	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
   6848				   vmx->loaded_vmcs->launched);
   6849
   6850	vcpu->arch.cr2 = native_read_cr2();
   6851
   6852	vmx_enable_fb_clear(vmx);
   6853
   6854	guest_state_exit_irqoff();
   6855}
   6856
   6857static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
   6858{
   6859	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6860	unsigned long cr3, cr4;
   6861
   6862	/* Record the guest's net vcpu time for enforced NMI injections. */
   6863	if (unlikely(!enable_vnmi &&
   6864		     vmx->loaded_vmcs->soft_vnmi_blocked))
   6865		vmx->loaded_vmcs->entry_time = ktime_get();
   6866
   6867	/*
   6868	 * Don't enter VMX if guest state is invalid, let the exit handler
   6869	 * start emulation until we arrive back to a valid state.  Synthesize a
   6870	 * consistency check VM-Exit due to invalid guest state and bail.
   6871	 */
   6872	if (unlikely(vmx->emulation_required)) {
   6873		vmx->fail = 0;
   6874
   6875		vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
   6876		vmx->exit_reason.failed_vmentry = 1;
   6877		kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
   6878		vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
   6879		kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
   6880		vmx->exit_intr_info = 0;
   6881		return EXIT_FASTPATH_NONE;
   6882	}
   6883
   6884	trace_kvm_entry(vcpu);
   6885
   6886	if (vmx->ple_window_dirty) {
   6887		vmx->ple_window_dirty = false;
   6888		vmcs_write32(PLE_WINDOW, vmx->ple_window);
   6889	}
   6890
   6891	/*
   6892	 * We did this in prepare_switch_to_guest, because it needs to
   6893	 * be within srcu_read_lock.
   6894	 */
   6895	WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
   6896
   6897	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
   6898		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
   6899	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
   6900		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
   6901	vcpu->arch.regs_dirty = 0;
   6902
   6903	/*
   6904	 * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
   6905	 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
   6906	 * it switches back to the current->mm, which can occur in KVM context
   6907	 * when switching to a temporary mm to patch kernel code, e.g. if KVM
   6908	 * toggles a static key while handling a VM-Exit.
   6909	 */
   6910	cr3 = __get_current_cr3_fast();
   6911	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
   6912		vmcs_writel(HOST_CR3, cr3);
   6913		vmx->loaded_vmcs->host_state.cr3 = cr3;
   6914	}
   6915
   6916	cr4 = cr4_read_shadow();
   6917	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
   6918		vmcs_writel(HOST_CR4, cr4);
   6919		vmx->loaded_vmcs->host_state.cr4 = cr4;
   6920	}
   6921
   6922	/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
   6923	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
   6924		set_debugreg(vcpu->arch.dr6, 6);
   6925
   6926	/* When single-stepping over STI and MOV SS, we must clear the
   6927	 * corresponding interruptibility bits in the guest state. Otherwise
   6928	 * vmentry fails as it then expects bit 14 (BS) in pending debug
   6929	 * exceptions being set, but that's not correct for the guest debugging
   6930	 * case. */
   6931	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
   6932		vmx_set_interrupt_shadow(vcpu, 0);
   6933
   6934	kvm_load_guest_xsave_state(vcpu);
   6935
   6936	pt_guest_enter(vmx);
   6937
   6938	atomic_switch_perf_msrs(vmx);
   6939	if (intel_pmu_lbr_is_enabled(vcpu))
   6940		vmx_passthrough_lbr_msrs(vcpu);
   6941
   6942	if (enable_preemption_timer)
   6943		vmx_update_hv_timer(vcpu);
   6944
   6945	kvm_wait_lapic_expire(vcpu);
   6946
   6947	/*
   6948	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
   6949	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
   6950	 * is no need to worry about the conditional branch over the wrmsr
   6951	 * being speculatively taken.
   6952	 */
   6953	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
   6954
   6955	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
   6956	vmx_vcpu_enter_exit(vcpu, vmx);
   6957
   6958	/*
   6959	 * We do not use IBRS in the kernel. If this vCPU has used the
   6960	 * SPEC_CTRL MSR it may have left it on; save the value and
   6961	 * turn it off. This is much more efficient than blindly adding
   6962	 * it to the atomic save/restore list. Especially as the former
   6963	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
   6964	 *
   6965	 * For non-nested case:
   6966	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
   6967	 * save it.
   6968	 *
   6969	 * For nested case:
   6970	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
   6971	 * save it.
   6972	 */
   6973	if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
   6974		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
   6975
   6976	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
   6977
   6978	/* All fields are clean at this point */
   6979	if (static_branch_unlikely(&enable_evmcs)) {
   6980		current_evmcs->hv_clean_fields |=
   6981			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
   6982
   6983		current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
   6984	}
   6985
   6986	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
   6987	if (vmx->host_debugctlmsr)
   6988		update_debugctlmsr(vmx->host_debugctlmsr);
   6989
   6990#ifndef CONFIG_X86_64
   6991	/*
   6992	 * The sysexit path does not restore ds/es, so we must set them to
   6993	 * a reasonable value ourselves.
   6994	 *
   6995	 * We can't defer this to vmx_prepare_switch_to_host() since that
   6996	 * function may be executed in interrupt context, which saves and
   6997	 * restore segments around it, nullifying its effect.
   6998	 */
   6999	loadsegment(ds, __USER_DS);
   7000	loadsegment(es, __USER_DS);
   7001#endif
   7002
   7003	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
   7004
   7005	pt_guest_exit(vmx);
   7006
   7007	kvm_load_host_xsave_state(vcpu);
   7008
   7009	if (is_guest_mode(vcpu)) {
   7010		/*
   7011		 * Track VMLAUNCH/VMRESUME that have made past guest state
   7012		 * checking.
   7013		 */
   7014		if (vmx->nested.nested_run_pending &&
   7015		    !vmx->exit_reason.failed_vmentry)
   7016			++vcpu->stat.nested_run;
   7017
   7018		vmx->nested.nested_run_pending = 0;
   7019	}
   7020
   7021	vmx->idt_vectoring_info = 0;
   7022
   7023	if (unlikely(vmx->fail)) {
   7024		vmx->exit_reason.full = 0xdead;
   7025		return EXIT_FASTPATH_NONE;
   7026	}
   7027
   7028	vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
   7029	if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
   7030		kvm_machine_check();
   7031
   7032	if (likely(!vmx->exit_reason.failed_vmentry))
   7033		vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
   7034
   7035	trace_kvm_exit(vcpu, KVM_ISA_VMX);
   7036
   7037	if (unlikely(vmx->exit_reason.failed_vmentry))
   7038		return EXIT_FASTPATH_NONE;
   7039
   7040	vmx->loaded_vmcs->launched = 1;
   7041
   7042	vmx_recover_nmi_blocking(vmx);
   7043	vmx_complete_interrupts(vmx);
   7044
   7045	if (is_guest_mode(vcpu))
   7046		return EXIT_FASTPATH_NONE;
   7047
   7048	return vmx_exit_handlers_fastpath(vcpu);
   7049}
   7050
   7051static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
   7052{
   7053	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7054
   7055	if (enable_pml)
   7056		vmx_destroy_pml_buffer(vmx);
   7057	free_vpid(vmx->vpid);
   7058	nested_vmx_free_vcpu(vcpu);
   7059	free_loaded_vmcs(vmx->loaded_vmcs);
   7060}
   7061
   7062static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
   7063{
   7064	struct vmx_uret_msr *tsx_ctrl;
   7065	struct vcpu_vmx *vmx;
   7066	int i, err;
   7067
   7068	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
   7069	vmx = to_vmx(vcpu);
   7070
   7071	INIT_LIST_HEAD(&vmx->pi_wakeup_list);
   7072
   7073	err = -ENOMEM;
   7074
   7075	vmx->vpid = allocate_vpid();
   7076
   7077	/*
   7078	 * If PML is turned on, failure on enabling PML just results in failure
   7079	 * of creating the vcpu, therefore we can simplify PML logic (by
   7080	 * avoiding dealing with cases, such as enabling PML partially on vcpus
   7081	 * for the guest), etc.
   7082	 */
   7083	if (enable_pml) {
   7084		vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
   7085		if (!vmx->pml_pg)
   7086			goto free_vpid;
   7087	}
   7088
   7089	for (i = 0; i < kvm_nr_uret_msrs; ++i)
   7090		vmx->guest_uret_msrs[i].mask = -1ull;
   7091	if (boot_cpu_has(X86_FEATURE_RTM)) {
   7092		/*
   7093		 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
   7094		 * Keep the host value unchanged to avoid changing CPUID bits
   7095		 * under the host kernel's feet.
   7096		 */
   7097		tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
   7098		if (tsx_ctrl)
   7099			tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
   7100	}
   7101
   7102	err = alloc_loaded_vmcs(&vmx->vmcs01);
   7103	if (err < 0)
   7104		goto free_pml;
   7105
   7106	/*
   7107	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
   7108	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
   7109	 * feature only for vmcs01, KVM currently isn't equipped to realize any
   7110	 * performance benefits from enabling it for vmcs02.
   7111	 */
   7112	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
   7113	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
   7114		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
   7115
   7116		evmcs->hv_enlightenments_control.msr_bitmap = 1;
   7117	}
   7118
   7119	/* The MSR bitmap starts with all ones */
   7120	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
   7121	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
   7122
   7123	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
   7124#ifdef CONFIG_X86_64
   7125	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
   7126	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
   7127	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
   7128#endif
   7129	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
   7130	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
   7131	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
   7132	if (kvm_cstate_in_guest(vcpu->kvm)) {
   7133		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
   7134		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
   7135		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
   7136		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
   7137	}
   7138
   7139	vmx->loaded_vmcs = &vmx->vmcs01;
   7140
   7141	if (cpu_need_virtualize_apic_accesses(vcpu)) {
   7142		err = alloc_apic_access_page(vcpu->kvm);
   7143		if (err)
   7144			goto free_vmcs;
   7145	}
   7146
   7147	if (enable_ept && !enable_unrestricted_guest) {
   7148		err = init_rmode_identity_map(vcpu->kvm);
   7149		if (err)
   7150			goto free_vmcs;
   7151	}
   7152
   7153	return 0;
   7154
   7155free_vmcs:
   7156	free_loaded_vmcs(vmx->loaded_vmcs);
   7157free_pml:
   7158	vmx_destroy_pml_buffer(vmx);
   7159free_vpid:
   7160	free_vpid(vmx->vpid);
   7161	return err;
   7162}
   7163
   7164#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
   7165#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
   7166
   7167static int vmx_vm_init(struct kvm *kvm)
   7168{
   7169	if (!ple_gap)
   7170		kvm->arch.pause_in_guest = true;
   7171
   7172	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
   7173		switch (l1tf_mitigation) {
   7174		case L1TF_MITIGATION_OFF:
   7175		case L1TF_MITIGATION_FLUSH_NOWARN:
   7176			/* 'I explicitly don't care' is set */
   7177			break;
   7178		case L1TF_MITIGATION_FLUSH:
   7179		case L1TF_MITIGATION_FLUSH_NOSMT:
   7180		case L1TF_MITIGATION_FULL:
   7181			/*
   7182			 * Warn upon starting the first VM in a potentially
   7183			 * insecure environment.
   7184			 */
   7185			if (sched_smt_active())
   7186				pr_warn_once(L1TF_MSG_SMT);
   7187			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
   7188				pr_warn_once(L1TF_MSG_L1D);
   7189			break;
   7190		case L1TF_MITIGATION_FULL_FORCE:
   7191			/* Flush is enforced */
   7192			break;
   7193		}
   7194	}
   7195	return 0;
   7196}
   7197
   7198static int __init vmx_check_processor_compat(void)
   7199{
   7200	struct vmcs_config vmcs_conf;
   7201	struct vmx_capability vmx_cap;
   7202
   7203	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
   7204	    !this_cpu_has(X86_FEATURE_VMX)) {
   7205		pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
   7206		return -EIO;
   7207	}
   7208
   7209	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
   7210		return -EIO;
   7211	if (nested)
   7212		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
   7213	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
   7214		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
   7215				smp_processor_id());
   7216		return -EIO;
   7217	}
   7218	return 0;
   7219}
   7220
   7221static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
   7222{
   7223	u8 cache;
   7224
   7225	/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
   7226	 * memory aliases with conflicting memory types and sometimes MCEs.
   7227	 * We have to be careful as to what are honored and when.
   7228	 *
   7229	 * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
   7230	 * UC.  The effective memory type is UC or WC depending on guest PAT.
   7231	 * This was historically the source of MCEs and we want to be
   7232	 * conservative.
   7233	 *
   7234	 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
   7235	 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
   7236	 * EPT memory type is set to WB.  The effective memory type is forced
   7237	 * WB.
   7238	 *
   7239	 * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
   7240	 * EPT memory type is used to emulate guest CD/MTRR.
   7241	 */
   7242
   7243	if (is_mmio)
   7244		return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
   7245
   7246	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
   7247		return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
   7248
   7249	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
   7250		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
   7251			cache = MTRR_TYPE_WRBACK;
   7252		else
   7253			cache = MTRR_TYPE_UNCACHABLE;
   7254
   7255		return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
   7256	}
   7257
   7258	return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
   7259}
   7260
   7261static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
   7262{
   7263	/*
   7264	 * These bits in the secondary execution controls field
   7265	 * are dynamic, the others are mostly based on the hypervisor
   7266	 * architecture and the guest's CPUID.  Do not touch the
   7267	 * dynamic bits.
   7268	 */
   7269	u32 mask =
   7270		SECONDARY_EXEC_SHADOW_VMCS |
   7271		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
   7272		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
   7273		SECONDARY_EXEC_DESC;
   7274
   7275	u32 cur_ctl = secondary_exec_controls_get(vmx);
   7276
   7277	secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
   7278}
   7279
   7280/*
   7281 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
   7282 * (indicating "allowed-1") if they are supported in the guest's CPUID.
   7283 */
   7284static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
   7285{
   7286	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7287	struct kvm_cpuid_entry2 *entry;
   7288
   7289	vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
   7290	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
   7291
   7292#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
   7293	if (entry && (entry->_reg & (_cpuid_mask)))			\
   7294		vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);	\
   7295} while (0)
   7296
   7297	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
   7298	cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
   7299	cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
   7300	cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
   7301	cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
   7302	cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
   7303	cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
   7304	cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
   7305	cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
   7306	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
   7307	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
   7308	cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
   7309	cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
   7310	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
   7311	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
   7312
   7313	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
   7314	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
   7315	cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
   7316	cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
   7317	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
   7318	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
   7319	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
   7320
   7321#undef cr4_fixed1_update
   7322}
   7323
   7324static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
   7325{
   7326	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7327
   7328	if (kvm_mpx_supported()) {
   7329		bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
   7330
   7331		if (mpx_enabled) {
   7332			vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
   7333			vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
   7334		} else {
   7335			vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
   7336			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
   7337		}
   7338	}
   7339}
   7340
   7341static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
   7342{
   7343	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7344	struct kvm_cpuid_entry2 *best = NULL;
   7345	int i;
   7346
   7347	for (i = 0; i < PT_CPUID_LEAVES; i++) {
   7348		best = kvm_find_cpuid_entry(vcpu, 0x14, i);
   7349		if (!best)
   7350			return;
   7351		vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
   7352		vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
   7353		vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
   7354		vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
   7355	}
   7356
   7357	/* Get the number of configurable Address Ranges for filtering */
   7358	vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
   7359						PT_CAP_num_address_ranges);
   7360
   7361	/* Initialize and clear the no dependency bits */
   7362	vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
   7363			RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
   7364			RTIT_CTL_BRANCH_EN);
   7365
   7366	/*
   7367	 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
   7368	 * will inject an #GP
   7369	 */
   7370	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
   7371		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
   7372
   7373	/*
   7374	 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
   7375	 * PSBFreq can be set
   7376	 */
   7377	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
   7378		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
   7379				RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
   7380
   7381	/*
   7382	 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
   7383	 */
   7384	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
   7385		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
   7386					      RTIT_CTL_MTC_RANGE);
   7387
   7388	/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
   7389	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
   7390		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
   7391							RTIT_CTL_PTW_EN);
   7392
   7393	/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
   7394	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
   7395		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
   7396
   7397	/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
   7398	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
   7399		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
   7400
   7401	/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
   7402	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
   7403		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
   7404
   7405	/* unmask address range configure area */
   7406	for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
   7407		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
   7408}
   7409
   7410static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
   7411{
   7412	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7413
   7414	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
   7415	vcpu->arch.xsaves_enabled = false;
   7416
   7417	vmx_setup_uret_msrs(vmx);
   7418
   7419	if (cpu_has_secondary_exec_ctrls())
   7420		vmcs_set_secondary_exec_control(vmx,
   7421						vmx_secondary_exec_control(vmx));
   7422
   7423	if (nested_vmx_allowed(vcpu))
   7424		vmx->msr_ia32_feature_control_valid_bits |=
   7425			FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
   7426			FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
   7427	else
   7428		vmx->msr_ia32_feature_control_valid_bits &=
   7429			~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
   7430			  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
   7431
   7432	if (nested_vmx_allowed(vcpu)) {
   7433		nested_vmx_cr_fixed1_bits_update(vcpu);
   7434		nested_vmx_entry_exit_ctls_update(vcpu);
   7435	}
   7436
   7437	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
   7438			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
   7439		update_intel_pt_cfg(vcpu);
   7440
   7441	if (boot_cpu_has(X86_FEATURE_RTM)) {
   7442		struct vmx_uret_msr *msr;
   7443		msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
   7444		if (msr) {
   7445			bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
   7446			vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
   7447		}
   7448	}
   7449
   7450	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
   7451		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
   7452					  !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
   7453
   7454
   7455	set_cr4_guest_host_mask(vmx);
   7456
   7457	vmx_write_encls_bitmap(vcpu, NULL);
   7458	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
   7459		vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
   7460	else
   7461		vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
   7462
   7463	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
   7464		vmx->msr_ia32_feature_control_valid_bits |=
   7465			FEAT_CTL_SGX_LC_ENABLED;
   7466	else
   7467		vmx->msr_ia32_feature_control_valid_bits &=
   7468			~FEAT_CTL_SGX_LC_ENABLED;
   7469
   7470	/* Refresh #PF interception to account for MAXPHYADDR changes. */
   7471	vmx_update_exception_bitmap(vcpu);
   7472}
   7473
   7474static __init void vmx_set_cpu_caps(void)
   7475{
   7476	kvm_set_cpu_caps();
   7477
   7478	/* CPUID 0x1 */
   7479	if (nested)
   7480		kvm_cpu_cap_set(X86_FEATURE_VMX);
   7481
   7482	/* CPUID 0x7 */
   7483	if (kvm_mpx_supported())
   7484		kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
   7485	if (!cpu_has_vmx_invpcid())
   7486		kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
   7487	if (vmx_pt_mode_is_host_guest())
   7488		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
   7489
   7490	if (!enable_sgx) {
   7491		kvm_cpu_cap_clear(X86_FEATURE_SGX);
   7492		kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
   7493		kvm_cpu_cap_clear(X86_FEATURE_SGX1);
   7494		kvm_cpu_cap_clear(X86_FEATURE_SGX2);
   7495	}
   7496
   7497	if (vmx_umip_emulated())
   7498		kvm_cpu_cap_set(X86_FEATURE_UMIP);
   7499
   7500	/* CPUID 0xD.1 */
   7501	supported_xss = 0;
   7502	if (!cpu_has_vmx_xsaves())
   7503		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
   7504
   7505	/* CPUID 0x80000001 and 0x7 (RDPID) */
   7506	if (!cpu_has_vmx_rdtscp()) {
   7507		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
   7508		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
   7509	}
   7510
   7511	if (cpu_has_vmx_waitpkg())
   7512		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
   7513}
   7514
   7515static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
   7516{
   7517	to_vmx(vcpu)->req_immediate_exit = true;
   7518}
   7519
   7520static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
   7521				  struct x86_instruction_info *info)
   7522{
   7523	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   7524	unsigned short port;
   7525	bool intercept;
   7526	int size;
   7527
   7528	if (info->intercept == x86_intercept_in ||
   7529	    info->intercept == x86_intercept_ins) {
   7530		port = info->src_val;
   7531		size = info->dst_bytes;
   7532	} else {
   7533		port = info->dst_val;
   7534		size = info->src_bytes;
   7535	}
   7536
   7537	/*
   7538	 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
   7539	 * VM-exits depend on the 'unconditional IO exiting' VM-execution
   7540	 * control.
   7541	 *
   7542	 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
   7543	 */
   7544	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
   7545		intercept = nested_cpu_has(vmcs12,
   7546					   CPU_BASED_UNCOND_IO_EXITING);
   7547	else
   7548		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
   7549
   7550	/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
   7551	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
   7552}
   7553
   7554static int vmx_check_intercept(struct kvm_vcpu *vcpu,
   7555			       struct x86_instruction_info *info,
   7556			       enum x86_intercept_stage stage,
   7557			       struct x86_exception *exception)
   7558{
   7559	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   7560
   7561	switch (info->intercept) {
   7562	/*
   7563	 * RDPID causes #UD if disabled through secondary execution controls.
   7564	 * Because it is marked as EmulateOnUD, we need to intercept it here.
   7565	 * Note, RDPID is hidden behind ENABLE_RDTSCP.
   7566	 */
   7567	case x86_intercept_rdpid:
   7568		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
   7569			exception->vector = UD_VECTOR;
   7570			exception->error_code_valid = false;
   7571			return X86EMUL_PROPAGATE_FAULT;
   7572		}
   7573		break;
   7574
   7575	case x86_intercept_in:
   7576	case x86_intercept_ins:
   7577	case x86_intercept_out:
   7578	case x86_intercept_outs:
   7579		return vmx_check_intercept_io(vcpu, info);
   7580
   7581	case x86_intercept_lgdt:
   7582	case x86_intercept_lidt:
   7583	case x86_intercept_lldt:
   7584	case x86_intercept_ltr:
   7585	case x86_intercept_sgdt:
   7586	case x86_intercept_sidt:
   7587	case x86_intercept_sldt:
   7588	case x86_intercept_str:
   7589		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
   7590			return X86EMUL_CONTINUE;
   7591
   7592		/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
   7593		break;
   7594
   7595	/* TODO: check more intercepts... */
   7596	default:
   7597		break;
   7598	}
   7599
   7600	return X86EMUL_UNHANDLEABLE;
   7601}
   7602
   7603#ifdef CONFIG_X86_64
   7604/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
   7605static inline int u64_shl_div_u64(u64 a, unsigned int shift,
   7606				  u64 divisor, u64 *result)
   7607{
   7608	u64 low = a << shift, high = a >> (64 - shift);
   7609
   7610	/* To avoid the overflow on divq */
   7611	if (high >= divisor)
   7612		return 1;
   7613
   7614	/* Low hold the result, high hold rem which is discarded */
   7615	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
   7616	    "rm" (divisor), "0" (low), "1" (high));
   7617	*result = low;
   7618
   7619	return 0;
   7620}
   7621
   7622static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
   7623			    bool *expired)
   7624{
   7625	struct vcpu_vmx *vmx;
   7626	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
   7627	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
   7628
   7629	vmx = to_vmx(vcpu);
   7630	tscl = rdtsc();
   7631	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
   7632	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
   7633	lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
   7634						    ktimer->timer_advance_ns);
   7635
   7636	if (delta_tsc > lapic_timer_advance_cycles)
   7637		delta_tsc -= lapic_timer_advance_cycles;
   7638	else
   7639		delta_tsc = 0;
   7640
   7641	/* Convert to host delta tsc if tsc scaling is enabled */
   7642	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
   7643	    delta_tsc && u64_shl_div_u64(delta_tsc,
   7644				kvm_tsc_scaling_ratio_frac_bits,
   7645				vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
   7646		return -ERANGE;
   7647
   7648	/*
   7649	 * If the delta tsc can't fit in the 32 bit after the multi shift,
   7650	 * we can't use the preemption timer.
   7651	 * It's possible that it fits on later vmentries, but checking
   7652	 * on every vmentry is costly so we just use an hrtimer.
   7653	 */
   7654	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
   7655		return -ERANGE;
   7656
   7657	vmx->hv_deadline_tsc = tscl + delta_tsc;
   7658	*expired = !delta_tsc;
   7659	return 0;
   7660}
   7661
   7662static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
   7663{
   7664	to_vmx(vcpu)->hv_deadline_tsc = -1;
   7665}
   7666#endif
   7667
   7668static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
   7669{
   7670	if (!kvm_pause_in_guest(vcpu->kvm))
   7671		shrink_ple_window(vcpu);
   7672}
   7673
   7674void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
   7675{
   7676	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7677
   7678	if (is_guest_mode(vcpu)) {
   7679		vmx->nested.update_vmcs01_cpu_dirty_logging = true;
   7680		return;
   7681	}
   7682
   7683	/*
   7684	 * Note, cpu_dirty_logging_count can be changed concurrent with this
   7685	 * code, but in that case another update request will be made and so
   7686	 * the guest will never run with a stale PML value.
   7687	 */
   7688	if (vcpu->kvm->arch.cpu_dirty_logging_count)
   7689		secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
   7690	else
   7691		secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
   7692}
   7693
   7694static void vmx_setup_mce(struct kvm_vcpu *vcpu)
   7695{
   7696	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
   7697		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
   7698			FEAT_CTL_LMCE_ENABLED;
   7699	else
   7700		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
   7701			~FEAT_CTL_LMCE_ENABLED;
   7702}
   7703
   7704static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   7705{
   7706	/* we need a nested vmexit to enter SMM, postpone if run is pending */
   7707	if (to_vmx(vcpu)->nested.nested_run_pending)
   7708		return -EBUSY;
   7709	return !is_smm(vcpu);
   7710}
   7711
   7712static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
   7713{
   7714	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7715
   7716	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
   7717	if (vmx->nested.smm.guest_mode)
   7718		nested_vmx_vmexit(vcpu, -1, 0, 0);
   7719
   7720	vmx->nested.smm.vmxon = vmx->nested.vmxon;
   7721	vmx->nested.vmxon = false;
   7722	vmx_clear_hlt(vcpu);
   7723	return 0;
   7724}
   7725
   7726static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
   7727{
   7728	struct vcpu_vmx *vmx = to_vmx(vcpu);
   7729	int ret;
   7730
   7731	if (vmx->nested.smm.vmxon) {
   7732		vmx->nested.vmxon = true;
   7733		vmx->nested.smm.vmxon = false;
   7734	}
   7735
   7736	if (vmx->nested.smm.guest_mode) {
   7737		ret = nested_vmx_enter_non_root_mode(vcpu, false);
   7738		if (ret)
   7739			return ret;
   7740
   7741		vmx->nested.nested_run_pending = 1;
   7742		vmx->nested.smm.guest_mode = false;
   7743	}
   7744	return 0;
   7745}
   7746
   7747static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
   7748{
   7749	/* RSM will cause a vmexit anyway.  */
   7750}
   7751
   7752static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
   7753{
   7754	return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
   7755}
   7756
   7757static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
   7758{
   7759	if (is_guest_mode(vcpu)) {
   7760		struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
   7761
   7762		if (hrtimer_try_to_cancel(timer) == 1)
   7763			hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
   7764	}
   7765}
   7766
   7767static void vmx_hardware_unsetup(void)
   7768{
   7769	kvm_set_posted_intr_wakeup_handler(NULL);
   7770
   7771	if (nested)
   7772		nested_vmx_hardware_unsetup();
   7773
   7774	free_kvm_area();
   7775}
   7776
   7777static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
   7778{
   7779	ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
   7780			  BIT(APICV_INHIBIT_REASON_ABSENT) |
   7781			  BIT(APICV_INHIBIT_REASON_HYPERV) |
   7782			  BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
   7783			  BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
   7784			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
   7785
   7786	return supported & BIT(reason);
   7787}
   7788
   7789static struct kvm_x86_ops vmx_x86_ops __initdata = {
   7790	.name = "kvm_intel",
   7791
   7792	.hardware_unsetup = vmx_hardware_unsetup,
   7793
   7794	.hardware_enable = vmx_hardware_enable,
   7795	.hardware_disable = vmx_hardware_disable,
   7796	.has_emulated_msr = vmx_has_emulated_msr,
   7797
   7798	.vm_size = sizeof(struct kvm_vmx),
   7799	.vm_init = vmx_vm_init,
   7800
   7801	.vcpu_create = vmx_vcpu_create,
   7802	.vcpu_free = vmx_vcpu_free,
   7803	.vcpu_reset = vmx_vcpu_reset,
   7804
   7805	.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
   7806	.vcpu_load = vmx_vcpu_load,
   7807	.vcpu_put = vmx_vcpu_put,
   7808
   7809	.update_exception_bitmap = vmx_update_exception_bitmap,
   7810	.get_msr_feature = vmx_get_msr_feature,
   7811	.get_msr = vmx_get_msr,
   7812	.set_msr = vmx_set_msr,
   7813	.get_segment_base = vmx_get_segment_base,
   7814	.get_segment = vmx_get_segment,
   7815	.set_segment = vmx_set_segment,
   7816	.get_cpl = vmx_get_cpl,
   7817	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
   7818	.set_cr0 = vmx_set_cr0,
   7819	.is_valid_cr4 = vmx_is_valid_cr4,
   7820	.set_cr4 = vmx_set_cr4,
   7821	.set_efer = vmx_set_efer,
   7822	.get_idt = vmx_get_idt,
   7823	.set_idt = vmx_set_idt,
   7824	.get_gdt = vmx_get_gdt,
   7825	.set_gdt = vmx_set_gdt,
   7826	.set_dr7 = vmx_set_dr7,
   7827	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
   7828	.cache_reg = vmx_cache_reg,
   7829	.get_rflags = vmx_get_rflags,
   7830	.set_rflags = vmx_set_rflags,
   7831	.get_if_flag = vmx_get_if_flag,
   7832
   7833	.flush_tlb_all = vmx_flush_tlb_all,
   7834	.flush_tlb_current = vmx_flush_tlb_current,
   7835	.flush_tlb_gva = vmx_flush_tlb_gva,
   7836	.flush_tlb_guest = vmx_flush_tlb_guest,
   7837
   7838	.vcpu_pre_run = vmx_vcpu_pre_run,
   7839	.vcpu_run = vmx_vcpu_run,
   7840	.handle_exit = vmx_handle_exit,
   7841	.skip_emulated_instruction = vmx_skip_emulated_instruction,
   7842	.update_emulated_instruction = vmx_update_emulated_instruction,
   7843	.set_interrupt_shadow = vmx_set_interrupt_shadow,
   7844	.get_interrupt_shadow = vmx_get_interrupt_shadow,
   7845	.patch_hypercall = vmx_patch_hypercall,
   7846	.inject_irq = vmx_inject_irq,
   7847	.inject_nmi = vmx_inject_nmi,
   7848	.queue_exception = vmx_queue_exception,
   7849	.cancel_injection = vmx_cancel_injection,
   7850	.interrupt_allowed = vmx_interrupt_allowed,
   7851	.nmi_allowed = vmx_nmi_allowed,
   7852	.get_nmi_mask = vmx_get_nmi_mask,
   7853	.set_nmi_mask = vmx_set_nmi_mask,
   7854	.enable_nmi_window = vmx_enable_nmi_window,
   7855	.enable_irq_window = vmx_enable_irq_window,
   7856	.update_cr8_intercept = vmx_update_cr8_intercept,
   7857	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
   7858	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
   7859	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
   7860	.load_eoi_exitmap = vmx_load_eoi_exitmap,
   7861	.apicv_post_state_restore = vmx_apicv_post_state_restore,
   7862	.check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
   7863	.hwapic_irr_update = vmx_hwapic_irr_update,
   7864	.hwapic_isr_update = vmx_hwapic_isr_update,
   7865	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
   7866	.sync_pir_to_irr = vmx_sync_pir_to_irr,
   7867	.deliver_interrupt = vmx_deliver_interrupt,
   7868	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
   7869
   7870	.set_tss_addr = vmx_set_tss_addr,
   7871	.set_identity_map_addr = vmx_set_identity_map_addr,
   7872	.get_mt_mask = vmx_get_mt_mask,
   7873
   7874	.get_exit_info = vmx_get_exit_info,
   7875
   7876	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
   7877
   7878	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
   7879
   7880	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
   7881	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
   7882	.write_tsc_offset = vmx_write_tsc_offset,
   7883	.write_tsc_multiplier = vmx_write_tsc_multiplier,
   7884
   7885	.load_mmu_pgd = vmx_load_mmu_pgd,
   7886
   7887	.check_intercept = vmx_check_intercept,
   7888	.handle_exit_irqoff = vmx_handle_exit_irqoff,
   7889
   7890	.request_immediate_exit = vmx_request_immediate_exit,
   7891
   7892	.sched_in = vmx_sched_in,
   7893
   7894	.cpu_dirty_log_size = PML_ENTITY_NUM,
   7895	.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
   7896
   7897	.nested_ops = &vmx_nested_ops,
   7898
   7899	.pi_update_irte = vmx_pi_update_irte,
   7900	.pi_start_assignment = vmx_pi_start_assignment,
   7901
   7902#ifdef CONFIG_X86_64
   7903	.set_hv_timer = vmx_set_hv_timer,
   7904	.cancel_hv_timer = vmx_cancel_hv_timer,
   7905#endif
   7906
   7907	.setup_mce = vmx_setup_mce,
   7908
   7909	.smi_allowed = vmx_smi_allowed,
   7910	.enter_smm = vmx_enter_smm,
   7911	.leave_smm = vmx_leave_smm,
   7912	.enable_smi_window = vmx_enable_smi_window,
   7913
   7914	.can_emulate_instruction = vmx_can_emulate_instruction,
   7915	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
   7916	.migrate_timers = vmx_migrate_timers,
   7917
   7918	.msr_filter_changed = vmx_msr_filter_changed,
   7919	.complete_emulated_msr = kvm_complete_insn_gp,
   7920
   7921	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
   7922};
   7923
   7924static unsigned int vmx_handle_intel_pt_intr(void)
   7925{
   7926	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
   7927
   7928	/* '0' on failure so that the !PT case can use a RET0 static call. */
   7929	if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
   7930		return 0;
   7931
   7932	kvm_make_request(KVM_REQ_PMI, vcpu);
   7933	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
   7934		  (unsigned long *)&vcpu->arch.pmu.global_status);
   7935	return 1;
   7936}
   7937
   7938static __init void vmx_setup_user_return_msrs(void)
   7939{
   7940
   7941	/*
   7942	 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
   7943	 * will emulate SYSCALL in legacy mode if the vendor string in guest
   7944	 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
   7945	 * support this emulation, MSR_STAR is included in the list for i386,
   7946	 * but is never loaded into hardware.  MSR_CSTAR is also never loaded
   7947	 * into hardware and is here purely for emulation purposes.
   7948	 */
   7949	const u32 vmx_uret_msrs_list[] = {
   7950	#ifdef CONFIG_X86_64
   7951		MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
   7952	#endif
   7953		MSR_EFER, MSR_TSC_AUX, MSR_STAR,
   7954		MSR_IA32_TSX_CTRL,
   7955	};
   7956	int i;
   7957
   7958	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
   7959
   7960	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
   7961		kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
   7962}
   7963
   7964static void __init vmx_setup_me_spte_mask(void)
   7965{
   7966	u64 me_mask = 0;
   7967
   7968	/*
   7969	 * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
   7970	 * the former to avoid exposing shadow_phys_bits.
   7971	 *
   7972	 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
   7973	 * shadow_phys_bits.  On MKTME and/or TDX capable systems,
   7974	 * boot_cpu_data.x86_phys_bits holds the actual physical address
   7975	 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
   7976	 * reported by CPUID.  Those bits between are KeyID bits.
   7977	 */
   7978	if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
   7979		me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
   7980			kvm_get_shadow_phys_bits() - 1);
   7981	/*
   7982	 * Unlike SME, host kernel doesn't support setting up any
   7983	 * MKTME KeyID on Intel platforms.  No memory encryption
   7984	 * bits should be included into the SPTE.
   7985	 */
   7986	kvm_mmu_set_me_spte_mask(0, me_mask);
   7987}
   7988
   7989static struct kvm_x86_init_ops vmx_init_ops __initdata;
   7990
   7991static __init int hardware_setup(void)
   7992{
   7993	unsigned long host_bndcfgs;
   7994	struct desc_ptr dt;
   7995	int r;
   7996
   7997	store_idt(&dt);
   7998	host_idt_base = dt.address;
   7999
   8000	vmx_setup_user_return_msrs();
   8001
   8002	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
   8003		return -EIO;
   8004
   8005	if (boot_cpu_has(X86_FEATURE_NX))
   8006		kvm_enable_efer_bits(EFER_NX);
   8007
   8008	if (boot_cpu_has(X86_FEATURE_MPX)) {
   8009		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
   8010		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
   8011	}
   8012
   8013	if (!cpu_has_vmx_mpx())
   8014		supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
   8015				    XFEATURE_MASK_BNDCSR);
   8016
   8017	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
   8018	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
   8019		enable_vpid = 0;
   8020
   8021	if (!cpu_has_vmx_ept() ||
   8022	    !cpu_has_vmx_ept_4levels() ||
   8023	    !cpu_has_vmx_ept_mt_wb() ||
   8024	    !cpu_has_vmx_invept_global())
   8025		enable_ept = 0;
   8026
   8027	/* NX support is required for shadow paging. */
   8028	if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
   8029		pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
   8030		return -EOPNOTSUPP;
   8031	}
   8032
   8033	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
   8034		enable_ept_ad_bits = 0;
   8035
   8036	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
   8037		enable_unrestricted_guest = 0;
   8038
   8039	if (!cpu_has_vmx_flexpriority())
   8040		flexpriority_enabled = 0;
   8041
   8042	if (!cpu_has_virtual_nmis())
   8043		enable_vnmi = 0;
   8044
   8045	/*
   8046	 * set_apic_access_page_addr() is used to reload apic access
   8047	 * page upon invalidation.  No need to do anything if not
   8048	 * using the APIC_ACCESS_ADDR VMCS field.
   8049	 */
   8050	if (!flexpriority_enabled)
   8051		vmx_x86_ops.set_apic_access_page_addr = NULL;
   8052
   8053	if (!cpu_has_vmx_tpr_shadow())
   8054		vmx_x86_ops.update_cr8_intercept = NULL;
   8055
   8056#if IS_ENABLED(CONFIG_HYPERV)
   8057	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
   8058	    && enable_ept) {
   8059		vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
   8060		vmx_x86_ops.tlb_remote_flush_with_range =
   8061				hv_remote_flush_tlb_with_range;
   8062	}
   8063#endif
   8064
   8065	if (!cpu_has_vmx_ple()) {
   8066		ple_gap = 0;
   8067		ple_window = 0;
   8068		ple_window_grow = 0;
   8069		ple_window_max = 0;
   8070		ple_window_shrink = 0;
   8071	}
   8072
   8073	if (!cpu_has_vmx_apicv())
   8074		enable_apicv = 0;
   8075	if (!enable_apicv)
   8076		vmx_x86_ops.sync_pir_to_irr = NULL;
   8077
   8078	if (cpu_has_vmx_tsc_scaling())
   8079		kvm_has_tsc_control = true;
   8080
   8081	kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
   8082	kvm_tsc_scaling_ratio_frac_bits = 48;
   8083	kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
   8084
   8085	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
   8086
   8087	if (enable_ept)
   8088		kvm_mmu_set_ept_masks(enable_ept_ad_bits,
   8089				      cpu_has_vmx_ept_execute_only());
   8090
   8091	/*
   8092	 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
   8093	 * bits to shadow_zero_check.
   8094	 */
   8095	vmx_setup_me_spte_mask();
   8096
   8097	kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
   8098			  ept_caps_to_lpage_level(vmx_capability.ept));
   8099
   8100	/*
   8101	 * Only enable PML when hardware supports PML feature, and both EPT
   8102	 * and EPT A/D bit features are enabled -- PML depends on them to work.
   8103	 */
   8104	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
   8105		enable_pml = 0;
   8106
   8107	if (!enable_pml)
   8108		vmx_x86_ops.cpu_dirty_log_size = 0;
   8109
   8110	if (!cpu_has_vmx_preemption_timer())
   8111		enable_preemption_timer = false;
   8112
   8113	if (enable_preemption_timer) {
   8114		u64 use_timer_freq = 5000ULL * 1000 * 1000;
   8115		u64 vmx_msr;
   8116
   8117		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
   8118		cpu_preemption_timer_multi =
   8119			vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
   8120
   8121		if (tsc_khz)
   8122			use_timer_freq = (u64)tsc_khz * 1000;
   8123		use_timer_freq >>= cpu_preemption_timer_multi;
   8124
   8125		/*
   8126		 * KVM "disables" the preemption timer by setting it to its max
   8127		 * value.  Don't use the timer if it might cause spurious exits
   8128		 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
   8129		 */
   8130		if (use_timer_freq > 0xffffffffu / 10)
   8131			enable_preemption_timer = false;
   8132	}
   8133
   8134	if (!enable_preemption_timer) {
   8135		vmx_x86_ops.set_hv_timer = NULL;
   8136		vmx_x86_ops.cancel_hv_timer = NULL;
   8137		vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
   8138	}
   8139
   8140	kvm_mce_cap_supported |= MCG_LMCE_P;
   8141
   8142	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
   8143		return -EINVAL;
   8144	if (!enable_ept || !cpu_has_vmx_intel_pt())
   8145		pt_mode = PT_MODE_SYSTEM;
   8146	if (pt_mode == PT_MODE_HOST_GUEST)
   8147		vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
   8148	else
   8149		vmx_init_ops.handle_intel_pt_intr = NULL;
   8150
   8151	setup_default_sgx_lepubkeyhash();
   8152
   8153	if (nested) {
   8154		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
   8155					   vmx_capability.ept);
   8156
   8157		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
   8158		if (r)
   8159			return r;
   8160	}
   8161
   8162	vmx_set_cpu_caps();
   8163
   8164	r = alloc_kvm_area();
   8165	if (r && nested)
   8166		nested_vmx_hardware_unsetup();
   8167
   8168	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
   8169
   8170	return r;
   8171}
   8172
   8173static struct kvm_x86_init_ops vmx_init_ops __initdata = {
   8174	.cpu_has_kvm_support = cpu_has_kvm_support,
   8175	.disabled_by_bios = vmx_disabled_by_bios,
   8176	.check_processor_compatibility = vmx_check_processor_compat,
   8177	.hardware_setup = hardware_setup,
   8178	.handle_intel_pt_intr = NULL,
   8179
   8180	.runtime_ops = &vmx_x86_ops,
   8181	.pmu_ops = &intel_pmu_ops,
   8182};
   8183
   8184static void vmx_cleanup_l1d_flush(void)
   8185{
   8186	if (vmx_l1d_flush_pages) {
   8187		free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
   8188		vmx_l1d_flush_pages = NULL;
   8189	}
   8190	/* Restore state so sysfs ignores VMX */
   8191	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
   8192}
   8193
   8194static void vmx_exit(void)
   8195{
   8196#ifdef CONFIG_KEXEC_CORE
   8197	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
   8198	synchronize_rcu();
   8199#endif
   8200
   8201	kvm_exit();
   8202
   8203#if IS_ENABLED(CONFIG_HYPERV)
   8204	if (static_branch_unlikely(&enable_evmcs)) {
   8205		int cpu;
   8206		struct hv_vp_assist_page *vp_ap;
   8207		/*
   8208		 * Reset everything to support using non-enlightened VMCS
   8209		 * access later (e.g. when we reload the module with
   8210		 * enlightened_vmcs=0)
   8211		 */
   8212		for_each_online_cpu(cpu) {
   8213			vp_ap =	hv_get_vp_assist_page(cpu);
   8214
   8215			if (!vp_ap)
   8216				continue;
   8217
   8218			vp_ap->nested_control.features.directhypercall = 0;
   8219			vp_ap->current_nested_vmcs = 0;
   8220			vp_ap->enlighten_vmentry = 0;
   8221		}
   8222
   8223		static_branch_disable(&enable_evmcs);
   8224	}
   8225#endif
   8226	vmx_cleanup_l1d_flush();
   8227
   8228	allow_smaller_maxphyaddr = false;
   8229}
   8230module_exit(vmx_exit);
   8231
   8232static int __init vmx_init(void)
   8233{
   8234	int r, cpu;
   8235
   8236#if IS_ENABLED(CONFIG_HYPERV)
   8237	/*
   8238	 * Enlightened VMCS usage should be recommended and the host needs
   8239	 * to support eVMCS v1 or above. We can also disable eVMCS support
   8240	 * with module parameter.
   8241	 */
   8242	if (enlightened_vmcs &&
   8243	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
   8244	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
   8245	    KVM_EVMCS_VERSION) {
   8246
   8247		/* Check that we have assist pages on all online CPUs */
   8248		for_each_online_cpu(cpu) {
   8249			if (!hv_get_vp_assist_page(cpu)) {
   8250				enlightened_vmcs = false;
   8251				break;
   8252			}
   8253		}
   8254
   8255		if (enlightened_vmcs) {
   8256			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
   8257			static_branch_enable(&enable_evmcs);
   8258		}
   8259
   8260		if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
   8261			vmx_x86_ops.enable_direct_tlbflush
   8262				= hv_enable_direct_tlbflush;
   8263
   8264	} else {
   8265		enlightened_vmcs = false;
   8266	}
   8267#endif
   8268
   8269	r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
   8270		     __alignof__(struct vcpu_vmx), THIS_MODULE);
   8271	if (r)
   8272		return r;
   8273
   8274	/*
   8275	 * Must be called after kvm_init() so enable_ept is properly set
   8276	 * up. Hand the parameter mitigation value in which was stored in
   8277	 * the pre module init parser. If no parameter was given, it will
   8278	 * contain 'auto' which will be turned into the default 'cond'
   8279	 * mitigation mode.
   8280	 */
   8281	r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
   8282	if (r) {
   8283		vmx_exit();
   8284		return r;
   8285	}
   8286
   8287	vmx_setup_fb_clear_ctrl();
   8288
   8289	for_each_possible_cpu(cpu) {
   8290		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
   8291
   8292		pi_init_cpu(cpu);
   8293	}
   8294
   8295#ifdef CONFIG_KEXEC_CORE
   8296	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
   8297			   crash_vmclear_local_loaded_vmcss);
   8298#endif
   8299	vmx_check_vmcs12_offsets();
   8300
   8301	/*
   8302	 * Shadow paging doesn't have a (further) performance penalty
   8303	 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
   8304	 * by default
   8305	 */
   8306	if (!enable_ept)
   8307		allow_smaller_maxphyaddr = true;
   8308
   8309	return 0;
   8310}
   8311module_init(vmx_init);