cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

nested.c (214896B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include <linux/objtool.h>
      4#include <linux/percpu.h>
      5
      6#include <asm/debugreg.h>
      7#include <asm/mmu_context.h>
      8
      9#include "cpuid.h"
     10#include "evmcs.h"
     11#include "hyperv.h"
     12#include "mmu.h"
     13#include "nested.h"
     14#include "pmu.h"
     15#include "sgx.h"
     16#include "trace.h"
     17#include "vmx.h"
     18#include "x86.h"
     19
     20static bool __read_mostly enable_shadow_vmcs = 1;
     21module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
     22
     23static bool __read_mostly nested_early_check = 0;
     24module_param(nested_early_check, bool, S_IRUGO);
     25
     26#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
     27
     28/*
     29 * Hyper-V requires all of these, so mark them as supported even though
     30 * they are just treated the same as all-context.
     31 */
     32#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
     33	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
     34	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
     35	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
     36	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
     37
     38#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
     39
     40enum {
     41	VMX_VMREAD_BITMAP,
     42	VMX_VMWRITE_BITMAP,
     43	VMX_BITMAP_NR
     44};
     45static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
     46
     47#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
     48#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
     49
     50struct shadow_vmcs_field {
     51	u16	encoding;
     52	u16	offset;
     53};
     54static struct shadow_vmcs_field shadow_read_only_fields[] = {
     55#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
     56#include "vmcs_shadow_fields.h"
     57};
     58static int max_shadow_read_only_fields =
     59	ARRAY_SIZE(shadow_read_only_fields);
     60
     61static struct shadow_vmcs_field shadow_read_write_fields[] = {
     62#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
     63#include "vmcs_shadow_fields.h"
     64};
     65static int max_shadow_read_write_fields =
     66	ARRAY_SIZE(shadow_read_write_fields);
     67
     68static void init_vmcs_shadow_fields(void)
     69{
     70	int i, j;
     71
     72	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
     73	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
     74
     75	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
     76		struct shadow_vmcs_field entry = shadow_read_only_fields[i];
     77		u16 field = entry.encoding;
     78
     79		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
     80		    (i + 1 == max_shadow_read_only_fields ||
     81		     shadow_read_only_fields[i + 1].encoding != field + 1))
     82			pr_err("Missing field from shadow_read_only_field %x\n",
     83			       field + 1);
     84
     85		clear_bit(field, vmx_vmread_bitmap);
     86		if (field & 1)
     87#ifdef CONFIG_X86_64
     88			continue;
     89#else
     90			entry.offset += sizeof(u32);
     91#endif
     92		shadow_read_only_fields[j++] = entry;
     93	}
     94	max_shadow_read_only_fields = j;
     95
     96	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
     97		struct shadow_vmcs_field entry = shadow_read_write_fields[i];
     98		u16 field = entry.encoding;
     99
    100		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
    101		    (i + 1 == max_shadow_read_write_fields ||
    102		     shadow_read_write_fields[i + 1].encoding != field + 1))
    103			pr_err("Missing field from shadow_read_write_field %x\n",
    104			       field + 1);
    105
    106		WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
    107			  field <= GUEST_TR_AR_BYTES,
    108			  "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
    109
    110		/*
    111		 * PML and the preemption timer can be emulated, but the
    112		 * processor cannot vmwrite to fields that don't exist
    113		 * on bare metal.
    114		 */
    115		switch (field) {
    116		case GUEST_PML_INDEX:
    117			if (!cpu_has_vmx_pml())
    118				continue;
    119			break;
    120		case VMX_PREEMPTION_TIMER_VALUE:
    121			if (!cpu_has_vmx_preemption_timer())
    122				continue;
    123			break;
    124		case GUEST_INTR_STATUS:
    125			if (!cpu_has_vmx_apicv())
    126				continue;
    127			break;
    128		default:
    129			break;
    130		}
    131
    132		clear_bit(field, vmx_vmwrite_bitmap);
    133		clear_bit(field, vmx_vmread_bitmap);
    134		if (field & 1)
    135#ifdef CONFIG_X86_64
    136			continue;
    137#else
    138			entry.offset += sizeof(u32);
    139#endif
    140		shadow_read_write_fields[j++] = entry;
    141	}
    142	max_shadow_read_write_fields = j;
    143}
    144
    145/*
    146 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
    147 * set the success or error code of an emulated VMX instruction (as specified
    148 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
    149 * instruction.
    150 */
    151static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
    152{
    153	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
    154			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
    155			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
    156	return kvm_skip_emulated_instruction(vcpu);
    157}
    158
    159static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
    160{
    161	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
    162			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
    163			    X86_EFLAGS_SF | X86_EFLAGS_OF))
    164			| X86_EFLAGS_CF);
    165	return kvm_skip_emulated_instruction(vcpu);
    166}
    167
    168static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
    169				u32 vm_instruction_error)
    170{
    171	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
    172			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
    173			    X86_EFLAGS_SF | X86_EFLAGS_OF))
    174			| X86_EFLAGS_ZF);
    175	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
    176	/*
    177	 * We don't need to force sync to shadow VMCS because
    178	 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
    179	 * fields and thus must be synced.
    180	 */
    181	if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
    182		to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
    183
    184	return kvm_skip_emulated_instruction(vcpu);
    185}
    186
    187static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
    188{
    189	struct vcpu_vmx *vmx = to_vmx(vcpu);
    190
    191	/*
    192	 * failValid writes the error number to the current VMCS, which
    193	 * can't be done if there isn't a current VMCS.
    194	 */
    195	if (vmx->nested.current_vmptr == INVALID_GPA &&
    196	    !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
    197		return nested_vmx_failInvalid(vcpu);
    198
    199	return nested_vmx_failValid(vcpu, vm_instruction_error);
    200}
    201
    202static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
    203{
    204	/* TODO: not to reset guest simply here. */
    205	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
    206	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
    207}
    208
    209static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
    210{
    211	return fixed_bits_valid(control, low, high);
    212}
    213
    214static inline u64 vmx_control_msr(u32 low, u32 high)
    215{
    216	return low | ((u64)high << 32);
    217}
    218
    219static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
    220{
    221	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
    222	vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
    223	vmx->nested.need_vmcs12_to_shadow_sync = false;
    224}
    225
    226static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
    227{
    228	struct vcpu_vmx *vmx = to_vmx(vcpu);
    229
    230	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
    231		kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
    232		vmx->nested.hv_evmcs = NULL;
    233	}
    234
    235	vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
    236}
    237
    238static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
    239				     struct loaded_vmcs *prev)
    240{
    241	struct vmcs_host_state *dest, *src;
    242
    243	if (unlikely(!vmx->guest_state_loaded))
    244		return;
    245
    246	src = &prev->host_state;
    247	dest = &vmx->loaded_vmcs->host_state;
    248
    249	vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
    250	dest->ldt_sel = src->ldt_sel;
    251#ifdef CONFIG_X86_64
    252	dest->ds_sel = src->ds_sel;
    253	dest->es_sel = src->es_sel;
    254#endif
    255}
    256
    257static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
    258{
    259	struct vcpu_vmx *vmx = to_vmx(vcpu);
    260	struct loaded_vmcs *prev;
    261	int cpu;
    262
    263	if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
    264		return;
    265
    266	cpu = get_cpu();
    267	prev = vmx->loaded_vmcs;
    268	vmx->loaded_vmcs = vmcs;
    269	vmx_vcpu_load_vmcs(vcpu, cpu, prev);
    270	vmx_sync_vmcs_host_state(vmx, prev);
    271	put_cpu();
    272
    273	vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
    274
    275	/*
    276	 * All lazily updated registers will be reloaded from VMCS12 on both
    277	 * vmentry and vmexit.
    278	 */
    279	vcpu->arch.regs_dirty = 0;
    280}
    281
    282/*
    283 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
    284 * just stops using VMX.
    285 */
    286static void free_nested(struct kvm_vcpu *vcpu)
    287{
    288	struct vcpu_vmx *vmx = to_vmx(vcpu);
    289
    290	if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
    291		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
    292
    293	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
    294		return;
    295
    296	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
    297
    298	vmx->nested.vmxon = false;
    299	vmx->nested.smm.vmxon = false;
    300	vmx->nested.vmxon_ptr = INVALID_GPA;
    301	free_vpid(vmx->nested.vpid02);
    302	vmx->nested.posted_intr_nv = -1;
    303	vmx->nested.current_vmptr = INVALID_GPA;
    304	if (enable_shadow_vmcs) {
    305		vmx_disable_shadow_vmcs(vmx);
    306		vmcs_clear(vmx->vmcs01.shadow_vmcs);
    307		free_vmcs(vmx->vmcs01.shadow_vmcs);
    308		vmx->vmcs01.shadow_vmcs = NULL;
    309	}
    310	kfree(vmx->nested.cached_vmcs12);
    311	vmx->nested.cached_vmcs12 = NULL;
    312	kfree(vmx->nested.cached_shadow_vmcs12);
    313	vmx->nested.cached_shadow_vmcs12 = NULL;
    314	/* Unpin physical memory we referred to in the vmcs02 */
    315	if (vmx->nested.apic_access_page) {
    316		kvm_release_page_clean(vmx->nested.apic_access_page);
    317		vmx->nested.apic_access_page = NULL;
    318	}
    319	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
    320	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
    321	vmx->nested.pi_desc = NULL;
    322
    323	kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
    324
    325	nested_release_evmcs(vcpu);
    326
    327	free_loaded_vmcs(&vmx->nested.vmcs02);
    328}
    329
    330/*
    331 * Ensure that the current vmcs of the logical processor is the
    332 * vmcs01 of the vcpu before calling free_nested().
    333 */
    334void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
    335{
    336	vcpu_load(vcpu);
    337	vmx_leave_nested(vcpu);
    338	vcpu_put(vcpu);
    339}
    340
    341#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
    342
    343static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
    344{
    345	return VALID_PAGE(root_hpa) &&
    346	       ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
    347}
    348
    349static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
    350				       gpa_t addr)
    351{
    352	uint i;
    353	struct kvm_mmu_root_info *cached_root;
    354
    355	WARN_ON_ONCE(!mmu_is_nested(vcpu));
    356
    357	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
    358		cached_root = &vcpu->arch.mmu->prev_roots[i];
    359
    360		if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
    361					    eptp))
    362			vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
    363	}
    364}
    365
    366static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
    367		struct x86_exception *fault)
    368{
    369	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
    370	struct vcpu_vmx *vmx = to_vmx(vcpu);
    371	u32 vm_exit_reason;
    372	unsigned long exit_qualification = vcpu->arch.exit_qualification;
    373
    374	if (vmx->nested.pml_full) {
    375		vm_exit_reason = EXIT_REASON_PML_FULL;
    376		vmx->nested.pml_full = false;
    377		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
    378	} else {
    379		if (fault->error_code & PFERR_RSVD_MASK)
    380			vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
    381		else
    382			vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
    383
    384		/*
    385		 * Although the caller (kvm_inject_emulated_page_fault) would
    386		 * have already synced the faulting address in the shadow EPT
    387		 * tables for the current EPTP12, we also need to sync it for
    388		 * any other cached EPTP02s based on the same EP4TA, since the
    389		 * TLB associates mappings to the EP4TA rather than the full EPTP.
    390		 */
    391		nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
    392					   fault->address);
    393	}
    394
    395	nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
    396	vmcs12->guest_physical_address = fault->address;
    397}
    398
    399static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
    400{
    401	struct vcpu_vmx *vmx = to_vmx(vcpu);
    402	bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
    403	int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
    404
    405	kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
    406				nested_ept_ad_enabled(vcpu),
    407				nested_ept_get_eptp(vcpu));
    408}
    409
    410static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
    411{
    412	WARN_ON(mmu_is_nested(vcpu));
    413
    414	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
    415	nested_ept_new_eptp(vcpu);
    416	vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
    417	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
    418	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
    419
    420	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
    421}
    422
    423static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
    424{
    425	vcpu->arch.mmu = &vcpu->arch.root_mmu;
    426	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
    427}
    428
    429static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
    430					    u16 error_code)
    431{
    432	bool inequality, bit;
    433
    434	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
    435	inequality =
    436		(error_code & vmcs12->page_fault_error_code_mask) !=
    437		 vmcs12->page_fault_error_code_match;
    438	return inequality ^ bit;
    439}
    440
    441
    442/*
    443 * KVM wants to inject page-faults which it got to the guest. This function
    444 * checks whether in a nested guest, we need to inject them to L1 or L2.
    445 */
    446static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
    447{
    448	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
    449	unsigned int nr = vcpu->arch.exception.nr;
    450	bool has_payload = vcpu->arch.exception.has_payload;
    451	unsigned long payload = vcpu->arch.exception.payload;
    452
    453	if (nr == PF_VECTOR) {
    454		if (vcpu->arch.exception.nested_apf) {
    455			*exit_qual = vcpu->arch.apf.nested_apf_token;
    456			return 1;
    457		}
    458		if (nested_vmx_is_page_fault_vmexit(vmcs12,
    459						    vcpu->arch.exception.error_code)) {
    460			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
    461			return 1;
    462		}
    463	} else if (vmcs12->exception_bitmap & (1u << nr)) {
    464		if (nr == DB_VECTOR) {
    465			if (!has_payload) {
    466				payload = vcpu->arch.dr6;
    467				payload &= ~DR6_BT;
    468				payload ^= DR6_ACTIVE_LOW;
    469			}
    470			*exit_qual = payload;
    471		} else
    472			*exit_qual = 0;
    473		return 1;
    474	}
    475
    476	return 0;
    477}
    478
    479static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
    480						    struct x86_exception *fault)
    481{
    482	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
    483
    484	WARN_ON(!is_guest_mode(vcpu));
    485
    486	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
    487	    !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
    488		vmcs12->vm_exit_intr_error_code = fault->error_code;
    489		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
    490				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
    491				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
    492				  fault->address);
    493		return true;
    494	}
    495	return false;
    496}
    497
    498static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
    499					       struct vmcs12 *vmcs12)
    500{
    501	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
    502		return 0;
    503
    504	if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
    505	    CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
    506		return -EINVAL;
    507
    508	return 0;
    509}
    510
    511static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
    512						struct vmcs12 *vmcs12)
    513{
    514	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
    515		return 0;
    516
    517	if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
    518		return -EINVAL;
    519
    520	return 0;
    521}
    522
    523static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
    524						struct vmcs12 *vmcs12)
    525{
    526	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
    527		return 0;
    528
    529	if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
    530		return -EINVAL;
    531
    532	return 0;
    533}
    534
    535/*
    536 * For x2APIC MSRs, ignore the vmcs01 bitmap.  L1 can enable x2APIC without L1
    537 * itself utilizing x2APIC.  All MSRs were previously set to be intercepted,
    538 * only the "disable intercept" case needs to be handled.
    539 */
    540static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
    541							unsigned long *msr_bitmap_l0,
    542							u32 msr, int type)
    543{
    544	if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
    545		vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
    546
    547	if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
    548		vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
    549}
    550
    551static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
    552{
    553	int msr;
    554
    555	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
    556		unsigned word = msr / BITS_PER_LONG;
    557
    558		msr_bitmap[word] = ~0;
    559		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
    560	}
    561}
    562
    563#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw)					\
    564static inline									\
    565void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx,			\
    566					 unsigned long *msr_bitmap_l1,		\
    567					 unsigned long *msr_bitmap_l0, u32 msr)	\
    568{										\
    569	if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) ||		\
    570	    vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr))			\
    571		vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr);			\
    572	else									\
    573		vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr);			\
    574}
    575BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
    576BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
    577
    578static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
    579						    unsigned long *msr_bitmap_l1,
    580						    unsigned long *msr_bitmap_l0,
    581						    u32 msr, int types)
    582{
    583	if (types & MSR_TYPE_R)
    584		nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
    585						  msr_bitmap_l0, msr);
    586	if (types & MSR_TYPE_W)
    587		nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
    588						   msr_bitmap_l0, msr);
    589}
    590
    591/*
    592 * Merge L0's and L1's MSR bitmap, return false to indicate that
    593 * we do not use the hardware.
    594 */
    595static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
    596						 struct vmcs12 *vmcs12)
    597{
    598	struct vcpu_vmx *vmx = to_vmx(vcpu);
    599	int msr;
    600	unsigned long *msr_bitmap_l1;
    601	unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
    602	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
    603	struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
    604
    605	/* Nothing to do if the MSR bitmap is not in use.  */
    606	if (!cpu_has_vmx_msr_bitmap() ||
    607	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
    608		return false;
    609
    610	/*
    611	 * MSR bitmap update can be skipped when:
    612	 * - MSR bitmap for L1 hasn't changed.
    613	 * - Nested hypervisor (L1) is attempting to launch the same L2 as
    614	 *   before.
    615	 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
    616	 *   and tells KVM (L0) there were no changes in MSR bitmap for L2.
    617	 */
    618	if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
    619	    evmcs->hv_enlightenments_control.msr_bitmap &&
    620	    evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
    621		return true;
    622
    623	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
    624		return false;
    625
    626	msr_bitmap_l1 = (unsigned long *)map->hva;
    627
    628	/*
    629	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
    630	 * 4-byte writes on 32-bit systems) up front to enable intercepts for
    631	 * the x2APIC MSR range and selectively toggle those relevant to L2.
    632	 */
    633	enable_x2apic_msr_intercepts(msr_bitmap_l0);
    634
    635	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
    636		if (nested_cpu_has_apic_reg_virt(vmcs12)) {
    637			/*
    638			 * L0 need not intercept reads for MSRs between 0x800
    639			 * and 0x8ff, it just lets the processor take the value
    640			 * from the virtual-APIC page; take those 256 bits
    641			 * directly from the L1 bitmap.
    642			 */
    643			for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
    644				unsigned word = msr / BITS_PER_LONG;
    645
    646				msr_bitmap_l0[word] = msr_bitmap_l1[word];
    647			}
    648		}
    649
    650		nested_vmx_disable_intercept_for_x2apic_msr(
    651			msr_bitmap_l1, msr_bitmap_l0,
    652			X2APIC_MSR(APIC_TASKPRI),
    653			MSR_TYPE_R | MSR_TYPE_W);
    654
    655		if (nested_cpu_has_vid(vmcs12)) {
    656			nested_vmx_disable_intercept_for_x2apic_msr(
    657				msr_bitmap_l1, msr_bitmap_l0,
    658				X2APIC_MSR(APIC_EOI),
    659				MSR_TYPE_W);
    660			nested_vmx_disable_intercept_for_x2apic_msr(
    661				msr_bitmap_l1, msr_bitmap_l0,
    662				X2APIC_MSR(APIC_SELF_IPI),
    663				MSR_TYPE_W);
    664		}
    665	}
    666
    667	/*
    668	 * Always check vmcs01's bitmap to honor userspace MSR filters and any
    669	 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
    670	 */
    671#ifdef CONFIG_X86_64
    672	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
    673					 MSR_FS_BASE, MSR_TYPE_RW);
    674
    675	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
    676					 MSR_GS_BASE, MSR_TYPE_RW);
    677
    678	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
    679					 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
    680#endif
    681	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
    682					 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
    683
    684	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
    685					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
    686
    687	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
    688
    689	vmx->nested.force_msr_bitmap_recalc = false;
    690
    691	return true;
    692}
    693
    694static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
    695				       struct vmcs12 *vmcs12)
    696{
    697	struct vcpu_vmx *vmx = to_vmx(vcpu);
    698	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
    699
    700	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
    701	    vmcs12->vmcs_link_pointer == INVALID_GPA)
    702		return;
    703
    704	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
    705	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
    706				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
    707		return;
    708
    709	kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
    710			      VMCS12_SIZE);
    711}
    712
    713static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
    714					      struct vmcs12 *vmcs12)
    715{
    716	struct vcpu_vmx *vmx = to_vmx(vcpu);
    717	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
    718
    719	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
    720	    vmcs12->vmcs_link_pointer == INVALID_GPA)
    721		return;
    722
    723	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
    724	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
    725				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
    726		return;
    727
    728	kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
    729			       VMCS12_SIZE);
    730}
    731
    732/*
    733 * In nested virtualization, check if L1 has set
    734 * VM_EXIT_ACK_INTR_ON_EXIT
    735 */
    736static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
    737{
    738	return get_vmcs12(vcpu)->vm_exit_controls &
    739		VM_EXIT_ACK_INTR_ON_EXIT;
    740}
    741
    742static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
    743					  struct vmcs12 *vmcs12)
    744{
    745	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
    746	    CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
    747		return -EINVAL;
    748	else
    749		return 0;
    750}
    751
    752static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
    753					   struct vmcs12 *vmcs12)
    754{
    755	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
    756	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
    757	    !nested_cpu_has_vid(vmcs12) &&
    758	    !nested_cpu_has_posted_intr(vmcs12))
    759		return 0;
    760
    761	/*
    762	 * If virtualize x2apic mode is enabled,
    763	 * virtualize apic access must be disabled.
    764	 */
    765	if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
    766	       nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
    767		return -EINVAL;
    768
    769	/*
    770	 * If virtual interrupt delivery is enabled,
    771	 * we must exit on external interrupts.
    772	 */
    773	if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
    774		return -EINVAL;
    775
    776	/*
    777	 * bits 15:8 should be zero in posted_intr_nv,
    778	 * the descriptor address has been already checked
    779	 * in nested_get_vmcs12_pages.
    780	 *
    781	 * bits 5:0 of posted_intr_desc_addr should be zero.
    782	 */
    783	if (nested_cpu_has_posted_intr(vmcs12) &&
    784	   (CC(!nested_cpu_has_vid(vmcs12)) ||
    785	    CC(!nested_exit_intr_ack_set(vcpu)) ||
    786	    CC((vmcs12->posted_intr_nv & 0xff00)) ||
    787	    CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
    788		return -EINVAL;
    789
    790	/* tpr shadow is needed by all apicv features. */
    791	if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
    792		return -EINVAL;
    793
    794	return 0;
    795}
    796
    797static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
    798				       u32 count, u64 addr)
    799{
    800	if (count == 0)
    801		return 0;
    802
    803	if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
    804	    !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
    805		return -EINVAL;
    806
    807	return 0;
    808}
    809
    810static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
    811						     struct vmcs12 *vmcs12)
    812{
    813	if (CC(nested_vmx_check_msr_switch(vcpu,
    814					   vmcs12->vm_exit_msr_load_count,
    815					   vmcs12->vm_exit_msr_load_addr)) ||
    816	    CC(nested_vmx_check_msr_switch(vcpu,
    817					   vmcs12->vm_exit_msr_store_count,
    818					   vmcs12->vm_exit_msr_store_addr)))
    819		return -EINVAL;
    820
    821	return 0;
    822}
    823
    824static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
    825                                                      struct vmcs12 *vmcs12)
    826{
    827	if (CC(nested_vmx_check_msr_switch(vcpu,
    828					   vmcs12->vm_entry_msr_load_count,
    829					   vmcs12->vm_entry_msr_load_addr)))
    830                return -EINVAL;
    831
    832	return 0;
    833}
    834
    835static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
    836					 struct vmcs12 *vmcs12)
    837{
    838	if (!nested_cpu_has_pml(vmcs12))
    839		return 0;
    840
    841	if (CC(!nested_cpu_has_ept(vmcs12)) ||
    842	    CC(!page_address_valid(vcpu, vmcs12->pml_address)))
    843		return -EINVAL;
    844
    845	return 0;
    846}
    847
    848static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
    849							struct vmcs12 *vmcs12)
    850{
    851	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
    852	       !nested_cpu_has_ept(vmcs12)))
    853		return -EINVAL;
    854	return 0;
    855}
    856
    857static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
    858							 struct vmcs12 *vmcs12)
    859{
    860	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
    861	       !nested_cpu_has_ept(vmcs12)))
    862		return -EINVAL;
    863	return 0;
    864}
    865
    866static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
    867						 struct vmcs12 *vmcs12)
    868{
    869	if (!nested_cpu_has_shadow_vmcs(vmcs12))
    870		return 0;
    871
    872	if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
    873	    CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
    874		return -EINVAL;
    875
    876	return 0;
    877}
    878
    879static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
    880				       struct vmx_msr_entry *e)
    881{
    882	/* x2APIC MSR accesses are not allowed */
    883	if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
    884		return -EINVAL;
    885	if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
    886	    CC(e->index == MSR_IA32_UCODE_REV))
    887		return -EINVAL;
    888	if (CC(e->reserved != 0))
    889		return -EINVAL;
    890	return 0;
    891}
    892
    893static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
    894				     struct vmx_msr_entry *e)
    895{
    896	if (CC(e->index == MSR_FS_BASE) ||
    897	    CC(e->index == MSR_GS_BASE) ||
    898	    CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
    899	    nested_vmx_msr_check_common(vcpu, e))
    900		return -EINVAL;
    901	return 0;
    902}
    903
    904static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
    905				      struct vmx_msr_entry *e)
    906{
    907	if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
    908	    nested_vmx_msr_check_common(vcpu, e))
    909		return -EINVAL;
    910	return 0;
    911}
    912
    913static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
    914{
    915	struct vcpu_vmx *vmx = to_vmx(vcpu);
    916	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
    917				       vmx->nested.msrs.misc_high);
    918
    919	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
    920}
    921
    922/*
    923 * Load guest's/host's msr at nested entry/exit.
    924 * return 0 for success, entry index for failure.
    925 *
    926 * One of the failure modes for MSR load/store is when a list exceeds the
    927 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
    928 * as possible, process all valid entries before failing rather than precheck
    929 * for a capacity violation.
    930 */
    931static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
    932{
    933	u32 i;
    934	struct vmx_msr_entry e;
    935	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
    936
    937	for (i = 0; i < count; i++) {
    938		if (unlikely(i >= max_msr_list_size))
    939			goto fail;
    940
    941		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
    942					&e, sizeof(e))) {
    943			pr_debug_ratelimited(
    944				"%s cannot read MSR entry (%u, 0x%08llx)\n",
    945				__func__, i, gpa + i * sizeof(e));
    946			goto fail;
    947		}
    948		if (nested_vmx_load_msr_check(vcpu, &e)) {
    949			pr_debug_ratelimited(
    950				"%s check failed (%u, 0x%x, 0x%x)\n",
    951				__func__, i, e.index, e.reserved);
    952			goto fail;
    953		}
    954		if (kvm_set_msr(vcpu, e.index, e.value)) {
    955			pr_debug_ratelimited(
    956				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
    957				__func__, i, e.index, e.value);
    958			goto fail;
    959		}
    960	}
    961	return 0;
    962fail:
    963	/* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
    964	return i + 1;
    965}
    966
    967static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
    968					    u32 msr_index,
    969					    u64 *data)
    970{
    971	struct vcpu_vmx *vmx = to_vmx(vcpu);
    972
    973	/*
    974	 * If the L0 hypervisor stored a more accurate value for the TSC that
    975	 * does not include the time taken for emulation of the L2->L1
    976	 * VM-exit in L0, use the more accurate value.
    977	 */
    978	if (msr_index == MSR_IA32_TSC) {
    979		int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
    980						    MSR_IA32_TSC);
    981
    982		if (i >= 0) {
    983			u64 val = vmx->msr_autostore.guest.val[i].value;
    984
    985			*data = kvm_read_l1_tsc(vcpu, val);
    986			return true;
    987		}
    988	}
    989
    990	if (kvm_get_msr(vcpu, msr_index, data)) {
    991		pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
    992			msr_index);
    993		return false;
    994	}
    995	return true;
    996}
    997
    998static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
    999				     struct vmx_msr_entry *e)
   1000{
   1001	if (kvm_vcpu_read_guest(vcpu,
   1002				gpa + i * sizeof(*e),
   1003				e, 2 * sizeof(u32))) {
   1004		pr_debug_ratelimited(
   1005			"%s cannot read MSR entry (%u, 0x%08llx)\n",
   1006			__func__, i, gpa + i * sizeof(*e));
   1007		return false;
   1008	}
   1009	if (nested_vmx_store_msr_check(vcpu, e)) {
   1010		pr_debug_ratelimited(
   1011			"%s check failed (%u, 0x%x, 0x%x)\n",
   1012			__func__, i, e->index, e->reserved);
   1013		return false;
   1014	}
   1015	return true;
   1016}
   1017
   1018static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
   1019{
   1020	u64 data;
   1021	u32 i;
   1022	struct vmx_msr_entry e;
   1023	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
   1024
   1025	for (i = 0; i < count; i++) {
   1026		if (unlikely(i >= max_msr_list_size))
   1027			return -EINVAL;
   1028
   1029		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
   1030			return -EINVAL;
   1031
   1032		if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
   1033			return -EINVAL;
   1034
   1035		if (kvm_vcpu_write_guest(vcpu,
   1036					 gpa + i * sizeof(e) +
   1037					     offsetof(struct vmx_msr_entry, value),
   1038					 &data, sizeof(data))) {
   1039			pr_debug_ratelimited(
   1040				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
   1041				__func__, i, e.index, data);
   1042			return -EINVAL;
   1043		}
   1044	}
   1045	return 0;
   1046}
   1047
   1048static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
   1049{
   1050	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   1051	u32 count = vmcs12->vm_exit_msr_store_count;
   1052	u64 gpa = vmcs12->vm_exit_msr_store_addr;
   1053	struct vmx_msr_entry e;
   1054	u32 i;
   1055
   1056	for (i = 0; i < count; i++) {
   1057		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
   1058			return false;
   1059
   1060		if (e.index == msr_index)
   1061			return true;
   1062	}
   1063	return false;
   1064}
   1065
   1066static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
   1067					   u32 msr_index)
   1068{
   1069	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1070	struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
   1071	bool in_vmcs12_store_list;
   1072	int msr_autostore_slot;
   1073	bool in_autostore_list;
   1074	int last;
   1075
   1076	msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
   1077	in_autostore_list = msr_autostore_slot >= 0;
   1078	in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
   1079
   1080	if (in_vmcs12_store_list && !in_autostore_list) {
   1081		if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
   1082			/*
   1083			 * Emulated VMEntry does not fail here.  Instead a less
   1084			 * accurate value will be returned by
   1085			 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
   1086			 * instead of reading the value from the vmcs02 VMExit
   1087			 * MSR-store area.
   1088			 */
   1089			pr_warn_ratelimited(
   1090				"Not enough msr entries in msr_autostore.  Can't add msr %x\n",
   1091				msr_index);
   1092			return;
   1093		}
   1094		last = autostore->nr++;
   1095		autostore->val[last].index = msr_index;
   1096	} else if (!in_vmcs12_store_list && in_autostore_list) {
   1097		last = --autostore->nr;
   1098		autostore->val[msr_autostore_slot] = autostore->val[last];
   1099	}
   1100}
   1101
   1102/*
   1103 * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
   1104 * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
   1105 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
   1106 * @entry_failure_code.
   1107 */
   1108static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
   1109			       bool nested_ept, bool reload_pdptrs,
   1110			       enum vm_entry_failure_code *entry_failure_code)
   1111{
   1112	if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
   1113		*entry_failure_code = ENTRY_FAIL_DEFAULT;
   1114		return -EINVAL;
   1115	}
   1116
   1117	/*
   1118	 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
   1119	 * must not be dereferenced.
   1120	 */
   1121	if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
   1122	    CC(!load_pdptrs(vcpu, cr3))) {
   1123		*entry_failure_code = ENTRY_FAIL_PDPTE;
   1124		return -EINVAL;
   1125	}
   1126
   1127	vcpu->arch.cr3 = cr3;
   1128	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
   1129
   1130	/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
   1131	kvm_init_mmu(vcpu);
   1132
   1133	if (!nested_ept)
   1134		kvm_mmu_new_pgd(vcpu, cr3);
   1135
   1136	return 0;
   1137}
   1138
   1139/*
   1140 * Returns if KVM is able to config CPU to tag TLB entries
   1141 * populated by L2 differently than TLB entries populated
   1142 * by L1.
   1143 *
   1144 * If L0 uses EPT, L1 and L2 run with different EPTP because
   1145 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
   1146 * are tagged with different EPTP.
   1147 *
   1148 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
   1149 * with different VPID (L1 entries are tagged with vmx->vpid
   1150 * while L2 entries are tagged with vmx->nested.vpid02).
   1151 */
   1152static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
   1153{
   1154	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   1155
   1156	return enable_ept ||
   1157	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
   1158}
   1159
   1160static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
   1161					    struct vmcs12 *vmcs12,
   1162					    bool is_vmenter)
   1163{
   1164	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1165
   1166	/*
   1167	 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
   1168	 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
   1169	 * full TLB flush from the guest's perspective.  This is required even
   1170	 * if VPID is disabled in the host as KVM may need to synchronize the
   1171	 * MMU in response to the guest TLB flush.
   1172	 *
   1173	 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
   1174	 * EPT is a special snowflake, as guest-physical mappings aren't
   1175	 * flushed on VPID invalidations, including VM-Enter or VM-Exit with
   1176	 * VPID disabled.  As a result, KVM _never_ needs to sync nEPT
   1177	 * entries on VM-Enter because L1 can't rely on VM-Enter to flush
   1178	 * those mappings.
   1179	 */
   1180	if (!nested_cpu_has_vpid(vmcs12)) {
   1181		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
   1182		return;
   1183	}
   1184
   1185	/* L2 should never have a VPID if VPID is disabled. */
   1186	WARN_ON(!enable_vpid);
   1187
   1188	/*
   1189	 * VPID is enabled and in use by vmcs12.  If vpid12 is changing, then
   1190	 * emulate a guest TLB flush as KVM does not track vpid12 history nor
   1191	 * is the VPID incorporated into the MMU context.  I.e. KVM must assume
   1192	 * that the new vpid12 has never been used and thus represents a new
   1193	 * guest ASID that cannot have entries in the TLB.
   1194	 */
   1195	if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
   1196		vmx->nested.last_vpid = vmcs12->virtual_processor_id;
   1197		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
   1198		return;
   1199	}
   1200
   1201	/*
   1202	 * If VPID is enabled, used by vmc12, and vpid12 is not changing but
   1203	 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
   1204	 * KVM was unable to allocate a VPID for L2, flush the current context
   1205	 * as the effective ASID is common to both L1 and L2.
   1206	 */
   1207	if (!nested_has_guest_tlb_tag(vcpu))
   1208		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
   1209}
   1210
   1211static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
   1212{
   1213	superset &= mask;
   1214	subset &= mask;
   1215
   1216	return (superset | subset) == superset;
   1217}
   1218
   1219static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
   1220{
   1221	const u64 feature_and_reserved =
   1222		/* feature (except bit 48; see below) */
   1223		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
   1224		/* reserved */
   1225		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
   1226	u64 vmx_basic = vmx->nested.msrs.basic;
   1227
   1228	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
   1229		return -EINVAL;
   1230
   1231	/*
   1232	 * KVM does not emulate a version of VMX that constrains physical
   1233	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
   1234	 */
   1235	if (data & BIT_ULL(48))
   1236		return -EINVAL;
   1237
   1238	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
   1239	    vmx_basic_vmcs_revision_id(data))
   1240		return -EINVAL;
   1241
   1242	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
   1243		return -EINVAL;
   1244
   1245	vmx->nested.msrs.basic = data;
   1246	return 0;
   1247}
   1248
   1249static int
   1250vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
   1251{
   1252	u64 supported;
   1253	u32 *lowp, *highp;
   1254
   1255	switch (msr_index) {
   1256	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
   1257		lowp = &vmx->nested.msrs.pinbased_ctls_low;
   1258		highp = &vmx->nested.msrs.pinbased_ctls_high;
   1259		break;
   1260	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
   1261		lowp = &vmx->nested.msrs.procbased_ctls_low;
   1262		highp = &vmx->nested.msrs.procbased_ctls_high;
   1263		break;
   1264	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
   1265		lowp = &vmx->nested.msrs.exit_ctls_low;
   1266		highp = &vmx->nested.msrs.exit_ctls_high;
   1267		break;
   1268	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
   1269		lowp = &vmx->nested.msrs.entry_ctls_low;
   1270		highp = &vmx->nested.msrs.entry_ctls_high;
   1271		break;
   1272	case MSR_IA32_VMX_PROCBASED_CTLS2:
   1273		lowp = &vmx->nested.msrs.secondary_ctls_low;
   1274		highp = &vmx->nested.msrs.secondary_ctls_high;
   1275		break;
   1276	default:
   1277		BUG();
   1278	}
   1279
   1280	supported = vmx_control_msr(*lowp, *highp);
   1281
   1282	/* Check must-be-1 bits are still 1. */
   1283	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
   1284		return -EINVAL;
   1285
   1286	/* Check must-be-0 bits are still 0. */
   1287	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
   1288		return -EINVAL;
   1289
   1290	*lowp = data;
   1291	*highp = data >> 32;
   1292	return 0;
   1293}
   1294
   1295static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
   1296{
   1297	const u64 feature_and_reserved_bits =
   1298		/* feature */
   1299		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
   1300		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
   1301		/* reserved */
   1302		GENMASK_ULL(13, 9) | BIT_ULL(31);
   1303	u64 vmx_misc;
   1304
   1305	vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
   1306				   vmx->nested.msrs.misc_high);
   1307
   1308	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
   1309		return -EINVAL;
   1310
   1311	if ((vmx->nested.msrs.pinbased_ctls_high &
   1312	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
   1313	    vmx_misc_preemption_timer_rate(data) !=
   1314	    vmx_misc_preemption_timer_rate(vmx_misc))
   1315		return -EINVAL;
   1316
   1317	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
   1318		return -EINVAL;
   1319
   1320	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
   1321		return -EINVAL;
   1322
   1323	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
   1324		return -EINVAL;
   1325
   1326	vmx->nested.msrs.misc_low = data;
   1327	vmx->nested.msrs.misc_high = data >> 32;
   1328
   1329	return 0;
   1330}
   1331
   1332static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
   1333{
   1334	u64 vmx_ept_vpid_cap;
   1335
   1336	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
   1337					   vmx->nested.msrs.vpid_caps);
   1338
   1339	/* Every bit is either reserved or a feature bit. */
   1340	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
   1341		return -EINVAL;
   1342
   1343	vmx->nested.msrs.ept_caps = data;
   1344	vmx->nested.msrs.vpid_caps = data >> 32;
   1345	return 0;
   1346}
   1347
   1348static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
   1349{
   1350	u64 *msr;
   1351
   1352	switch (msr_index) {
   1353	case MSR_IA32_VMX_CR0_FIXED0:
   1354		msr = &vmx->nested.msrs.cr0_fixed0;
   1355		break;
   1356	case MSR_IA32_VMX_CR4_FIXED0:
   1357		msr = &vmx->nested.msrs.cr4_fixed0;
   1358		break;
   1359	default:
   1360		BUG();
   1361	}
   1362
   1363	/*
   1364	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
   1365	 * must be 1 in the restored value.
   1366	 */
   1367	if (!is_bitwise_subset(data, *msr, -1ULL))
   1368		return -EINVAL;
   1369
   1370	*msr = data;
   1371	return 0;
   1372}
   1373
   1374/*
   1375 * Called when userspace is restoring VMX MSRs.
   1376 *
   1377 * Returns 0 on success, non-0 otherwise.
   1378 */
   1379int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
   1380{
   1381	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1382
   1383	/*
   1384	 * Don't allow changes to the VMX capability MSRs while the vCPU
   1385	 * is in VMX operation.
   1386	 */
   1387	if (vmx->nested.vmxon)
   1388		return -EBUSY;
   1389
   1390	switch (msr_index) {
   1391	case MSR_IA32_VMX_BASIC:
   1392		return vmx_restore_vmx_basic(vmx, data);
   1393	case MSR_IA32_VMX_PINBASED_CTLS:
   1394	case MSR_IA32_VMX_PROCBASED_CTLS:
   1395	case MSR_IA32_VMX_EXIT_CTLS:
   1396	case MSR_IA32_VMX_ENTRY_CTLS:
   1397		/*
   1398		 * The "non-true" VMX capability MSRs are generated from the
   1399		 * "true" MSRs, so we do not support restoring them directly.
   1400		 *
   1401		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
   1402		 * should restore the "true" MSRs with the must-be-1 bits
   1403		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
   1404		 * DEFAULT SETTINGS".
   1405		 */
   1406		return -EINVAL;
   1407	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
   1408	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
   1409	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
   1410	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
   1411	case MSR_IA32_VMX_PROCBASED_CTLS2:
   1412		return vmx_restore_control_msr(vmx, msr_index, data);
   1413	case MSR_IA32_VMX_MISC:
   1414		return vmx_restore_vmx_misc(vmx, data);
   1415	case MSR_IA32_VMX_CR0_FIXED0:
   1416	case MSR_IA32_VMX_CR4_FIXED0:
   1417		return vmx_restore_fixed0_msr(vmx, msr_index, data);
   1418	case MSR_IA32_VMX_CR0_FIXED1:
   1419	case MSR_IA32_VMX_CR4_FIXED1:
   1420		/*
   1421		 * These MSRs are generated based on the vCPU's CPUID, so we
   1422		 * do not support restoring them directly.
   1423		 */
   1424		return -EINVAL;
   1425	case MSR_IA32_VMX_EPT_VPID_CAP:
   1426		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
   1427	case MSR_IA32_VMX_VMCS_ENUM:
   1428		vmx->nested.msrs.vmcs_enum = data;
   1429		return 0;
   1430	case MSR_IA32_VMX_VMFUNC:
   1431		if (data & ~vmx->nested.msrs.vmfunc_controls)
   1432			return -EINVAL;
   1433		vmx->nested.msrs.vmfunc_controls = data;
   1434		return 0;
   1435	default:
   1436		/*
   1437		 * The rest of the VMX capability MSRs do not support restore.
   1438		 */
   1439		return -EINVAL;
   1440	}
   1441}
   1442
   1443/* Returns 0 on success, non-0 otherwise. */
   1444int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
   1445{
   1446	switch (msr_index) {
   1447	case MSR_IA32_VMX_BASIC:
   1448		*pdata = msrs->basic;
   1449		break;
   1450	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
   1451	case MSR_IA32_VMX_PINBASED_CTLS:
   1452		*pdata = vmx_control_msr(
   1453			msrs->pinbased_ctls_low,
   1454			msrs->pinbased_ctls_high);
   1455		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
   1456			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
   1457		break;
   1458	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
   1459	case MSR_IA32_VMX_PROCBASED_CTLS:
   1460		*pdata = vmx_control_msr(
   1461			msrs->procbased_ctls_low,
   1462			msrs->procbased_ctls_high);
   1463		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
   1464			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
   1465		break;
   1466	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
   1467	case MSR_IA32_VMX_EXIT_CTLS:
   1468		*pdata = vmx_control_msr(
   1469			msrs->exit_ctls_low,
   1470			msrs->exit_ctls_high);
   1471		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
   1472			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   1473		break;
   1474	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
   1475	case MSR_IA32_VMX_ENTRY_CTLS:
   1476		*pdata = vmx_control_msr(
   1477			msrs->entry_ctls_low,
   1478			msrs->entry_ctls_high);
   1479		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
   1480			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
   1481		break;
   1482	case MSR_IA32_VMX_MISC:
   1483		*pdata = vmx_control_msr(
   1484			msrs->misc_low,
   1485			msrs->misc_high);
   1486		break;
   1487	case MSR_IA32_VMX_CR0_FIXED0:
   1488		*pdata = msrs->cr0_fixed0;
   1489		break;
   1490	case MSR_IA32_VMX_CR0_FIXED1:
   1491		*pdata = msrs->cr0_fixed1;
   1492		break;
   1493	case MSR_IA32_VMX_CR4_FIXED0:
   1494		*pdata = msrs->cr4_fixed0;
   1495		break;
   1496	case MSR_IA32_VMX_CR4_FIXED1:
   1497		*pdata = msrs->cr4_fixed1;
   1498		break;
   1499	case MSR_IA32_VMX_VMCS_ENUM:
   1500		*pdata = msrs->vmcs_enum;
   1501		break;
   1502	case MSR_IA32_VMX_PROCBASED_CTLS2:
   1503		*pdata = vmx_control_msr(
   1504			msrs->secondary_ctls_low,
   1505			msrs->secondary_ctls_high);
   1506		break;
   1507	case MSR_IA32_VMX_EPT_VPID_CAP:
   1508		*pdata = msrs->ept_caps |
   1509			((u64)msrs->vpid_caps << 32);
   1510		break;
   1511	case MSR_IA32_VMX_VMFUNC:
   1512		*pdata = msrs->vmfunc_controls;
   1513		break;
   1514	default:
   1515		return 1;
   1516	}
   1517
   1518	return 0;
   1519}
   1520
   1521/*
   1522 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
   1523 * been modified by the L1 guest.  Note, "writable" in this context means
   1524 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
   1525 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
   1526 * VM-exit information fields (which are actually writable if the vCPU is
   1527 * configured to support "VMWRITE to any supported field in the VMCS").
   1528 */
   1529static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
   1530{
   1531	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
   1532	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
   1533	struct shadow_vmcs_field field;
   1534	unsigned long val;
   1535	int i;
   1536
   1537	if (WARN_ON(!shadow_vmcs))
   1538		return;
   1539
   1540	preempt_disable();
   1541
   1542	vmcs_load(shadow_vmcs);
   1543
   1544	for (i = 0; i < max_shadow_read_write_fields; i++) {
   1545		field = shadow_read_write_fields[i];
   1546		val = __vmcs_readl(field.encoding);
   1547		vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
   1548	}
   1549
   1550	vmcs_clear(shadow_vmcs);
   1551	vmcs_load(vmx->loaded_vmcs->vmcs);
   1552
   1553	preempt_enable();
   1554}
   1555
   1556static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
   1557{
   1558	const struct shadow_vmcs_field *fields[] = {
   1559		shadow_read_write_fields,
   1560		shadow_read_only_fields
   1561	};
   1562	const int max_fields[] = {
   1563		max_shadow_read_write_fields,
   1564		max_shadow_read_only_fields
   1565	};
   1566	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
   1567	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
   1568	struct shadow_vmcs_field field;
   1569	unsigned long val;
   1570	int i, q;
   1571
   1572	if (WARN_ON(!shadow_vmcs))
   1573		return;
   1574
   1575	vmcs_load(shadow_vmcs);
   1576
   1577	for (q = 0; q < ARRAY_SIZE(fields); q++) {
   1578		for (i = 0; i < max_fields[q]; i++) {
   1579			field = fields[q][i];
   1580			val = vmcs12_read_any(vmcs12, field.encoding,
   1581					      field.offset);
   1582			__vmcs_writel(field.encoding, val);
   1583		}
   1584	}
   1585
   1586	vmcs_clear(shadow_vmcs);
   1587	vmcs_load(vmx->loaded_vmcs->vmcs);
   1588}
   1589
   1590static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
   1591{
   1592	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
   1593	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
   1594
   1595	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
   1596	vmcs12->tpr_threshold = evmcs->tpr_threshold;
   1597	vmcs12->guest_rip = evmcs->guest_rip;
   1598
   1599	if (unlikely(!(hv_clean_fields &
   1600		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
   1601		vmcs12->guest_rsp = evmcs->guest_rsp;
   1602		vmcs12->guest_rflags = evmcs->guest_rflags;
   1603		vmcs12->guest_interruptibility_info =
   1604			evmcs->guest_interruptibility_info;
   1605	}
   1606
   1607	if (unlikely(!(hv_clean_fields &
   1608		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
   1609		vmcs12->cpu_based_vm_exec_control =
   1610			evmcs->cpu_based_vm_exec_control;
   1611	}
   1612
   1613	if (unlikely(!(hv_clean_fields &
   1614		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
   1615		vmcs12->exception_bitmap = evmcs->exception_bitmap;
   1616	}
   1617
   1618	if (unlikely(!(hv_clean_fields &
   1619		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
   1620		vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
   1621	}
   1622
   1623	if (unlikely(!(hv_clean_fields &
   1624		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
   1625		vmcs12->vm_entry_intr_info_field =
   1626			evmcs->vm_entry_intr_info_field;
   1627		vmcs12->vm_entry_exception_error_code =
   1628			evmcs->vm_entry_exception_error_code;
   1629		vmcs12->vm_entry_instruction_len =
   1630			evmcs->vm_entry_instruction_len;
   1631	}
   1632
   1633	if (unlikely(!(hv_clean_fields &
   1634		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
   1635		vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
   1636		vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
   1637		vmcs12->host_cr0 = evmcs->host_cr0;
   1638		vmcs12->host_cr3 = evmcs->host_cr3;
   1639		vmcs12->host_cr4 = evmcs->host_cr4;
   1640		vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
   1641		vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
   1642		vmcs12->host_rip = evmcs->host_rip;
   1643		vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
   1644		vmcs12->host_es_selector = evmcs->host_es_selector;
   1645		vmcs12->host_cs_selector = evmcs->host_cs_selector;
   1646		vmcs12->host_ss_selector = evmcs->host_ss_selector;
   1647		vmcs12->host_ds_selector = evmcs->host_ds_selector;
   1648		vmcs12->host_fs_selector = evmcs->host_fs_selector;
   1649		vmcs12->host_gs_selector = evmcs->host_gs_selector;
   1650		vmcs12->host_tr_selector = evmcs->host_tr_selector;
   1651	}
   1652
   1653	if (unlikely(!(hv_clean_fields &
   1654		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
   1655		vmcs12->pin_based_vm_exec_control =
   1656			evmcs->pin_based_vm_exec_control;
   1657		vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
   1658		vmcs12->secondary_vm_exec_control =
   1659			evmcs->secondary_vm_exec_control;
   1660	}
   1661
   1662	if (unlikely(!(hv_clean_fields &
   1663		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
   1664		vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
   1665		vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
   1666	}
   1667
   1668	if (unlikely(!(hv_clean_fields &
   1669		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
   1670		vmcs12->msr_bitmap = evmcs->msr_bitmap;
   1671	}
   1672
   1673	if (unlikely(!(hv_clean_fields &
   1674		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
   1675		vmcs12->guest_es_base = evmcs->guest_es_base;
   1676		vmcs12->guest_cs_base = evmcs->guest_cs_base;
   1677		vmcs12->guest_ss_base = evmcs->guest_ss_base;
   1678		vmcs12->guest_ds_base = evmcs->guest_ds_base;
   1679		vmcs12->guest_fs_base = evmcs->guest_fs_base;
   1680		vmcs12->guest_gs_base = evmcs->guest_gs_base;
   1681		vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
   1682		vmcs12->guest_tr_base = evmcs->guest_tr_base;
   1683		vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
   1684		vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
   1685		vmcs12->guest_es_limit = evmcs->guest_es_limit;
   1686		vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
   1687		vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
   1688		vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
   1689		vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
   1690		vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
   1691		vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
   1692		vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
   1693		vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
   1694		vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
   1695		vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
   1696		vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
   1697		vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
   1698		vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
   1699		vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
   1700		vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
   1701		vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
   1702		vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
   1703		vmcs12->guest_es_selector = evmcs->guest_es_selector;
   1704		vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
   1705		vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
   1706		vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
   1707		vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
   1708		vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
   1709		vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
   1710		vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
   1711	}
   1712
   1713	if (unlikely(!(hv_clean_fields &
   1714		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
   1715		vmcs12->tsc_offset = evmcs->tsc_offset;
   1716		vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
   1717		vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
   1718	}
   1719
   1720	if (unlikely(!(hv_clean_fields &
   1721		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
   1722		vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
   1723		vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
   1724		vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
   1725		vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
   1726		vmcs12->guest_cr0 = evmcs->guest_cr0;
   1727		vmcs12->guest_cr3 = evmcs->guest_cr3;
   1728		vmcs12->guest_cr4 = evmcs->guest_cr4;
   1729		vmcs12->guest_dr7 = evmcs->guest_dr7;
   1730	}
   1731
   1732	if (unlikely(!(hv_clean_fields &
   1733		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
   1734		vmcs12->host_fs_base = evmcs->host_fs_base;
   1735		vmcs12->host_gs_base = evmcs->host_gs_base;
   1736		vmcs12->host_tr_base = evmcs->host_tr_base;
   1737		vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
   1738		vmcs12->host_idtr_base = evmcs->host_idtr_base;
   1739		vmcs12->host_rsp = evmcs->host_rsp;
   1740	}
   1741
   1742	if (unlikely(!(hv_clean_fields &
   1743		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
   1744		vmcs12->ept_pointer = evmcs->ept_pointer;
   1745		vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
   1746	}
   1747
   1748	if (unlikely(!(hv_clean_fields &
   1749		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
   1750		vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
   1751		vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
   1752		vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
   1753		vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
   1754		vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
   1755		vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
   1756		vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
   1757		vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
   1758		vmcs12->guest_pending_dbg_exceptions =
   1759			evmcs->guest_pending_dbg_exceptions;
   1760		vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
   1761		vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
   1762		vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
   1763		vmcs12->guest_activity_state = evmcs->guest_activity_state;
   1764		vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
   1765	}
   1766
   1767	/*
   1768	 * Not used?
   1769	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
   1770	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
   1771	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
   1772	 * vmcs12->page_fault_error_code_mask =
   1773	 *		evmcs->page_fault_error_code_mask;
   1774	 * vmcs12->page_fault_error_code_match =
   1775	 *		evmcs->page_fault_error_code_match;
   1776	 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
   1777	 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
   1778	 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
   1779	 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
   1780	 */
   1781
   1782	/*
   1783	 * Read only fields:
   1784	 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
   1785	 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
   1786	 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
   1787	 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
   1788	 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
   1789	 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
   1790	 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
   1791	 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
   1792	 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
   1793	 * vmcs12->exit_qualification = evmcs->exit_qualification;
   1794	 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
   1795	 *
   1796	 * Not present in struct vmcs12:
   1797	 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
   1798	 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
   1799	 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
   1800	 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
   1801	 */
   1802
   1803	return;
   1804}
   1805
   1806static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
   1807{
   1808	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
   1809	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
   1810
   1811	/*
   1812	 * Should not be changed by KVM:
   1813	 *
   1814	 * evmcs->host_es_selector = vmcs12->host_es_selector;
   1815	 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
   1816	 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
   1817	 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
   1818	 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
   1819	 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
   1820	 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
   1821	 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
   1822	 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
   1823	 * evmcs->host_cr0 = vmcs12->host_cr0;
   1824	 * evmcs->host_cr3 = vmcs12->host_cr3;
   1825	 * evmcs->host_cr4 = vmcs12->host_cr4;
   1826	 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
   1827	 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
   1828	 * evmcs->host_rip = vmcs12->host_rip;
   1829	 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
   1830	 * evmcs->host_fs_base = vmcs12->host_fs_base;
   1831	 * evmcs->host_gs_base = vmcs12->host_gs_base;
   1832	 * evmcs->host_tr_base = vmcs12->host_tr_base;
   1833	 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
   1834	 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
   1835	 * evmcs->host_rsp = vmcs12->host_rsp;
   1836	 * sync_vmcs02_to_vmcs12() doesn't read these:
   1837	 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
   1838	 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
   1839	 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
   1840	 * evmcs->ept_pointer = vmcs12->ept_pointer;
   1841	 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
   1842	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
   1843	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
   1844	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
   1845	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
   1846	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
   1847	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
   1848	 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
   1849	 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
   1850	 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
   1851	 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
   1852	 * evmcs->page_fault_error_code_mask =
   1853	 *		vmcs12->page_fault_error_code_mask;
   1854	 * evmcs->page_fault_error_code_match =
   1855	 *		vmcs12->page_fault_error_code_match;
   1856	 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
   1857	 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
   1858	 * evmcs->tsc_offset = vmcs12->tsc_offset;
   1859	 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
   1860	 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
   1861	 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
   1862	 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
   1863	 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
   1864	 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
   1865	 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
   1866	 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
   1867	 *
   1868	 * Not present in struct vmcs12:
   1869	 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
   1870	 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
   1871	 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
   1872	 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
   1873	 */
   1874
   1875	evmcs->guest_es_selector = vmcs12->guest_es_selector;
   1876	evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
   1877	evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
   1878	evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
   1879	evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
   1880	evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
   1881	evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
   1882	evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
   1883
   1884	evmcs->guest_es_limit = vmcs12->guest_es_limit;
   1885	evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
   1886	evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
   1887	evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
   1888	evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
   1889	evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
   1890	evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
   1891	evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
   1892	evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
   1893	evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
   1894
   1895	evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
   1896	evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
   1897	evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
   1898	evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
   1899	evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
   1900	evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
   1901	evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
   1902	evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
   1903
   1904	evmcs->guest_es_base = vmcs12->guest_es_base;
   1905	evmcs->guest_cs_base = vmcs12->guest_cs_base;
   1906	evmcs->guest_ss_base = vmcs12->guest_ss_base;
   1907	evmcs->guest_ds_base = vmcs12->guest_ds_base;
   1908	evmcs->guest_fs_base = vmcs12->guest_fs_base;
   1909	evmcs->guest_gs_base = vmcs12->guest_gs_base;
   1910	evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
   1911	evmcs->guest_tr_base = vmcs12->guest_tr_base;
   1912	evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
   1913	evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
   1914
   1915	evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
   1916	evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
   1917
   1918	evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
   1919	evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
   1920	evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
   1921	evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
   1922
   1923	evmcs->guest_pending_dbg_exceptions =
   1924		vmcs12->guest_pending_dbg_exceptions;
   1925	evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
   1926	evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
   1927
   1928	evmcs->guest_activity_state = vmcs12->guest_activity_state;
   1929	evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
   1930
   1931	evmcs->guest_cr0 = vmcs12->guest_cr0;
   1932	evmcs->guest_cr3 = vmcs12->guest_cr3;
   1933	evmcs->guest_cr4 = vmcs12->guest_cr4;
   1934	evmcs->guest_dr7 = vmcs12->guest_dr7;
   1935
   1936	evmcs->guest_physical_address = vmcs12->guest_physical_address;
   1937
   1938	evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
   1939	evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
   1940	evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
   1941	evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
   1942	evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
   1943	evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
   1944	evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
   1945	evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
   1946
   1947	evmcs->exit_qualification = vmcs12->exit_qualification;
   1948
   1949	evmcs->guest_linear_address = vmcs12->guest_linear_address;
   1950	evmcs->guest_rsp = vmcs12->guest_rsp;
   1951	evmcs->guest_rflags = vmcs12->guest_rflags;
   1952
   1953	evmcs->guest_interruptibility_info =
   1954		vmcs12->guest_interruptibility_info;
   1955	evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
   1956	evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
   1957	evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
   1958	evmcs->vm_entry_exception_error_code =
   1959		vmcs12->vm_entry_exception_error_code;
   1960	evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
   1961
   1962	evmcs->guest_rip = vmcs12->guest_rip;
   1963
   1964	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
   1965
   1966	return;
   1967}
   1968
   1969/*
   1970 * This is an equivalent of the nested hypervisor executing the vmptrld
   1971 * instruction.
   1972 */
   1973static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
   1974	struct kvm_vcpu *vcpu, bool from_launch)
   1975{
   1976	struct vcpu_vmx *vmx = to_vmx(vcpu);
   1977	bool evmcs_gpa_changed = false;
   1978	u64 evmcs_gpa;
   1979
   1980	if (likely(!vmx->nested.enlightened_vmcs_enabled))
   1981		return EVMPTRLD_DISABLED;
   1982
   1983	if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
   1984		nested_release_evmcs(vcpu);
   1985		return EVMPTRLD_DISABLED;
   1986	}
   1987
   1988	if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
   1989		vmx->nested.current_vmptr = INVALID_GPA;
   1990
   1991		nested_release_evmcs(vcpu);
   1992
   1993		if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
   1994				 &vmx->nested.hv_evmcs_map))
   1995			return EVMPTRLD_ERROR;
   1996
   1997		vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
   1998
   1999		/*
   2000		 * Currently, KVM only supports eVMCS version 1
   2001		 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
   2002		 * value to first u32 field of eVMCS which should specify eVMCS
   2003		 * VersionNumber.
   2004		 *
   2005		 * Guest should be aware of supported eVMCS versions by host by
   2006		 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
   2007		 * expected to set this CPUID leaf according to the value
   2008		 * returned in vmcs_version from nested_enable_evmcs().
   2009		 *
   2010		 * However, it turns out that Microsoft Hyper-V fails to comply
   2011		 * to their own invented interface: When Hyper-V use eVMCS, it
   2012		 * just sets first u32 field of eVMCS to revision_id specified
   2013		 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
   2014		 * which is one of the supported versions specified in
   2015		 * CPUID.0x4000000A.EAX[0:15].
   2016		 *
   2017		 * To overcome Hyper-V bug, we accept here either a supported
   2018		 * eVMCS version or VMCS12 revision_id as valid values for first
   2019		 * u32 field of eVMCS.
   2020		 */
   2021		if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
   2022		    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
   2023			nested_release_evmcs(vcpu);
   2024			return EVMPTRLD_VMFAIL;
   2025		}
   2026
   2027		vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
   2028
   2029		evmcs_gpa_changed = true;
   2030		/*
   2031		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
   2032		 * reloaded from guest's memory (read only fields, fields not
   2033		 * present in struct hv_enlightened_vmcs, ...). Make sure there
   2034		 * are no leftovers.
   2035		 */
   2036		if (from_launch) {
   2037			struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   2038			memset(vmcs12, 0, sizeof(*vmcs12));
   2039			vmcs12->hdr.revision_id = VMCS12_REVISION;
   2040		}
   2041
   2042	}
   2043
   2044	/*
   2045	 * Clean fields data can't be used on VMLAUNCH and when we switch
   2046	 * between different L2 guests as KVM keeps a single VMCS12 per L1.
   2047	 */
   2048	if (from_launch || evmcs_gpa_changed) {
   2049		vmx->nested.hv_evmcs->hv_clean_fields &=
   2050			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
   2051
   2052		vmx->nested.force_msr_bitmap_recalc = true;
   2053	}
   2054
   2055	return EVMPTRLD_SUCCEEDED;
   2056}
   2057
   2058void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
   2059{
   2060	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2061
   2062	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   2063		copy_vmcs12_to_enlightened(vmx);
   2064	else
   2065		copy_vmcs12_to_shadow(vmx);
   2066
   2067	vmx->nested.need_vmcs12_to_shadow_sync = false;
   2068}
   2069
   2070static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
   2071{
   2072	struct vcpu_vmx *vmx =
   2073		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
   2074
   2075	vmx->nested.preemption_timer_expired = true;
   2076	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
   2077	kvm_vcpu_kick(&vmx->vcpu);
   2078
   2079	return HRTIMER_NORESTART;
   2080}
   2081
   2082static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
   2083{
   2084	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2085	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   2086
   2087	u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
   2088			    VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
   2089
   2090	if (!vmx->nested.has_preemption_timer_deadline) {
   2091		vmx->nested.preemption_timer_deadline =
   2092			vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
   2093		vmx->nested.has_preemption_timer_deadline = true;
   2094	}
   2095	return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
   2096}
   2097
   2098static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
   2099					u64 preemption_timeout)
   2100{
   2101	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2102
   2103	/*
   2104	 * A timer value of zero is architecturally guaranteed to cause
   2105	 * a VMExit prior to executing any instructions in the guest.
   2106	 */
   2107	if (preemption_timeout == 0) {
   2108		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
   2109		return;
   2110	}
   2111
   2112	if (vcpu->arch.virtual_tsc_khz == 0)
   2113		return;
   2114
   2115	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
   2116	preemption_timeout *= 1000000;
   2117	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
   2118	hrtimer_start(&vmx->nested.preemption_timer,
   2119		      ktime_add_ns(ktime_get(), preemption_timeout),
   2120		      HRTIMER_MODE_ABS_PINNED);
   2121}
   2122
   2123static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
   2124{
   2125	if (vmx->nested.nested_run_pending &&
   2126	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
   2127		return vmcs12->guest_ia32_efer;
   2128	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
   2129		return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
   2130	else
   2131		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
   2132}
   2133
   2134static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
   2135{
   2136	/*
   2137	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
   2138	 * according to L0's settings (vmcs12 is irrelevant here).  Host
   2139	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
   2140	 * will be set as needed prior to VMLAUNCH/VMRESUME.
   2141	 */
   2142	if (vmx->nested.vmcs02_initialized)
   2143		return;
   2144	vmx->nested.vmcs02_initialized = true;
   2145
   2146	/*
   2147	 * We don't care what the EPTP value is we just need to guarantee
   2148	 * it's valid so we don't get a false positive when doing early
   2149	 * consistency checks.
   2150	 */
   2151	if (enable_ept && nested_early_check)
   2152		vmcs_write64(EPT_POINTER,
   2153			     construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
   2154
   2155	/* All VMFUNCs are currently emulated through L0 vmexits.  */
   2156	if (cpu_has_vmx_vmfunc())
   2157		vmcs_write64(VM_FUNCTION_CONTROL, 0);
   2158
   2159	if (cpu_has_vmx_posted_intr())
   2160		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
   2161
   2162	if (cpu_has_vmx_msr_bitmap())
   2163		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
   2164
   2165	/*
   2166	 * PML is emulated for L2, but never enabled in hardware as the MMU
   2167	 * handles A/D emulation.  Disabling PML for L2 also avoids having to
   2168	 * deal with filtering out L2 GPAs from the buffer.
   2169	 */
   2170	if (enable_pml) {
   2171		vmcs_write64(PML_ADDRESS, 0);
   2172		vmcs_write16(GUEST_PML_INDEX, -1);
   2173	}
   2174
   2175	if (cpu_has_vmx_encls_vmexit())
   2176		vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
   2177
   2178	/*
   2179	 * Set the MSR load/store lists to match L0's settings.  Only the
   2180	 * addresses are constant (for vmcs02), the counts can change based
   2181	 * on L2's behavior, e.g. switching to/from long mode.
   2182	 */
   2183	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
   2184	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
   2185	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
   2186
   2187	vmx_set_constant_host_state(vmx);
   2188}
   2189
   2190static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
   2191				      struct vmcs12 *vmcs12)
   2192{
   2193	prepare_vmcs02_constant_state(vmx);
   2194
   2195	vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
   2196
   2197	if (enable_vpid) {
   2198		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
   2199			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
   2200		else
   2201			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
   2202	}
   2203}
   2204
   2205static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
   2206				 struct vmcs12 *vmcs12)
   2207{
   2208	u32 exec_control;
   2209	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
   2210
   2211	if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   2212		prepare_vmcs02_early_rare(vmx, vmcs12);
   2213
   2214	/*
   2215	 * PIN CONTROLS
   2216	 */
   2217	exec_control = __pin_controls_get(vmcs01);
   2218	exec_control |= (vmcs12->pin_based_vm_exec_control &
   2219			 ~PIN_BASED_VMX_PREEMPTION_TIMER);
   2220
   2221	/* Posted interrupts setting is only taken from vmcs12.  */
   2222	vmx->nested.pi_pending = false;
   2223	if (nested_cpu_has_posted_intr(vmcs12))
   2224		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
   2225	else
   2226		exec_control &= ~PIN_BASED_POSTED_INTR;
   2227	pin_controls_set(vmx, exec_control);
   2228
   2229	/*
   2230	 * EXEC CONTROLS
   2231	 */
   2232	exec_control = __exec_controls_get(vmcs01); /* L0's desires */
   2233	exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
   2234	exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
   2235	exec_control &= ~CPU_BASED_TPR_SHADOW;
   2236	exec_control |= vmcs12->cpu_based_vm_exec_control;
   2237
   2238	vmx->nested.l1_tpr_threshold = -1;
   2239	if (exec_control & CPU_BASED_TPR_SHADOW)
   2240		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
   2241#ifdef CONFIG_X86_64
   2242	else
   2243		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
   2244				CPU_BASED_CR8_STORE_EXITING;
   2245#endif
   2246
   2247	/*
   2248	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
   2249	 * for I/O port accesses.
   2250	 */
   2251	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
   2252	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
   2253
   2254	/*
   2255	 * This bit will be computed in nested_get_vmcs12_pages, because
   2256	 * we do not have access to L1's MSR bitmap yet.  For now, keep
   2257	 * the same bit as before, hoping to avoid multiple VMWRITEs that
   2258	 * only set/clear this bit.
   2259	 */
   2260	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
   2261	exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
   2262
   2263	exec_controls_set(vmx, exec_control);
   2264
   2265	/*
   2266	 * SECONDARY EXEC CONTROLS
   2267	 */
   2268	if (cpu_has_secondary_exec_ctrls()) {
   2269		exec_control = __secondary_exec_controls_get(vmcs01);
   2270
   2271		/* Take the following fields only from vmcs12 */
   2272		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
   2273				  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
   2274				  SECONDARY_EXEC_ENABLE_INVPCID |
   2275				  SECONDARY_EXEC_ENABLE_RDTSCP |
   2276				  SECONDARY_EXEC_XSAVES |
   2277				  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
   2278				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
   2279				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
   2280				  SECONDARY_EXEC_ENABLE_VMFUNC |
   2281				  SECONDARY_EXEC_TSC_SCALING |
   2282				  SECONDARY_EXEC_DESC);
   2283
   2284		if (nested_cpu_has(vmcs12,
   2285				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
   2286			exec_control |= vmcs12->secondary_vm_exec_control;
   2287
   2288		/* PML is emulated and never enabled in hardware for L2. */
   2289		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
   2290
   2291		/* VMCS shadowing for L2 is emulated for now */
   2292		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
   2293
   2294		/*
   2295		 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
   2296		 * will not have to rewrite the controls just for this bit.
   2297		 */
   2298		if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
   2299		    (vmcs12->guest_cr4 & X86_CR4_UMIP))
   2300			exec_control |= SECONDARY_EXEC_DESC;
   2301
   2302		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
   2303			vmcs_write16(GUEST_INTR_STATUS,
   2304				vmcs12->guest_intr_status);
   2305
   2306		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
   2307		    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
   2308
   2309		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
   2310			vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
   2311
   2312		secondary_exec_controls_set(vmx, exec_control);
   2313	}
   2314
   2315	/*
   2316	 * ENTRY CONTROLS
   2317	 *
   2318	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
   2319	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
   2320	 * on the related bits (if supported by the CPU) in the hope that
   2321	 * we can avoid VMWrites during vmx_set_efer().
   2322	 */
   2323	exec_control = __vm_entry_controls_get(vmcs01);
   2324	exec_control |= vmcs12->vm_entry_controls;
   2325	exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
   2326	if (cpu_has_load_ia32_efer()) {
   2327		if (guest_efer & EFER_LMA)
   2328			exec_control |= VM_ENTRY_IA32E_MODE;
   2329		if (guest_efer != host_efer)
   2330			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
   2331	}
   2332	vm_entry_controls_set(vmx, exec_control);
   2333
   2334	/*
   2335	 * EXIT CONTROLS
   2336	 *
   2337	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
   2338	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
   2339	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
   2340	 */
   2341	exec_control = __vm_exit_controls_get(vmcs01);
   2342	if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
   2343		exec_control |= VM_EXIT_LOAD_IA32_EFER;
   2344	else
   2345		exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
   2346	vm_exit_controls_set(vmx, exec_control);
   2347
   2348	/*
   2349	 * Interrupt/Exception Fields
   2350	 */
   2351	if (vmx->nested.nested_run_pending) {
   2352		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
   2353			     vmcs12->vm_entry_intr_info_field);
   2354		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
   2355			     vmcs12->vm_entry_exception_error_code);
   2356		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
   2357			     vmcs12->vm_entry_instruction_len);
   2358		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
   2359			     vmcs12->guest_interruptibility_info);
   2360		vmx->loaded_vmcs->nmi_known_unmasked =
   2361			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
   2362	} else {
   2363		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
   2364	}
   2365}
   2366
   2367static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
   2368{
   2369	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
   2370
   2371	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
   2372			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
   2373		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
   2374		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
   2375		vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
   2376		vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
   2377		vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
   2378		vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
   2379		vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
   2380		vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
   2381		vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
   2382		vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
   2383		vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
   2384		vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
   2385		vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
   2386		vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
   2387		vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
   2388		vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
   2389		vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
   2390		vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
   2391		vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
   2392		vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
   2393		vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
   2394		vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
   2395		vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
   2396		vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
   2397		vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
   2398		vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
   2399		vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
   2400		vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
   2401		vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
   2402		vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
   2403		vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
   2404		vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
   2405		vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
   2406		vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
   2407		vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
   2408		vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
   2409
   2410		vmx->segment_cache.bitmask = 0;
   2411	}
   2412
   2413	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
   2414			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
   2415		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
   2416		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
   2417			    vmcs12->guest_pending_dbg_exceptions);
   2418		vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
   2419		vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
   2420
   2421		/*
   2422		 * L1 may access the L2's PDPTR, so save them to construct
   2423		 * vmcs12
   2424		 */
   2425		if (enable_ept) {
   2426			vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
   2427			vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
   2428			vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
   2429			vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
   2430		}
   2431
   2432		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
   2433		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
   2434			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
   2435	}
   2436
   2437	if (nested_cpu_has_xsaves(vmcs12))
   2438		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
   2439
   2440	/*
   2441	 * Whether page-faults are trapped is determined by a combination of
   2442	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
   2443	 * doesn't care about page faults then we should set all of these to
   2444	 * L1's desires. However, if L0 does care about (some) page faults, it
   2445	 * is not easy (if at all possible?) to merge L0 and L1's desires, we
   2446	 * simply ask to exit on each and every L2 page fault. This is done by
   2447	 * setting MASK=MATCH=0 and (see below) EB.PF=1.
   2448	 * Note that below we don't need special code to set EB.PF beyond the
   2449	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
   2450	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
   2451	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
   2452	 */
   2453	if (vmx_need_pf_intercept(&vmx->vcpu)) {
   2454		/*
   2455		 * TODO: if both L0 and L1 need the same MASK and MATCH,
   2456		 * go ahead and use it?
   2457		 */
   2458		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
   2459		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
   2460	} else {
   2461		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
   2462		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
   2463	}
   2464
   2465	if (cpu_has_vmx_apicv()) {
   2466		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
   2467		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
   2468		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
   2469		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
   2470	}
   2471
   2472	/*
   2473	 * Make sure the msr_autostore list is up to date before we set the
   2474	 * count in the vmcs02.
   2475	 */
   2476	prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
   2477
   2478	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
   2479	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
   2480	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
   2481
   2482	set_cr4_guest_host_mask(vmx);
   2483}
   2484
   2485/*
   2486 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   2487 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
   2488 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
   2489 * guest in a way that will both be appropriate to L1's requests, and our
   2490 * needs. In addition to modifying the active vmcs (which is vmcs02), this
   2491 * function also has additional necessary side-effects, like setting various
   2492 * vcpu->arch fields.
   2493 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
   2494 * is assigned to entry_failure_code on failure.
   2495 */
   2496static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
   2497			  bool from_vmentry,
   2498			  enum vm_entry_failure_code *entry_failure_code)
   2499{
   2500	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2501	bool load_guest_pdptrs_vmcs12 = false;
   2502
   2503	if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
   2504		prepare_vmcs02_rare(vmx, vmcs12);
   2505		vmx->nested.dirty_vmcs12 = false;
   2506
   2507		load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
   2508			!(vmx->nested.hv_evmcs->hv_clean_fields &
   2509			  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
   2510	}
   2511
   2512	if (vmx->nested.nested_run_pending &&
   2513	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
   2514		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
   2515		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
   2516	} else {
   2517		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
   2518		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
   2519	}
   2520	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
   2521	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
   2522		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
   2523	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
   2524
   2525	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
   2526	 * bitwise-or of what L1 wants to trap for L2, and what we want to
   2527	 * trap. Note that CR0.TS also needs updating - we do this later.
   2528	 */
   2529	vmx_update_exception_bitmap(vcpu);
   2530	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
   2531	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
   2532
   2533	if (vmx->nested.nested_run_pending &&
   2534	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
   2535		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
   2536		vcpu->arch.pat = vmcs12->guest_ia32_pat;
   2537	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
   2538		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
   2539	}
   2540
   2541	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
   2542			vcpu->arch.l1_tsc_offset,
   2543			vmx_get_l2_tsc_offset(vcpu),
   2544			vmx_get_l2_tsc_multiplier(vcpu));
   2545
   2546	vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
   2547			vcpu->arch.l1_tsc_scaling_ratio,
   2548			vmx_get_l2_tsc_multiplier(vcpu));
   2549
   2550	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
   2551	if (kvm_has_tsc_control)
   2552		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
   2553
   2554	nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
   2555
   2556	if (nested_cpu_has_ept(vmcs12))
   2557		nested_ept_init_mmu_context(vcpu);
   2558
   2559	/*
   2560	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
   2561	 * bits which we consider mandatory enabled.
   2562	 * The CR0_READ_SHADOW is what L2 should have expected to read given
   2563	 * the specifications by L1; It's not enough to take
   2564	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
   2565	 * have more bits than L1 expected.
   2566	 */
   2567	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
   2568	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
   2569
   2570	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
   2571	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
   2572
   2573	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
   2574	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
   2575	vmx_set_efer(vcpu, vcpu->arch.efer);
   2576
   2577	/*
   2578	 * Guest state is invalid and unrestricted guest is disabled,
   2579	 * which means L1 attempted VMEntry to L2 with invalid state.
   2580	 * Fail the VMEntry.
   2581	 *
   2582	 * However when force loading the guest state (SMM exit or
   2583	 * loading nested state after migration, it is possible to
   2584	 * have invalid guest state now, which will be later fixed by
   2585	 * restoring L2 register state
   2586	 */
   2587	if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
   2588		*entry_failure_code = ENTRY_FAIL_DEFAULT;
   2589		return -EINVAL;
   2590	}
   2591
   2592	/* Shadow page tables on either EPT or shadow page tables. */
   2593	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
   2594				from_vmentry, entry_failure_code))
   2595		return -EINVAL;
   2596
   2597	/*
   2598	 * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
   2599	 * on nested VM-Exit, which can occur without actually running L2 and
   2600	 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
   2601	 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
   2602	 * transition to HLT instead of running L2.
   2603	 */
   2604	if (enable_ept)
   2605		vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
   2606
   2607	/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
   2608	if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
   2609	    is_pae_paging(vcpu)) {
   2610		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
   2611		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
   2612		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
   2613		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
   2614	}
   2615
   2616	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
   2617	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
   2618				     vmcs12->guest_ia32_perf_global_ctrl))) {
   2619		*entry_failure_code = ENTRY_FAIL_DEFAULT;
   2620		return -EINVAL;
   2621	}
   2622
   2623	kvm_rsp_write(vcpu, vmcs12->guest_rsp);
   2624	kvm_rip_write(vcpu, vmcs12->guest_rip);
   2625
   2626	/*
   2627	 * It was observed that genuine Hyper-V running in L1 doesn't reset
   2628	 * 'hv_clean_fields' by itself, it only sets the corresponding dirty
   2629	 * bits when it changes a field in eVMCS. Mark all fields as clean
   2630	 * here.
   2631	 */
   2632	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   2633		vmx->nested.hv_evmcs->hv_clean_fields |=
   2634			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
   2635
   2636	return 0;
   2637}
   2638
   2639static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
   2640{
   2641	if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
   2642	       nested_cpu_has_virtual_nmis(vmcs12)))
   2643		return -EINVAL;
   2644
   2645	if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
   2646	       nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
   2647		return -EINVAL;
   2648
   2649	return 0;
   2650}
   2651
   2652static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
   2653{
   2654	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2655
   2656	/* Check for memory type validity */
   2657	switch (new_eptp & VMX_EPTP_MT_MASK) {
   2658	case VMX_EPTP_MT_UC:
   2659		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
   2660			return false;
   2661		break;
   2662	case VMX_EPTP_MT_WB:
   2663		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
   2664			return false;
   2665		break;
   2666	default:
   2667		return false;
   2668	}
   2669
   2670	/* Page-walk levels validity. */
   2671	switch (new_eptp & VMX_EPTP_PWL_MASK) {
   2672	case VMX_EPTP_PWL_5:
   2673		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
   2674			return false;
   2675		break;
   2676	case VMX_EPTP_PWL_4:
   2677		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
   2678			return false;
   2679		break;
   2680	default:
   2681		return false;
   2682	}
   2683
   2684	/* Reserved bits should not be set */
   2685	if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
   2686		return false;
   2687
   2688	/* AD, if set, should be supported */
   2689	if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
   2690		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
   2691			return false;
   2692	}
   2693
   2694	return true;
   2695}
   2696
   2697/*
   2698 * Checks related to VM-Execution Control Fields
   2699 */
   2700static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
   2701                                              struct vmcs12 *vmcs12)
   2702{
   2703	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2704
   2705	if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
   2706				   vmx->nested.msrs.pinbased_ctls_low,
   2707				   vmx->nested.msrs.pinbased_ctls_high)) ||
   2708	    CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
   2709				   vmx->nested.msrs.procbased_ctls_low,
   2710				   vmx->nested.msrs.procbased_ctls_high)))
   2711		return -EINVAL;
   2712
   2713	if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
   2714	    CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
   2715				   vmx->nested.msrs.secondary_ctls_low,
   2716				   vmx->nested.msrs.secondary_ctls_high)))
   2717		return -EINVAL;
   2718
   2719	if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
   2720	    nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
   2721	    nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
   2722	    nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
   2723	    nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
   2724	    nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
   2725	    nested_vmx_check_nmi_controls(vmcs12) ||
   2726	    nested_vmx_check_pml_controls(vcpu, vmcs12) ||
   2727	    nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
   2728	    nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
   2729	    nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
   2730	    CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
   2731		return -EINVAL;
   2732
   2733	if (!nested_cpu_has_preemption_timer(vmcs12) &&
   2734	    nested_cpu_has_save_preemption_timer(vmcs12))
   2735		return -EINVAL;
   2736
   2737	if (nested_cpu_has_ept(vmcs12) &&
   2738	    CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
   2739		return -EINVAL;
   2740
   2741	if (nested_cpu_has_vmfunc(vmcs12)) {
   2742		if (CC(vmcs12->vm_function_control &
   2743		       ~vmx->nested.msrs.vmfunc_controls))
   2744			return -EINVAL;
   2745
   2746		if (nested_cpu_has_eptp_switching(vmcs12)) {
   2747			if (CC(!nested_cpu_has_ept(vmcs12)) ||
   2748			    CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
   2749				return -EINVAL;
   2750		}
   2751	}
   2752
   2753	return 0;
   2754}
   2755
   2756/*
   2757 * Checks related to VM-Exit Control Fields
   2758 */
   2759static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
   2760                                         struct vmcs12 *vmcs12)
   2761{
   2762	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2763
   2764	if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
   2765				    vmx->nested.msrs.exit_ctls_low,
   2766				    vmx->nested.msrs.exit_ctls_high)) ||
   2767	    CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
   2768		return -EINVAL;
   2769
   2770	return 0;
   2771}
   2772
   2773/*
   2774 * Checks related to VM-Entry Control Fields
   2775 */
   2776static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
   2777					  struct vmcs12 *vmcs12)
   2778{
   2779	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2780
   2781	if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
   2782				    vmx->nested.msrs.entry_ctls_low,
   2783				    vmx->nested.msrs.entry_ctls_high)))
   2784		return -EINVAL;
   2785
   2786	/*
   2787	 * From the Intel SDM, volume 3:
   2788	 * Fields relevant to VM-entry event injection must be set properly.
   2789	 * These fields are the VM-entry interruption-information field, the
   2790	 * VM-entry exception error code, and the VM-entry instruction length.
   2791	 */
   2792	if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
   2793		u32 intr_info = vmcs12->vm_entry_intr_info_field;
   2794		u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
   2795		u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
   2796		bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
   2797		bool should_have_error_code;
   2798		bool urg = nested_cpu_has2(vmcs12,
   2799					   SECONDARY_EXEC_UNRESTRICTED_GUEST);
   2800		bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
   2801
   2802		/* VM-entry interruption-info field: interruption type */
   2803		if (CC(intr_type == INTR_TYPE_RESERVED) ||
   2804		    CC(intr_type == INTR_TYPE_OTHER_EVENT &&
   2805		       !nested_cpu_supports_monitor_trap_flag(vcpu)))
   2806			return -EINVAL;
   2807
   2808		/* VM-entry interruption-info field: vector */
   2809		if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
   2810		    CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
   2811		    CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
   2812			return -EINVAL;
   2813
   2814		/* VM-entry interruption-info field: deliver error code */
   2815		should_have_error_code =
   2816			intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
   2817			x86_exception_has_error_code(vector);
   2818		if (CC(has_error_code != should_have_error_code))
   2819			return -EINVAL;
   2820
   2821		/* VM-entry exception error code */
   2822		if (CC(has_error_code &&
   2823		       vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
   2824			return -EINVAL;
   2825
   2826		/* VM-entry interruption-info field: reserved bits */
   2827		if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
   2828			return -EINVAL;
   2829
   2830		/* VM-entry instruction length */
   2831		switch (intr_type) {
   2832		case INTR_TYPE_SOFT_EXCEPTION:
   2833		case INTR_TYPE_SOFT_INTR:
   2834		case INTR_TYPE_PRIV_SW_EXCEPTION:
   2835			if (CC(vmcs12->vm_entry_instruction_len > 15) ||
   2836			    CC(vmcs12->vm_entry_instruction_len == 0 &&
   2837			    CC(!nested_cpu_has_zero_length_injection(vcpu))))
   2838				return -EINVAL;
   2839		}
   2840	}
   2841
   2842	if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
   2843		return -EINVAL;
   2844
   2845	return 0;
   2846}
   2847
   2848static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
   2849				     struct vmcs12 *vmcs12)
   2850{
   2851	if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
   2852	    nested_check_vm_exit_controls(vcpu, vmcs12) ||
   2853	    nested_check_vm_entry_controls(vcpu, vmcs12))
   2854		return -EINVAL;
   2855
   2856	if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
   2857		return nested_evmcs_check_controls(vmcs12);
   2858
   2859	return 0;
   2860}
   2861
   2862static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
   2863				       struct vmcs12 *vmcs12)
   2864{
   2865#ifdef CONFIG_X86_64
   2866	if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
   2867		!!(vcpu->arch.efer & EFER_LMA)))
   2868		return -EINVAL;
   2869#endif
   2870	return 0;
   2871}
   2872
   2873static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
   2874				       struct vmcs12 *vmcs12)
   2875{
   2876	bool ia32e;
   2877
   2878	if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
   2879	    CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
   2880	    CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
   2881		return -EINVAL;
   2882
   2883	if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
   2884	    CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
   2885		return -EINVAL;
   2886
   2887	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
   2888	    CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
   2889		return -EINVAL;
   2890
   2891	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
   2892	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
   2893					   vmcs12->host_ia32_perf_global_ctrl)))
   2894		return -EINVAL;
   2895
   2896#ifdef CONFIG_X86_64
   2897	ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
   2898#else
   2899	ia32e = false;
   2900#endif
   2901
   2902	if (ia32e) {
   2903		if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
   2904			return -EINVAL;
   2905	} else {
   2906		if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
   2907		    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
   2908		    CC((vmcs12->host_rip) >> 32))
   2909			return -EINVAL;
   2910	}
   2911
   2912	if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2913	    CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2914	    CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2915	    CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2916	    CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2917	    CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2918	    CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
   2919	    CC(vmcs12->host_cs_selector == 0) ||
   2920	    CC(vmcs12->host_tr_selector == 0) ||
   2921	    CC(vmcs12->host_ss_selector == 0 && !ia32e))
   2922		return -EINVAL;
   2923
   2924	if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
   2925	    CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
   2926	    CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
   2927	    CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
   2928	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
   2929	    CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
   2930		return -EINVAL;
   2931
   2932	/*
   2933	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
   2934	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
   2935	 * the values of the LMA and LME bits in the field must each be that of
   2936	 * the host address-space size VM-exit control.
   2937	 */
   2938	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
   2939		if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
   2940		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
   2941		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
   2942			return -EINVAL;
   2943	}
   2944
   2945	return 0;
   2946}
   2947
   2948static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
   2949					  struct vmcs12 *vmcs12)
   2950{
   2951	struct vcpu_vmx *vmx = to_vmx(vcpu);
   2952	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
   2953	struct vmcs_hdr hdr;
   2954
   2955	if (vmcs12->vmcs_link_pointer == INVALID_GPA)
   2956		return 0;
   2957
   2958	if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
   2959		return -EINVAL;
   2960
   2961	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
   2962	    CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
   2963					 vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
   2964                return -EINVAL;
   2965
   2966	if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
   2967					    offsetof(struct vmcs12, hdr),
   2968					    sizeof(hdr))))
   2969		return -EINVAL;
   2970
   2971	if (CC(hdr.revision_id != VMCS12_REVISION) ||
   2972	    CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
   2973		return -EINVAL;
   2974
   2975	return 0;
   2976}
   2977
   2978/*
   2979 * Checks related to Guest Non-register State
   2980 */
   2981static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
   2982{
   2983	if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
   2984	       vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
   2985	       vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
   2986		return -EINVAL;
   2987
   2988	return 0;
   2989}
   2990
   2991static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
   2992					struct vmcs12 *vmcs12,
   2993					enum vm_entry_failure_code *entry_failure_code)
   2994{
   2995	bool ia32e;
   2996
   2997	*entry_failure_code = ENTRY_FAIL_DEFAULT;
   2998
   2999	if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
   3000	    CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
   3001		return -EINVAL;
   3002
   3003	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
   3004	    CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
   3005		return -EINVAL;
   3006
   3007	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
   3008	    CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
   3009		return -EINVAL;
   3010
   3011	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
   3012		*entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
   3013		return -EINVAL;
   3014	}
   3015
   3016	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
   3017	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
   3018					   vmcs12->guest_ia32_perf_global_ctrl)))
   3019		return -EINVAL;
   3020
   3021	/*
   3022	 * If the load IA32_EFER VM-entry control is 1, the following checks
   3023	 * are performed on the field for the IA32_EFER MSR:
   3024	 * - Bits reserved in the IA32_EFER MSR must be 0.
   3025	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
   3026	 *   the IA-32e mode guest VM-exit control. It must also be identical
   3027	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
   3028	 *   CR0.PG) is 1.
   3029	 */
   3030	if (to_vmx(vcpu)->nested.nested_run_pending &&
   3031	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
   3032		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
   3033		if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
   3034		    CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
   3035		    CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
   3036		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
   3037			return -EINVAL;
   3038	}
   3039
   3040	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
   3041	    (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
   3042	     CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
   3043		return -EINVAL;
   3044
   3045	if (nested_check_guest_non_reg_state(vmcs12))
   3046		return -EINVAL;
   3047
   3048	return 0;
   3049}
   3050
   3051static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
   3052{
   3053	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3054	unsigned long cr3, cr4;
   3055	bool vm_fail;
   3056
   3057	if (!nested_early_check)
   3058		return 0;
   3059
   3060	if (vmx->msr_autoload.host.nr)
   3061		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
   3062	if (vmx->msr_autoload.guest.nr)
   3063		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
   3064
   3065	preempt_disable();
   3066
   3067	vmx_prepare_switch_to_guest(vcpu);
   3068
   3069	/*
   3070	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
   3071	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
   3072	 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
   3073	 * there is no need to preserve other bits or save/restore the field.
   3074	 */
   3075	vmcs_writel(GUEST_RFLAGS, 0);
   3076
   3077	cr3 = __get_current_cr3_fast();
   3078	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
   3079		vmcs_writel(HOST_CR3, cr3);
   3080		vmx->loaded_vmcs->host_state.cr3 = cr3;
   3081	}
   3082
   3083	cr4 = cr4_read_shadow();
   3084	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
   3085		vmcs_writel(HOST_CR4, cr4);
   3086		vmx->loaded_vmcs->host_state.cr4 = cr4;
   3087	}
   3088
   3089	vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
   3090				 vmx->loaded_vmcs->launched);
   3091
   3092	if (vmx->msr_autoload.host.nr)
   3093		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
   3094	if (vmx->msr_autoload.guest.nr)
   3095		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
   3096
   3097	if (vm_fail) {
   3098		u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
   3099
   3100		preempt_enable();
   3101
   3102		trace_kvm_nested_vmenter_failed(
   3103			"early hardware check VM-instruction error: ", error);
   3104		WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
   3105		return 1;
   3106	}
   3107
   3108	/*
   3109	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
   3110	 */
   3111	if (hw_breakpoint_active())
   3112		set_debugreg(__this_cpu_read(cpu_dr7), 7);
   3113	local_irq_enable();
   3114	preempt_enable();
   3115
   3116	/*
   3117	 * A non-failing VMEntry means we somehow entered guest mode with
   3118	 * an illegal RIP, and that's just the tip of the iceberg.  There
   3119	 * is no telling what memory has been modified or what state has
   3120	 * been exposed to unknown code.  Hitting this all but guarantees
   3121	 * a (very critical) hardware issue.
   3122	 */
   3123	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
   3124		VMX_EXIT_REASONS_FAILED_VMENTRY));
   3125
   3126	return 0;
   3127}
   3128
   3129static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
   3130{
   3131	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3132
   3133	/*
   3134	 * hv_evmcs may end up being not mapped after migration (when
   3135	 * L2 was running), map it here to make sure vmcs12 changes are
   3136	 * properly reflected.
   3137	 */
   3138	if (vmx->nested.enlightened_vmcs_enabled &&
   3139	    vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
   3140		enum nested_evmptrld_status evmptrld_status =
   3141			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
   3142
   3143		if (evmptrld_status == EVMPTRLD_VMFAIL ||
   3144		    evmptrld_status == EVMPTRLD_ERROR)
   3145			return false;
   3146
   3147		/*
   3148		 * Post migration VMCS12 always provides the most actual
   3149		 * information, copy it to eVMCS upon entry.
   3150		 */
   3151		vmx->nested.need_vmcs12_to_shadow_sync = true;
   3152	}
   3153
   3154	return true;
   3155}
   3156
   3157static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
   3158{
   3159	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   3160	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3161	struct kvm_host_map *map;
   3162	struct page *page;
   3163	u64 hpa;
   3164
   3165	if (!vcpu->arch.pdptrs_from_userspace &&
   3166	    !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
   3167		/*
   3168		 * Reload the guest's PDPTRs since after a migration
   3169		 * the guest CR3 might be restored prior to setting the nested
   3170		 * state which can lead to a load of wrong PDPTRs.
   3171		 */
   3172		if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
   3173			return false;
   3174	}
   3175
   3176
   3177	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
   3178		/*
   3179		 * Translate L1 physical address to host physical
   3180		 * address for vmcs02. Keep the page pinned, so this
   3181		 * physical address remains valid. We keep a reference
   3182		 * to it so we can release it later.
   3183		 */
   3184		if (vmx->nested.apic_access_page) { /* shouldn't happen */
   3185			kvm_release_page_clean(vmx->nested.apic_access_page);
   3186			vmx->nested.apic_access_page = NULL;
   3187		}
   3188		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
   3189		if (!is_error_page(page)) {
   3190			vmx->nested.apic_access_page = page;
   3191			hpa = page_to_phys(vmx->nested.apic_access_page);
   3192			vmcs_write64(APIC_ACCESS_ADDR, hpa);
   3193		} else {
   3194			pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
   3195					     __func__);
   3196			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   3197			vcpu->run->internal.suberror =
   3198				KVM_INTERNAL_ERROR_EMULATION;
   3199			vcpu->run->internal.ndata = 0;
   3200			return false;
   3201		}
   3202	}
   3203
   3204	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
   3205		map = &vmx->nested.virtual_apic_map;
   3206
   3207		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
   3208			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
   3209		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
   3210		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
   3211			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
   3212			/*
   3213			 * The processor will never use the TPR shadow, simply
   3214			 * clear the bit from the execution control.  Such a
   3215			 * configuration is useless, but it happens in tests.
   3216			 * For any other configuration, failing the vm entry is
   3217			 * _not_ what the processor does but it's basically the
   3218			 * only possibility we have.
   3219			 */
   3220			exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
   3221		} else {
   3222			/*
   3223			 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
   3224			 * force VM-Entry to fail.
   3225			 */
   3226			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
   3227		}
   3228	}
   3229
   3230	if (nested_cpu_has_posted_intr(vmcs12)) {
   3231		map = &vmx->nested.pi_desc_map;
   3232
   3233		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
   3234			vmx->nested.pi_desc =
   3235				(struct pi_desc *)(((void *)map->hva) +
   3236				offset_in_page(vmcs12->posted_intr_desc_addr));
   3237			vmcs_write64(POSTED_INTR_DESC_ADDR,
   3238				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
   3239		} else {
   3240			/*
   3241			 * Defer the KVM_INTERNAL_EXIT until KVM tries to
   3242			 * access the contents of the VMCS12 posted interrupt
   3243			 * descriptor. (Note that KVM may do this when it
   3244			 * should not, per the architectural specification.)
   3245			 */
   3246			vmx->nested.pi_desc = NULL;
   3247			pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
   3248		}
   3249	}
   3250	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
   3251		exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
   3252	else
   3253		exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
   3254
   3255	return true;
   3256}
   3257
   3258static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
   3259{
   3260	if (!nested_get_evmcs_page(vcpu)) {
   3261		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
   3262				     __func__);
   3263		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   3264		vcpu->run->internal.suberror =
   3265			KVM_INTERNAL_ERROR_EMULATION;
   3266		vcpu->run->internal.ndata = 0;
   3267
   3268		return false;
   3269	}
   3270
   3271	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
   3272		return false;
   3273
   3274	return true;
   3275}
   3276
   3277static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
   3278{
   3279	struct vmcs12 *vmcs12;
   3280	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3281	gpa_t dst;
   3282
   3283	if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
   3284		return 0;
   3285
   3286	if (WARN_ON_ONCE(vmx->nested.pml_full))
   3287		return 1;
   3288
   3289	/*
   3290	 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
   3291	 * set is already checked as part of A/D emulation.
   3292	 */
   3293	vmcs12 = get_vmcs12(vcpu);
   3294	if (!nested_cpu_has_pml(vmcs12))
   3295		return 0;
   3296
   3297	if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
   3298		vmx->nested.pml_full = true;
   3299		return 1;
   3300	}
   3301
   3302	gpa &= ~0xFFFull;
   3303	dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
   3304
   3305	if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
   3306				 offset_in_page(dst), sizeof(gpa)))
   3307		return 0;
   3308
   3309	vmcs12->guest_pml_index--;
   3310
   3311	return 0;
   3312}
   3313
   3314/*
   3315 * Intel's VMX Instruction Reference specifies a common set of prerequisites
   3316 * for running VMX instructions (except VMXON, whose prerequisites are
   3317 * slightly different). It also specifies what exception to inject otherwise.
   3318 * Note that many of these exceptions have priority over VM exits, so they
   3319 * don't have to be checked again here.
   3320 */
   3321static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
   3322{
   3323	if (!to_vmx(vcpu)->nested.vmxon) {
   3324		kvm_queue_exception(vcpu, UD_VECTOR);
   3325		return 0;
   3326	}
   3327
   3328	if (vmx_get_cpl(vcpu)) {
   3329		kvm_inject_gp(vcpu, 0);
   3330		return 0;
   3331	}
   3332
   3333	return 1;
   3334}
   3335
   3336static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
   3337{
   3338	u8 rvi = vmx_get_rvi();
   3339	u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
   3340
   3341	return ((rvi & 0xf0) > (vppr & 0xf0));
   3342}
   3343
   3344static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
   3345				   struct vmcs12 *vmcs12);
   3346
   3347/*
   3348 * If from_vmentry is false, this is being called from state restore (either RSM
   3349 * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
   3350 *
   3351 * Returns:
   3352 *	NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
   3353 *	NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
   3354 *	NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
   3355 *	NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
   3356 */
   3357enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
   3358							bool from_vmentry)
   3359{
   3360	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3361	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   3362	enum vm_entry_failure_code entry_failure_code;
   3363	bool evaluate_pending_interrupts;
   3364	union vmx_exit_reason exit_reason = {
   3365		.basic = EXIT_REASON_INVALID_STATE,
   3366		.failed_vmentry = 1,
   3367	};
   3368	u32 failed_index;
   3369
   3370	kvm_service_local_tlb_flush_requests(vcpu);
   3371
   3372	evaluate_pending_interrupts = exec_controls_get(vmx) &
   3373		(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
   3374	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
   3375		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
   3376
   3377	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
   3378		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
   3379	if (kvm_mpx_supported() &&
   3380		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
   3381		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
   3382
   3383	/*
   3384	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
   3385	 * nested early checks are disabled.  In the event of a "late" VM-Fail,
   3386	 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
   3387	 * software model to the pre-VMEntry host state.  When EPT is disabled,
   3388	 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
   3389	 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
   3390	 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
   3391	 * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
   3392	 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
   3393	 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
   3394	 * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
   3395	 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
   3396	 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
   3397	 * path would need to manually save/restore vmcs01.GUEST_CR3.
   3398	 */
   3399	if (!enable_ept && !nested_early_check)
   3400		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
   3401
   3402	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
   3403
   3404	prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
   3405
   3406	if (from_vmentry) {
   3407		if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
   3408			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
   3409			return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
   3410		}
   3411
   3412		if (nested_vmx_check_vmentry_hw(vcpu)) {
   3413			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
   3414			return NVMX_VMENTRY_VMFAIL;
   3415		}
   3416
   3417		if (nested_vmx_check_guest_state(vcpu, vmcs12,
   3418						 &entry_failure_code)) {
   3419			exit_reason.basic = EXIT_REASON_INVALID_STATE;
   3420			vmcs12->exit_qualification = entry_failure_code;
   3421			goto vmentry_fail_vmexit;
   3422		}
   3423	}
   3424
   3425	enter_guest_mode(vcpu);
   3426
   3427	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
   3428		exit_reason.basic = EXIT_REASON_INVALID_STATE;
   3429		vmcs12->exit_qualification = entry_failure_code;
   3430		goto vmentry_fail_vmexit_guest_mode;
   3431	}
   3432
   3433	if (from_vmentry) {
   3434		failed_index = nested_vmx_load_msr(vcpu,
   3435						   vmcs12->vm_entry_msr_load_addr,
   3436						   vmcs12->vm_entry_msr_load_count);
   3437		if (failed_index) {
   3438			exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
   3439			vmcs12->exit_qualification = failed_index;
   3440			goto vmentry_fail_vmexit_guest_mode;
   3441		}
   3442	} else {
   3443		/*
   3444		 * The MMU is not initialized to point at the right entities yet and
   3445		 * "get pages" would need to read data from the guest (i.e. we will
   3446		 * need to perform gpa to hpa translation). Request a call
   3447		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
   3448		 * have already been set at vmentry time and should not be reset.
   3449		 */
   3450		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
   3451	}
   3452
   3453	/*
   3454	 * If L1 had a pending IRQ/NMI until it executed
   3455	 * VMLAUNCH/VMRESUME which wasn't delivered because it was
   3456	 * disallowed (e.g. interrupts disabled), L0 needs to
   3457	 * evaluate if this pending event should cause an exit from L2
   3458	 * to L1 or delivered directly to L2 (e.g. In case L1 don't
   3459	 * intercept EXTERNAL_INTERRUPT).
   3460	 *
   3461	 * Usually this would be handled by the processor noticing an
   3462	 * IRQ/NMI window request, or checking RVI during evaluation of
   3463	 * pending virtual interrupts.  However, this setting was done
   3464	 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
   3465	 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
   3466	 */
   3467	if (unlikely(evaluate_pending_interrupts))
   3468		kvm_make_request(KVM_REQ_EVENT, vcpu);
   3469
   3470	/*
   3471	 * Do not start the preemption timer hrtimer until after we know
   3472	 * we are successful, so that only nested_vmx_vmexit needs to cancel
   3473	 * the timer.
   3474	 */
   3475	vmx->nested.preemption_timer_expired = false;
   3476	if (nested_cpu_has_preemption_timer(vmcs12)) {
   3477		u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
   3478		vmx_start_preemption_timer(vcpu, timer_value);
   3479	}
   3480
   3481	/*
   3482	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
   3483	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
   3484	 * returned as far as L1 is concerned. It will only return (and set
   3485	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
   3486	 */
   3487	return NVMX_VMENTRY_SUCCESS;
   3488
   3489	/*
   3490	 * A failed consistency check that leads to a VMExit during L1's
   3491	 * VMEnter to L2 is a variation of a normal VMexit, as explained in
   3492	 * 26.7 "VM-entry failures during or after loading guest state".
   3493	 */
   3494vmentry_fail_vmexit_guest_mode:
   3495	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
   3496		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
   3497	leave_guest_mode(vcpu);
   3498
   3499vmentry_fail_vmexit:
   3500	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
   3501
   3502	if (!from_vmentry)
   3503		return NVMX_VMENTRY_VMEXIT;
   3504
   3505	load_vmcs12_host_state(vcpu, vmcs12);
   3506	vmcs12->vm_exit_reason = exit_reason.full;
   3507	if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   3508		vmx->nested.need_vmcs12_to_shadow_sync = true;
   3509	return NVMX_VMENTRY_VMEXIT;
   3510}
   3511
   3512/*
   3513 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
   3514 * for running an L2 nested guest.
   3515 */
   3516static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
   3517{
   3518	struct vmcs12 *vmcs12;
   3519	enum nvmx_vmentry_status status;
   3520	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3521	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
   3522	enum nested_evmptrld_status evmptrld_status;
   3523
   3524	if (!nested_vmx_check_permission(vcpu))
   3525		return 1;
   3526
   3527	evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
   3528	if (evmptrld_status == EVMPTRLD_ERROR) {
   3529		kvm_queue_exception(vcpu, UD_VECTOR);
   3530		return 1;
   3531	}
   3532
   3533	kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
   3534
   3535	if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
   3536		return nested_vmx_failInvalid(vcpu);
   3537
   3538	if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
   3539	       vmx->nested.current_vmptr == INVALID_GPA))
   3540		return nested_vmx_failInvalid(vcpu);
   3541
   3542	vmcs12 = get_vmcs12(vcpu);
   3543
   3544	/*
   3545	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
   3546	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
   3547	 * rather than RFLAGS.ZF, and no error number is stored to the
   3548	 * VM-instruction error field.
   3549	 */
   3550	if (CC(vmcs12->hdr.shadow_vmcs))
   3551		return nested_vmx_failInvalid(vcpu);
   3552
   3553	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
   3554		copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
   3555		/* Enlightened VMCS doesn't have launch state */
   3556		vmcs12->launch_state = !launch;
   3557	} else if (enable_shadow_vmcs) {
   3558		copy_shadow_to_vmcs12(vmx);
   3559	}
   3560
   3561	/*
   3562	 * The nested entry process starts with enforcing various prerequisites
   3563	 * on vmcs12 as required by the Intel SDM, and act appropriately when
   3564	 * they fail: As the SDM explains, some conditions should cause the
   3565	 * instruction to fail, while others will cause the instruction to seem
   3566	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
   3567	 * To speed up the normal (success) code path, we should avoid checking
   3568	 * for misconfigurations which will anyway be caught by the processor
   3569	 * when using the merged vmcs02.
   3570	 */
   3571	if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
   3572		return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
   3573
   3574	if (CC(vmcs12->launch_state == launch))
   3575		return nested_vmx_fail(vcpu,
   3576			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
   3577			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
   3578
   3579	if (nested_vmx_check_controls(vcpu, vmcs12))
   3580		return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
   3581
   3582	if (nested_vmx_check_address_space_size(vcpu, vmcs12))
   3583		return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
   3584
   3585	if (nested_vmx_check_host_state(vcpu, vmcs12))
   3586		return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
   3587
   3588	/*
   3589	 * We're finally done with prerequisite checking, and can start with
   3590	 * the nested entry.
   3591	 */
   3592	vmx->nested.nested_run_pending = 1;
   3593	vmx->nested.has_preemption_timer_deadline = false;
   3594	status = nested_vmx_enter_non_root_mode(vcpu, true);
   3595	if (unlikely(status != NVMX_VMENTRY_SUCCESS))
   3596		goto vmentry_failed;
   3597
   3598	/* Emulate processing of posted interrupts on VM-Enter. */
   3599	if (nested_cpu_has_posted_intr(vmcs12) &&
   3600	    kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
   3601		vmx->nested.pi_pending = true;
   3602		kvm_make_request(KVM_REQ_EVENT, vcpu);
   3603		kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
   3604	}
   3605
   3606	/* Hide L1D cache contents from the nested guest.  */
   3607	vmx->vcpu.arch.l1tf_flush_l1d = true;
   3608
   3609	/*
   3610	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
   3611	 * also be used as part of restoring nVMX state for
   3612	 * snapshot restore (migration).
   3613	 *
   3614	 * In this flow, it is assumed that vmcs12 cache was
   3615	 * transferred as part of captured nVMX state and should
   3616	 * therefore not be read from guest memory (which may not
   3617	 * exist on destination host yet).
   3618	 */
   3619	nested_cache_shadow_vmcs12(vcpu, vmcs12);
   3620
   3621	switch (vmcs12->guest_activity_state) {
   3622	case GUEST_ACTIVITY_HLT:
   3623		/*
   3624		 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
   3625		 * awakened by event injection or by an NMI-window VM-exit or
   3626		 * by an interrupt-window VM-exit, halt the vcpu.
   3627		 */
   3628		if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
   3629		    !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
   3630		    !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
   3631		      (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
   3632			vmx->nested.nested_run_pending = 0;
   3633			return kvm_emulate_halt_noskip(vcpu);
   3634		}
   3635		break;
   3636	case GUEST_ACTIVITY_WAIT_SIPI:
   3637		vmx->nested.nested_run_pending = 0;
   3638		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
   3639		break;
   3640	default:
   3641		break;
   3642	}
   3643
   3644	return 1;
   3645
   3646vmentry_failed:
   3647	vmx->nested.nested_run_pending = 0;
   3648	if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
   3649		return 0;
   3650	if (status == NVMX_VMENTRY_VMEXIT)
   3651		return 1;
   3652	WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
   3653	return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
   3654}
   3655
   3656/*
   3657 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
   3658 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
   3659 * This function returns the new value we should put in vmcs12.guest_cr0.
   3660 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
   3661 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
   3662 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
   3663 *     didn't trap the bit, because if L1 did, so would L0).
   3664 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
   3665 *     been modified by L2, and L1 knows it. So just leave the old value of
   3666 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
   3667 *     isn't relevant, because if L0 traps this bit it can set it to anything.
   3668 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
   3669 *     changed these bits, and therefore they need to be updated, but L0
   3670 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
   3671 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
   3672 */
   3673static inline unsigned long
   3674vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
   3675{
   3676	return
   3677	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
   3678	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
   3679	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
   3680			vcpu->arch.cr0_guest_owned_bits));
   3681}
   3682
   3683static inline unsigned long
   3684vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
   3685{
   3686	return
   3687	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
   3688	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
   3689	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
   3690			vcpu->arch.cr4_guest_owned_bits));
   3691}
   3692
   3693static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
   3694				      struct vmcs12 *vmcs12,
   3695				      u32 vm_exit_reason, u32 exit_intr_info)
   3696{
   3697	u32 idt_vectoring;
   3698	unsigned int nr;
   3699
   3700	/*
   3701	 * Per the SDM, VM-Exits due to double and triple faults are never
   3702	 * considered to occur during event delivery, even if the double/triple
   3703	 * fault is the result of an escalating vectoring issue.
   3704	 *
   3705	 * Note, the SDM qualifies the double fault behavior with "The original
   3706	 * event results in a double-fault exception".  It's unclear why the
   3707	 * qualification exists since exits due to double fault can occur only
   3708	 * while vectoring a different exception (injected events are never
   3709	 * subject to interception), i.e. there's _always_ an original event.
   3710	 *
   3711	 * The SDM also uses NMI as a confusing example for the "original event
   3712	 * causes the VM exit directly" clause.  NMI isn't special in any way,
   3713	 * the same rule applies to all events that cause an exit directly.
   3714	 * NMI is an odd choice for the example because NMIs can only occur on
   3715	 * instruction boundaries, i.e. they _can't_ occur during vectoring.
   3716	 */
   3717	if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
   3718	    ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
   3719	     is_double_fault(exit_intr_info))) {
   3720		vmcs12->idt_vectoring_info_field = 0;
   3721	} else if (vcpu->arch.exception.injected) {
   3722		nr = vcpu->arch.exception.nr;
   3723		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
   3724
   3725		if (kvm_exception_is_soft(nr)) {
   3726			vmcs12->vm_exit_instruction_len =
   3727				vcpu->arch.event_exit_inst_len;
   3728			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
   3729		} else
   3730			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
   3731
   3732		if (vcpu->arch.exception.has_error_code) {
   3733			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
   3734			vmcs12->idt_vectoring_error_code =
   3735				vcpu->arch.exception.error_code;
   3736		}
   3737
   3738		vmcs12->idt_vectoring_info_field = idt_vectoring;
   3739	} else if (vcpu->arch.nmi_injected) {
   3740		vmcs12->idt_vectoring_info_field =
   3741			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
   3742	} else if (vcpu->arch.interrupt.injected) {
   3743		nr = vcpu->arch.interrupt.nr;
   3744		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
   3745
   3746		if (vcpu->arch.interrupt.soft) {
   3747			idt_vectoring |= INTR_TYPE_SOFT_INTR;
   3748			vmcs12->vm_entry_instruction_len =
   3749				vcpu->arch.event_exit_inst_len;
   3750		} else
   3751			idt_vectoring |= INTR_TYPE_EXT_INTR;
   3752
   3753		vmcs12->idt_vectoring_info_field = idt_vectoring;
   3754	} else {
   3755		vmcs12->idt_vectoring_info_field = 0;
   3756	}
   3757}
   3758
   3759
   3760void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
   3761{
   3762	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   3763	gfn_t gfn;
   3764
   3765	/*
   3766	 * Don't need to mark the APIC access page dirty; it is never
   3767	 * written to by the CPU during APIC virtualization.
   3768	 */
   3769
   3770	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
   3771		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
   3772		kvm_vcpu_mark_page_dirty(vcpu, gfn);
   3773	}
   3774
   3775	if (nested_cpu_has_posted_intr(vmcs12)) {
   3776		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
   3777		kvm_vcpu_mark_page_dirty(vcpu, gfn);
   3778	}
   3779}
   3780
   3781static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
   3782{
   3783	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3784	int max_irr;
   3785	void *vapic_page;
   3786	u16 status;
   3787
   3788	if (!vmx->nested.pi_pending)
   3789		return 0;
   3790
   3791	if (!vmx->nested.pi_desc)
   3792		goto mmio_needed;
   3793
   3794	vmx->nested.pi_pending = false;
   3795
   3796	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
   3797		return 0;
   3798
   3799	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
   3800	if (max_irr != 256) {
   3801		vapic_page = vmx->nested.virtual_apic_map.hva;
   3802		if (!vapic_page)
   3803			goto mmio_needed;
   3804
   3805		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
   3806			vapic_page, &max_irr);
   3807		status = vmcs_read16(GUEST_INTR_STATUS);
   3808		if ((u8)max_irr > ((u8)status & 0xff)) {
   3809			status &= ~0xff;
   3810			status |= (u8)max_irr;
   3811			vmcs_write16(GUEST_INTR_STATUS, status);
   3812		}
   3813	}
   3814
   3815	nested_mark_vmcs12_pages_dirty(vcpu);
   3816	return 0;
   3817
   3818mmio_needed:
   3819	kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
   3820	return -ENXIO;
   3821}
   3822
   3823static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
   3824					       unsigned long exit_qual)
   3825{
   3826	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   3827	unsigned int nr = vcpu->arch.exception.nr;
   3828	u32 intr_info = nr | INTR_INFO_VALID_MASK;
   3829
   3830	if (vcpu->arch.exception.has_error_code) {
   3831		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
   3832		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
   3833	}
   3834
   3835	if (kvm_exception_is_soft(nr))
   3836		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
   3837	else
   3838		intr_info |= INTR_TYPE_HARD_EXCEPTION;
   3839
   3840	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
   3841	    vmx_get_nmi_mask(vcpu))
   3842		intr_info |= INTR_INFO_UNBLOCK_NMI;
   3843
   3844	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
   3845}
   3846
   3847/*
   3848 * Returns true if a debug trap is pending delivery.
   3849 *
   3850 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
   3851 * exception may be inferred from the presence of an exception payload.
   3852 */
   3853static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
   3854{
   3855	return vcpu->arch.exception.pending &&
   3856			vcpu->arch.exception.nr == DB_VECTOR &&
   3857			vcpu->arch.exception.payload;
   3858}
   3859
   3860/*
   3861 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
   3862 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
   3863 * represents these debug traps with a payload that is said to be compatible
   3864 * with the 'pending debug exceptions' field, write the payload to the VMCS
   3865 * field if a VM-exit is delivered before the debug trap.
   3866 */
   3867static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
   3868{
   3869	if (vmx_pending_dbg_trap(vcpu))
   3870		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
   3871			    vcpu->arch.exception.payload);
   3872}
   3873
   3874static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
   3875{
   3876	return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
   3877	       to_vmx(vcpu)->nested.preemption_timer_expired;
   3878}
   3879
   3880static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
   3881{
   3882	struct vcpu_vmx *vmx = to_vmx(vcpu);
   3883	unsigned long exit_qual;
   3884	bool block_nested_events =
   3885	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
   3886	bool mtf_pending = vmx->nested.mtf_pending;
   3887	struct kvm_lapic *apic = vcpu->arch.apic;
   3888
   3889	/*
   3890	 * Clear the MTF state. If a higher priority VM-exit is delivered first,
   3891	 * this state is discarded.
   3892	 */
   3893	if (!block_nested_events)
   3894		vmx->nested.mtf_pending = false;
   3895
   3896	if (lapic_in_kernel(vcpu) &&
   3897		test_bit(KVM_APIC_INIT, &apic->pending_events)) {
   3898		if (block_nested_events)
   3899			return -EBUSY;
   3900		nested_vmx_update_pending_dbg(vcpu);
   3901		clear_bit(KVM_APIC_INIT, &apic->pending_events);
   3902		if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
   3903			nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
   3904		return 0;
   3905	}
   3906
   3907	if (lapic_in_kernel(vcpu) &&
   3908	    test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
   3909		if (block_nested_events)
   3910			return -EBUSY;
   3911
   3912		clear_bit(KVM_APIC_SIPI, &apic->pending_events);
   3913		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
   3914			nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
   3915						apic->sipi_vector & 0xFFUL);
   3916		return 0;
   3917	}
   3918
   3919	/*
   3920	 * Process any exceptions that are not debug traps before MTF.
   3921	 *
   3922	 * Note that only a pending nested run can block a pending exception.
   3923	 * Otherwise an injected NMI/interrupt should either be
   3924	 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
   3925	 * while delivering the pending exception.
   3926	 */
   3927
   3928	if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
   3929		if (vmx->nested.nested_run_pending)
   3930			return -EBUSY;
   3931		if (!nested_vmx_check_exception(vcpu, &exit_qual))
   3932			goto no_vmexit;
   3933		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
   3934		return 0;
   3935	}
   3936
   3937	if (mtf_pending) {
   3938		if (block_nested_events)
   3939			return -EBUSY;
   3940		nested_vmx_update_pending_dbg(vcpu);
   3941		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
   3942		return 0;
   3943	}
   3944
   3945	if (vcpu->arch.exception.pending) {
   3946		if (vmx->nested.nested_run_pending)
   3947			return -EBUSY;
   3948		if (!nested_vmx_check_exception(vcpu, &exit_qual))
   3949			goto no_vmexit;
   3950		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
   3951		return 0;
   3952	}
   3953
   3954	if (nested_vmx_preemption_timer_pending(vcpu)) {
   3955		if (block_nested_events)
   3956			return -EBUSY;
   3957		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
   3958		return 0;
   3959	}
   3960
   3961	if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
   3962		if (block_nested_events)
   3963			return -EBUSY;
   3964		goto no_vmexit;
   3965	}
   3966
   3967	if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
   3968		if (block_nested_events)
   3969			return -EBUSY;
   3970		if (!nested_exit_on_nmi(vcpu))
   3971			goto no_vmexit;
   3972
   3973		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
   3974				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
   3975				  INTR_INFO_VALID_MASK, 0);
   3976		/*
   3977		 * The NMI-triggered VM exit counts as injection:
   3978		 * clear this one and block further NMIs.
   3979		 */
   3980		vcpu->arch.nmi_pending = 0;
   3981		vmx_set_nmi_mask(vcpu, true);
   3982		return 0;
   3983	}
   3984
   3985	if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
   3986		if (block_nested_events)
   3987			return -EBUSY;
   3988		if (!nested_exit_on_intr(vcpu))
   3989			goto no_vmexit;
   3990		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
   3991		return 0;
   3992	}
   3993
   3994no_vmexit:
   3995	return vmx_complete_nested_posted_interrupt(vcpu);
   3996}
   3997
   3998static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
   3999{
   4000	ktime_t remaining =
   4001		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
   4002	u64 value;
   4003
   4004	if (ktime_to_ns(remaining) <= 0)
   4005		return 0;
   4006
   4007	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
   4008	do_div(value, 1000000);
   4009	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
   4010}
   4011
   4012static bool is_vmcs12_ext_field(unsigned long field)
   4013{
   4014	switch (field) {
   4015	case GUEST_ES_SELECTOR:
   4016	case GUEST_CS_SELECTOR:
   4017	case GUEST_SS_SELECTOR:
   4018	case GUEST_DS_SELECTOR:
   4019	case GUEST_FS_SELECTOR:
   4020	case GUEST_GS_SELECTOR:
   4021	case GUEST_LDTR_SELECTOR:
   4022	case GUEST_TR_SELECTOR:
   4023	case GUEST_ES_LIMIT:
   4024	case GUEST_CS_LIMIT:
   4025	case GUEST_SS_LIMIT:
   4026	case GUEST_DS_LIMIT:
   4027	case GUEST_FS_LIMIT:
   4028	case GUEST_GS_LIMIT:
   4029	case GUEST_LDTR_LIMIT:
   4030	case GUEST_TR_LIMIT:
   4031	case GUEST_GDTR_LIMIT:
   4032	case GUEST_IDTR_LIMIT:
   4033	case GUEST_ES_AR_BYTES:
   4034	case GUEST_DS_AR_BYTES:
   4035	case GUEST_FS_AR_BYTES:
   4036	case GUEST_GS_AR_BYTES:
   4037	case GUEST_LDTR_AR_BYTES:
   4038	case GUEST_TR_AR_BYTES:
   4039	case GUEST_ES_BASE:
   4040	case GUEST_CS_BASE:
   4041	case GUEST_SS_BASE:
   4042	case GUEST_DS_BASE:
   4043	case GUEST_FS_BASE:
   4044	case GUEST_GS_BASE:
   4045	case GUEST_LDTR_BASE:
   4046	case GUEST_TR_BASE:
   4047	case GUEST_GDTR_BASE:
   4048	case GUEST_IDTR_BASE:
   4049	case GUEST_PENDING_DBG_EXCEPTIONS:
   4050	case GUEST_BNDCFGS:
   4051		return true;
   4052	default:
   4053		break;
   4054	}
   4055
   4056	return false;
   4057}
   4058
   4059static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
   4060				       struct vmcs12 *vmcs12)
   4061{
   4062	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4063
   4064	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
   4065	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
   4066	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
   4067	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
   4068	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
   4069	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
   4070	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
   4071	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
   4072	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
   4073	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
   4074	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
   4075	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
   4076	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
   4077	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
   4078	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
   4079	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
   4080	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
   4081	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
   4082	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
   4083	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
   4084	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
   4085	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
   4086	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
   4087	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
   4088	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
   4089	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
   4090	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
   4091	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
   4092	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
   4093	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
   4094	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
   4095	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
   4096	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
   4097	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
   4098	vmcs12->guest_pending_dbg_exceptions =
   4099		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
   4100	if (kvm_mpx_supported())
   4101		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
   4102
   4103	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
   4104}
   4105
   4106static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
   4107				       struct vmcs12 *vmcs12)
   4108{
   4109	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4110	int cpu;
   4111
   4112	if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
   4113		return;
   4114
   4115
   4116	WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
   4117
   4118	cpu = get_cpu();
   4119	vmx->loaded_vmcs = &vmx->nested.vmcs02;
   4120	vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
   4121
   4122	sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
   4123
   4124	vmx->loaded_vmcs = &vmx->vmcs01;
   4125	vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
   4126	put_cpu();
   4127}
   4128
   4129/*
   4130 * Update the guest state fields of vmcs12 to reflect changes that
   4131 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
   4132 * VM-entry controls is also updated, since this is really a guest
   4133 * state bit.)
   4134 */
   4135static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
   4136{
   4137	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4138
   4139	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   4140		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
   4141
   4142	vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
   4143		!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
   4144
   4145	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
   4146	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
   4147
   4148	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
   4149	vmcs12->guest_rip = kvm_rip_read(vcpu);
   4150	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
   4151
   4152	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
   4153	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
   4154
   4155	vmcs12->guest_interruptibility_info =
   4156		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
   4157
   4158	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
   4159		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
   4160	else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
   4161		vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
   4162	else
   4163		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
   4164
   4165	if (nested_cpu_has_preemption_timer(vmcs12) &&
   4166	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
   4167	    !vmx->nested.nested_run_pending)
   4168		vmcs12->vmx_preemption_timer_value =
   4169			vmx_get_preemption_timer_value(vcpu);
   4170
   4171	/*
   4172	 * In some cases (usually, nested EPT), L2 is allowed to change its
   4173	 * own CR3 without exiting. If it has changed it, we must keep it.
   4174	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
   4175	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
   4176	 *
   4177	 * Additionally, restore L2's PDPTR to vmcs12.
   4178	 */
   4179	if (enable_ept) {
   4180		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
   4181		if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
   4182			vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
   4183			vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
   4184			vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
   4185			vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
   4186		}
   4187	}
   4188
   4189	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
   4190
   4191	if (nested_cpu_has_vid(vmcs12))
   4192		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
   4193
   4194	vmcs12->vm_entry_controls =
   4195		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
   4196		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
   4197
   4198	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
   4199		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
   4200
   4201	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
   4202		vmcs12->guest_ia32_efer = vcpu->arch.efer;
   4203}
   4204
   4205/*
   4206 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
   4207 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
   4208 * and this function updates it to reflect the changes to the guest state while
   4209 * L2 was running (and perhaps made some exits which were handled directly by L0
   4210 * without going back to L1), and to reflect the exit reason.
   4211 * Note that we do not have to copy here all VMCS fields, just those that
   4212 * could have changed by the L2 guest or the exit - i.e., the guest-state and
   4213 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
   4214 * which already writes to vmcs12 directly.
   4215 */
   4216static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
   4217			   u32 vm_exit_reason, u32 exit_intr_info,
   4218			   unsigned long exit_qualification)
   4219{
   4220	/* update exit information fields: */
   4221	vmcs12->vm_exit_reason = vm_exit_reason;
   4222	if (to_vmx(vcpu)->exit_reason.enclave_mode)
   4223		vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
   4224	vmcs12->exit_qualification = exit_qualification;
   4225
   4226	/*
   4227	 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
   4228	 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
   4229	 * exit info fields are unmodified.
   4230	 */
   4231	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
   4232		vmcs12->launch_state = 1;
   4233
   4234		/* vm_entry_intr_info_field is cleared on exit. Emulate this
   4235		 * instead of reading the real value. */
   4236		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
   4237
   4238		/*
   4239		 * Transfer the event that L0 or L1 may wanted to inject into
   4240		 * L2 to IDT_VECTORING_INFO_FIELD.
   4241		 */
   4242		vmcs12_save_pending_event(vcpu, vmcs12,
   4243					  vm_exit_reason, exit_intr_info);
   4244
   4245		vmcs12->vm_exit_intr_info = exit_intr_info;
   4246		vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
   4247		vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   4248
   4249		/*
   4250		 * According to spec, there's no need to store the guest's
   4251		 * MSRs if the exit is due to a VM-entry failure that occurs
   4252		 * during or after loading the guest state. Since this exit
   4253		 * does not fall in that category, we need to save the MSRs.
   4254		 */
   4255		if (nested_vmx_store_msr(vcpu,
   4256					 vmcs12->vm_exit_msr_store_addr,
   4257					 vmcs12->vm_exit_msr_store_count))
   4258			nested_vmx_abort(vcpu,
   4259					 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
   4260	}
   4261
   4262	/*
   4263	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
   4264	 * preserved above and would only end up incorrectly in L1.
   4265	 */
   4266	vcpu->arch.nmi_injected = false;
   4267	kvm_clear_exception_queue(vcpu);
   4268	kvm_clear_interrupt_queue(vcpu);
   4269}
   4270
   4271/*
   4272 * A part of what we need to when the nested L2 guest exits and we want to
   4273 * run its L1 parent, is to reset L1's guest state to the host state specified
   4274 * in vmcs12.
   4275 * This function is to be called not only on normal nested exit, but also on
   4276 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
   4277 * Failures During or After Loading Guest State").
   4278 * This function should be called when the active VMCS is L1's (vmcs01).
   4279 */
   4280static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
   4281				   struct vmcs12 *vmcs12)
   4282{
   4283	enum vm_entry_failure_code ignored;
   4284	struct kvm_segment seg;
   4285
   4286	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
   4287		vcpu->arch.efer = vmcs12->host_ia32_efer;
   4288	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
   4289		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
   4290	else
   4291		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
   4292	vmx_set_efer(vcpu, vcpu->arch.efer);
   4293
   4294	kvm_rsp_write(vcpu, vmcs12->host_rsp);
   4295	kvm_rip_write(vcpu, vmcs12->host_rip);
   4296	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
   4297	vmx_set_interrupt_shadow(vcpu, 0);
   4298
   4299	/*
   4300	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
   4301	 * actually changed, because vmx_set_cr0 refers to efer set above.
   4302	 *
   4303	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
   4304	 * (KVM doesn't change it);
   4305	 */
   4306	vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
   4307	vmx_set_cr0(vcpu, vmcs12->host_cr0);
   4308
   4309	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
   4310	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
   4311	vmx_set_cr4(vcpu, vmcs12->host_cr4);
   4312
   4313	nested_ept_uninit_mmu_context(vcpu);
   4314
   4315	/*
   4316	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
   4317	 * couldn't have changed.
   4318	 */
   4319	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
   4320		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
   4321
   4322	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
   4323
   4324	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
   4325	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
   4326	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
   4327	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
   4328	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
   4329	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
   4330	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
   4331
   4332	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
   4333	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
   4334		vmcs_write64(GUEST_BNDCFGS, 0);
   4335
   4336	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
   4337		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
   4338		vcpu->arch.pat = vmcs12->host_ia32_pat;
   4339	}
   4340	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
   4341		WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
   4342					 vmcs12->host_ia32_perf_global_ctrl));
   4343
   4344	/* Set L1 segment info according to Intel SDM
   4345	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
   4346	seg = (struct kvm_segment) {
   4347		.base = 0,
   4348		.limit = 0xFFFFFFFF,
   4349		.selector = vmcs12->host_cs_selector,
   4350		.type = 11,
   4351		.present = 1,
   4352		.s = 1,
   4353		.g = 1
   4354	};
   4355	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
   4356		seg.l = 1;
   4357	else
   4358		seg.db = 1;
   4359	__vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
   4360	seg = (struct kvm_segment) {
   4361		.base = 0,
   4362		.limit = 0xFFFFFFFF,
   4363		.type = 3,
   4364		.present = 1,
   4365		.s = 1,
   4366		.db = 1,
   4367		.g = 1
   4368	};
   4369	seg.selector = vmcs12->host_ds_selector;
   4370	__vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
   4371	seg.selector = vmcs12->host_es_selector;
   4372	__vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
   4373	seg.selector = vmcs12->host_ss_selector;
   4374	__vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
   4375	seg.selector = vmcs12->host_fs_selector;
   4376	seg.base = vmcs12->host_fs_base;
   4377	__vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
   4378	seg.selector = vmcs12->host_gs_selector;
   4379	seg.base = vmcs12->host_gs_base;
   4380	__vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
   4381	seg = (struct kvm_segment) {
   4382		.base = vmcs12->host_tr_base,
   4383		.limit = 0x67,
   4384		.selector = vmcs12->host_tr_selector,
   4385		.type = 11,
   4386		.present = 1
   4387	};
   4388	__vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
   4389
   4390	memset(&seg, 0, sizeof(seg));
   4391	seg.unusable = 1;
   4392	__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
   4393
   4394	kvm_set_dr(vcpu, 7, 0x400);
   4395	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
   4396
   4397	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
   4398				vmcs12->vm_exit_msr_load_count))
   4399		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
   4400
   4401	to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
   4402}
   4403
   4404static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
   4405{
   4406	struct vmx_uret_msr *efer_msr;
   4407	unsigned int i;
   4408
   4409	if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
   4410		return vmcs_read64(GUEST_IA32_EFER);
   4411
   4412	if (cpu_has_load_ia32_efer())
   4413		return host_efer;
   4414
   4415	for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
   4416		if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
   4417			return vmx->msr_autoload.guest.val[i].value;
   4418	}
   4419
   4420	efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
   4421	if (efer_msr)
   4422		return efer_msr->data;
   4423
   4424	return host_efer;
   4425}
   4426
   4427static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
   4428{
   4429	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   4430	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4431	struct vmx_msr_entry g, h;
   4432	gpa_t gpa;
   4433	u32 i, j;
   4434
   4435	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
   4436
   4437	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
   4438		/*
   4439		 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
   4440		 * as vmcs01.GUEST_DR7 contains a userspace defined value
   4441		 * and vcpu->arch.dr7 is not squirreled away before the
   4442		 * nested VMENTER (not worth adding a variable in nested_vmx).
   4443		 */
   4444		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
   4445			kvm_set_dr(vcpu, 7, DR7_FIXED_1);
   4446		else
   4447			WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
   4448	}
   4449
   4450	/*
   4451	 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
   4452	 * handle a variety of side effects to KVM's software model.
   4453	 */
   4454	vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
   4455
   4456	vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
   4457	vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
   4458
   4459	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
   4460	vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
   4461
   4462	nested_ept_uninit_mmu_context(vcpu);
   4463	vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
   4464	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
   4465
   4466	/*
   4467	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
   4468	 * from vmcs01 (if necessary).  The PDPTRs are not loaded on
   4469	 * VMFail, like everything else we just need to ensure our
   4470	 * software model is up-to-date.
   4471	 */
   4472	if (enable_ept && is_pae_paging(vcpu))
   4473		ept_save_pdptrs(vcpu);
   4474
   4475	kvm_mmu_reset_context(vcpu);
   4476
   4477	/*
   4478	 * This nasty bit of open coding is a compromise between blindly
   4479	 * loading L1's MSRs using the exit load lists (incorrect emulation
   4480	 * of VMFail), leaving the nested VM's MSRs in the software model
   4481	 * (incorrect behavior) and snapshotting the modified MSRs (too
   4482	 * expensive since the lists are unbound by hardware).  For each
   4483	 * MSR that was (prematurely) loaded from the nested VMEntry load
   4484	 * list, reload it from the exit load list if it exists and differs
   4485	 * from the guest value.  The intent is to stuff host state as
   4486	 * silently as possible, not to fully process the exit load list.
   4487	 */
   4488	for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
   4489		gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
   4490		if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
   4491			pr_debug_ratelimited(
   4492				"%s read MSR index failed (%u, 0x%08llx)\n",
   4493				__func__, i, gpa);
   4494			goto vmabort;
   4495		}
   4496
   4497		for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
   4498			gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
   4499			if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
   4500				pr_debug_ratelimited(
   4501					"%s read MSR failed (%u, 0x%08llx)\n",
   4502					__func__, j, gpa);
   4503				goto vmabort;
   4504			}
   4505			if (h.index != g.index)
   4506				continue;
   4507			if (h.value == g.value)
   4508				break;
   4509
   4510			if (nested_vmx_load_msr_check(vcpu, &h)) {
   4511				pr_debug_ratelimited(
   4512					"%s check failed (%u, 0x%x, 0x%x)\n",
   4513					__func__, j, h.index, h.reserved);
   4514				goto vmabort;
   4515			}
   4516
   4517			if (kvm_set_msr(vcpu, h.index, h.value)) {
   4518				pr_debug_ratelimited(
   4519					"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
   4520					__func__, j, h.index, h.value);
   4521				goto vmabort;
   4522			}
   4523		}
   4524	}
   4525
   4526	return;
   4527
   4528vmabort:
   4529	nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
   4530}
   4531
   4532/*
   4533 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
   4534 * and modify vmcs12 to make it see what it would expect to see there if
   4535 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
   4536 */
   4537void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
   4538		       u32 exit_intr_info, unsigned long exit_qualification)
   4539{
   4540	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4541	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   4542
   4543	/* trying to cancel vmlaunch/vmresume is a bug */
   4544	WARN_ON_ONCE(vmx->nested.nested_run_pending);
   4545
   4546	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
   4547		/*
   4548		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
   4549		 * Enlightened VMCS after migration and we still need to
   4550		 * do that when something is forcing L2->L1 exit prior to
   4551		 * the first L2 run.
   4552		 */
   4553		(void)nested_get_evmcs_page(vcpu);
   4554	}
   4555
   4556	/* Service pending TLB flush requests for L2 before switching to L1. */
   4557	kvm_service_local_tlb_flush_requests(vcpu);
   4558
   4559	/*
   4560	 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
   4561	 * now and the new vmentry.  Ensure that the VMCS02 PDPTR fields are
   4562	 * up-to-date before switching to L1.
   4563	 */
   4564	if (enable_ept && is_pae_paging(vcpu))
   4565		vmx_ept_load_pdptrs(vcpu);
   4566
   4567	leave_guest_mode(vcpu);
   4568
   4569	if (nested_cpu_has_preemption_timer(vmcs12))
   4570		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
   4571
   4572	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
   4573		vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
   4574		if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
   4575			vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
   4576	}
   4577
   4578	if (likely(!vmx->fail)) {
   4579		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
   4580
   4581		if (vm_exit_reason != -1)
   4582			prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
   4583				       exit_intr_info, exit_qualification);
   4584
   4585		/*
   4586		 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
   4587		 * also be used to capture vmcs12 cache as part of
   4588		 * capturing nVMX state for snapshot (migration).
   4589		 *
   4590		 * Otherwise, this flush will dirty guest memory at a
   4591		 * point it is already assumed by user-space to be
   4592		 * immutable.
   4593		 */
   4594		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
   4595	} else {
   4596		/*
   4597		 * The only expected VM-instruction error is "VM entry with
   4598		 * invalid control field(s)." Anything else indicates a
   4599		 * problem with L0.  And we should never get here with a
   4600		 * VMFail of any type if early consistency checks are enabled.
   4601		 */
   4602		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
   4603			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
   4604		WARN_ON_ONCE(nested_early_check);
   4605	}
   4606
   4607	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
   4608
   4609	/* Update any VMCS fields that might have changed while L2 ran */
   4610	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
   4611	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
   4612	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
   4613	if (kvm_has_tsc_control)
   4614		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
   4615
   4616	if (vmx->nested.l1_tpr_threshold != -1)
   4617		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
   4618
   4619	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
   4620		vmx->nested.change_vmcs01_virtual_apic_mode = false;
   4621		vmx_set_virtual_apic_mode(vcpu);
   4622	}
   4623
   4624	if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
   4625		vmx->nested.update_vmcs01_cpu_dirty_logging = false;
   4626		vmx_update_cpu_dirty_logging(vcpu);
   4627	}
   4628
   4629	/* Unpin physical memory we referred to in vmcs02 */
   4630	if (vmx->nested.apic_access_page) {
   4631		kvm_release_page_clean(vmx->nested.apic_access_page);
   4632		vmx->nested.apic_access_page = NULL;
   4633	}
   4634	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
   4635	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
   4636	vmx->nested.pi_desc = NULL;
   4637
   4638	if (vmx->nested.reload_vmcs01_apic_access_page) {
   4639		vmx->nested.reload_vmcs01_apic_access_page = false;
   4640		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
   4641	}
   4642
   4643	if (vmx->nested.update_vmcs01_apicv_status) {
   4644		vmx->nested.update_vmcs01_apicv_status = false;
   4645		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
   4646	}
   4647
   4648	if ((vm_exit_reason != -1) &&
   4649	    (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
   4650		vmx->nested.need_vmcs12_to_shadow_sync = true;
   4651
   4652	/* in case we halted in L2 */
   4653	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
   4654
   4655	if (likely(!vmx->fail)) {
   4656		if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
   4657		    nested_exit_intr_ack_set(vcpu)) {
   4658			int irq = kvm_cpu_get_interrupt(vcpu);
   4659			WARN_ON(irq < 0);
   4660			vmcs12->vm_exit_intr_info = irq |
   4661				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
   4662		}
   4663
   4664		if (vm_exit_reason != -1)
   4665			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
   4666						       vmcs12->exit_qualification,
   4667						       vmcs12->idt_vectoring_info_field,
   4668						       vmcs12->vm_exit_intr_info,
   4669						       vmcs12->vm_exit_intr_error_code,
   4670						       KVM_ISA_VMX);
   4671
   4672		load_vmcs12_host_state(vcpu, vmcs12);
   4673
   4674		return;
   4675	}
   4676
   4677	/*
   4678	 * After an early L2 VM-entry failure, we're now back
   4679	 * in L1 which thinks it just finished a VMLAUNCH or
   4680	 * VMRESUME instruction, so we need to set the failure
   4681	 * flag and the VM-instruction error field of the VMCS
   4682	 * accordingly, and skip the emulated instruction.
   4683	 */
   4684	(void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
   4685
   4686	/*
   4687	 * Restore L1's host state to KVM's software model.  We're here
   4688	 * because a consistency check was caught by hardware, which
   4689	 * means some amount of guest state has been propagated to KVM's
   4690	 * model and needs to be unwound to the host's state.
   4691	 */
   4692	nested_vmx_restore_host_state(vcpu);
   4693
   4694	vmx->fail = 0;
   4695}
   4696
   4697static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
   4698{
   4699	nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
   4700}
   4701
   4702/*
   4703 * Decode the memory-address operand of a vmx instruction, as recorded on an
   4704 * exit caused by such an instruction (run by a guest hypervisor).
   4705 * On success, returns 0. When the operand is invalid, returns 1 and throws
   4706 * #UD, #GP, or #SS.
   4707 */
   4708int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
   4709			u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
   4710{
   4711	gva_t off;
   4712	bool exn;
   4713	struct kvm_segment s;
   4714
   4715	/*
   4716	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
   4717	 * Execution", on an exit, vmx_instruction_info holds most of the
   4718	 * addressing components of the operand. Only the displacement part
   4719	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
   4720	 * For how an actual address is calculated from all these components,
   4721	 * refer to Vol. 1, "Operand Addressing".
   4722	 */
   4723	int  scaling = vmx_instruction_info & 3;
   4724	int  addr_size = (vmx_instruction_info >> 7) & 7;
   4725	bool is_reg = vmx_instruction_info & (1u << 10);
   4726	int  seg_reg = (vmx_instruction_info >> 15) & 7;
   4727	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
   4728	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
   4729	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
   4730	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
   4731
   4732	if (is_reg) {
   4733		kvm_queue_exception(vcpu, UD_VECTOR);
   4734		return 1;
   4735	}
   4736
   4737	/* Addr = segment_base + offset */
   4738	/* offset = base + [index * scale] + displacement */
   4739	off = exit_qualification; /* holds the displacement */
   4740	if (addr_size == 1)
   4741		off = (gva_t)sign_extend64(off, 31);
   4742	else if (addr_size == 0)
   4743		off = (gva_t)sign_extend64(off, 15);
   4744	if (base_is_valid)
   4745		off += kvm_register_read(vcpu, base_reg);
   4746	if (index_is_valid)
   4747		off += kvm_register_read(vcpu, index_reg) << scaling;
   4748	vmx_get_segment(vcpu, &s, seg_reg);
   4749
   4750	/*
   4751	 * The effective address, i.e. @off, of a memory operand is truncated
   4752	 * based on the address size of the instruction.  Note that this is
   4753	 * the *effective address*, i.e. the address prior to accounting for
   4754	 * the segment's base.
   4755	 */
   4756	if (addr_size == 1) /* 32 bit */
   4757		off &= 0xffffffff;
   4758	else if (addr_size == 0) /* 16 bit */
   4759		off &= 0xffff;
   4760
   4761	/* Checks for #GP/#SS exceptions. */
   4762	exn = false;
   4763	if (is_long_mode(vcpu)) {
   4764		/*
   4765		 * The virtual/linear address is never truncated in 64-bit
   4766		 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
   4767		 * address when using FS/GS with a non-zero base.
   4768		 */
   4769		if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
   4770			*ret = s.base + off;
   4771		else
   4772			*ret = off;
   4773
   4774		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
   4775		 * non-canonical form. This is the only check on the memory
   4776		 * destination for long mode!
   4777		 */
   4778		exn = is_noncanonical_address(*ret, vcpu);
   4779	} else {
   4780		/*
   4781		 * When not in long mode, the virtual/linear address is
   4782		 * unconditionally truncated to 32 bits regardless of the
   4783		 * address size.
   4784		 */
   4785		*ret = (s.base + off) & 0xffffffff;
   4786
   4787		/* Protected mode: apply checks for segment validity in the
   4788		 * following order:
   4789		 * - segment type check (#GP(0) may be thrown)
   4790		 * - usability check (#GP(0)/#SS(0))
   4791		 * - limit check (#GP(0)/#SS(0))
   4792		 */
   4793		if (wr)
   4794			/* #GP(0) if the destination operand is located in a
   4795			 * read-only data segment or any code segment.
   4796			 */
   4797			exn = ((s.type & 0xa) == 0 || (s.type & 8));
   4798		else
   4799			/* #GP(0) if the source operand is located in an
   4800			 * execute-only code segment
   4801			 */
   4802			exn = ((s.type & 0xa) == 8);
   4803		if (exn) {
   4804			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
   4805			return 1;
   4806		}
   4807		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
   4808		 */
   4809		exn = (s.unusable != 0);
   4810
   4811		/*
   4812		 * Protected mode: #GP(0)/#SS(0) if the memory operand is
   4813		 * outside the segment limit.  All CPUs that support VMX ignore
   4814		 * limit checks for flat segments, i.e. segments with base==0,
   4815		 * limit==0xffffffff and of type expand-up data or code.
   4816		 */
   4817		if (!(s.base == 0 && s.limit == 0xffffffff &&
   4818		     ((s.type & 8) || !(s.type & 4))))
   4819			exn = exn || ((u64)off + len - 1 > s.limit);
   4820	}
   4821	if (exn) {
   4822		kvm_queue_exception_e(vcpu,
   4823				      seg_reg == VCPU_SREG_SS ?
   4824						SS_VECTOR : GP_VECTOR,
   4825				      0);
   4826		return 1;
   4827	}
   4828
   4829	return 0;
   4830}
   4831
   4832void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
   4833			    bool vcpu_has_perf_global_ctrl)
   4834{
   4835	struct vcpu_vmx *vmx;
   4836
   4837	if (!nested_vmx_allowed(vcpu))
   4838		return;
   4839
   4840	vmx = to_vmx(vcpu);
   4841	if (vcpu_has_perf_global_ctrl) {
   4842		vmx->nested.msrs.entry_ctls_high |=
   4843				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
   4844		vmx->nested.msrs.exit_ctls_high |=
   4845				VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
   4846	} else {
   4847		vmx->nested.msrs.entry_ctls_high &=
   4848				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
   4849		vmx->nested.msrs.exit_ctls_high &=
   4850				~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
   4851	}
   4852}
   4853
   4854static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
   4855				int *ret)
   4856{
   4857	gva_t gva;
   4858	struct x86_exception e;
   4859	int r;
   4860
   4861	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
   4862				vmcs_read32(VMX_INSTRUCTION_INFO), false,
   4863				sizeof(*vmpointer), &gva)) {
   4864		*ret = 1;
   4865		return -EINVAL;
   4866	}
   4867
   4868	r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
   4869	if (r != X86EMUL_CONTINUE) {
   4870		*ret = kvm_handle_memory_failure(vcpu, r, &e);
   4871		return -EINVAL;
   4872	}
   4873
   4874	return 0;
   4875}
   4876
   4877/*
   4878 * Allocate a shadow VMCS and associate it with the currently loaded
   4879 * VMCS, unless such a shadow VMCS already exists. The newly allocated
   4880 * VMCS is also VMCLEARed, so that it is ready for use.
   4881 */
   4882static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
   4883{
   4884	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4885	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
   4886
   4887	/*
   4888	 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
   4889	 * when L1 executes VMXOFF or the vCPU is forced out of nested
   4890	 * operation.  VMXON faults if the CPU is already post-VMXON, so it
   4891	 * should be impossible to already have an allocated shadow VMCS.  KVM
   4892	 * doesn't support virtualization of VMCS shadowing, so vmcs01 should
   4893	 * always be the loaded VMCS.
   4894	 */
   4895	if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
   4896		return loaded_vmcs->shadow_vmcs;
   4897
   4898	loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
   4899	if (loaded_vmcs->shadow_vmcs)
   4900		vmcs_clear(loaded_vmcs->shadow_vmcs);
   4901
   4902	return loaded_vmcs->shadow_vmcs;
   4903}
   4904
   4905static int enter_vmx_operation(struct kvm_vcpu *vcpu)
   4906{
   4907	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4908	int r;
   4909
   4910	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
   4911	if (r < 0)
   4912		goto out_vmcs02;
   4913
   4914	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
   4915	if (!vmx->nested.cached_vmcs12)
   4916		goto out_cached_vmcs12;
   4917
   4918	vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
   4919	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
   4920	if (!vmx->nested.cached_shadow_vmcs12)
   4921		goto out_cached_shadow_vmcs12;
   4922
   4923	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
   4924		goto out_shadow_vmcs;
   4925
   4926	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
   4927		     HRTIMER_MODE_ABS_PINNED);
   4928	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
   4929
   4930	vmx->nested.vpid02 = allocate_vpid();
   4931
   4932	vmx->nested.vmcs02_initialized = false;
   4933	vmx->nested.vmxon = true;
   4934
   4935	if (vmx_pt_mode_is_host_guest()) {
   4936		vmx->pt_desc.guest.ctl = 0;
   4937		pt_update_intercept_for_msr(vcpu);
   4938	}
   4939
   4940	return 0;
   4941
   4942out_shadow_vmcs:
   4943	kfree(vmx->nested.cached_shadow_vmcs12);
   4944
   4945out_cached_shadow_vmcs12:
   4946	kfree(vmx->nested.cached_vmcs12);
   4947
   4948out_cached_vmcs12:
   4949	free_loaded_vmcs(&vmx->nested.vmcs02);
   4950
   4951out_vmcs02:
   4952	return -ENOMEM;
   4953}
   4954
   4955/* Emulate the VMXON instruction. */
   4956static int handle_vmon(struct kvm_vcpu *vcpu)
   4957{
   4958	int ret;
   4959	gpa_t vmptr;
   4960	uint32_t revision;
   4961	struct vcpu_vmx *vmx = to_vmx(vcpu);
   4962	const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
   4963		| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
   4964
   4965	/*
   4966	 * The Intel VMX Instruction Reference lists a bunch of bits that are
   4967	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
   4968	 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
   4969	 * Otherwise, we should fail with #UD.  But most faulting conditions
   4970	 * have already been checked by hardware, prior to the VM-exit for
   4971	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
   4972	 * that bit set to 1 in non-root mode.
   4973	 */
   4974	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
   4975		kvm_queue_exception(vcpu, UD_VECTOR);
   4976		return 1;
   4977	}
   4978
   4979	/* CPL=0 must be checked manually. */
   4980	if (vmx_get_cpl(vcpu)) {
   4981		kvm_inject_gp(vcpu, 0);
   4982		return 1;
   4983	}
   4984
   4985	if (vmx->nested.vmxon)
   4986		return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
   4987
   4988	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
   4989			!= VMXON_NEEDED_FEATURES) {
   4990		kvm_inject_gp(vcpu, 0);
   4991		return 1;
   4992	}
   4993
   4994	if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
   4995		return ret;
   4996
   4997	/*
   4998	 * SDM 3: 24.11.5
   4999	 * The first 4 bytes of VMXON region contain the supported
   5000	 * VMCS revision identifier
   5001	 *
   5002	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
   5003	 * which replaces physical address width with 32
   5004	 */
   5005	if (!page_address_valid(vcpu, vmptr))
   5006		return nested_vmx_failInvalid(vcpu);
   5007
   5008	if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
   5009	    revision != VMCS12_REVISION)
   5010		return nested_vmx_failInvalid(vcpu);
   5011
   5012	vmx->nested.vmxon_ptr = vmptr;
   5013	ret = enter_vmx_operation(vcpu);
   5014	if (ret)
   5015		return ret;
   5016
   5017	return nested_vmx_succeed(vcpu);
   5018}
   5019
   5020static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
   5021{
   5022	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5023
   5024	if (vmx->nested.current_vmptr == INVALID_GPA)
   5025		return;
   5026
   5027	copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
   5028
   5029	if (enable_shadow_vmcs) {
   5030		/* copy to memory all shadowed fields in case
   5031		   they were modified */
   5032		copy_shadow_to_vmcs12(vmx);
   5033		vmx_disable_shadow_vmcs(vmx);
   5034	}
   5035	vmx->nested.posted_intr_nv = -1;
   5036
   5037	/* Flush VMCS12 to guest memory */
   5038	kvm_vcpu_write_guest_page(vcpu,
   5039				  vmx->nested.current_vmptr >> PAGE_SHIFT,
   5040				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
   5041
   5042	kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
   5043
   5044	vmx->nested.current_vmptr = INVALID_GPA;
   5045}
   5046
   5047/* Emulate the VMXOFF instruction */
   5048static int handle_vmoff(struct kvm_vcpu *vcpu)
   5049{
   5050	if (!nested_vmx_check_permission(vcpu))
   5051		return 1;
   5052
   5053	free_nested(vcpu);
   5054
   5055	/* Process a latched INIT during time CPU was in VMX operation */
   5056	kvm_make_request(KVM_REQ_EVENT, vcpu);
   5057
   5058	return nested_vmx_succeed(vcpu);
   5059}
   5060
   5061/* Emulate the VMCLEAR instruction */
   5062static int handle_vmclear(struct kvm_vcpu *vcpu)
   5063{
   5064	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5065	u32 zero = 0;
   5066	gpa_t vmptr;
   5067	u64 evmcs_gpa;
   5068	int r;
   5069
   5070	if (!nested_vmx_check_permission(vcpu))
   5071		return 1;
   5072
   5073	if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
   5074		return r;
   5075
   5076	if (!page_address_valid(vcpu, vmptr))
   5077		return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
   5078
   5079	if (vmptr == vmx->nested.vmxon_ptr)
   5080		return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
   5081
   5082	/*
   5083	 * When Enlightened VMEntry is enabled on the calling CPU we treat
   5084	 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
   5085	 * way to distinguish it from VMCS12) and we must not corrupt it by
   5086	 * writing to the non-existent 'launch_state' field. The area doesn't
   5087	 * have to be the currently active EVMCS on the calling CPU and there's
   5088	 * nothing KVM has to do to transition it from 'active' to 'non-active'
   5089	 * state. It is possible that the area will stay mapped as
   5090	 * vmx->nested.hv_evmcs but this shouldn't be a problem.
   5091	 */
   5092	if (likely(!vmx->nested.enlightened_vmcs_enabled ||
   5093		   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
   5094		if (vmptr == vmx->nested.current_vmptr)
   5095			nested_release_vmcs12(vcpu);
   5096
   5097		kvm_vcpu_write_guest(vcpu,
   5098				     vmptr + offsetof(struct vmcs12,
   5099						      launch_state),
   5100				     &zero, sizeof(zero));
   5101	} else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
   5102		nested_release_evmcs(vcpu);
   5103	}
   5104
   5105	return nested_vmx_succeed(vcpu);
   5106}
   5107
   5108/* Emulate the VMLAUNCH instruction */
   5109static int handle_vmlaunch(struct kvm_vcpu *vcpu)
   5110{
   5111	return nested_vmx_run(vcpu, true);
   5112}
   5113
   5114/* Emulate the VMRESUME instruction */
   5115static int handle_vmresume(struct kvm_vcpu *vcpu)
   5116{
   5117
   5118	return nested_vmx_run(vcpu, false);
   5119}
   5120
   5121static int handle_vmread(struct kvm_vcpu *vcpu)
   5122{
   5123	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
   5124						    : get_vmcs12(vcpu);
   5125	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5126	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5127	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5128	struct x86_exception e;
   5129	unsigned long field;
   5130	u64 value;
   5131	gva_t gva = 0;
   5132	short offset;
   5133	int len, r;
   5134
   5135	if (!nested_vmx_check_permission(vcpu))
   5136		return 1;
   5137
   5138	/* Decode instruction info and find the field to read */
   5139	field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
   5140
   5141	if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
   5142		/*
   5143		 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
   5144		 * any VMREAD sets the ALU flags for VMfailInvalid.
   5145		 */
   5146		if (vmx->nested.current_vmptr == INVALID_GPA ||
   5147		    (is_guest_mode(vcpu) &&
   5148		     get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
   5149			return nested_vmx_failInvalid(vcpu);
   5150
   5151		offset = get_vmcs12_field_offset(field);
   5152		if (offset < 0)
   5153			return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
   5154
   5155		if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
   5156			copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
   5157
   5158		/* Read the field, zero-extended to a u64 value */
   5159		value = vmcs12_read_any(vmcs12, field, offset);
   5160	} else {
   5161		/*
   5162		 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
   5163		 * enlightened VMCS is active VMREAD/VMWRITE instructions are
   5164		 * unsupported. Unfortunately, certain versions of Windows 11
   5165		 * don't comply with this requirement which is not enforced in
   5166		 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
   5167		 * workaround, as misbehaving guests will panic on VM-Fail.
   5168		 * Note, enlightened VMCS is incompatible with shadow VMCS so
   5169		 * all VMREADs from L2 should go to L1.
   5170		 */
   5171		if (WARN_ON_ONCE(is_guest_mode(vcpu)))
   5172			return nested_vmx_failInvalid(vcpu);
   5173
   5174		offset = evmcs_field_offset(field, NULL);
   5175		if (offset < 0)
   5176			return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
   5177
   5178		/* Read the field, zero-extended to a u64 value */
   5179		value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
   5180	}
   5181
   5182	/*
   5183	 * Now copy part of this value to register or memory, as requested.
   5184	 * Note that the number of bits actually copied is 32 or 64 depending
   5185	 * on the guest's mode (32 or 64 bit), not on the given field's length.
   5186	 */
   5187	if (instr_info & BIT(10)) {
   5188		kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
   5189	} else {
   5190		len = is_64_bit_mode(vcpu) ? 8 : 4;
   5191		if (get_vmx_mem_address(vcpu, exit_qualification,
   5192					instr_info, true, len, &gva))
   5193			return 1;
   5194		/* _system ok, nested_vmx_check_permission has verified cpl=0 */
   5195		r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
   5196		if (r != X86EMUL_CONTINUE)
   5197			return kvm_handle_memory_failure(vcpu, r, &e);
   5198	}
   5199
   5200	return nested_vmx_succeed(vcpu);
   5201}
   5202
   5203static bool is_shadow_field_rw(unsigned long field)
   5204{
   5205	switch (field) {
   5206#define SHADOW_FIELD_RW(x, y) case x:
   5207#include "vmcs_shadow_fields.h"
   5208		return true;
   5209	default:
   5210		break;
   5211	}
   5212	return false;
   5213}
   5214
   5215static bool is_shadow_field_ro(unsigned long field)
   5216{
   5217	switch (field) {
   5218#define SHADOW_FIELD_RO(x, y) case x:
   5219#include "vmcs_shadow_fields.h"
   5220		return true;
   5221	default:
   5222		break;
   5223	}
   5224	return false;
   5225}
   5226
   5227static int handle_vmwrite(struct kvm_vcpu *vcpu)
   5228{
   5229	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
   5230						    : get_vmcs12(vcpu);
   5231	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5232	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5233	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5234	struct x86_exception e;
   5235	unsigned long field;
   5236	short offset;
   5237	gva_t gva;
   5238	int len, r;
   5239
   5240	/*
   5241	 * The value to write might be 32 or 64 bits, depending on L1's long
   5242	 * mode, and eventually we need to write that into a field of several
   5243	 * possible lengths. The code below first zero-extends the value to 64
   5244	 * bit (value), and then copies only the appropriate number of
   5245	 * bits into the vmcs12 field.
   5246	 */
   5247	u64 value = 0;
   5248
   5249	if (!nested_vmx_check_permission(vcpu))
   5250		return 1;
   5251
   5252	/*
   5253	 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
   5254	 * any VMWRITE sets the ALU flags for VMfailInvalid.
   5255	 */
   5256	if (vmx->nested.current_vmptr == INVALID_GPA ||
   5257	    (is_guest_mode(vcpu) &&
   5258	     get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
   5259		return nested_vmx_failInvalid(vcpu);
   5260
   5261	if (instr_info & BIT(10))
   5262		value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
   5263	else {
   5264		len = is_64_bit_mode(vcpu) ? 8 : 4;
   5265		if (get_vmx_mem_address(vcpu, exit_qualification,
   5266					instr_info, false, len, &gva))
   5267			return 1;
   5268		r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
   5269		if (r != X86EMUL_CONTINUE)
   5270			return kvm_handle_memory_failure(vcpu, r, &e);
   5271	}
   5272
   5273	field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
   5274
   5275	offset = get_vmcs12_field_offset(field);
   5276	if (offset < 0)
   5277		return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
   5278
   5279	/*
   5280	 * If the vCPU supports "VMWRITE to any supported field in the
   5281	 * VMCS," then the "read-only" fields are actually read/write.
   5282	 */
   5283	if (vmcs_field_readonly(field) &&
   5284	    !nested_cpu_has_vmwrite_any_field(vcpu))
   5285		return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
   5286
   5287	/*
   5288	 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
   5289	 * vmcs12, else we may crush a field or consume a stale value.
   5290	 */
   5291	if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
   5292		copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
   5293
   5294	/*
   5295	 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
   5296	 * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
   5297	 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
   5298	 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
   5299	 * from L1 will return a different value than VMREAD from L2 (L1 sees
   5300	 * the stripped down value, L2 sees the full value as stored by KVM).
   5301	 */
   5302	if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
   5303		value &= 0x1f0ff;
   5304
   5305	vmcs12_write_any(vmcs12, field, offset, value);
   5306
   5307	/*
   5308	 * Do not track vmcs12 dirty-state if in guest-mode as we actually
   5309	 * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
   5310	 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
   5311	 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
   5312	 */
   5313	if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
   5314		/*
   5315		 * L1 can read these fields without exiting, ensure the
   5316		 * shadow VMCS is up-to-date.
   5317		 */
   5318		if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
   5319			preempt_disable();
   5320			vmcs_load(vmx->vmcs01.shadow_vmcs);
   5321
   5322			__vmcs_writel(field, value);
   5323
   5324			vmcs_clear(vmx->vmcs01.shadow_vmcs);
   5325			vmcs_load(vmx->loaded_vmcs->vmcs);
   5326			preempt_enable();
   5327		}
   5328		vmx->nested.dirty_vmcs12 = true;
   5329	}
   5330
   5331	return nested_vmx_succeed(vcpu);
   5332}
   5333
   5334static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
   5335{
   5336	vmx->nested.current_vmptr = vmptr;
   5337	if (enable_shadow_vmcs) {
   5338		secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
   5339		vmcs_write64(VMCS_LINK_POINTER,
   5340			     __pa(vmx->vmcs01.shadow_vmcs));
   5341		vmx->nested.need_vmcs12_to_shadow_sync = true;
   5342	}
   5343	vmx->nested.dirty_vmcs12 = true;
   5344	vmx->nested.force_msr_bitmap_recalc = true;
   5345}
   5346
   5347/* Emulate the VMPTRLD instruction */
   5348static int handle_vmptrld(struct kvm_vcpu *vcpu)
   5349{
   5350	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5351	gpa_t vmptr;
   5352	int r;
   5353
   5354	if (!nested_vmx_check_permission(vcpu))
   5355		return 1;
   5356
   5357	if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
   5358		return r;
   5359
   5360	if (!page_address_valid(vcpu, vmptr))
   5361		return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
   5362
   5363	if (vmptr == vmx->nested.vmxon_ptr)
   5364		return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
   5365
   5366	/* Forbid normal VMPTRLD if Enlightened version was used */
   5367	if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   5368		return 1;
   5369
   5370	if (vmx->nested.current_vmptr != vmptr) {
   5371		struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
   5372		struct vmcs_hdr hdr;
   5373
   5374		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
   5375			/*
   5376			 * Reads from an unbacked page return all 1s,
   5377			 * which means that the 32 bits located at the
   5378			 * given physical address won't match the required
   5379			 * VMCS12_REVISION identifier.
   5380			 */
   5381			return nested_vmx_fail(vcpu,
   5382				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
   5383		}
   5384
   5385		if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
   5386						 offsetof(struct vmcs12, hdr),
   5387						 sizeof(hdr))) {
   5388			return nested_vmx_fail(vcpu,
   5389				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
   5390		}
   5391
   5392		if (hdr.revision_id != VMCS12_REVISION ||
   5393		    (hdr.shadow_vmcs &&
   5394		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
   5395			return nested_vmx_fail(vcpu,
   5396				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
   5397		}
   5398
   5399		nested_release_vmcs12(vcpu);
   5400
   5401		/*
   5402		 * Load VMCS12 from guest memory since it is not already
   5403		 * cached.
   5404		 */
   5405		if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
   5406					  VMCS12_SIZE)) {
   5407			return nested_vmx_fail(vcpu,
   5408				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
   5409		}
   5410
   5411		set_current_vmptr(vmx, vmptr);
   5412	}
   5413
   5414	return nested_vmx_succeed(vcpu);
   5415}
   5416
   5417/* Emulate the VMPTRST instruction */
   5418static int handle_vmptrst(struct kvm_vcpu *vcpu)
   5419{
   5420	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
   5421	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5422	gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
   5423	struct x86_exception e;
   5424	gva_t gva;
   5425	int r;
   5426
   5427	if (!nested_vmx_check_permission(vcpu))
   5428		return 1;
   5429
   5430	if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
   5431		return 1;
   5432
   5433	if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
   5434				true, sizeof(gpa_t), &gva))
   5435		return 1;
   5436	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
   5437	r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
   5438					sizeof(gpa_t), &e);
   5439	if (r != X86EMUL_CONTINUE)
   5440		return kvm_handle_memory_failure(vcpu, r, &e);
   5441
   5442	return nested_vmx_succeed(vcpu);
   5443}
   5444
   5445/* Emulate the INVEPT instruction */
   5446static int handle_invept(struct kvm_vcpu *vcpu)
   5447{
   5448	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5449	u32 vmx_instruction_info, types;
   5450	unsigned long type, roots_to_free;
   5451	struct kvm_mmu *mmu;
   5452	gva_t gva;
   5453	struct x86_exception e;
   5454	struct {
   5455		u64 eptp, gpa;
   5456	} operand;
   5457	int i, r, gpr_index;
   5458
   5459	if (!(vmx->nested.msrs.secondary_ctls_high &
   5460	      SECONDARY_EXEC_ENABLE_EPT) ||
   5461	    !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
   5462		kvm_queue_exception(vcpu, UD_VECTOR);
   5463		return 1;
   5464	}
   5465
   5466	if (!nested_vmx_check_permission(vcpu))
   5467		return 1;
   5468
   5469	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5470	gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
   5471	type = kvm_register_read(vcpu, gpr_index);
   5472
   5473	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
   5474
   5475	if (type >= 32 || !(types & (1 << type)))
   5476		return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5477
   5478	/* According to the Intel VMX instruction reference, the memory
   5479	 * operand is read even if it isn't needed (e.g., for type==global)
   5480	 */
   5481	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
   5482			vmx_instruction_info, false, sizeof(operand), &gva))
   5483		return 1;
   5484	r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
   5485	if (r != X86EMUL_CONTINUE)
   5486		return kvm_handle_memory_failure(vcpu, r, &e);
   5487
   5488	/*
   5489	 * Nested EPT roots are always held through guest_mmu,
   5490	 * not root_mmu.
   5491	 */
   5492	mmu = &vcpu->arch.guest_mmu;
   5493
   5494	switch (type) {
   5495	case VMX_EPT_EXTENT_CONTEXT:
   5496		if (!nested_vmx_check_eptp(vcpu, operand.eptp))
   5497			return nested_vmx_fail(vcpu,
   5498				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5499
   5500		roots_to_free = 0;
   5501		if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
   5502					    operand.eptp))
   5503			roots_to_free |= KVM_MMU_ROOT_CURRENT;
   5504
   5505		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
   5506			if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
   5507						    mmu->prev_roots[i].pgd,
   5508						    operand.eptp))
   5509				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
   5510		}
   5511		break;
   5512	case VMX_EPT_EXTENT_GLOBAL:
   5513		roots_to_free = KVM_MMU_ROOTS_ALL;
   5514		break;
   5515	default:
   5516		BUG();
   5517		break;
   5518	}
   5519
   5520	if (roots_to_free)
   5521		kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
   5522
   5523	return nested_vmx_succeed(vcpu);
   5524}
   5525
   5526static int handle_invvpid(struct kvm_vcpu *vcpu)
   5527{
   5528	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5529	u32 vmx_instruction_info;
   5530	unsigned long type, types;
   5531	gva_t gva;
   5532	struct x86_exception e;
   5533	struct {
   5534		u64 vpid;
   5535		u64 gla;
   5536	} operand;
   5537	u16 vpid02;
   5538	int r, gpr_index;
   5539
   5540	if (!(vmx->nested.msrs.secondary_ctls_high &
   5541	      SECONDARY_EXEC_ENABLE_VPID) ||
   5542			!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
   5543		kvm_queue_exception(vcpu, UD_VECTOR);
   5544		return 1;
   5545	}
   5546
   5547	if (!nested_vmx_check_permission(vcpu))
   5548		return 1;
   5549
   5550	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5551	gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
   5552	type = kvm_register_read(vcpu, gpr_index);
   5553
   5554	types = (vmx->nested.msrs.vpid_caps &
   5555			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
   5556
   5557	if (type >= 32 || !(types & (1 << type)))
   5558		return nested_vmx_fail(vcpu,
   5559			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5560
   5561	/* according to the intel vmx instruction reference, the memory
   5562	 * operand is read even if it isn't needed (e.g., for type==global)
   5563	 */
   5564	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
   5565			vmx_instruction_info, false, sizeof(operand), &gva))
   5566		return 1;
   5567	r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
   5568	if (r != X86EMUL_CONTINUE)
   5569		return kvm_handle_memory_failure(vcpu, r, &e);
   5570
   5571	if (operand.vpid >> 16)
   5572		return nested_vmx_fail(vcpu,
   5573			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5574
   5575	vpid02 = nested_get_vpid02(vcpu);
   5576	switch (type) {
   5577	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
   5578		if (!operand.vpid ||
   5579		    is_noncanonical_address(operand.gla, vcpu))
   5580			return nested_vmx_fail(vcpu,
   5581				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5582		vpid_sync_vcpu_addr(vpid02, operand.gla);
   5583		break;
   5584	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
   5585	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
   5586		if (!operand.vpid)
   5587			return nested_vmx_fail(vcpu,
   5588				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
   5589		vpid_sync_context(vpid02);
   5590		break;
   5591	case VMX_VPID_EXTENT_ALL_CONTEXT:
   5592		vpid_sync_context(vpid02);
   5593		break;
   5594	default:
   5595		WARN_ON_ONCE(1);
   5596		return kvm_skip_emulated_instruction(vcpu);
   5597	}
   5598
   5599	/*
   5600	 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
   5601	 * linear mappings for L2 (tagged with L2's VPID).  Free all guest
   5602	 * roots as VPIDs are not tracked in the MMU role.
   5603	 *
   5604	 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
   5605	 * an MMU when EPT is disabled.
   5606	 *
   5607	 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
   5608	 */
   5609	if (!enable_ept)
   5610		kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
   5611
   5612	return nested_vmx_succeed(vcpu);
   5613}
   5614
   5615static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
   5616				     struct vmcs12 *vmcs12)
   5617{
   5618	u32 index = kvm_rcx_read(vcpu);
   5619	u64 new_eptp;
   5620
   5621	if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
   5622		return 1;
   5623	if (index >= VMFUNC_EPTP_ENTRIES)
   5624		return 1;
   5625
   5626	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
   5627				     &new_eptp, index * 8, 8))
   5628		return 1;
   5629
   5630	/*
   5631	 * If the (L2) guest does a vmfunc to the currently
   5632	 * active ept pointer, we don't have to do anything else
   5633	 */
   5634	if (vmcs12->ept_pointer != new_eptp) {
   5635		if (!nested_vmx_check_eptp(vcpu, new_eptp))
   5636			return 1;
   5637
   5638		vmcs12->ept_pointer = new_eptp;
   5639		nested_ept_new_eptp(vcpu);
   5640
   5641		if (!nested_cpu_has_vpid(vmcs12))
   5642			kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
   5643	}
   5644
   5645	return 0;
   5646}
   5647
   5648static int handle_vmfunc(struct kvm_vcpu *vcpu)
   5649{
   5650	struct vcpu_vmx *vmx = to_vmx(vcpu);
   5651	struct vmcs12 *vmcs12;
   5652	u32 function = kvm_rax_read(vcpu);
   5653
   5654	/*
   5655	 * VMFUNC is only supported for nested guests, but we always enable the
   5656	 * secondary control for simplicity; for non-nested mode, fake that we
   5657	 * didn't by injecting #UD.
   5658	 */
   5659	if (!is_guest_mode(vcpu)) {
   5660		kvm_queue_exception(vcpu, UD_VECTOR);
   5661		return 1;
   5662	}
   5663
   5664	vmcs12 = get_vmcs12(vcpu);
   5665
   5666	/*
   5667	 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
   5668	 * is enabled in vmcs02 if and only if it's enabled in vmcs12.
   5669	 */
   5670	if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
   5671		kvm_queue_exception(vcpu, UD_VECTOR);
   5672		return 1;
   5673	}
   5674
   5675	if (!(vmcs12->vm_function_control & BIT_ULL(function)))
   5676		goto fail;
   5677
   5678	switch (function) {
   5679	case 0:
   5680		if (nested_vmx_eptp_switching(vcpu, vmcs12))
   5681			goto fail;
   5682		break;
   5683	default:
   5684		goto fail;
   5685	}
   5686	return kvm_skip_emulated_instruction(vcpu);
   5687
   5688fail:
   5689	/*
   5690	 * This is effectively a reflected VM-Exit, as opposed to a synthesized
   5691	 * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
   5692	 * EXIT_REASON_VMFUNC as the exit reason.
   5693	 */
   5694	nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
   5695			  vmx_get_intr_info(vcpu),
   5696			  vmx_get_exit_qual(vcpu));
   5697	return 1;
   5698}
   5699
   5700/*
   5701 * Return true if an IO instruction with the specified port and size should cause
   5702 * a VM-exit into L1.
   5703 */
   5704bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
   5705				 int size)
   5706{
   5707	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   5708	gpa_t bitmap, last_bitmap;
   5709	u8 b;
   5710
   5711	last_bitmap = INVALID_GPA;
   5712	b = -1;
   5713
   5714	while (size > 0) {
   5715		if (port < 0x8000)
   5716			bitmap = vmcs12->io_bitmap_a;
   5717		else if (port < 0x10000)
   5718			bitmap = vmcs12->io_bitmap_b;
   5719		else
   5720			return true;
   5721		bitmap += (port & 0x7fff) / 8;
   5722
   5723		if (last_bitmap != bitmap)
   5724			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
   5725				return true;
   5726		if (b & (1 << (port & 7)))
   5727			return true;
   5728
   5729		port++;
   5730		size--;
   5731		last_bitmap = bitmap;
   5732	}
   5733
   5734	return false;
   5735}
   5736
   5737static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
   5738				       struct vmcs12 *vmcs12)
   5739{
   5740	unsigned long exit_qualification;
   5741	unsigned short port;
   5742	int size;
   5743
   5744	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
   5745		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
   5746
   5747	exit_qualification = vmx_get_exit_qual(vcpu);
   5748
   5749	port = exit_qualification >> 16;
   5750	size = (exit_qualification & 7) + 1;
   5751
   5752	return nested_vmx_check_io_bitmaps(vcpu, port, size);
   5753}
   5754
   5755/*
   5756 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
   5757 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
   5758 * disinterest in the current event (read or write a specific MSR) by using an
   5759 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
   5760 */
   5761static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
   5762					struct vmcs12 *vmcs12,
   5763					union vmx_exit_reason exit_reason)
   5764{
   5765	u32 msr_index = kvm_rcx_read(vcpu);
   5766	gpa_t bitmap;
   5767
   5768	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
   5769		return true;
   5770
   5771	/*
   5772	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
   5773	 * for the four combinations of read/write and low/high MSR numbers.
   5774	 * First we need to figure out which of the four to use:
   5775	 */
   5776	bitmap = vmcs12->msr_bitmap;
   5777	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
   5778		bitmap += 2048;
   5779	if (msr_index >= 0xc0000000) {
   5780		msr_index -= 0xc0000000;
   5781		bitmap += 1024;
   5782	}
   5783
   5784	/* Then read the msr_index'th bit from this bitmap: */
   5785	if (msr_index < 1024*8) {
   5786		unsigned char b;
   5787		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
   5788			return true;
   5789		return 1 & (b >> (msr_index & 7));
   5790	} else
   5791		return true; /* let L1 handle the wrong parameter */
   5792}
   5793
   5794/*
   5795 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
   5796 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
   5797 * intercept (via guest_host_mask etc.) the current event.
   5798 */
   5799static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
   5800	struct vmcs12 *vmcs12)
   5801{
   5802	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
   5803	int cr = exit_qualification & 15;
   5804	int reg;
   5805	unsigned long val;
   5806
   5807	switch ((exit_qualification >> 4) & 3) {
   5808	case 0: /* mov to cr */
   5809		reg = (exit_qualification >> 8) & 15;
   5810		val = kvm_register_read(vcpu, reg);
   5811		switch (cr) {
   5812		case 0:
   5813			if (vmcs12->cr0_guest_host_mask &
   5814			    (val ^ vmcs12->cr0_read_shadow))
   5815				return true;
   5816			break;
   5817		case 3:
   5818			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
   5819				return true;
   5820			break;
   5821		case 4:
   5822			if (vmcs12->cr4_guest_host_mask &
   5823			    (vmcs12->cr4_read_shadow ^ val))
   5824				return true;
   5825			break;
   5826		case 8:
   5827			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
   5828				return true;
   5829			break;
   5830		}
   5831		break;
   5832	case 2: /* clts */
   5833		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
   5834		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
   5835			return true;
   5836		break;
   5837	case 1: /* mov from cr */
   5838		switch (cr) {
   5839		case 3:
   5840			if (vmcs12->cpu_based_vm_exec_control &
   5841			    CPU_BASED_CR3_STORE_EXITING)
   5842				return true;
   5843			break;
   5844		case 8:
   5845			if (vmcs12->cpu_based_vm_exec_control &
   5846			    CPU_BASED_CR8_STORE_EXITING)
   5847				return true;
   5848			break;
   5849		}
   5850		break;
   5851	case 3: /* lmsw */
   5852		/*
   5853		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
   5854		 * cr0. Other attempted changes are ignored, with no exit.
   5855		 */
   5856		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
   5857		if (vmcs12->cr0_guest_host_mask & 0xe &
   5858		    (val ^ vmcs12->cr0_read_shadow))
   5859			return true;
   5860		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
   5861		    !(vmcs12->cr0_read_shadow & 0x1) &&
   5862		    (val & 0x1))
   5863			return true;
   5864		break;
   5865	}
   5866	return false;
   5867}
   5868
   5869static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
   5870					  struct vmcs12 *vmcs12)
   5871{
   5872	u32 encls_leaf;
   5873
   5874	if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
   5875	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
   5876		return false;
   5877
   5878	encls_leaf = kvm_rax_read(vcpu);
   5879	if (encls_leaf > 62)
   5880		encls_leaf = 63;
   5881	return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
   5882}
   5883
   5884static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
   5885	struct vmcs12 *vmcs12, gpa_t bitmap)
   5886{
   5887	u32 vmx_instruction_info;
   5888	unsigned long field;
   5889	u8 b;
   5890
   5891	if (!nested_cpu_has_shadow_vmcs(vmcs12))
   5892		return true;
   5893
   5894	/* Decode instruction info and find the field to access */
   5895	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
   5896	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
   5897
   5898	/* Out-of-range fields always cause a VM exit from L2 to L1 */
   5899	if (field >> 15)
   5900		return true;
   5901
   5902	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
   5903		return true;
   5904
   5905	return 1 & (b >> (field & 7));
   5906}
   5907
   5908static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
   5909{
   5910	u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
   5911
   5912	if (nested_cpu_has_mtf(vmcs12))
   5913		return true;
   5914
   5915	/*
   5916	 * An MTF VM-exit may be injected into the guest by setting the
   5917	 * interruption-type to 7 (other event) and the vector field to 0. Such
   5918	 * is the case regardless of the 'monitor trap flag' VM-execution
   5919	 * control.
   5920	 */
   5921	return entry_intr_info == (INTR_INFO_VALID_MASK
   5922				   | INTR_TYPE_OTHER_EVENT);
   5923}
   5924
   5925/*
   5926 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
   5927 * L1 wants the exit.  Only call this when in is_guest_mode (L2).
   5928 */
   5929static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
   5930				     union vmx_exit_reason exit_reason)
   5931{
   5932	u32 intr_info;
   5933
   5934	switch ((u16)exit_reason.basic) {
   5935	case EXIT_REASON_EXCEPTION_NMI:
   5936		intr_info = vmx_get_intr_info(vcpu);
   5937		if (is_nmi(intr_info))
   5938			return true;
   5939		else if (is_page_fault(intr_info))
   5940			return vcpu->arch.apf.host_apf_flags ||
   5941			       vmx_need_pf_intercept(vcpu);
   5942		else if (is_debug(intr_info) &&
   5943			 vcpu->guest_debug &
   5944			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
   5945			return true;
   5946		else if (is_breakpoint(intr_info) &&
   5947			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
   5948			return true;
   5949		else if (is_alignment_check(intr_info) &&
   5950			 !vmx_guest_inject_ac(vcpu))
   5951			return true;
   5952		return false;
   5953	case EXIT_REASON_EXTERNAL_INTERRUPT:
   5954		return true;
   5955	case EXIT_REASON_MCE_DURING_VMENTRY:
   5956		return true;
   5957	case EXIT_REASON_EPT_VIOLATION:
   5958		/*
   5959		 * L0 always deals with the EPT violation. If nested EPT is
   5960		 * used, and the nested mmu code discovers that the address is
   5961		 * missing in the guest EPT table (EPT12), the EPT violation
   5962		 * will be injected with nested_ept_inject_page_fault()
   5963		 */
   5964		return true;
   5965	case EXIT_REASON_EPT_MISCONFIG:
   5966		/*
   5967		 * L2 never uses directly L1's EPT, but rather L0's own EPT
   5968		 * table (shadow on EPT) or a merged EPT table that L0 built
   5969		 * (EPT on EPT). So any problems with the structure of the
   5970		 * table is L0's fault.
   5971		 */
   5972		return true;
   5973	case EXIT_REASON_PREEMPTION_TIMER:
   5974		return true;
   5975	case EXIT_REASON_PML_FULL:
   5976		/*
   5977		 * PML is emulated for an L1 VMM and should never be enabled in
   5978		 * vmcs02, always "handle" PML_FULL by exiting to userspace.
   5979		 */
   5980		return true;
   5981	case EXIT_REASON_VMFUNC:
   5982		/* VM functions are emulated through L2->L0 vmexits. */
   5983		return true;
   5984	case EXIT_REASON_BUS_LOCK:
   5985		/*
   5986		 * At present, bus lock VM exit is never exposed to L1.
   5987		 * Handle L2's bus locks in L0 directly.
   5988		 */
   5989		return true;
   5990	default:
   5991		break;
   5992	}
   5993	return false;
   5994}
   5995
   5996/*
   5997 * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
   5998 * is_guest_mode (L2).
   5999 */
   6000static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
   6001				     union vmx_exit_reason exit_reason)
   6002{
   6003	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   6004	u32 intr_info;
   6005
   6006	switch ((u16)exit_reason.basic) {
   6007	case EXIT_REASON_EXCEPTION_NMI:
   6008		intr_info = vmx_get_intr_info(vcpu);
   6009		if (is_nmi(intr_info))
   6010			return true;
   6011		else if (is_page_fault(intr_info))
   6012			return true;
   6013		return vmcs12->exception_bitmap &
   6014				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
   6015	case EXIT_REASON_EXTERNAL_INTERRUPT:
   6016		return nested_exit_on_intr(vcpu);
   6017	case EXIT_REASON_TRIPLE_FAULT:
   6018		return true;
   6019	case EXIT_REASON_INTERRUPT_WINDOW:
   6020		return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
   6021	case EXIT_REASON_NMI_WINDOW:
   6022		return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
   6023	case EXIT_REASON_TASK_SWITCH:
   6024		return true;
   6025	case EXIT_REASON_CPUID:
   6026		return true;
   6027	case EXIT_REASON_HLT:
   6028		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
   6029	case EXIT_REASON_INVD:
   6030		return true;
   6031	case EXIT_REASON_INVLPG:
   6032		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
   6033	case EXIT_REASON_RDPMC:
   6034		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
   6035	case EXIT_REASON_RDRAND:
   6036		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
   6037	case EXIT_REASON_RDSEED:
   6038		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
   6039	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
   6040		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
   6041	case EXIT_REASON_VMREAD:
   6042		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
   6043			vmcs12->vmread_bitmap);
   6044	case EXIT_REASON_VMWRITE:
   6045		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
   6046			vmcs12->vmwrite_bitmap);
   6047	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
   6048	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
   6049	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
   6050	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
   6051	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
   6052		/*
   6053		 * VMX instructions trap unconditionally. This allows L1 to
   6054		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
   6055		 */
   6056		return true;
   6057	case EXIT_REASON_CR_ACCESS:
   6058		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
   6059	case EXIT_REASON_DR_ACCESS:
   6060		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
   6061	case EXIT_REASON_IO_INSTRUCTION:
   6062		return nested_vmx_exit_handled_io(vcpu, vmcs12);
   6063	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
   6064		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
   6065	case EXIT_REASON_MSR_READ:
   6066	case EXIT_REASON_MSR_WRITE:
   6067		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
   6068	case EXIT_REASON_INVALID_STATE:
   6069		return true;
   6070	case EXIT_REASON_MWAIT_INSTRUCTION:
   6071		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
   6072	case EXIT_REASON_MONITOR_TRAP_FLAG:
   6073		return nested_vmx_exit_handled_mtf(vmcs12);
   6074	case EXIT_REASON_MONITOR_INSTRUCTION:
   6075		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
   6076	case EXIT_REASON_PAUSE_INSTRUCTION:
   6077		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
   6078			nested_cpu_has2(vmcs12,
   6079				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
   6080	case EXIT_REASON_MCE_DURING_VMENTRY:
   6081		return true;
   6082	case EXIT_REASON_TPR_BELOW_THRESHOLD:
   6083		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
   6084	case EXIT_REASON_APIC_ACCESS:
   6085	case EXIT_REASON_APIC_WRITE:
   6086	case EXIT_REASON_EOI_INDUCED:
   6087		/*
   6088		 * The controls for "virtualize APIC accesses," "APIC-
   6089		 * register virtualization," and "virtual-interrupt
   6090		 * delivery" only come from vmcs12.
   6091		 */
   6092		return true;
   6093	case EXIT_REASON_INVPCID:
   6094		return
   6095			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
   6096			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
   6097	case EXIT_REASON_WBINVD:
   6098		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
   6099	case EXIT_REASON_XSETBV:
   6100		return true;
   6101	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
   6102		/*
   6103		 * This should never happen, since it is not possible to
   6104		 * set XSS to a non-zero value---neither in L1 nor in L2.
   6105		 * If if it were, XSS would have to be checked against
   6106		 * the XSS exit bitmap in vmcs12.
   6107		 */
   6108		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
   6109	case EXIT_REASON_UMWAIT:
   6110	case EXIT_REASON_TPAUSE:
   6111		return nested_cpu_has2(vmcs12,
   6112			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
   6113	case EXIT_REASON_ENCLS:
   6114		return nested_vmx_exit_handled_encls(vcpu, vmcs12);
   6115	default:
   6116		return true;
   6117	}
   6118}
   6119
   6120/*
   6121 * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
   6122 * reflected into L1.
   6123 */
   6124bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
   6125{
   6126	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6127	union vmx_exit_reason exit_reason = vmx->exit_reason;
   6128	unsigned long exit_qual;
   6129	u32 exit_intr_info;
   6130
   6131	WARN_ON_ONCE(vmx->nested.nested_run_pending);
   6132
   6133	/*
   6134	 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
   6135	 * has already loaded L2's state.
   6136	 */
   6137	if (unlikely(vmx->fail)) {
   6138		trace_kvm_nested_vmenter_failed(
   6139			"hardware VM-instruction error: ",
   6140			vmcs_read32(VM_INSTRUCTION_ERROR));
   6141		exit_intr_info = 0;
   6142		exit_qual = 0;
   6143		goto reflect_vmexit;
   6144	}
   6145
   6146	trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
   6147
   6148	/* If L0 (KVM) wants the exit, it trumps L1's desires. */
   6149	if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
   6150		return false;
   6151
   6152	/* If L1 doesn't want the exit, handle it in L0. */
   6153	if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
   6154		return false;
   6155
   6156	/*
   6157	 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
   6158	 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
   6159	 * need to be synthesized by querying the in-kernel LAPIC, but external
   6160	 * interrupts are never reflected to L1 so it's a non-issue.
   6161	 */
   6162	exit_intr_info = vmx_get_intr_info(vcpu);
   6163	if (is_exception_with_error_code(exit_intr_info)) {
   6164		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   6165
   6166		vmcs12->vm_exit_intr_error_code =
   6167			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
   6168	}
   6169	exit_qual = vmx_get_exit_qual(vcpu);
   6170
   6171reflect_vmexit:
   6172	nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
   6173	return true;
   6174}
   6175
   6176static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
   6177				struct kvm_nested_state __user *user_kvm_nested_state,
   6178				u32 user_data_size)
   6179{
   6180	struct vcpu_vmx *vmx;
   6181	struct vmcs12 *vmcs12;
   6182	struct kvm_nested_state kvm_state = {
   6183		.flags = 0,
   6184		.format = KVM_STATE_NESTED_FORMAT_VMX,
   6185		.size = sizeof(kvm_state),
   6186		.hdr.vmx.flags = 0,
   6187		.hdr.vmx.vmxon_pa = INVALID_GPA,
   6188		.hdr.vmx.vmcs12_pa = INVALID_GPA,
   6189		.hdr.vmx.preemption_timer_deadline = 0,
   6190	};
   6191	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
   6192		&user_kvm_nested_state->data.vmx[0];
   6193
   6194	if (!vcpu)
   6195		return kvm_state.size + sizeof(*user_vmx_nested_state);
   6196
   6197	vmx = to_vmx(vcpu);
   6198	vmcs12 = get_vmcs12(vcpu);
   6199
   6200	if (nested_vmx_allowed(vcpu) &&
   6201	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
   6202		kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
   6203		kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
   6204
   6205		if (vmx_has_valid_vmcs12(vcpu)) {
   6206			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
   6207
   6208			/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
   6209			if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
   6210				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
   6211
   6212			if (is_guest_mode(vcpu) &&
   6213			    nested_cpu_has_shadow_vmcs(vmcs12) &&
   6214			    vmcs12->vmcs_link_pointer != INVALID_GPA)
   6215				kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
   6216		}
   6217
   6218		if (vmx->nested.smm.vmxon)
   6219			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
   6220
   6221		if (vmx->nested.smm.guest_mode)
   6222			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
   6223
   6224		if (is_guest_mode(vcpu)) {
   6225			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
   6226
   6227			if (vmx->nested.nested_run_pending)
   6228				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
   6229
   6230			if (vmx->nested.mtf_pending)
   6231				kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
   6232
   6233			if (nested_cpu_has_preemption_timer(vmcs12) &&
   6234			    vmx->nested.has_preemption_timer_deadline) {
   6235				kvm_state.hdr.vmx.flags |=
   6236					KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
   6237				kvm_state.hdr.vmx.preemption_timer_deadline =
   6238					vmx->nested.preemption_timer_deadline;
   6239			}
   6240		}
   6241	}
   6242
   6243	if (user_data_size < kvm_state.size)
   6244		goto out;
   6245
   6246	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
   6247		return -EFAULT;
   6248
   6249	if (!vmx_has_valid_vmcs12(vcpu))
   6250		goto out;
   6251
   6252	/*
   6253	 * When running L2, the authoritative vmcs12 state is in the
   6254	 * vmcs02. When running L1, the authoritative vmcs12 state is
   6255	 * in the shadow or enlightened vmcs linked to vmcs01, unless
   6256	 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
   6257	 * vmcs12 state is in the vmcs12 already.
   6258	 */
   6259	if (is_guest_mode(vcpu)) {
   6260		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
   6261		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
   6262	} else  {
   6263		copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
   6264		if (!vmx->nested.need_vmcs12_to_shadow_sync) {
   6265			if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
   6266				/*
   6267				 * L1 hypervisor is not obliged to keep eVMCS
   6268				 * clean fields data always up-to-date while
   6269				 * not in guest mode, 'hv_clean_fields' is only
   6270				 * supposed to be actual upon vmentry so we need
   6271				 * to ignore it here and do full copy.
   6272				 */
   6273				copy_enlightened_to_vmcs12(vmx, 0);
   6274			else if (enable_shadow_vmcs)
   6275				copy_shadow_to_vmcs12(vmx);
   6276		}
   6277	}
   6278
   6279	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
   6280	BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
   6281
   6282	/*
   6283	 * Copy over the full allocated size of vmcs12 rather than just the size
   6284	 * of the struct.
   6285	 */
   6286	if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
   6287		return -EFAULT;
   6288
   6289	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
   6290	    vmcs12->vmcs_link_pointer != INVALID_GPA) {
   6291		if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
   6292				 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
   6293			return -EFAULT;
   6294	}
   6295out:
   6296	return kvm_state.size;
   6297}
   6298
   6299/*
   6300 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
   6301 */
   6302void vmx_leave_nested(struct kvm_vcpu *vcpu)
   6303{
   6304	if (is_guest_mode(vcpu)) {
   6305		to_vmx(vcpu)->nested.nested_run_pending = 0;
   6306		nested_vmx_vmexit(vcpu, -1, 0, 0);
   6307	}
   6308	free_nested(vcpu);
   6309}
   6310
   6311static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
   6312				struct kvm_nested_state __user *user_kvm_nested_state,
   6313				struct kvm_nested_state *kvm_state)
   6314{
   6315	struct vcpu_vmx *vmx = to_vmx(vcpu);
   6316	struct vmcs12 *vmcs12;
   6317	enum vm_entry_failure_code ignored;
   6318	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
   6319		&user_kvm_nested_state->data.vmx[0];
   6320	int ret;
   6321
   6322	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
   6323		return -EINVAL;
   6324
   6325	if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
   6326		if (kvm_state->hdr.vmx.smm.flags)
   6327			return -EINVAL;
   6328
   6329		if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
   6330			return -EINVAL;
   6331
   6332		/*
   6333		 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
   6334		 * enable eVMCS capability on vCPU. However, since then
   6335		 * code was changed such that flag signals vmcs12 should
   6336		 * be copied into eVMCS in guest memory.
   6337		 *
   6338		 * To preserve backwards compatability, allow user
   6339		 * to set this flag even when there is no VMXON region.
   6340		 */
   6341		if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
   6342			return -EINVAL;
   6343	} else {
   6344		if (!nested_vmx_allowed(vcpu))
   6345			return -EINVAL;
   6346
   6347		if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
   6348			return -EINVAL;
   6349	}
   6350
   6351	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
   6352	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
   6353		return -EINVAL;
   6354
   6355	if (kvm_state->hdr.vmx.smm.flags &
   6356	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
   6357		return -EINVAL;
   6358
   6359	if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
   6360		return -EINVAL;
   6361
   6362	/*
   6363	 * SMM temporarily disables VMX, so we cannot be in guest mode,
   6364	 * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
   6365	 * must be zero.
   6366	 */
   6367	if (is_smm(vcpu) ?
   6368		(kvm_state->flags &
   6369		 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
   6370		: kvm_state->hdr.vmx.smm.flags)
   6371		return -EINVAL;
   6372
   6373	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
   6374	    !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
   6375		return -EINVAL;
   6376
   6377	if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
   6378		(!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
   6379			return -EINVAL;
   6380
   6381	vmx_leave_nested(vcpu);
   6382
   6383	if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
   6384		return 0;
   6385
   6386	vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
   6387	ret = enter_vmx_operation(vcpu);
   6388	if (ret)
   6389		return ret;
   6390
   6391	/* Empty 'VMXON' state is permitted if no VMCS loaded */
   6392	if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
   6393		/* See vmx_has_valid_vmcs12.  */
   6394		if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
   6395		    (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
   6396		    (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
   6397			return -EINVAL;
   6398		else
   6399			return 0;
   6400	}
   6401
   6402	if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
   6403		if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
   6404		    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
   6405			return -EINVAL;
   6406
   6407		set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
   6408	} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
   6409		/*
   6410		 * nested_vmx_handle_enlightened_vmptrld() cannot be called
   6411		 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
   6412		 * restored yet. EVMCS will be mapped from
   6413		 * nested_get_vmcs12_pages().
   6414		 */
   6415		vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
   6416		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
   6417	} else {
   6418		return -EINVAL;
   6419	}
   6420
   6421	if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
   6422		vmx->nested.smm.vmxon = true;
   6423		vmx->nested.vmxon = false;
   6424
   6425		if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
   6426			vmx->nested.smm.guest_mode = true;
   6427	}
   6428
   6429	vmcs12 = get_vmcs12(vcpu);
   6430	if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
   6431		return -EFAULT;
   6432
   6433	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
   6434		return -EINVAL;
   6435
   6436	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
   6437		return 0;
   6438
   6439	vmx->nested.nested_run_pending =
   6440		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
   6441
   6442	vmx->nested.mtf_pending =
   6443		!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
   6444
   6445	ret = -EINVAL;
   6446	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
   6447	    vmcs12->vmcs_link_pointer != INVALID_GPA) {
   6448		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
   6449
   6450		if (kvm_state->size <
   6451		    sizeof(*kvm_state) +
   6452		    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
   6453			goto error_guest_mode;
   6454
   6455		if (copy_from_user(shadow_vmcs12,
   6456				   user_vmx_nested_state->shadow_vmcs12,
   6457				   sizeof(*shadow_vmcs12))) {
   6458			ret = -EFAULT;
   6459			goto error_guest_mode;
   6460		}
   6461
   6462		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
   6463		    !shadow_vmcs12->hdr.shadow_vmcs)
   6464			goto error_guest_mode;
   6465	}
   6466
   6467	vmx->nested.has_preemption_timer_deadline = false;
   6468	if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
   6469		vmx->nested.has_preemption_timer_deadline = true;
   6470		vmx->nested.preemption_timer_deadline =
   6471			kvm_state->hdr.vmx.preemption_timer_deadline;
   6472	}
   6473
   6474	if (nested_vmx_check_controls(vcpu, vmcs12) ||
   6475	    nested_vmx_check_host_state(vcpu, vmcs12) ||
   6476	    nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
   6477		goto error_guest_mode;
   6478
   6479	vmx->nested.dirty_vmcs12 = true;
   6480	vmx->nested.force_msr_bitmap_recalc = true;
   6481	ret = nested_vmx_enter_non_root_mode(vcpu, false);
   6482	if (ret)
   6483		goto error_guest_mode;
   6484
   6485	return 0;
   6486
   6487error_guest_mode:
   6488	vmx->nested.nested_run_pending = 0;
   6489	return ret;
   6490}
   6491
   6492void nested_vmx_set_vmcs_shadowing_bitmap(void)
   6493{
   6494	if (enable_shadow_vmcs) {
   6495		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
   6496		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
   6497	}
   6498}
   6499
   6500/*
   6501 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
   6502 * that madness to get the encoding for comparison.
   6503 */
   6504#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
   6505
   6506static u64 nested_vmx_calc_vmcs_enum_msr(void)
   6507{
   6508	/*
   6509	 * Note these are the so called "index" of the VMCS field encoding, not
   6510	 * the index into vmcs12.
   6511	 */
   6512	unsigned int max_idx, idx;
   6513	int i;
   6514
   6515	/*
   6516	 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
   6517	 * vmcs12, regardless of whether or not the associated feature is
   6518	 * exposed to L1.  Simply find the field with the highest index.
   6519	 */
   6520	max_idx = 0;
   6521	for (i = 0; i < nr_vmcs12_fields; i++) {
   6522		/* The vmcs12 table is very, very sparsely populated. */
   6523		if (!vmcs12_field_offsets[i])
   6524			continue;
   6525
   6526		idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
   6527		if (idx > max_idx)
   6528			max_idx = idx;
   6529	}
   6530
   6531	return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
   6532}
   6533
   6534/*
   6535 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
   6536 * returned for the various VMX controls MSRs when nested VMX is enabled.
   6537 * The same values should also be used to verify that vmcs12 control fields are
   6538 * valid during nested entry from L1 to L2.
   6539 * Each of these control msrs has a low and high 32-bit half: A low bit is on
   6540 * if the corresponding bit in the (32-bit) control field *must* be on, and a
   6541 * bit in the high half is on if the corresponding bit in the control field
   6542 * may be on. See also vmx_control_verify().
   6543 */
   6544void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
   6545{
   6546	/*
   6547	 * Note that as a general rule, the high half of the MSRs (bits in
   6548	 * the control fields which may be 1) should be initialized by the
   6549	 * intersection of the underlying hardware's MSR (i.e., features which
   6550	 * can be supported) and the list of features we want to expose -
   6551	 * because they are known to be properly supported in our code.
   6552	 * Also, usually, the low half of the MSRs (bits which must be 1) can
   6553	 * be set to 0, meaning that L1 may turn off any of these bits. The
   6554	 * reason is that if one of these bits is necessary, it will appear
   6555	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
   6556	 * fields of vmcs01 and vmcs02, will turn these bits off - and
   6557	 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
   6558	 * These rules have exceptions below.
   6559	 */
   6560
   6561	/* pin-based controls */
   6562	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
   6563		msrs->pinbased_ctls_low,
   6564		msrs->pinbased_ctls_high);
   6565	msrs->pinbased_ctls_low |=
   6566		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
   6567	msrs->pinbased_ctls_high &=
   6568		PIN_BASED_EXT_INTR_MASK |
   6569		PIN_BASED_NMI_EXITING |
   6570		PIN_BASED_VIRTUAL_NMIS |
   6571		(enable_apicv ? PIN_BASED_POSTED_INTR : 0);
   6572	msrs->pinbased_ctls_high |=
   6573		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
   6574		PIN_BASED_VMX_PREEMPTION_TIMER;
   6575
   6576	/* exit controls */
   6577	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
   6578		msrs->exit_ctls_low,
   6579		msrs->exit_ctls_high);
   6580	msrs->exit_ctls_low =
   6581		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   6582
   6583	msrs->exit_ctls_high &=
   6584#ifdef CONFIG_X86_64
   6585		VM_EXIT_HOST_ADDR_SPACE_SIZE |
   6586#endif
   6587		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
   6588		VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
   6589	msrs->exit_ctls_high |=
   6590		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
   6591		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
   6592		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
   6593
   6594	/* We support free control of debug control saving. */
   6595	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
   6596
   6597	/* entry controls */
   6598	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
   6599		msrs->entry_ctls_low,
   6600		msrs->entry_ctls_high);
   6601	msrs->entry_ctls_low =
   6602		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
   6603	msrs->entry_ctls_high &=
   6604#ifdef CONFIG_X86_64
   6605		VM_ENTRY_IA32E_MODE |
   6606#endif
   6607		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
   6608		VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
   6609	msrs->entry_ctls_high |=
   6610		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
   6611
   6612	/* We support free control of debug control loading. */
   6613	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
   6614
   6615	/* cpu-based controls */
   6616	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
   6617		msrs->procbased_ctls_low,
   6618		msrs->procbased_ctls_high);
   6619	msrs->procbased_ctls_low =
   6620		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
   6621	msrs->procbased_ctls_high &=
   6622		CPU_BASED_INTR_WINDOW_EXITING |
   6623		CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
   6624		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
   6625		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
   6626		CPU_BASED_CR3_STORE_EXITING |
   6627#ifdef CONFIG_X86_64
   6628		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
   6629#endif
   6630		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
   6631		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
   6632		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
   6633		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
   6634		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
   6635	/*
   6636	 * We can allow some features even when not supported by the
   6637	 * hardware. For example, L1 can specify an MSR bitmap - and we
   6638	 * can use it to avoid exits to L1 - even when L0 runs L2
   6639	 * without MSR bitmaps.
   6640	 */
   6641	msrs->procbased_ctls_high |=
   6642		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
   6643		CPU_BASED_USE_MSR_BITMAPS;
   6644
   6645	/* We support free control of CR3 access interception. */
   6646	msrs->procbased_ctls_low &=
   6647		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
   6648
   6649	/*
   6650	 * secondary cpu-based controls.  Do not include those that
   6651	 * depend on CPUID bits, they are added later by
   6652	 * vmx_vcpu_after_set_cpuid.
   6653	 */
   6654	if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
   6655		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
   6656		      msrs->secondary_ctls_low,
   6657		      msrs->secondary_ctls_high);
   6658
   6659	msrs->secondary_ctls_low = 0;
   6660	msrs->secondary_ctls_high &=
   6661		SECONDARY_EXEC_DESC |
   6662		SECONDARY_EXEC_ENABLE_RDTSCP |
   6663		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
   6664		SECONDARY_EXEC_WBINVD_EXITING |
   6665		SECONDARY_EXEC_APIC_REGISTER_VIRT |
   6666		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
   6667		SECONDARY_EXEC_RDRAND_EXITING |
   6668		SECONDARY_EXEC_ENABLE_INVPCID |
   6669		SECONDARY_EXEC_RDSEED_EXITING |
   6670		SECONDARY_EXEC_XSAVES |
   6671		SECONDARY_EXEC_TSC_SCALING;
   6672
   6673	/*
   6674	 * We can emulate "VMCS shadowing," even if the hardware
   6675	 * doesn't support it.
   6676	 */
   6677	msrs->secondary_ctls_high |=
   6678		SECONDARY_EXEC_SHADOW_VMCS;
   6679
   6680	if (enable_ept) {
   6681		/* nested EPT: emulate EPT also to L1 */
   6682		msrs->secondary_ctls_high |=
   6683			SECONDARY_EXEC_ENABLE_EPT;
   6684		msrs->ept_caps =
   6685			VMX_EPT_PAGE_WALK_4_BIT |
   6686			VMX_EPT_PAGE_WALK_5_BIT |
   6687			VMX_EPTP_WB_BIT |
   6688			VMX_EPT_INVEPT_BIT |
   6689			VMX_EPT_EXECUTE_ONLY_BIT;
   6690
   6691		msrs->ept_caps &= ept_caps;
   6692		msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
   6693			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
   6694			VMX_EPT_1GB_PAGE_BIT;
   6695		if (enable_ept_ad_bits) {
   6696			msrs->secondary_ctls_high |=
   6697				SECONDARY_EXEC_ENABLE_PML;
   6698			msrs->ept_caps |= VMX_EPT_AD_BIT;
   6699		}
   6700	}
   6701
   6702	if (cpu_has_vmx_vmfunc()) {
   6703		msrs->secondary_ctls_high |=
   6704			SECONDARY_EXEC_ENABLE_VMFUNC;
   6705		/*
   6706		 * Advertise EPTP switching unconditionally
   6707		 * since we emulate it
   6708		 */
   6709		if (enable_ept)
   6710			msrs->vmfunc_controls =
   6711				VMX_VMFUNC_EPTP_SWITCHING;
   6712	}
   6713
   6714	/*
   6715	 * Old versions of KVM use the single-context version without
   6716	 * checking for support, so declare that it is supported even
   6717	 * though it is treated as global context.  The alternative is
   6718	 * not failing the single-context invvpid, and it is worse.
   6719	 */
   6720	if (enable_vpid) {
   6721		msrs->secondary_ctls_high |=
   6722			SECONDARY_EXEC_ENABLE_VPID;
   6723		msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
   6724			VMX_VPID_EXTENT_SUPPORTED_MASK;
   6725	}
   6726
   6727	if (enable_unrestricted_guest)
   6728		msrs->secondary_ctls_high |=
   6729			SECONDARY_EXEC_UNRESTRICTED_GUEST;
   6730
   6731	if (flexpriority_enabled)
   6732		msrs->secondary_ctls_high |=
   6733			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
   6734
   6735	if (enable_sgx)
   6736		msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
   6737
   6738	/* miscellaneous data */
   6739	rdmsr(MSR_IA32_VMX_MISC,
   6740		msrs->misc_low,
   6741		msrs->misc_high);
   6742	msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
   6743	msrs->misc_low |=
   6744		MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
   6745		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
   6746		VMX_MISC_ACTIVITY_HLT |
   6747		VMX_MISC_ACTIVITY_WAIT_SIPI;
   6748	msrs->misc_high = 0;
   6749
   6750	/*
   6751	 * This MSR reports some information about VMX support. We
   6752	 * should return information about the VMX we emulate for the
   6753	 * guest, and the VMCS structure we give it - not about the
   6754	 * VMX support of the underlying hardware.
   6755	 */
   6756	msrs->basic =
   6757		VMCS12_REVISION |
   6758		VMX_BASIC_TRUE_CTLS |
   6759		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
   6760		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
   6761
   6762	if (cpu_has_vmx_basic_inout())
   6763		msrs->basic |= VMX_BASIC_INOUT;
   6764
   6765	/*
   6766	 * These MSRs specify bits which the guest must keep fixed on
   6767	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
   6768	 * We picked the standard core2 setting.
   6769	 */
   6770#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
   6771#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
   6772	msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
   6773	msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
   6774
   6775	/* These MSRs specify bits which the guest must keep fixed off. */
   6776	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
   6777	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
   6778
   6779	msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
   6780}
   6781
   6782void nested_vmx_hardware_unsetup(void)
   6783{
   6784	int i;
   6785
   6786	if (enable_shadow_vmcs) {
   6787		for (i = 0; i < VMX_BITMAP_NR; i++)
   6788			free_page((unsigned long)vmx_bitmap[i]);
   6789	}
   6790}
   6791
   6792__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
   6793{
   6794	int i;
   6795
   6796	if (!cpu_has_vmx_shadow_vmcs())
   6797		enable_shadow_vmcs = 0;
   6798	if (enable_shadow_vmcs) {
   6799		for (i = 0; i < VMX_BITMAP_NR; i++) {
   6800			/*
   6801			 * The vmx_bitmap is not tied to a VM and so should
   6802			 * not be charged to a memcg.
   6803			 */
   6804			vmx_bitmap[i] = (unsigned long *)
   6805				__get_free_page(GFP_KERNEL);
   6806			if (!vmx_bitmap[i]) {
   6807				nested_vmx_hardware_unsetup();
   6808				return -ENOMEM;
   6809			}
   6810		}
   6811
   6812		init_vmcs_shadow_fields();
   6813	}
   6814
   6815	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear;
   6816	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch;
   6817	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld;
   6818	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst;
   6819	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread;
   6820	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume;
   6821	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite;
   6822	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff;
   6823	exit_handlers[EXIT_REASON_VMON]		= handle_vmon;
   6824	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept;
   6825	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid;
   6826	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc;
   6827
   6828	return 0;
   6829}
   6830
   6831struct kvm_x86_nested_ops vmx_nested_ops = {
   6832	.leave_nested = vmx_leave_nested,
   6833	.check_events = vmx_check_nested_events,
   6834	.handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
   6835	.hv_timer_pending = nested_vmx_preemption_timer_pending,
   6836	.triple_fault = nested_vmx_triple_fault,
   6837	.get_state = vmx_get_nested_state,
   6838	.set_state = vmx_set_nested_state,
   6839	.get_nested_state_pages = vmx_get_nested_state_pages,
   6840	.write_log_dirty = nested_vmx_write_pml_buffer,
   6841	.enable_evmcs = nested_enable_evmcs,
   6842	.get_evmcs_version = nested_get_evmcs_version,
   6843};