svm.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
svm.c (144700B)
      1#define pr_fmt(fmt) "SVM: " fmt
      2
      3#include <linux/kvm_host.h>
      4
      5#include "cachepc/cachepc.h"
      6#include "cachepc/event.h"
      7#include "cachepc/track.h"
      8
      9#include "irq.h"
     10#include "mmu.h"
     11#include "kvm_cache_regs.h"
     12#include "x86.h"
     13#include "cpuid.h"
     14#include "pmu.h"
     15
     16#include <linux/module.h>
     17#include <linux/mod_devicetable.h>
     18#include <linux/kernel.h>
     19#include <linux/vmalloc.h>
     20#include <linux/highmem.h>
     21#include <linux/amd-iommu.h>
     22#include <linux/sched.h>
     23#include <linux/trace_events.h>
     24#include <linux/slab.h>
     25#include <linux/hashtable.h>
     26#include <linux/objtool.h>
     27#include <linux/psp-sev.h>
     28#include <linux/file.h>
     29#include <linux/pagemap.h>
     30#include <linux/swap.h>
     31#include <linux/rwsem.h>
     32#include <linux/cc_platform.h>
     33
     34#include <asm/apic.h>
     35#include <asm/perf_event.h>
     36#include <asm/tlbflush.h>
     37#include <asm/desc.h>
     38#include <asm/debugreg.h>
     39#include <asm/kvm_para.h>
     40#include <asm/irq_remapping.h>
     41#include <asm/spec-ctrl.h>
     42#include <asm/cpu_device_id.h>
     43#include <asm/traps.h>
     44#include <asm/fpu/api.h>
     45
     46#include <asm/virtext.h>
     47#include "trace.h"
     48
     49#include "svm.h"
     50#include "svm_ops.h"
     51
     52#include "kvm_onhyperv.h"
     53#include "svm_onhyperv.h"
     54
     55MODULE_AUTHOR("Qumranet");
     56MODULE_LICENSE("GPL");
     57
     58#ifdef MODULE
     59static const struct x86_cpu_id svm_cpu_id[] = {
     60	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
     61	{}
     62};
     63MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
     64#endif
     65
     66#define SEG_TYPE_LDT 2
     67#define SEG_TYPE_BUSY_TSS16 3
     68
     69static bool erratum_383_found __read_mostly;
     70
     71u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
     72
     73/*
     74 * Set osvw_len to higher value when updated Revision Guides
     75 * are published and we know what the new status bits are
     76 */
     77static uint64_t osvw_len = 4, osvw_status;
     78
     79static DEFINE_PER_CPU(u64, current_tsc_ratio);
     80
     81static const struct svm_direct_access_msrs {
     82	u32 index;   /* Index of the MSR */
     83	bool always; /* True if intercept is initially cleared */
     84} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
     85	{ .index = MSR_STAR,				.always = true  },
     86	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
     87	{ .index = MSR_IA32_SYSENTER_EIP,		.always = false },
     88	{ .index = MSR_IA32_SYSENTER_ESP,		.always = false },
     89#ifdef CONFIG_X86_64
     90	{ .index = MSR_GS_BASE,				.always = true  },
     91	{ .index = MSR_FS_BASE,				.always = true  },
     92	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
     93	{ .index = MSR_LSTAR,				.always = true  },
     94	{ .index = MSR_CSTAR,				.always = true  },
     95	{ .index = MSR_SYSCALL_MASK,			.always = true  },
     96#endif
     97	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
     98	{ .index = MSR_IA32_PRED_CMD,			.always = false },
     99	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
    100	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
    101	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
    102	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
    103	{ .index = MSR_EFER,				.always = false },
    104	{ .index = MSR_IA32_CR_PAT,			.always = false },
    105	{ .index = MSR_AMD64_SEV_ES_GHCB,		.always = true  },
    106	{ .index = MSR_TSC_AUX,				.always = false },
    107	{ .index = MSR_INVALID,				.always = false },
    108};
    109
    110/*
    111 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
    112 * pause_filter_count: On processors that support Pause filtering(indicated
    113 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
    114 *	count value. On VMRUN this value is loaded into an internal counter.
    115 *	Each time a pause instruction is executed, this counter is decremented
    116 *	until it reaches zero at which time a #VMEXIT is generated if pause
    117 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
    118 *	Intercept Filtering for more details.
    119 *	This also indicate if ple logic enabled.
    120 *
    121 * pause_filter_thresh: In addition, some processor families support advanced
    122 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
    123 *	the amount of time a guest is allowed to execute in a pause loop.
    124 *	In this mode, a 16-bit pause filter threshold field is added in the
    125 *	VMCB. The threshold value is a cycle count that is used to reset the
    126 *	pause counter. As with simple pause filtering, VMRUN loads the pause
    127 *	count value from VMCB into an internal counter. Then, on each pause
    128 *	instruction the hardware checks the elapsed number of cycles since
    129 *	the most recent pause instruction against the pause filter threshold.
    130 *	If the elapsed cycle count is greater than the pause filter threshold,
    131 *	then the internal pause count is reloaded from the VMCB and execution
    132 *	continues. If the elapsed cycle count is less than the pause filter
    133 *	threshold, then the internal pause count is decremented. If the count
    134 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
    135 *	triggered. If advanced pause filtering is supported and pause filter
    136 *	threshold field is set to zero, the filter will operate in the simpler,
    137 *	count only mode.
    138 */
    139
    140static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
    141module_param(pause_filter_thresh, ushort, 0444);
    142
    143static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
    144module_param(pause_filter_count, ushort, 0444);
    145
    146/* Default doubles per-vcpu window every exit. */
    147static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
    148module_param(pause_filter_count_grow, ushort, 0444);
    149
    150/* Default resets per-vcpu window every exit to pause_filter_count. */
    151static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
    152module_param(pause_filter_count_shrink, ushort, 0444);
    153
    154/* Default is to compute the maximum so we can never overflow. */
    155static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
    156module_param(pause_filter_count_max, ushort, 0444);
    157
    158/*
    159 * Use nested page tables by default.  Note, NPT may get forced off by
    160 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
    161 */
    162bool npt_enabled = true;
    163module_param_named(npt, npt_enabled, bool, 0444);
    164
    165/* allow nested virtualization in KVM/SVM */
    166static int nested = true;
    167module_param(nested, int, S_IRUGO);
    168
    169/* enable/disable Next RIP Save */
    170static int nrips = true;
    171module_param(nrips, int, 0444);
    172
    173/* enable/disable Virtual VMLOAD VMSAVE */
    174static int vls = true;
    175module_param(vls, int, 0444);
    176
    177/* enable/disable Virtual GIF */
    178int vgif = true;
    179module_param(vgif, int, 0444);
    180
    181/* enable/disable LBR virtualization */
    182static int lbrv = true;
    183module_param(lbrv, int, 0444);
    184
    185static int tsc_scaling = true;
    186module_param(tsc_scaling, int, 0444);
    187
    188/*
    189 * enable / disable AVIC.  Because the defaults differ for APICv
    190 * support between VMX and SVM we cannot use module_param_named.
    191 */
    192static bool avic;
    193module_param(avic, bool, 0444);
    194
    195static bool force_avic;
    196module_param_unsafe(force_avic, bool, 0444);
    197
    198bool __read_mostly dump_invalid_vmcb;
    199module_param(dump_invalid_vmcb, bool, 0644);
    200
    201
    202bool intercept_smi = true;
    203module_param(intercept_smi, bool, 0444);
    204
    205
    206static bool svm_gp_erratum_intercept = true;
    207
    208static u8 rsm_ins_bytes[] = "\x0f\xaa";
    209
    210static unsigned long iopm_base;
    211
    212struct kvm_ldttss_desc {
    213	u16 limit0;
    214	u16 base0;
    215	unsigned base1:8, type:5, dpl:2, p:1;
    216	unsigned limit1:4, zero0:3, g:1, base2:8;
    217	u32 base3;
    218	u32 zero1;
    219} __attribute__((packed));
    220
    221DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
    222
    223/*
    224 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
    225 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
    226 *
    227 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
    228 * defer the restoration of TSC_AUX until the CPU returns to userspace.
    229 */
    230static int tsc_aux_uret_slot __read_mostly = -1;
    231
    232static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
    233
    234#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
    235#define MSRS_RANGE_SIZE 2048
    236#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
    237
    238u32 svm_msrpm_offset(u32 msr)
    239{
    240	u32 offset;
    241	int i;
    242
    243	for (i = 0; i < NUM_MSR_MAPS; i++) {
    244		if (msr < msrpm_ranges[i] ||
    245		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
    246			continue;
    247
    248		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
    249		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
    250
    251		/* Now we have the u8 offset - but need the u32 offset */
    252		return offset / 4;
    253	}
    254
    255	/* MSR not in any range */
    256	return MSR_INVALID;
    257}
    258
    259static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
    260
    261static int get_npt_level(void)
    262{
    263#ifdef CONFIG_X86_64
    264	return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
    265#else
    266	return PT32E_ROOT_LEVEL;
    267#endif
    268}
    269
    270int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
    271{
    272	struct vcpu_svm *svm = to_svm(vcpu);
    273	u64 old_efer = vcpu->arch.efer;
    274	vcpu->arch.efer = efer;
    275
    276	if (!npt_enabled) {
    277		/* Shadow paging assumes NX to be available.  */
    278		efer |= EFER_NX;
    279
    280		if (!(efer & EFER_LMA))
    281			efer &= ~EFER_LME;
    282	}
    283
    284	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
    285		if (!(efer & EFER_SVME)) {
    286			svm_leave_nested(vcpu);
    287			svm_set_gif(svm, true);
    288			/* #GP intercept is still needed for vmware backdoor */
    289			if (!enable_vmware_backdoor)
    290				clr_exception_intercept(svm, GP_VECTOR);
    291
    292			/*
    293			 * Free the nested guest state, unless we are in SMM.
    294			 * In this case we will return to the nested guest
    295			 * as soon as we leave SMM.
    296			 */
    297			if (!is_smm(vcpu))
    298				svm_free_nested(svm);
    299
    300		} else {
    301			int ret = svm_allocate_nested(svm);
    302
    303			if (ret) {
    304				vcpu->arch.efer = old_efer;
    305				return ret;
    306			}
    307
    308			/*
    309			 * Never intercept #GP for SEV guests, KVM can't
    310			 * decrypt guest memory to workaround the erratum.
    311			 */
    312			if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
    313				set_exception_intercept(svm, GP_VECTOR);
    314		}
    315	}
    316
    317	svm->vmcb->save.efer = efer | EFER_SVME;
    318	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
    319	return 0;
    320}
    321
    322static int is_external_interrupt(u32 info)
    323{
    324	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
    325	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
    326}
    327
    328static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
    329{
    330	struct vcpu_svm *svm = to_svm(vcpu);
    331	u32 ret = 0;
    332
    333	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
    334		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
    335	return ret;
    336}
    337
    338static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
    339{
    340	struct vcpu_svm *svm = to_svm(vcpu);
    341
    342	if (mask == 0)
    343		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
    344	else
    345		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
    346
    347}
    348
    349static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
    350{
    351	struct vcpu_svm *svm = to_svm(vcpu);
    352
    353	/*
    354	 * SEV-ES does not expose the next RIP. The RIP update is controlled by
    355	 * the type of exit and the #VC handler in the guest.
    356	 */
    357	if (sev_es_guest(vcpu->kvm))
    358		goto done;
    359
    360	if (nrips && svm->vmcb->control.next_rip != 0) {
    361		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
    362		svm->next_rip = svm->vmcb->control.next_rip;
    363	}
    364
    365	if (!svm->next_rip) {
    366		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
    367			return 0;
    368	} else {
    369		kvm_rip_write(vcpu, svm->next_rip);
    370	}
    371
    372done:
    373	svm_set_interrupt_shadow(vcpu, 0);
    374
    375	return 1;
    376}
    377
    378static void svm_queue_exception(struct kvm_vcpu *vcpu)
    379{
    380	struct vcpu_svm *svm = to_svm(vcpu);
    381	unsigned nr = vcpu->arch.exception.nr;
    382	bool has_error_code = vcpu->arch.exception.has_error_code;
    383	u32 error_code = vcpu->arch.exception.error_code;
    384
    385	kvm_deliver_exception_payload(vcpu);
    386
    387	if (nr == BP_VECTOR && !nrips) {
    388		unsigned long rip, old_rip = kvm_rip_read(vcpu);
    389
    390		/*
    391		 * For guest debugging where we have to reinject #BP if some
    392		 * INT3 is guest-owned:
    393		 * Emulate nRIP by moving RIP forward. Will fail if injection
    394		 * raises a fault that is not intercepted. Still better than
    395		 * failing in all cases.
    396		 */
    397		(void)svm_skip_emulated_instruction(vcpu);
    398		rip = kvm_rip_read(vcpu);
    399		svm->int3_rip = rip + svm->vmcb->save.cs.base;
    400		svm->int3_injected = rip - old_rip;
    401	}
    402
    403	svm->vmcb->control.event_inj = nr
    404		| SVM_EVTINJ_VALID
    405		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
    406		| SVM_EVTINJ_TYPE_EXEPT;
    407	svm->vmcb->control.event_inj_err = error_code;
    408}
    409
    410static void svm_init_erratum_383(void)
    411{
    412	u32 low, high;
    413	int err;
    414	u64 val;
    415
    416	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
    417		return;
    418
    419	/* Use _safe variants to not break nested virtualization */
    420	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
    421	if (err)
    422		return;
    423
    424	val |= (1ULL << 47);
    425
    426	low  = lower_32_bits(val);
    427	high = upper_32_bits(val);
    428
    429	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
    430
    431	erratum_383_found = true;
    432}
    433
    434static void svm_init_osvw(struct kvm_vcpu *vcpu)
    435{
    436	/*
    437	 * Guests should see errata 400 and 415 as fixed (assuming that
    438	 * HLT and IO instructions are intercepted).
    439	 */
    440	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
    441	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
    442
    443	/*
    444	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
    445	 * all osvw.status bits inside that length, including bit 0 (which is
    446	 * reserved for erratum 298), are valid. However, if host processor's
    447	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
    448	 * be conservative here and therefore we tell the guest that erratum 298
    449	 * is present (because we really don't know).
    450	 */
    451	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
    452		vcpu->arch.osvw.status |= 1;
    453}
    454
    455static int has_svm(void)
    456{
    457	const char *msg;
    458
    459	if (!cpu_has_svm(&msg)) {
    460		printk(KERN_INFO "has_svm: %s\n", msg);
    461		return 0;
    462	}
    463
    464	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
    465		pr_info("KVM is unsupported when running as an SEV guest\n");
    466		return 0;
    467	}
    468
    469	return 1;
    470}
    471
    472void __svm_write_tsc_multiplier(u64 multiplier)
    473{
    474	preempt_disable();
    475
    476	if (multiplier == __this_cpu_read(current_tsc_ratio))
    477		goto out;
    478
    479	wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
    480	__this_cpu_write(current_tsc_ratio, multiplier);
    481out:
    482	preempt_enable();
    483}
    484
    485static void svm_hardware_disable(void)
    486{
    487	/* Make sure we clean up behind us */
    488	if (tsc_scaling)
    489		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
    490
    491	cpu_svm_disable();
    492
    493	amd_pmu_disable_virt();
    494}
    495
    496static int svm_hardware_enable(void)
    497{
    498
    499	struct svm_cpu_data *sd;
    500	uint64_t efer;
    501	struct desc_struct *gdt;
    502	int me = raw_smp_processor_id();
    503
    504	rdmsrl(MSR_EFER, efer);
    505	if (efer & EFER_SVME)
    506		return -EBUSY;
    507
    508	if (!has_svm()) {
    509		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
    510		return -EINVAL;
    511	}
    512	sd = per_cpu(svm_data, me);
    513	if (!sd) {
    514		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
    515		return -EINVAL;
    516	}
    517
    518	sd->asid_generation = 1;
    519	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
    520	sd->next_asid = sd->max_asid + 1;
    521	sd->min_asid = max_sev_asid + 1;
    522
    523	gdt = get_current_gdt_rw();
    524	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
    525
    526	wrmsrl(MSR_EFER, efer | EFER_SVME);
    527
    528	wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
    529
    530	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
    531		/*
    532		 * Set the default value, even if we don't use TSC scaling
    533		 * to avoid having stale value in the msr
    534		 */
    535		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
    536	}
    537
    538
    539	/*
    540	 * Get OSVW bits.
    541	 *
    542	 * Note that it is possible to have a system with mixed processor
    543	 * revisions and therefore different OSVW bits. If bits are not the same
    544	 * on different processors then choose the worst case (i.e. if erratum
    545	 * is present on one processor and not on another then assume that the
    546	 * erratum is present everywhere).
    547	 */
    548	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
    549		uint64_t len, status = 0;
    550		int err;
    551
    552		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
    553		if (!err)
    554			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
    555						      &err);
    556
    557		if (err)
    558			osvw_status = osvw_len = 0;
    559		else {
    560			if (len < osvw_len)
    561				osvw_len = len;
    562			osvw_status |= status;
    563			osvw_status &= (1ULL << osvw_len) - 1;
    564		}
    565	} else
    566		osvw_status = osvw_len = 0;
    567
    568	svm_init_erratum_383();
    569
    570	amd_pmu_enable_virt();
    571
    572	return 0;
    573}
    574
    575static void svm_cpu_uninit(int cpu)
    576{
    577	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
    578
    579	if (!sd)
    580		return;
    581
    582	per_cpu(svm_data, cpu) = NULL;
    583	kfree(sd->sev_vmcbs);
    584	__free_page(sd->save_area);
    585	kfree(sd);
    586}
    587
    588static int svm_cpu_init(int cpu)
    589{
    590	struct svm_cpu_data *sd;
    591	int ret = -ENOMEM;
    592
    593	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
    594	if (!sd)
    595		return ret;
    596	sd->cpu = cpu;
    597	sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
    598	if (!sd->save_area)
    599		goto free_cpu_data;
    600
    601	ret = sev_cpu_init(sd);
    602	if (ret)
    603		goto free_save_area;
    604
    605	per_cpu(svm_data, cpu) = sd;
    606
    607	return 0;
    608
    609free_save_area:
    610	__free_page(sd->save_area);
    611free_cpu_data:
    612	kfree(sd);
    613	return ret;
    614
    615}
    616
    617static int direct_access_msr_slot(u32 msr)
    618{
    619	u32 i;
    620
    621	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
    622		if (direct_access_msrs[i].index == msr)
    623			return i;
    624
    625	return -ENOENT;
    626}
    627
    628static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
    629				     int write)
    630{
    631	struct vcpu_svm *svm = to_svm(vcpu);
    632	int slot = direct_access_msr_slot(msr);
    633
    634	if (slot == -ENOENT)
    635		return;
    636
    637	/* Set the shadow bitmaps to the desired intercept states */
    638	if (read)
    639		set_bit(slot, svm->shadow_msr_intercept.read);
    640	else
    641		clear_bit(slot, svm->shadow_msr_intercept.read);
    642
    643	if (write)
    644		set_bit(slot, svm->shadow_msr_intercept.write);
    645	else
    646		clear_bit(slot, svm->shadow_msr_intercept.write);
    647}
    648
    649static bool valid_msr_intercept(u32 index)
    650{
    651	return direct_access_msr_slot(index) != -ENOENT;
    652}
    653
    654static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
    655{
    656	u8 bit_write;
    657	unsigned long tmp;
    658	u32 offset;
    659	u32 *msrpm;
    660
    661	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
    662				      to_svm(vcpu)->msrpm;
    663
    664	offset    = svm_msrpm_offset(msr);
    665	bit_write = 2 * (msr & 0x0f) + 1;
    666	tmp       = msrpm[offset];
    667
    668	BUG_ON(offset == MSR_INVALID);
    669
    670	return !!test_bit(bit_write,  &tmp);
    671}
    672
    673static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
    674					u32 msr, int read, int write)
    675{
    676	struct vcpu_svm *svm = to_svm(vcpu);
    677	u8 bit_read, bit_write;
    678	unsigned long tmp;
    679	u32 offset;
    680
    681	/*
    682	 * If this warning triggers extend the direct_access_msrs list at the
    683	 * beginning of the file
    684	 */
    685	WARN_ON(!valid_msr_intercept(msr));
    686
    687	/* Enforce non allowed MSRs to trap */
    688	if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
    689		read = 0;
    690
    691	if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
    692		write = 0;
    693
    694	offset    = svm_msrpm_offset(msr);
    695	bit_read  = 2 * (msr & 0x0f);
    696	bit_write = 2 * (msr & 0x0f) + 1;
    697	tmp       = msrpm[offset];
    698
    699	BUG_ON(offset == MSR_INVALID);
    700
    701	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
    702	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
    703
    704	msrpm[offset] = tmp;
    705
    706	svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
    707	svm->nested.force_msr_bitmap_recalc = true;
    708}
    709
    710void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
    711			  int read, int write)
    712{
    713	set_shadow_msr_intercept(vcpu, msr, read, write);
    714	set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
    715}
    716
    717u32 *svm_vcpu_alloc_msrpm(void)
    718{
    719	unsigned int order = get_order(MSRPM_SIZE);
    720	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
    721	u32 *msrpm;
    722
    723	if (!pages)
    724		return NULL;
    725
    726	msrpm = page_address(pages);
    727	memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
    728
    729	return msrpm;
    730}
    731
    732void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
    733{
    734	int i;
    735
    736	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
    737		if (!direct_access_msrs[i].always)
    738			continue;
    739		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
    740	}
    741}
    742
    743
    744void svm_vcpu_free_msrpm(u32 *msrpm)
    745{
    746	__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
    747}
    748
    749static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
    750{
    751	struct vcpu_svm *svm = to_svm(vcpu);
    752	u32 i;
    753
    754	/*
    755	 * Set intercept permissions for all direct access MSRs again. They
    756	 * will automatically get filtered through the MSR filter, so we are
    757	 * back in sync after this.
    758	 */
    759	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
    760		u32 msr = direct_access_msrs[i].index;
    761		u32 read = test_bit(i, svm->shadow_msr_intercept.read);
    762		u32 write = test_bit(i, svm->shadow_msr_intercept.write);
    763
    764		set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
    765	}
    766}
    767
    768static void add_msr_offset(u32 offset)
    769{
    770	int i;
    771
    772	for (i = 0; i < MSRPM_OFFSETS; ++i) {
    773
    774		/* Offset already in list? */
    775		if (msrpm_offsets[i] == offset)
    776			return;
    777
    778		/* Slot used by another offset? */
    779		if (msrpm_offsets[i] != MSR_INVALID)
    780			continue;
    781
    782		/* Add offset to list */
    783		msrpm_offsets[i] = offset;
    784
    785		return;
    786	}
    787
    788	/*
    789	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
    790	 * increase MSRPM_OFFSETS in this case.
    791	 */
    792	BUG();
    793}
    794
    795static void init_msrpm_offsets(void)
    796{
    797	int i;
    798
    799	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
    800
    801	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
    802		u32 offset;
    803
    804		offset = svm_msrpm_offset(direct_access_msrs[i].index);
    805		BUG_ON(offset == MSR_INVALID);
    806
    807		add_msr_offset(offset);
    808	}
    809}
    810
    811void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
    812{
    813	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
    814	to_vmcb->save.br_from		= from_vmcb->save.br_from;
    815	to_vmcb->save.br_to		= from_vmcb->save.br_to;
    816	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
    817	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
    818
    819	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
    820}
    821
    822static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
    823{
    824	struct vcpu_svm *svm = to_svm(vcpu);
    825
    826	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
    827	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
    828	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
    829	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
    830	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
    831
    832	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
    833	if (is_guest_mode(vcpu))
    834		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
    835}
    836
    837static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
    838{
    839	struct vcpu_svm *svm = to_svm(vcpu);
    840
    841	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
    842	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
    843	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
    844	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
    845	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
    846
    847	/*
    848	 * Move the LBR msrs back to the vmcb01 to avoid copying them
    849	 * on nested guest entries.
    850	 */
    851	if (is_guest_mode(vcpu))
    852		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
    853}
    854
    855static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
    856{
    857	/*
    858	 * If the LBR virtualization is disabled, the LBR msrs are always
    859	 * kept in the vmcb01 to avoid copying them on nested guest entries.
    860	 *
    861	 * If nested, and the LBR virtualization is enabled/disabled, the msrs
    862	 * are moved between the vmcb01 and vmcb02 as needed.
    863	 */
    864	struct vmcb *vmcb =
    865		(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
    866			svm->vmcb : svm->vmcb01.ptr;
    867
    868	switch (index) {
    869	case MSR_IA32_DEBUGCTLMSR:
    870		return vmcb->save.dbgctl;
    871	case MSR_IA32_LASTBRANCHFROMIP:
    872		return vmcb->save.br_from;
    873	case MSR_IA32_LASTBRANCHTOIP:
    874		return vmcb->save.br_to;
    875	case MSR_IA32_LASTINTFROMIP:
    876		return vmcb->save.last_excp_from;
    877	case MSR_IA32_LASTINTTOIP:
    878		return vmcb->save.last_excp_to;
    879	default:
    880		KVM_BUG(false, svm->vcpu.kvm,
    881			"%s: Unknown MSR 0x%x", __func__, index);
    882		return 0;
    883	}
    884}
    885
    886void svm_update_lbrv(struct kvm_vcpu *vcpu)
    887{
    888	struct vcpu_svm *svm = to_svm(vcpu);
    889
    890	bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
    891					   DEBUGCTLMSR_LBR;
    892
    893	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
    894				      LBR_CTL_ENABLE_MASK);
    895
    896	if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
    897		if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
    898			enable_lbrv = true;
    899
    900	if (enable_lbrv == current_enable_lbrv)
    901		return;
    902
    903	if (enable_lbrv)
    904		svm_enable_lbrv(vcpu);
    905	else
    906		svm_disable_lbrv(vcpu);
    907}
    908
    909void disable_nmi_singlestep(struct vcpu_svm *svm)
    910{
    911	svm->nmi_singlestep = false;
    912
    913	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
    914		/* Clear our flags if they were not set by the guest */
    915		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
    916			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
    917		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
    918			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
    919	}
    920}
    921
    922static void grow_ple_window(struct kvm_vcpu *vcpu)
    923{
    924	struct vcpu_svm *svm = to_svm(vcpu);
    925	struct vmcb_control_area *control = &svm->vmcb->control;
    926	int old = control->pause_filter_count;
    927
    928	if (kvm_pause_in_guest(vcpu->kvm))
    929		return;
    930
    931	control->pause_filter_count = __grow_ple_window(old,
    932							pause_filter_count,
    933							pause_filter_count_grow,
    934							pause_filter_count_max);
    935
    936	if (control->pause_filter_count != old) {
    937		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
    938		trace_kvm_ple_window_update(vcpu->vcpu_id,
    939					    control->pause_filter_count, old);
    940	}
    941}
    942
    943static void shrink_ple_window(struct kvm_vcpu *vcpu)
    944{
    945	struct vcpu_svm *svm = to_svm(vcpu);
    946	struct vmcb_control_area *control = &svm->vmcb->control;
    947	int old = control->pause_filter_count;
    948
    949	if (kvm_pause_in_guest(vcpu->kvm))
    950		return;
    951
    952	control->pause_filter_count =
    953				__shrink_ple_window(old,
    954						    pause_filter_count,
    955						    pause_filter_count_shrink,
    956						    pause_filter_count);
    957	if (control->pause_filter_count != old) {
    958		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
    959		trace_kvm_ple_window_update(vcpu->vcpu_id,
    960					    control->pause_filter_count, old);
    961	}
    962}
    963
    964static void svm_hardware_unsetup(void)
    965{
    966	int cpu;
    967
    968	sev_hardware_unsetup();
    969
    970	for_each_possible_cpu(cpu)
    971		svm_cpu_uninit(cpu);
    972
    973	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
    974	get_order(IOPM_SIZE));
    975	iopm_base = 0;
    976}
    977
    978static void init_seg(struct vmcb_seg *seg)
    979{
    980	seg->selector = 0;
    981	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
    982		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
    983	seg->limit = 0xffff;
    984	seg->base = 0;
    985}
    986
    987static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
    988{
    989	seg->selector = 0;
    990	seg->attrib = SVM_SELECTOR_P_MASK | type;
    991	seg->limit = 0xffff;
    992	seg->base = 0;
    993}
    994
    995static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
    996{
    997	struct vcpu_svm *svm = to_svm(vcpu);
    998
    999	return svm->nested.ctl.tsc_offset;
   1000}
   1001
   1002static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
   1003{
   1004	struct vcpu_svm *svm = to_svm(vcpu);
   1005
   1006	return svm->tsc_ratio_msr;
   1007}
   1008
   1009static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
   1010{
   1011	struct vcpu_svm *svm = to_svm(vcpu);
   1012
   1013	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
   1014	svm->vmcb->control.tsc_offset = offset;
   1015	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
   1016}
   1017
   1018static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
   1019{
   1020	__svm_write_tsc_multiplier(multiplier);
   1021}
   1022
   1023
   1024/* Evaluate instruction intercepts that depend on guest CPUID features. */
   1025static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
   1026					      struct vcpu_svm *svm)
   1027{
   1028	/*
   1029	 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
   1030	 * roots, or if INVPCID is disabled in the guest to inject #UD.
   1031	 */
   1032	if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
   1033		if (!npt_enabled ||
   1034		    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
   1035			svm_set_intercept(svm, INTERCEPT_INVPCID);
   1036		else
   1037			svm_clr_intercept(svm, INTERCEPT_INVPCID);
   1038	}
   1039
   1040	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
   1041		if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
   1042			svm_clr_intercept(svm, INTERCEPT_RDTSCP);
   1043		else
   1044			svm_set_intercept(svm, INTERCEPT_RDTSCP);
   1045	}
   1046}
   1047
   1048static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
   1049{
   1050	struct vcpu_svm *svm = to_svm(vcpu);
   1051
   1052	if (guest_cpuid_is_intel(vcpu)) {
   1053		/*
   1054		 * We must intercept SYSENTER_EIP and SYSENTER_ESP
   1055		 * accesses because the processor only stores 32 bits.
   1056		 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
   1057		 */
   1058		svm_set_intercept(svm, INTERCEPT_VMLOAD);
   1059		svm_set_intercept(svm, INTERCEPT_VMSAVE);
   1060		svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
   1061
   1062		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
   1063		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
   1064
   1065		svm->v_vmload_vmsave_enabled = false;
   1066	} else {
   1067		/*
   1068		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
   1069		 * in VMCB and clear intercepts to avoid #VMEXIT.
   1070		 */
   1071		if (vls) {
   1072			svm_clr_intercept(svm, INTERCEPT_VMLOAD);
   1073			svm_clr_intercept(svm, INTERCEPT_VMSAVE);
   1074			svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
   1075		}
   1076		/* No need to intercept these MSRs */
   1077		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
   1078		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
   1079	}
   1080}
   1081
   1082static void init_vmcb(struct kvm_vcpu *vcpu)
   1083{
   1084	struct vcpu_svm *svm = to_svm(vcpu);
   1085	struct vmcb *vmcb = svm->vmcb01.ptr;
   1086	struct vmcb_control_area *control = &vmcb->control;
   1087	struct vmcb_save_area *save = &vmcb->save;
   1088
   1089	svm_set_intercept(svm, INTERCEPT_CR0_READ);
   1090	svm_set_intercept(svm, INTERCEPT_CR3_READ);
   1091	svm_set_intercept(svm, INTERCEPT_CR4_READ);
   1092	svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
   1093	svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
   1094	svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
   1095	if (!kvm_vcpu_apicv_active(vcpu))
   1096		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
   1097
   1098	set_dr_intercepts(svm);
   1099
   1100	set_exception_intercept(svm, PF_VECTOR);
   1101	set_exception_intercept(svm, UD_VECTOR);
   1102	set_exception_intercept(svm, MC_VECTOR);
   1103	set_exception_intercept(svm, AC_VECTOR);
   1104	set_exception_intercept(svm, DB_VECTOR);
   1105	/*
   1106	 * Guest access to VMware backdoor ports could legitimately
   1107	 * trigger #GP because of TSS I/O permission bitmap.
   1108	 * We intercept those #GP and allow access to them anyway
   1109	 * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
   1110	 * decrypt guest memory to decode the faulting instruction.
   1111	 */
   1112	if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
   1113		set_exception_intercept(svm, GP_VECTOR);
   1114
   1115	svm_set_intercept(svm, INTERCEPT_INTR);
   1116	svm_set_intercept(svm, INTERCEPT_NMI);
   1117
   1118	if (intercept_smi)
   1119		svm_set_intercept(svm, INTERCEPT_SMI);
   1120
   1121	svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
   1122	svm_set_intercept(svm, INTERCEPT_RDPMC);
   1123	svm_set_intercept(svm, INTERCEPT_CPUID);
   1124	svm_set_intercept(svm, INTERCEPT_INVD);
   1125	svm_set_intercept(svm, INTERCEPT_INVLPG);
   1126	svm_set_intercept(svm, INTERCEPT_INVLPGA);
   1127	svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
   1128	svm_set_intercept(svm, INTERCEPT_MSR_PROT);
   1129	svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
   1130	svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
   1131	svm_set_intercept(svm, INTERCEPT_VMRUN);
   1132	svm_set_intercept(svm, INTERCEPT_VMMCALL);
   1133	svm_set_intercept(svm, INTERCEPT_VMLOAD);
   1134	svm_set_intercept(svm, INTERCEPT_VMSAVE);
   1135	svm_set_intercept(svm, INTERCEPT_STGI);
   1136	svm_set_intercept(svm, INTERCEPT_CLGI);
   1137	svm_set_intercept(svm, INTERCEPT_SKINIT);
   1138	svm_set_intercept(svm, INTERCEPT_WBINVD);
   1139	svm_set_intercept(svm, INTERCEPT_XSETBV);
   1140	svm_set_intercept(svm, INTERCEPT_RDPRU);
   1141	svm_set_intercept(svm, INTERCEPT_RSM);
   1142
   1143	if (!kvm_mwait_in_guest(vcpu->kvm)) {
   1144		svm_set_intercept(svm, INTERCEPT_MONITOR);
   1145		svm_set_intercept(svm, INTERCEPT_MWAIT);
   1146	}
   1147
   1148	if (!kvm_hlt_in_guest(vcpu->kvm))
   1149		svm_set_intercept(svm, INTERCEPT_HLT);
   1150
   1151	control->iopm_base_pa = __sme_set(iopm_base);
   1152	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
   1153	control->int_ctl = V_INTR_MASKING_MASK;
   1154
   1155	init_seg(&save->es);
   1156	init_seg(&save->ss);
   1157	init_seg(&save->ds);
   1158	init_seg(&save->fs);
   1159	init_seg(&save->gs);
   1160
   1161	save->cs.selector = 0xf000;
   1162	save->cs.base = 0xffff0000;
   1163	/* Executable/Readable Code Segment */
   1164	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
   1165		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
   1166	save->cs.limit = 0xffff;
   1167
   1168	save->gdtr.base = 0;
   1169	save->gdtr.limit = 0xffff;
   1170	save->idtr.base = 0;
   1171	save->idtr.limit = 0xffff;
   1172
   1173	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
   1174	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
   1175
   1176	if (npt_enabled) {
   1177		/* Setup VMCB for Nested Paging */
   1178		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
   1179		svm_clr_intercept(svm, INTERCEPT_INVLPG);
   1180		clr_exception_intercept(svm, PF_VECTOR);
   1181		svm_clr_intercept(svm, INTERCEPT_CR3_READ);
   1182		svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
   1183		save->g_pat = vcpu->arch.pat;
   1184		save->cr3 = 0;
   1185	}
   1186	svm->current_vmcb->asid_generation = 0;
   1187	svm->asid = 0;
   1188
   1189	svm->nested.vmcb12_gpa = INVALID_GPA;
   1190	svm->nested.last_vmcb12_gpa = INVALID_GPA;
   1191
   1192	if (!kvm_pause_in_guest(vcpu->kvm)) {
   1193		control->pause_filter_count = pause_filter_count;
   1194		if (pause_filter_thresh)
   1195			control->pause_filter_thresh = pause_filter_thresh;
   1196		svm_set_intercept(svm, INTERCEPT_PAUSE);
   1197	} else {
   1198		svm_clr_intercept(svm, INTERCEPT_PAUSE);
   1199	}
   1200
   1201	svm_recalc_instruction_intercepts(vcpu, svm);
   1202
   1203	/*
   1204	 * If the host supports V_SPEC_CTRL then disable the interception
   1205	 * of MSR_IA32_SPEC_CTRL.
   1206	 */
   1207	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
   1208		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
   1209
   1210	if (kvm_vcpu_apicv_active(vcpu))
   1211		avic_init_vmcb(svm, vmcb);
   1212
   1213	if (vgif) {
   1214		svm_clr_intercept(svm, INTERCEPT_STGI);
   1215		svm_clr_intercept(svm, INTERCEPT_CLGI);
   1216		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
   1217	}
   1218
   1219	if (sev_guest(vcpu->kvm))
   1220		sev_init_vmcb(svm);
   1221
   1222	svm_hv_init_vmcb(vmcb);
   1223	init_vmcb_after_set_cpuid(vcpu);
   1224
   1225	vmcb_mark_all_dirty(vmcb);
   1226
   1227	enable_gif(svm);
   1228}
   1229
   1230static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
   1231{
   1232	struct vcpu_svm *svm = to_svm(vcpu);
   1233
   1234	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
   1235
   1236	svm_init_osvw(vcpu);
   1237	vcpu->arch.microcode_version = 0x01000065;
   1238	svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
   1239
   1240	if (sev_es_guest(vcpu->kvm))
   1241		sev_es_vcpu_reset(svm);
   1242}
   1243
   1244static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
   1245{
   1246	struct vcpu_svm *svm = to_svm(vcpu);
   1247
   1248	svm->spec_ctrl = 0;
   1249	svm->virt_spec_ctrl = 0;
   1250
   1251	if (init_event)
   1252		sev_snp_init_protected_guest_state(vcpu);
   1253
   1254	init_vmcb(vcpu);
   1255
   1256	if (!init_event)
   1257		__svm_vcpu_reset(vcpu);
   1258}
   1259
   1260void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
   1261{
   1262	svm->current_vmcb = target_vmcb;
   1263	svm->vmcb = target_vmcb->ptr;
   1264}
   1265
   1266static int svm_vcpu_create(struct kvm_vcpu *vcpu)
   1267{
   1268	struct vcpu_svm *svm;
   1269	struct page *vmcb01_page;
   1270	struct page *vmsa_page = NULL;
   1271	int err;
   1272
   1273	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
   1274	svm = to_svm(vcpu);
   1275
   1276	err = -ENOMEM;
   1277	vmcb01_page = snp_safe_alloc_page(vcpu);
   1278	if (!vmcb01_page)
   1279		goto out;
   1280
   1281	if (sev_es_guest(vcpu->kvm)) {
   1282		/*
   1283		 * SEV-ES guests require a separate VMSA page used to contain
   1284		 * the encrypted register state of the guest.
   1285		 */
   1286		vmsa_page = snp_safe_alloc_page(vcpu);
   1287		if (!vmsa_page)
   1288			goto error_free_vmcb_page;
   1289
   1290		/*
   1291		 * SEV-ES guests maintain an encrypted version of their FPU
   1292		 * state which is restored and saved on VMRUN and VMEXIT.
   1293		 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
   1294		 * do xsave/xrstor on it.
   1295		 */
   1296		fpstate_set_confidential(&vcpu->arch.guest_fpu);
   1297	}
   1298
   1299	err = avic_init_vcpu(svm);
   1300	if (err)
   1301		goto error_free_vmsa_page;
   1302
   1303	svm->msrpm = svm_vcpu_alloc_msrpm();
   1304	if (!svm->msrpm) {
   1305		err = -ENOMEM;
   1306		goto error_free_vmsa_page;
   1307	}
   1308
   1309	svm->vmcb01.ptr = page_address(vmcb01_page);
   1310	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
   1311	svm_switch_vmcb(svm, &svm->vmcb01);
   1312
   1313	if (vmsa_page) {
   1314		svm->sev_es.vmsa = page_address(vmsa_page);
   1315
   1316		/*
   1317		 * Do not include the encryption mask on the VMSA physical
   1318		 * address since hardware will access it using the guest key.
   1319		 */
   1320		svm->sev_es.vmsa_pa = __pa(svm->sev_es.vmsa);
   1321	}
   1322
   1323	svm->guest_state_loaded = false;
   1324
   1325	return 0;
   1326
   1327error_free_vmsa_page:
   1328	if (vmsa_page)
   1329		__free_page(vmsa_page);
   1330error_free_vmcb_page:
   1331	__free_page(vmcb01_page);
   1332out:
   1333	return err;
   1334}
   1335
   1336static void svm_clear_current_vmcb(struct vmcb *vmcb)
   1337{
   1338	int i;
   1339
   1340	for_each_online_cpu(i)
   1341		cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
   1342}
   1343
   1344static void svm_vcpu_free(struct kvm_vcpu *vcpu)
   1345{
   1346	struct vcpu_svm *svm = to_svm(vcpu);
   1347
   1348	/*
   1349	 * The vmcb page can be recycled, causing a false negative in
   1350	 * svm_vcpu_load(). So, ensure that no logical CPU has this
   1351	 * vmcb page recorded as its current vmcb.
   1352	 */
   1353	svm_clear_current_vmcb(svm->vmcb);
   1354
   1355	svm_free_nested(svm);
   1356
   1357	sev_free_vcpu(vcpu);
   1358
   1359	__free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
   1360	__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
   1361}
   1362
   1363static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
   1364{
   1365	struct vcpu_svm *svm = to_svm(vcpu);
   1366	struct svm_cpu_data *sd;
   1367
   1368	if (sev_es_guest(vcpu->kvm))
   1369		sev_es_unmap_ghcb(svm);
   1370
   1371	if (svm->guest_state_loaded)
   1372		return;
   1373
   1374	/* sev_es_unmap_ghcb() can resched, so grab per-cpu pointer afterward. */
   1375	barrier();
   1376	sd = per_cpu(svm_data, vcpu->cpu);
   1377
   1378	/*
   1379	 * Save additional host state that will be restored on VMEXIT (sev-es)
   1380	 * or subsequent vmload of host save area.
   1381	 */
   1382	vmsave(__sme_page_pa(sd->save_area));
   1383	if (sev_es_guest(vcpu->kvm)) {
   1384		struct sev_es_save_area *hostsa;
   1385		hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
   1386
   1387		sev_es_prepare_switch_to_guest(hostsa);
   1388	}
   1389
   1390	if (tsc_scaling)
   1391		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
   1392
   1393	if (likely(tsc_aux_uret_slot >= 0))
   1394		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
   1395
   1396	svm->guest_state_loaded = true;
   1397}
   1398
   1399static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
   1400{
   1401	to_svm(vcpu)->guest_state_loaded = false;
   1402}
   1403
   1404static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   1405{
   1406	struct vcpu_svm *svm = to_svm(vcpu);
   1407	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
   1408
   1409	if (sd->current_vmcb != svm->vmcb) {
   1410		sd->current_vmcb = svm->vmcb;
   1411		indirect_branch_prediction_barrier();
   1412	}
   1413	if (kvm_vcpu_apicv_active(vcpu))
   1414		avic_vcpu_load(vcpu, cpu);
   1415}
   1416
   1417static void svm_vcpu_put(struct kvm_vcpu *vcpu)
   1418{
   1419	if (kvm_vcpu_apicv_active(vcpu))
   1420		avic_vcpu_put(vcpu);
   1421
   1422	svm_prepare_host_switch(vcpu);
   1423
   1424	++vcpu->stat.host_state_reload;
   1425}
   1426
   1427static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
   1428{
   1429	struct vcpu_svm *svm = to_svm(vcpu);
   1430	unsigned long rflags = svm->vmcb->save.rflags;
   1431
   1432	if (svm->nmi_singlestep) {
   1433		/* Hide our flags if they were not set by the guest */
   1434		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
   1435			rflags &= ~X86_EFLAGS_TF;
   1436		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
   1437			rflags &= ~X86_EFLAGS_RF;
   1438	}
   1439	return rflags;
   1440}
   1441
   1442static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
   1443{
   1444	if (to_svm(vcpu)->nmi_singlestep)
   1445		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
   1446
   1447       /*
   1448        * Any change of EFLAGS.VM is accompanied by a reload of SS
   1449        * (caused by either a task switch or an inter-privilege IRET),
   1450        * so we do not need to update the CPL here.
   1451        */
   1452	to_svm(vcpu)->vmcb->save.rflags = rflags;
   1453}
   1454
   1455static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
   1456{
   1457	struct vmcb *vmcb = to_svm(vcpu)->vmcb;
   1458
   1459	return sev_es_guest(vcpu->kvm)
   1460		? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
   1461		: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
   1462}
   1463
   1464static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
   1465{
   1466	kvm_register_mark_available(vcpu, reg);
   1467
   1468	switch (reg) {
   1469	case VCPU_EXREG_PDPTR:
   1470		/*
   1471		 * When !npt_enabled, mmu->pdptrs[] is already available since
   1472		 * it is always updated per SDM when moving to CRs.
   1473		 */
   1474		if (npt_enabled)
   1475			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
   1476		break;
   1477	default:
   1478		KVM_BUG_ON(1, vcpu->kvm);
   1479	}
   1480}
   1481
   1482static void svm_set_vintr(struct vcpu_svm *svm)
   1483{
   1484	struct vmcb_control_area *control;
   1485
   1486	/*
   1487	 * The following fields are ignored when AVIC is enabled
   1488	 */
   1489	WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
   1490
   1491	svm_set_intercept(svm, INTERCEPT_VINTR);
   1492
   1493	/*
   1494	 * This is just a dummy VINTR to actually cause a vmexit to happen.
   1495	 * Actual injection of virtual interrupts happens through EVENTINJ.
   1496	 */
   1497	control = &svm->vmcb->control;
   1498	control->int_vector = 0x0;
   1499	control->int_ctl &= ~V_INTR_PRIO_MASK;
   1500	control->int_ctl |= V_IRQ_MASK |
   1501		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
   1502	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
   1503}
   1504
   1505static void svm_clear_vintr(struct vcpu_svm *svm)
   1506{
   1507	svm_clr_intercept(svm, INTERCEPT_VINTR);
   1508
   1509	/* Drop int_ctl fields related to VINTR injection.  */
   1510	svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
   1511	if (is_guest_mode(&svm->vcpu)) {
   1512		svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
   1513
   1514		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
   1515			(svm->nested.ctl.int_ctl & V_TPR_MASK));
   1516
   1517		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
   1518			V_IRQ_INJECTION_BITS_MASK;
   1519
   1520		svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
   1521	}
   1522
   1523	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
   1524}
   1525
   1526static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
   1527{
   1528	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
   1529	struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
   1530
   1531	switch (seg) {
   1532	case VCPU_SREG_CS: return &save->cs;
   1533	case VCPU_SREG_DS: return &save->ds;
   1534	case VCPU_SREG_ES: return &save->es;
   1535	case VCPU_SREG_FS: return &save01->fs;
   1536	case VCPU_SREG_GS: return &save01->gs;
   1537	case VCPU_SREG_SS: return &save->ss;
   1538	case VCPU_SREG_TR: return &save01->tr;
   1539	case VCPU_SREG_LDTR: return &save01->ldtr;
   1540	}
   1541	BUG();
   1542	return NULL;
   1543}
   1544
   1545static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
   1546{
   1547	struct vmcb_seg *s = svm_seg(vcpu, seg);
   1548
   1549	return s->base;
   1550}
   1551
   1552static void svm_get_segment(struct kvm_vcpu *vcpu,
   1553			    struct kvm_segment *var, int seg)
   1554{
   1555	struct vmcb_seg *s = svm_seg(vcpu, seg);
   1556
   1557	var->base = s->base;
   1558	var->limit = s->limit;
   1559	var->selector = s->selector;
   1560	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
   1561	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
   1562	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
   1563	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
   1564	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
   1565	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
   1566	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
   1567
   1568	/*
   1569	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
   1570	 * However, the SVM spec states that the G bit is not observed by the
   1571	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
   1572	 * So let's synthesize a legal G bit for all segments, this helps
   1573	 * running KVM nested. It also helps cross-vendor migration, because
   1574	 * Intel's vmentry has a check on the 'G' bit.
   1575	 */
   1576	var->g = s->limit > 0xfffff;
   1577
   1578	/*
   1579	 * AMD's VMCB does not have an explicit unusable field, so emulate it
   1580	 * for cross vendor migration purposes by "not present"
   1581	 */
   1582	var->unusable = !var->present;
   1583
   1584	switch (seg) {
   1585	case VCPU_SREG_TR:
   1586		/*
   1587		 * Work around a bug where the busy flag in the tr selector
   1588		 * isn't exposed
   1589		 */
   1590		var->type |= 0x2;
   1591		break;
   1592	case VCPU_SREG_DS:
   1593	case VCPU_SREG_ES:
   1594	case VCPU_SREG_FS:
   1595	case VCPU_SREG_GS:
   1596		/*
   1597		 * The accessed bit must always be set in the segment
   1598		 * descriptor cache, although it can be cleared in the
   1599		 * descriptor, the cached bit always remains at 1. Since
   1600		 * Intel has a check on this, set it here to support
   1601		 * cross-vendor migration.
   1602		 */
   1603		if (!var->unusable)
   1604			var->type |= 0x1;
   1605		break;
   1606	case VCPU_SREG_SS:
   1607		/*
   1608		 * On AMD CPUs sometimes the DB bit in the segment
   1609		 * descriptor is left as 1, although the whole segment has
   1610		 * been made unusable. Clear it here to pass an Intel VMX
   1611		 * entry check when cross vendor migrating.
   1612		 */
   1613		if (var->unusable)
   1614			var->db = 0;
   1615		/* This is symmetric with svm_set_segment() */
   1616		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
   1617		break;
   1618	}
   1619}
   1620
   1621static int svm_get_cpl(struct kvm_vcpu *vcpu)
   1622{
   1623	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
   1624
   1625	return save->cpl;
   1626}
   1627
   1628static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
   1629{
   1630	struct kvm_segment cs;
   1631
   1632	svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
   1633	*db = cs.db;
   1634	*l = cs.l;
   1635}
   1636
   1637static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   1638{
   1639	struct vcpu_svm *svm = to_svm(vcpu);
   1640
   1641	dt->size = svm->vmcb->save.idtr.limit;
   1642	dt->address = svm->vmcb->save.idtr.base;
   1643}
   1644
   1645static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   1646{
   1647	struct vcpu_svm *svm = to_svm(vcpu);
   1648
   1649	svm->vmcb->save.idtr.limit = dt->size;
   1650	svm->vmcb->save.idtr.base = dt->address ;
   1651	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
   1652}
   1653
   1654static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   1655{
   1656	struct vcpu_svm *svm = to_svm(vcpu);
   1657
   1658	dt->size = svm->vmcb->save.gdtr.limit;
   1659	dt->address = svm->vmcb->save.gdtr.base;
   1660}
   1661
   1662static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
   1663{
   1664	struct vcpu_svm *svm = to_svm(vcpu);
   1665
   1666	svm->vmcb->save.gdtr.limit = dt->size;
   1667	svm->vmcb->save.gdtr.base = dt->address ;
   1668	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
   1669}
   1670
   1671static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
   1672{
   1673	struct vcpu_svm *svm = to_svm(vcpu);
   1674
   1675	/*
   1676	 * For guests that don't set guest_state_protected, the cr3 update is
   1677	 * handled via kvm_mmu_load() while entering the guest. For guests
   1678	 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
   1679	 * VMCB save area now, since the save area will become the initial
   1680	 * contents of the VMSA, and future VMCB save area updates won't be
   1681	 * seen.
   1682	 */
   1683	if (sev_es_guest(vcpu->kvm)) {
   1684		svm->vmcb->save.cr3 = cr3;
   1685		vmcb_mark_dirty(svm->vmcb, VMCB_CR);
   1686	}
   1687}
   1688
   1689void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
   1690{
   1691	struct vcpu_svm *svm = to_svm(vcpu);
   1692	u64 hcr0 = cr0;
   1693	bool old_paging = is_paging(vcpu);
   1694
   1695#ifdef CONFIG_X86_64
   1696	if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
   1697		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
   1698			vcpu->arch.efer |= EFER_LMA;
   1699			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
   1700		}
   1701
   1702		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
   1703			vcpu->arch.efer &= ~EFER_LMA;
   1704			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
   1705		}
   1706	}
   1707#endif
   1708	vcpu->arch.cr0 = cr0;
   1709
   1710	if (!npt_enabled) {
   1711		hcr0 |= X86_CR0_PG | X86_CR0_WP;
   1712		if (old_paging != is_paging(vcpu))
   1713			svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
   1714	}
   1715
   1716	/*
   1717	 * re-enable caching here because the QEMU bios
   1718	 * does not do it - this results in some delay at
   1719	 * reboot
   1720	 */
   1721	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
   1722		hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
   1723
   1724	svm->vmcb->save.cr0 = hcr0;
   1725	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
   1726
   1727	/*
   1728	 * SEV-ES guests must always keep the CR intercepts cleared. CR
   1729	 * tracking is done using the CR write traps.
   1730	 */
   1731	if (sev_es_guest(vcpu->kvm))
   1732		return;
   1733
   1734	if (hcr0 == cr0) {
   1735		/* Selective CR0 write remains on.  */
   1736		svm_clr_intercept(svm, INTERCEPT_CR0_READ);
   1737		svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
   1738	} else {
   1739		svm_set_intercept(svm, INTERCEPT_CR0_READ);
   1740		svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
   1741	}
   1742}
   1743
   1744static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   1745{
   1746	return true;
   1747}
   1748
   1749void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   1750{
   1751	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
   1752	unsigned long old_cr4 = vcpu->arch.cr4;
   1753
   1754	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
   1755		svm_flush_tlb_current(vcpu);
   1756
   1757	vcpu->arch.cr4 = cr4;
   1758	if (!npt_enabled) {
   1759		cr4 |= X86_CR4_PAE;
   1760
   1761		if (!is_paging(vcpu))
   1762			cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
   1763	}
   1764	cr4 |= host_cr4_mce;
   1765	to_svm(vcpu)->vmcb->save.cr4 = cr4;
   1766	vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
   1767
   1768	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
   1769		kvm_update_cpuid_runtime(vcpu);
   1770}
   1771
   1772static void svm_set_segment(struct kvm_vcpu *vcpu,
   1773			    struct kvm_segment *var, int seg)
   1774{
   1775	struct vcpu_svm *svm = to_svm(vcpu);
   1776	struct vmcb_seg *s = svm_seg(vcpu, seg);
   1777
   1778	s->base = var->base;
   1779	s->limit = var->limit;
   1780	s->selector = var->selector;
   1781	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
   1782	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
   1783	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
   1784	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
   1785	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
   1786	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
   1787	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
   1788	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
   1789
   1790	/*
   1791	 * This is always accurate, except if SYSRET returned to a segment
   1792	 * with SS.DPL != 3.  Intel does not have this quirk, and always
   1793	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
   1794	 * would entail passing the CPL to userspace and back.
   1795	 */
   1796	if (seg == VCPU_SREG_SS)
   1797		/* This is symmetric with svm_get_segment() */
   1798		svm->vmcb->save.cpl = (var->dpl & 3);
   1799
   1800	vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
   1801}
   1802
   1803static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
   1804{
   1805	struct vcpu_svm *svm = to_svm(vcpu);
   1806
   1807	clr_exception_intercept(svm, BP_VECTOR);
   1808
   1809	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
   1810		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
   1811			set_exception_intercept(svm, BP_VECTOR);
   1812	}
   1813}
   1814
   1815static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
   1816{
   1817	if (sd->next_asid > sd->max_asid) {
   1818		++sd->asid_generation;
   1819		sd->next_asid = sd->min_asid;
   1820		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
   1821		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
   1822	}
   1823
   1824	svm->current_vmcb->asid_generation = sd->asid_generation;
   1825	svm->asid = sd->next_asid++;
   1826}
   1827
   1828static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
   1829{
   1830	struct vmcb *vmcb = svm->vmcb;
   1831
   1832	if (svm->vcpu.arch.guest_state_protected)
   1833		return;
   1834
   1835	if (unlikely(value != vmcb->save.dr6)) {
   1836		vmcb->save.dr6 = value;
   1837		vmcb_mark_dirty(vmcb, VMCB_DR);
   1838	}
   1839}
   1840
   1841static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
   1842{
   1843	struct vcpu_svm *svm = to_svm(vcpu);
   1844
   1845	if (vcpu->arch.guest_state_protected)
   1846		return;
   1847
   1848	get_debugreg(vcpu->arch.db[0], 0);
   1849	get_debugreg(vcpu->arch.db[1], 1);
   1850	get_debugreg(vcpu->arch.db[2], 2);
   1851	get_debugreg(vcpu->arch.db[3], 3);
   1852	/*
   1853	 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
   1854	 * because db_interception might need it.  We can do it before vmentry.
   1855	 */
   1856	vcpu->arch.dr6 = svm->vmcb->save.dr6;
   1857	vcpu->arch.dr7 = svm->vmcb->save.dr7;
   1858	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
   1859	set_dr_intercepts(svm);
   1860}
   1861
   1862static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
   1863{
   1864	struct vcpu_svm *svm = to_svm(vcpu);
   1865
   1866	if (vcpu->arch.guest_state_protected)
   1867		return;
   1868
   1869	svm->vmcb->save.dr7 = value;
   1870	vmcb_mark_dirty(svm->vmcb, VMCB_DR);
   1871}
   1872
   1873static int pf_interception(struct kvm_vcpu *vcpu)
   1874{
   1875	struct vcpu_svm *svm = to_svm(vcpu);
   1876
   1877	u64 fault_address = svm->vmcb->control.exit_info_2;
   1878	u64 error_code = svm->vmcb->control.exit_info_1;
   1879
   1880	return kvm_handle_page_fault(vcpu, error_code, fault_address,
   1881			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
   1882			svm->vmcb->control.insn_bytes : NULL,
   1883			svm->vmcb->control.insn_len);
   1884}
   1885
   1886static int npf_interception(struct kvm_vcpu *vcpu)
   1887{
   1888	struct vcpu_svm *svm = to_svm(vcpu);
   1889	int rc;
   1890
   1891	u64 fault_address = svm->vmcb->control.exit_info_2;
   1892	u64 error_code = svm->vmcb->control.exit_info_1;
   1893
   1894	trace_kvm_page_fault(fault_address, error_code);
   1895	rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
   1896				static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
   1897				svm->vmcb->control.insn_bytes : NULL,
   1898				svm->vmcb->control.insn_len);
   1899
   1900	if (error_code & PFERR_GUEST_RMP_MASK)
   1901		handle_rmp_page_fault(vcpu, fault_address, error_code);
   1902
   1903	return rc;
   1904}
   1905
   1906static int db_interception(struct kvm_vcpu *vcpu)
   1907{
   1908	struct kvm_run *kvm_run = vcpu->run;
   1909	struct vcpu_svm *svm = to_svm(vcpu);
   1910
   1911	if (!(vcpu->guest_debug &
   1912	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
   1913		!svm->nmi_singlestep) {
   1914		u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
   1915		kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
   1916		return 1;
   1917	}
   1918
   1919	if (svm->nmi_singlestep) {
   1920		disable_nmi_singlestep(svm);
   1921		/* Make sure we check for pending NMIs upon entry */
   1922		kvm_make_request(KVM_REQ_EVENT, vcpu);
   1923	}
   1924
   1925	if (vcpu->guest_debug &
   1926	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
   1927		kvm_run->exit_reason = KVM_EXIT_DEBUG;
   1928		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
   1929		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
   1930		kvm_run->debug.arch.pc =
   1931			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
   1932		kvm_run->debug.arch.exception = DB_VECTOR;
   1933		return 0;
   1934	}
   1935
   1936	return 1;
   1937}
   1938
   1939static int bp_interception(struct kvm_vcpu *vcpu)
   1940{
   1941	struct vcpu_svm *svm = to_svm(vcpu);
   1942	struct kvm_run *kvm_run = vcpu->run;
   1943
   1944	kvm_run->exit_reason = KVM_EXIT_DEBUG;
   1945	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
   1946	kvm_run->debug.arch.exception = BP_VECTOR;
   1947	return 0;
   1948}
   1949
   1950static int ud_interception(struct kvm_vcpu *vcpu)
   1951{
   1952	return handle_ud(vcpu);
   1953}
   1954
   1955static int ac_interception(struct kvm_vcpu *vcpu)
   1956{
   1957	kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
   1958	return 1;
   1959}
   1960
   1961static bool is_erratum_383(void)
   1962{
   1963	int err, i;
   1964	u64 value;
   1965
   1966	if (!erratum_383_found)
   1967		return false;
   1968
   1969	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
   1970	if (err)
   1971		return false;
   1972
   1973	/* Bit 62 may or may not be set for this mce */
   1974	value &= ~(1ULL << 62);
   1975
   1976	if (value != 0xb600000000010015ULL)
   1977		return false;
   1978
   1979	/* Clear MCi_STATUS registers */
   1980	for (i = 0; i < 6; ++i)
   1981		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
   1982
   1983	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
   1984	if (!err) {
   1985		u32 low, high;
   1986
   1987		value &= ~(1ULL << 2);
   1988		low    = lower_32_bits(value);
   1989		high   = upper_32_bits(value);
   1990
   1991		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
   1992	}
   1993
   1994	/* Flush tlb to evict multi-match entries */
   1995	__flush_tlb_all();
   1996
   1997	return true;
   1998}
   1999
   2000static void svm_handle_mce(struct kvm_vcpu *vcpu)
   2001{
   2002	if (is_erratum_383()) {
   2003		/*
   2004		 * Erratum 383 triggered. Guest state is corrupt so kill the
   2005		 * guest.
   2006		 */
   2007		pr_err("KVM: Guest triggered AMD Erratum 383\n");
   2008
   2009		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   2010
   2011		return;
   2012	}
   2013
   2014	/*
   2015	 * On an #MC intercept the MCE handler is not called automatically in
   2016	 * the host. So do it by hand here.
   2017	 */
   2018	kvm_machine_check();
   2019}
   2020
   2021static int mc_interception(struct kvm_vcpu *vcpu)
   2022{
   2023	return 1;
   2024}
   2025
   2026static int shutdown_interception(struct kvm_vcpu *vcpu)
   2027{
   2028	struct kvm_run *kvm_run = vcpu->run;
   2029	struct vcpu_svm *svm = to_svm(vcpu);
   2030
   2031	/*
   2032	 * The VM save area has already been encrypted so it
   2033	 * cannot be reinitialized - just terminate.
   2034	 */
   2035	if (sev_es_guest(vcpu->kvm))
   2036		return -EINVAL;
   2037
   2038	/*
   2039	 * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
   2040	 * the VMCB in a known good state.  Unfortuately, KVM doesn't have
   2041	 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
   2042	 * userspace.  At a platform view, INIT is acceptable behavior as
   2043	 * there exist bare metal platforms that automatically INIT the CPU
   2044	 * in response to shutdown.
   2045	 */
   2046	clear_page(svm->vmcb);
   2047	kvm_vcpu_reset(vcpu, true);
   2048
   2049	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
   2050	return 0;
   2051}
   2052
   2053static int io_interception(struct kvm_vcpu *vcpu)
   2054{
   2055	struct vcpu_svm *svm = to_svm(vcpu);
   2056	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
   2057	int size, in, string;
   2058	unsigned port;
   2059
   2060	++vcpu->stat.io_exits;
   2061	string = (io_info & SVM_IOIO_STR_MASK) != 0;
   2062	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
   2063	port = io_info >> 16;
   2064	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
   2065
   2066	if (string) {
   2067		if (sev_es_guest(vcpu->kvm))
   2068			return sev_es_string_io(svm, size, port, in);
   2069		else
   2070			return kvm_emulate_instruction(vcpu, 0);
   2071	}
   2072
   2073	svm->next_rip = svm->vmcb->control.exit_info_2;
   2074
   2075	return kvm_fast_pio(vcpu, size, port, in);
   2076}
   2077
   2078static int nmi_interception(struct kvm_vcpu *vcpu)
   2079{
   2080	return 1;
   2081}
   2082
   2083static int smi_interception(struct kvm_vcpu *vcpu)
   2084{
   2085	return 1;
   2086}
   2087
   2088static int intr_interception(struct kvm_vcpu *vcpu)
   2089{
   2090	struct vcpu_svm *svm;
   2091	struct cpc_fault *fault, *next;
   2092	bool inst_gfn_seen;
   2093	size_t count;
   2094
   2095	++vcpu->stat.irq_exits;
   2096
   2097	if (!cpc_singlestep)
   2098		return 1;
   2099
   2100	svm = to_svm(vcpu);
   2101
   2102	if (sev_es_guest(vcpu->kvm)) {
   2103		/* invalidate cached vmsa so rip is updated */
   2104		wbinvd();
   2105		cpc_rip = svm->sev_es.vmsa->rip;
   2106	} else {
   2107		cpc_rip = kvm_rip_read(vcpu);
   2108	}
   2109
   2110	WARN_ON(!cpc_rip_prev_set);
   2111	if (cpc_rip == cpc_rip_prev) {
   2112		CPC_DBG("No RIP change (%016llx,%u,%llu)\n",
   2113			cpc_rip, cpc_apic_timer, cpc_retinst);
   2114		cpc_apic_timer += 1;
   2115		return 1;
   2116	}
   2117	cpc_rip_prev = cpc_rip;
   2118	CPC_INFO("Detected RIP change! (%016llx,%u,%llu)\n",
   2119		cpc_rip, cpc_apic_timer, cpc_retinst);
   2120
   2121	// if (!cpc_retinst_prev)
   2122	// 	cpc_retinst_prev = cpc_retinst;
   2123	// if (cpc_retinst_prev == cpc_retinst) {
   2124	// 	cpc_apic_timer += 1;
   2125	// 	return 1;
   2126	// }
   2127	// cpc_retinst_prev = cpc_retinst;
   2128	// CPC_INFO("Detected RETINST change! (%016llx,%u,%llu)\n",
   2129	// 	cpc_rip, cpc_apic_timer, cpc_retinst);
   2130
   2131	count = 0;
   2132	list_for_each_entry(fault, &cpc_faults, list)
   2133		count += 1;
   2134	CPC_INFO("Caught single step with %lu faults!\n", count);
   2135
   2136	switch (cpc_track_mode) {
   2137	case CPC_TRACK_PAGES:
   2138		cpc_track_pages.in_step = false;
   2139		cpc_singlestep = false;
   2140
   2141		if (cpc_track_pages.cur_avail && cpc_track_pages.next_avail) {
   2142			CPC_INFO("Boundary %08llx -> %08llx resolved through step %llu\n",
   2143				cpc_track_pages.cur_gfn, cpc_track_pages.next_gfn,
   2144				cpc_track_pages.retinst);
   2145			cpc_track_single(vcpu, cpc_track_pages.cur_gfn,
   2146				KVM_PAGE_TRACK_EXEC);
   2147			cpc_track_pages.prev_gfn = cpc_track_pages.cur_gfn;
   2148			cpc_track_pages.prev_avail = true;
   2149			cpc_track_pages.cur_gfn = cpc_track_pages.next_gfn;
   2150			cpc_track_pages.cur_avail = true;
   2151			cpc_track_pages.next_avail = false;
   2152		}
   2153
   2154		/* reset retinst to something realistic for a singlestep */
   2155		WARN_ON(cpc_track_pages.retinst > 100);
   2156		cpc_track_pages.retinst = 3;
   2157
   2158		break;
   2159	case CPC_TRACK_STEPS:
   2160		inst_gfn_seen = false;
   2161		list_for_each_entry_safe(fault, next, &cpc_faults, list) {
   2162			if (!inst_gfn_seen && (fault->err & PFERR_FETCH_MASK)) {
   2163				inst_gfn_seen = true;
   2164			}
   2165			if (!inst_gfn_seen && cpc_track_steps.use_filter) {
   2166				/* remove without retracking */
   2167				list_del(&fault->list);
   2168				kfree(fault);
   2169			}
   2170		}
   2171
   2172		if (cpc_track_steps.target_user && !cpc_retinst_user) {
   2173			/* stop single-stepping until we leave this page */
   2174			CPC_INFO("Target page not userspace, skipping..\n");
   2175			cpc_singlestep = false;
   2176			cpc_prime_probe = false;
   2177			cpc_track_steps.stepping = false;
   2178			break;
   2179		}
   2180
   2181		cpc_send_track_step_event(&cpc_faults, cpc_guest_misses);
   2182		list_for_each_entry_safe(fault, next, &cpc_faults, list) {
   2183			if (cpc_track_steps.with_data && cpc_track_steps.stepping)
   2184				cpc_track_single(vcpu, fault->gfn, KVM_PAGE_TRACK_ACCESS);
   2185			else
   2186				cpc_track_single(vcpu, fault->gfn, KVM_PAGE_TRACK_EXEC);
   2187			list_del(&fault->list);
   2188			kfree(fault);
   2189		}
   2190		cpc_singlestep_reset = true;
   2191		break;
   2192	}
   2193
   2194	list_for_each_entry_safe(fault, next, &cpc_faults, list) {
   2195		list_del(&fault->list);
   2196		kfree(fault);
   2197	}
   2198
   2199	return 1;
   2200}
   2201
   2202static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
   2203{
   2204	struct vcpu_svm *svm = to_svm(vcpu);
   2205	struct vmcb *vmcb12;
   2206	struct kvm_host_map map;
   2207	int ret;
   2208
   2209	if (nested_svm_check_permissions(vcpu))
   2210		return 1;
   2211
   2212	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
   2213	if (ret) {
   2214		if (ret == -EINVAL)
   2215			kvm_inject_gp(vcpu, 0);
   2216		return 1;
   2217	}
   2218
   2219	vmcb12 = map.hva;
   2220
   2221	ret = kvm_skip_emulated_instruction(vcpu);
   2222
   2223	if (vmload) {
   2224		svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
   2225		svm->sysenter_eip_hi = 0;
   2226		svm->sysenter_esp_hi = 0;
   2227	} else {
   2228		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
   2229	}
   2230
   2231	kvm_vcpu_unmap(vcpu, &map, true);
   2232
   2233	return ret;
   2234}
   2235
   2236static int vmload_interception(struct kvm_vcpu *vcpu)
   2237{
   2238	return vmload_vmsave_interception(vcpu, true);
   2239}
   2240
   2241static int vmsave_interception(struct kvm_vcpu *vcpu)
   2242{
   2243	return vmload_vmsave_interception(vcpu, false);
   2244}
   2245
   2246static int vmrun_interception(struct kvm_vcpu *vcpu)
   2247{
   2248	if (nested_svm_check_permissions(vcpu))
   2249		return 1;
   2250
   2251	return nested_svm_vmrun(vcpu);
   2252}
   2253
   2254enum {
   2255	NONE_SVM_INSTR,
   2256	SVM_INSTR_VMRUN,
   2257	SVM_INSTR_VMLOAD,
   2258	SVM_INSTR_VMSAVE,
   2259};
   2260
   2261/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
   2262static int svm_instr_opcode(struct kvm_vcpu *vcpu)
   2263{
   2264	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
   2265
   2266	if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
   2267		return NONE_SVM_INSTR;
   2268
   2269	switch (ctxt->modrm) {
   2270	case 0xd8: /* VMRUN */
   2271		return SVM_INSTR_VMRUN;
   2272	case 0xda: /* VMLOAD */
   2273		return SVM_INSTR_VMLOAD;
   2274	case 0xdb: /* VMSAVE */
   2275		return SVM_INSTR_VMSAVE;
   2276	default:
   2277		break;
   2278	}
   2279
   2280	return NONE_SVM_INSTR;
   2281}
   2282
   2283static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
   2284{
   2285	const int guest_mode_exit_codes[] = {
   2286		[SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
   2287		[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
   2288		[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
   2289	};
   2290	int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
   2291		[SVM_INSTR_VMRUN] = vmrun_interception,
   2292		[SVM_INSTR_VMLOAD] = vmload_interception,
   2293		[SVM_INSTR_VMSAVE] = vmsave_interception,
   2294	};
   2295	struct vcpu_svm *svm = to_svm(vcpu);
   2296	int ret;
   2297
   2298	if (is_guest_mode(vcpu)) {
   2299		/* Returns '1' or -errno on failure, '0' on success. */
   2300		ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
   2301		if (ret)
   2302			return ret;
   2303		return 1;
   2304	}
   2305	return svm_instr_handlers[opcode](vcpu);
   2306}
   2307
   2308/*
   2309 * #GP handling code. Note that #GP can be triggered under the following two
   2310 * cases:
   2311 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
   2312 *      some AMD CPUs when EAX of these instructions are in the reserved memory
   2313 *      regions (e.g. SMM memory on host).
   2314 *   2) VMware backdoor
   2315 */
   2316static int gp_interception(struct kvm_vcpu *vcpu)
   2317{
   2318	struct vcpu_svm *svm = to_svm(vcpu);
   2319	u32 error_code = svm->vmcb->control.exit_info_1;
   2320	int opcode;
   2321
   2322	/* Both #GP cases have zero error_code */
   2323	if (error_code)
   2324		goto reinject;
   2325
   2326	/* Decode the instruction for usage later */
   2327	if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
   2328		goto reinject;
   2329
   2330	opcode = svm_instr_opcode(vcpu);
   2331
   2332	if (opcode == NONE_SVM_INSTR) {
   2333		if (!enable_vmware_backdoor)
   2334			goto reinject;
   2335
   2336		/*
   2337		 * VMware backdoor emulation on #GP interception only handles
   2338		 * IN{S}, OUT{S}, and RDPMC.
   2339		 */
   2340		if (!is_guest_mode(vcpu))
   2341			return kvm_emulate_instruction(vcpu,
   2342				EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
   2343	} else {
   2344		/* All SVM instructions expect page aligned RAX */
   2345		if (svm->vmcb->save.rax & ~PAGE_MASK)
   2346			goto reinject;
   2347
   2348		return emulate_svm_instr(vcpu, opcode);
   2349	}
   2350
   2351reinject:
   2352	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
   2353	return 1;
   2354}
   2355
   2356void svm_set_gif(struct vcpu_svm *svm, bool value)
   2357{
   2358	if (value) {
   2359		/*
   2360		 * If VGIF is enabled, the STGI intercept is only added to
   2361		 * detect the opening of the SMI/NMI window; remove it now.
   2362		 * Likewise, clear the VINTR intercept, we will set it
   2363		 * again while processing KVM_REQ_EVENT if needed.
   2364		 */
   2365		if (vgif)
   2366			svm_clr_intercept(svm, INTERCEPT_STGI);
   2367		if (svm_is_intercept(svm, INTERCEPT_VINTR))
   2368			svm_clear_vintr(svm);
   2369
   2370		enable_gif(svm);
   2371		if (svm->vcpu.arch.smi_pending ||
   2372		    svm->vcpu.arch.nmi_pending ||
   2373		    kvm_cpu_has_injectable_intr(&svm->vcpu))
   2374			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
   2375	} else {
   2376		disable_gif(svm);
   2377
   2378		/*
   2379		 * After a CLGI no interrupts should come.  But if vGIF is
   2380		 * in use, we still rely on the VINTR intercept (rather than
   2381		 * STGI) to detect an open interrupt window.
   2382		*/
   2383		if (!vgif)
   2384			svm_clear_vintr(svm);
   2385	}
   2386}
   2387
   2388static int stgi_interception(struct kvm_vcpu *vcpu)
   2389{
   2390	int ret;
   2391
   2392	if (nested_svm_check_permissions(vcpu))
   2393		return 1;
   2394
   2395	ret = kvm_skip_emulated_instruction(vcpu);
   2396	svm_set_gif(to_svm(vcpu), true);
   2397	return ret;
   2398}
   2399
   2400static int clgi_interception(struct kvm_vcpu *vcpu)
   2401{
   2402	int ret;
   2403
   2404	if (nested_svm_check_permissions(vcpu))
   2405		return 1;
   2406
   2407	ret = kvm_skip_emulated_instruction(vcpu);
   2408	svm_set_gif(to_svm(vcpu), false);
   2409	return ret;
   2410}
   2411
   2412static int invlpga_interception(struct kvm_vcpu *vcpu)
   2413{
   2414	gva_t gva = kvm_rax_read(vcpu);
   2415	u32 asid = kvm_rcx_read(vcpu);
   2416
   2417	/* FIXME: Handle an address size prefix. */
   2418	if (!is_long_mode(vcpu))
   2419		gva = (u32)gva;
   2420
   2421	trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
   2422
   2423	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
   2424	kvm_mmu_invlpg(vcpu, gva);
   2425
   2426	return kvm_skip_emulated_instruction(vcpu);
   2427}
   2428
   2429static int skinit_interception(struct kvm_vcpu *vcpu)
   2430{
   2431	trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
   2432
   2433	kvm_queue_exception(vcpu, UD_VECTOR);
   2434	return 1;
   2435}
   2436
   2437static int task_switch_interception(struct kvm_vcpu *vcpu)
   2438{
   2439	struct vcpu_svm *svm = to_svm(vcpu);
   2440	u16 tss_selector;
   2441	int reason;
   2442	int int_type = svm->vmcb->control.exit_int_info &
   2443		SVM_EXITINTINFO_TYPE_MASK;
   2444	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
   2445	uint32_t type =
   2446		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
   2447	uint32_t idt_v =
   2448		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
   2449	bool has_error_code = false;
   2450	u32 error_code = 0;
   2451
   2452	tss_selector = (u16)svm->vmcb->control.exit_info_1;
   2453
   2454	if (svm->vmcb->control.exit_info_2 &
   2455	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
   2456		reason = TASK_SWITCH_IRET;
   2457	else if (svm->vmcb->control.exit_info_2 &
   2458		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
   2459		reason = TASK_SWITCH_JMP;
   2460	else if (idt_v)
   2461		reason = TASK_SWITCH_GATE;
   2462	else
   2463		reason = TASK_SWITCH_CALL;
   2464
   2465	if (reason == TASK_SWITCH_GATE) {
   2466		switch (type) {
   2467		case SVM_EXITINTINFO_TYPE_NMI:
   2468			vcpu->arch.nmi_injected = false;
   2469			break;
   2470		case SVM_EXITINTINFO_TYPE_EXEPT:
   2471			if (svm->vmcb->control.exit_info_2 &
   2472			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
   2473				has_error_code = true;
   2474				error_code =
   2475					(u32)svm->vmcb->control.exit_info_2;
   2476			}
   2477			kvm_clear_exception_queue(vcpu);
   2478			break;
   2479		case SVM_EXITINTINFO_TYPE_INTR:
   2480			kvm_clear_interrupt_queue(vcpu);
   2481			break;
   2482		default:
   2483			break;
   2484		}
   2485	}
   2486
   2487	if (reason != TASK_SWITCH_GATE ||
   2488	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
   2489	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
   2490	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
   2491		if (!svm_skip_emulated_instruction(vcpu))
   2492			return 0;
   2493	}
   2494
   2495	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
   2496		int_vec = -1;
   2497
   2498	return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
   2499			       has_error_code, error_code);
   2500}
   2501
   2502static int iret_interception(struct kvm_vcpu *vcpu)
   2503{
   2504	struct vcpu_svm *svm = to_svm(vcpu);
   2505
   2506	++vcpu->stat.nmi_window_exits;
   2507	vcpu->arch.hflags |= HF_IRET_MASK;
   2508	if (!sev_es_guest(vcpu->kvm)) {
   2509		svm_clr_intercept(svm, INTERCEPT_IRET);
   2510		svm->nmi_iret_rip = kvm_rip_read(vcpu);
   2511	}
   2512	kvm_make_request(KVM_REQ_EVENT, vcpu);
   2513	return 1;
   2514}
   2515
   2516static int invlpg_interception(struct kvm_vcpu *vcpu)
   2517{
   2518	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
   2519		return kvm_emulate_instruction(vcpu, 0);
   2520
   2521	kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
   2522	return kvm_skip_emulated_instruction(vcpu);
   2523}
   2524
   2525static int emulate_on_interception(struct kvm_vcpu *vcpu)
   2526{
   2527	return kvm_emulate_instruction(vcpu, 0);
   2528}
   2529
   2530static int rsm_interception(struct kvm_vcpu *vcpu)
   2531{
   2532	return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
   2533}
   2534
   2535static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
   2536					    unsigned long val)
   2537{
   2538	struct vcpu_svm *svm = to_svm(vcpu);
   2539	unsigned long cr0 = vcpu->arch.cr0;
   2540	bool ret = false;
   2541
   2542	if (!is_guest_mode(vcpu) ||
   2543	    (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
   2544		return false;
   2545
   2546	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
   2547	val &= ~SVM_CR0_SELECTIVE_MASK;
   2548
   2549	if (cr0 ^ val) {
   2550		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
   2551		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
   2552	}
   2553
   2554	return ret;
   2555}
   2556
   2557#define CR_VALID (1ULL << 63)
   2558
   2559static int cr_interception(struct kvm_vcpu *vcpu)
   2560{
   2561	struct vcpu_svm *svm = to_svm(vcpu);
   2562	int reg, cr;
   2563	unsigned long val;
   2564	int err;
   2565
   2566	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
   2567		return emulate_on_interception(vcpu);
   2568
   2569	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
   2570		return emulate_on_interception(vcpu);
   2571
   2572	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
   2573	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
   2574		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
   2575	else
   2576		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
   2577
   2578	err = 0;
   2579	if (cr >= 16) { /* mov to cr */
   2580		cr -= 16;
   2581		val = kvm_register_read(vcpu, reg);
   2582		trace_kvm_cr_write(cr, val);
   2583		switch (cr) {
   2584		case 0:
   2585			if (!check_selective_cr0_intercepted(vcpu, val))
   2586				err = kvm_set_cr0(vcpu, val);
   2587			else
   2588				return 1;
   2589
   2590			break;
   2591		case 3:
   2592			err = kvm_set_cr3(vcpu, val);
   2593			break;
   2594		case 4:
   2595			err = kvm_set_cr4(vcpu, val);
   2596			break;
   2597		case 8:
   2598			err = kvm_set_cr8(vcpu, val);
   2599			break;
   2600		default:
   2601			WARN(1, "unhandled write to CR%d", cr);
   2602			kvm_queue_exception(vcpu, UD_VECTOR);
   2603			return 1;
   2604		}
   2605	} else { /* mov from cr */
   2606		switch (cr) {
   2607		case 0:
   2608			val = kvm_read_cr0(vcpu);
   2609			break;
   2610		case 2:
   2611			val = vcpu->arch.cr2;
   2612			break;
   2613		case 3:
   2614			val = kvm_read_cr3(vcpu);
   2615			break;
   2616		case 4:
   2617			val = kvm_read_cr4(vcpu);
   2618			break;
   2619		case 8:
   2620			val = kvm_get_cr8(vcpu);
   2621			break;
   2622		default:
   2623			WARN(1, "unhandled read from CR%d", cr);
   2624			kvm_queue_exception(vcpu, UD_VECTOR);
   2625			return 1;
   2626		}
   2627		kvm_register_write(vcpu, reg, val);
   2628		trace_kvm_cr_read(cr, val);
   2629	}
   2630	return kvm_complete_insn_gp(vcpu, err);
   2631}
   2632
   2633static int cr_trap(struct kvm_vcpu *vcpu)
   2634{
   2635	struct vcpu_svm *svm = to_svm(vcpu);
   2636	unsigned long old_value, new_value;
   2637	unsigned int cr;
   2638	int ret = 0;
   2639
   2640	new_value = (unsigned long)svm->vmcb->control.exit_info_1;
   2641
   2642	cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
   2643	switch (cr) {
   2644	case 0:
   2645		old_value = kvm_read_cr0(vcpu);
   2646		svm_set_cr0(vcpu, new_value);
   2647
   2648		kvm_post_set_cr0(vcpu, old_value, new_value);
   2649		break;
   2650	case 4:
   2651		old_value = kvm_read_cr4(vcpu);
   2652		svm_set_cr4(vcpu, new_value);
   2653
   2654		kvm_post_set_cr4(vcpu, old_value, new_value);
   2655		break;
   2656	case 8:
   2657		ret = kvm_set_cr8(vcpu, new_value);
   2658		break;
   2659	default:
   2660		WARN(1, "unhandled CR%d write trap", cr);
   2661		kvm_queue_exception(vcpu, UD_VECTOR);
   2662		return 1;
   2663	}
   2664
   2665	return kvm_complete_insn_gp(vcpu, ret);
   2666}
   2667
   2668static int dr_interception(struct kvm_vcpu *vcpu)
   2669{
   2670	struct vcpu_svm *svm = to_svm(vcpu);
   2671	int reg, dr;
   2672	unsigned long val;
   2673	int err = 0;
   2674
   2675	if (vcpu->guest_debug == 0) {
   2676		/*
   2677		 * No more DR vmexits; force a reload of the debug registers
   2678		 * and reenter on this instruction.  The next vmexit will
   2679		 * retrieve the full state of the debug registers.
   2680		 */
   2681		clr_dr_intercepts(svm);
   2682		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
   2683		return 1;
   2684	}
   2685
   2686	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
   2687		return emulate_on_interception(vcpu);
   2688
   2689	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
   2690	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
   2691	if (dr >= 16) { /* mov to DRn  */
   2692		dr -= 16;
   2693		val = kvm_register_read(vcpu, reg);
   2694		err = kvm_set_dr(vcpu, dr, val);
   2695	} else {
   2696		kvm_get_dr(vcpu, dr, &val);
   2697		kvm_register_write(vcpu, reg, val);
   2698	}
   2699
   2700	return kvm_complete_insn_gp(vcpu, err);
   2701}
   2702
   2703static int cr8_write_interception(struct kvm_vcpu *vcpu)
   2704{
   2705	int r;
   2706
   2707	u8 cr8_prev = kvm_get_cr8(vcpu);
   2708	/* instruction emulation calls kvm_set_cr8() */
   2709	r = cr_interception(vcpu);
   2710	if (lapic_in_kernel(vcpu))
   2711		return r;
   2712	if (cr8_prev <= kvm_get_cr8(vcpu))
   2713		return r;
   2714	vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
   2715	return 0;
   2716}
   2717
   2718static int efer_trap(struct kvm_vcpu *vcpu)
   2719{
   2720	struct msr_data msr_info;
   2721	int ret;
   2722
   2723	/*
   2724	 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
   2725	 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
   2726	 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
   2727	 * the guest doesn't have X86_FEATURE_SVM.
   2728	 */
   2729	msr_info.host_initiated = false;
   2730	msr_info.index = MSR_EFER;
   2731	msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
   2732	ret = kvm_set_msr_common(vcpu, &msr_info);
   2733
   2734	return kvm_complete_insn_gp(vcpu, ret);
   2735}
   2736
   2737static int svm_get_msr_feature(struct kvm_msr_entry *msr)
   2738{
   2739	msr->data = 0;
   2740
   2741	switch (msr->index) {
   2742	case MSR_F10H_DECFG:
   2743		if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
   2744			msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
   2745		break;
   2746	case MSR_IA32_PERF_CAPABILITIES:
   2747		return 0;
   2748	default:
   2749		return KVM_MSR_RET_INVALID;
   2750	}
   2751
   2752	return 0;
   2753}
   2754
   2755static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   2756{
   2757	struct vcpu_svm *svm = to_svm(vcpu);
   2758
   2759	switch (msr_info->index) {
   2760	case MSR_AMD64_TSC_RATIO:
   2761		if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
   2762			return 1;
   2763		msr_info->data = svm->tsc_ratio_msr;
   2764		break;
   2765	case MSR_STAR:
   2766		msr_info->data = svm->vmcb01.ptr->save.star;
   2767		break;
   2768#ifdef CONFIG_X86_64
   2769	case MSR_LSTAR:
   2770		msr_info->data = svm->vmcb01.ptr->save.lstar;
   2771		break;
   2772	case MSR_CSTAR:
   2773		msr_info->data = svm->vmcb01.ptr->save.cstar;
   2774		break;
   2775	case MSR_KERNEL_GS_BASE:
   2776		msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
   2777		break;
   2778	case MSR_SYSCALL_MASK:
   2779		msr_info->data = svm->vmcb01.ptr->save.sfmask;
   2780		break;
   2781#endif
   2782	case MSR_IA32_SYSENTER_CS:
   2783		msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
   2784		break;
   2785	case MSR_IA32_SYSENTER_EIP:
   2786		msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
   2787		if (guest_cpuid_is_intel(vcpu))
   2788			msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
   2789		break;
   2790	case MSR_IA32_SYSENTER_ESP:
   2791		msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
   2792		if (guest_cpuid_is_intel(vcpu))
   2793			msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
   2794		break;
   2795	case MSR_TSC_AUX:
   2796		msr_info->data = svm->tsc_aux;
   2797		break;
   2798	case MSR_IA32_DEBUGCTLMSR:
   2799	case MSR_IA32_LASTBRANCHFROMIP:
   2800	case MSR_IA32_LASTBRANCHTOIP:
   2801	case MSR_IA32_LASTINTFROMIP:
   2802	case MSR_IA32_LASTINTTOIP:
   2803		msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
   2804		break;
   2805	case MSR_VM_HSAVE_PA:
   2806		msr_info->data = svm->nested.hsave_msr;
   2807		break;
   2808	case MSR_VM_CR:
   2809		msr_info->data = svm->nested.vm_cr_msr;
   2810		break;
   2811	case MSR_IA32_SPEC_CTRL:
   2812		if (!msr_info->host_initiated &&
   2813		    !guest_has_spec_ctrl_msr(vcpu))
   2814			return 1;
   2815
   2816		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
   2817			msr_info->data = svm->vmcb->save.spec_ctrl;
   2818		else
   2819			msr_info->data = svm->spec_ctrl;
   2820		break;
   2821	case MSR_AMD64_VIRT_SPEC_CTRL:
   2822		if (!msr_info->host_initiated &&
   2823		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
   2824			return 1;
   2825
   2826		msr_info->data = svm->virt_spec_ctrl;
   2827		break;
   2828	case MSR_F15H_IC_CFG: {
   2829
   2830		int family, model;
   2831
   2832		family = guest_cpuid_family(vcpu);
   2833		model  = guest_cpuid_model(vcpu);
   2834
   2835		if (family < 0 || model < 0)
   2836			return kvm_get_msr_common(vcpu, msr_info);
   2837
   2838		msr_info->data = 0;
   2839
   2840		if (family == 0x15 &&
   2841		    (model >= 0x2 && model < 0x20))
   2842			msr_info->data = 0x1E;
   2843		}
   2844		break;
   2845	case MSR_F10H_DECFG:
   2846		msr_info->data = svm->msr_decfg;
   2847		break;
   2848	default:
   2849		return kvm_get_msr_common(vcpu, msr_info);
   2850	}
   2851	return 0;
   2852}
   2853
   2854static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
   2855{
   2856	struct vcpu_svm *svm = to_svm(vcpu);
   2857	if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb_in_use))
   2858		return kvm_complete_insn_gp(vcpu, err);
   2859
   2860	svm_set_ghcb_sw_exit_info_1(vcpu, 1);
   2861	svm_set_ghcb_sw_exit_info_2(vcpu,
   2862				    X86_TRAP_GP |
   2863				    SVM_EVTINJ_TYPE_EXEPT |
   2864				    SVM_EVTINJ_VALID);
   2865	return 1;
   2866}
   2867
   2868static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
   2869{
   2870	struct vcpu_svm *svm = to_svm(vcpu);
   2871	int svm_dis, chg_mask;
   2872
   2873	if (data & ~SVM_VM_CR_VALID_MASK)
   2874		return 1;
   2875
   2876	chg_mask = SVM_VM_CR_VALID_MASK;
   2877
   2878	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
   2879		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
   2880
   2881	svm->nested.vm_cr_msr &= ~chg_mask;
   2882	svm->nested.vm_cr_msr |= (data & chg_mask);
   2883
   2884	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
   2885
   2886	/* check for svm_disable while efer.svme is set */
   2887	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
   2888		return 1;
   2889
   2890	return 0;
   2891}
   2892
   2893static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
   2894{
   2895	struct vcpu_svm *svm = to_svm(vcpu);
   2896	int r;
   2897
   2898	u32 ecx = msr->index;
   2899	u64 data = msr->data;
   2900	switch (ecx) {
   2901	case MSR_AMD64_TSC_RATIO:
   2902
   2903		if (!svm->tsc_scaling_enabled) {
   2904
   2905			if (!msr->host_initiated)
   2906				return 1;
   2907			/*
   2908			 * In case TSC scaling is not enabled, always
   2909			 * leave this MSR at the default value.
   2910			 *
   2911			 * Due to bug in qemu 6.2.0, it would try to set
   2912			 * this msr to 0 if tsc scaling is not enabled.
   2913			 * Ignore this value as well.
   2914			 */
   2915			if (data != 0 && data != svm->tsc_ratio_msr)
   2916				return 1;
   2917			break;
   2918		}
   2919
   2920		if (data & SVM_TSC_RATIO_RSVD)
   2921			return 1;
   2922
   2923		svm->tsc_ratio_msr = data;
   2924
   2925		if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
   2926			nested_svm_update_tsc_ratio_msr(vcpu);
   2927
   2928		break;
   2929	case MSR_IA32_CR_PAT:
   2930		if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
   2931			return 1;
   2932		vcpu->arch.pat = data;
   2933		svm->vmcb01.ptr->save.g_pat = data;
   2934		if (is_guest_mode(vcpu))
   2935			nested_vmcb02_compute_g_pat(svm);
   2936		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
   2937		break;
   2938	case MSR_IA32_SPEC_CTRL:
   2939		if (!msr->host_initiated &&
   2940		    !guest_has_spec_ctrl_msr(vcpu))
   2941			return 1;
   2942
   2943		if (kvm_spec_ctrl_test_value(data))
   2944			return 1;
   2945
   2946		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
   2947			svm->vmcb->save.spec_ctrl = data;
   2948		else
   2949			svm->spec_ctrl = data;
   2950		if (!data)
   2951			break;
   2952
   2953		/*
   2954		 * For non-nested:
   2955		 * When it's written (to non-zero) for the first time, pass
   2956		 * it through.
   2957		 *
   2958		 * For nested:
   2959		 * The handling of the MSR bitmap for L2 guests is done in
   2960		 * nested_svm_vmrun_msrpm.
   2961		 * We update the L1 MSR bit as well since it will end up
   2962		 * touching the MSR anyway now.
   2963		 */
   2964		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
   2965		break;
   2966	case MSR_IA32_PRED_CMD:
   2967		if (!msr->host_initiated &&
   2968		    !guest_has_pred_cmd_msr(vcpu))
   2969			return 1;
   2970
   2971		if (data & ~PRED_CMD_IBPB)
   2972			return 1;
   2973		if (!boot_cpu_has(X86_FEATURE_IBPB))
   2974			return 1;
   2975		if (!data)
   2976			break;
   2977
   2978		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
   2979		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
   2980		break;
   2981	case MSR_AMD64_VIRT_SPEC_CTRL:
   2982		if (!msr->host_initiated &&
   2983		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
   2984			return 1;
   2985
   2986		if (data & ~SPEC_CTRL_SSBD)
   2987			return 1;
   2988
   2989		svm->virt_spec_ctrl = data;
   2990		break;
   2991	case MSR_STAR:
   2992		svm->vmcb01.ptr->save.star = data;
   2993		break;
   2994#ifdef CONFIG_X86_64
   2995	case MSR_LSTAR:
   2996		svm->vmcb01.ptr->save.lstar = data;
   2997		break;
   2998	case MSR_CSTAR:
   2999		svm->vmcb01.ptr->save.cstar = data;
   3000		break;
   3001	case MSR_KERNEL_GS_BASE:
   3002		svm->vmcb01.ptr->save.kernel_gs_base = data;
   3003		break;
   3004	case MSR_SYSCALL_MASK:
   3005		svm->vmcb01.ptr->save.sfmask = data;
   3006		break;
   3007#endif
   3008	case MSR_IA32_SYSENTER_CS:
   3009		svm->vmcb01.ptr->save.sysenter_cs = data;
   3010		break;
   3011	case MSR_IA32_SYSENTER_EIP:
   3012		svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
   3013		/*
   3014		 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
   3015		 * when we spoof an Intel vendor ID (for cross vendor migration).
   3016		 * In this case we use this intercept to track the high
   3017		 * 32 bit part of these msrs to support Intel's
   3018		 * implementation of SYSENTER/SYSEXIT.
   3019		 */
   3020		svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
   3021		break;
   3022	case MSR_IA32_SYSENTER_ESP:
   3023		svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
   3024		svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
   3025		break;
   3026	case MSR_TSC_AUX:
   3027		/*
   3028		 * TSC_AUX is usually changed only during boot and never read
   3029		 * directly.  Intercept TSC_AUX instead of exposing it to the
   3030		 * guest via direct_access_msrs, and switch it via user return.
   3031		 */
   3032		preempt_disable();
   3033		r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
   3034		preempt_enable();
   3035		if (r)
   3036			return 1;
   3037
   3038		svm->tsc_aux = data;
   3039		break;
   3040	case MSR_IA32_DEBUGCTLMSR:
   3041		if (!lbrv) {
   3042			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
   3043				    __func__, data);
   3044			break;
   3045		}
   3046		if (data & DEBUGCTL_RESERVED_BITS)
   3047			return 1;
   3048
   3049		if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
   3050			svm->vmcb->save.dbgctl = data;
   3051		else
   3052			svm->vmcb01.ptr->save.dbgctl = data;
   3053
   3054		svm_update_lbrv(vcpu);
   3055
   3056		break;
   3057	case MSR_VM_HSAVE_PA:
   3058		/*
   3059		 * Old kernels did not validate the value written to
   3060		 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
   3061		 * value to allow live migrating buggy or malicious guests
   3062		 * originating from those kernels.
   3063		 */
   3064		if (!msr->host_initiated && !page_address_valid(vcpu, data))
   3065			return 1;
   3066
   3067		svm->nested.hsave_msr = data & PAGE_MASK;
   3068		break;
   3069	case MSR_VM_CR:
   3070		return svm_set_vm_cr(vcpu, data);
   3071	case MSR_VM_IGNNE:
   3072		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
   3073		break;
   3074	case MSR_F10H_DECFG: {
   3075		struct kvm_msr_entry msr_entry;
   3076
   3077		msr_entry.index = msr->index;
   3078		if (svm_get_msr_feature(&msr_entry))
   3079			return 1;
   3080
   3081		/* Check the supported bits */
   3082		if (data & ~msr_entry.data)
   3083			return 1;
   3084
   3085		/* Don't allow the guest to change a bit, #GP */
   3086		if (!msr->host_initiated && (data ^ msr_entry.data))
   3087			return 1;
   3088
   3089		svm->msr_decfg = data;
   3090		break;
   3091	}
   3092	default:
   3093		return kvm_set_msr_common(vcpu, msr);
   3094	}
   3095	return 0;
   3096}
   3097
   3098static int msr_interception(struct kvm_vcpu *vcpu)
   3099{
   3100	if (to_svm(vcpu)->vmcb->control.exit_info_1)
   3101		return kvm_emulate_wrmsr(vcpu);
   3102	else
   3103		return kvm_emulate_rdmsr(vcpu);
   3104}
   3105
   3106static int interrupt_window_interception(struct kvm_vcpu *vcpu)
   3107{
   3108	kvm_make_request(KVM_REQ_EVENT, vcpu);
   3109	svm_clear_vintr(to_svm(vcpu));
   3110
   3111	/*
   3112	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
   3113	 * In this case AVIC was temporarily disabled for
   3114	 * requesting the IRQ window and we have to re-enable it.
   3115	 *
   3116	 * If running nested, still remove the VM wide AVIC inhibit to
   3117	 * support case in which the interrupt window was requested when the
   3118	 * vCPU was not running nested.
   3119
   3120	 * All vCPUs which run still run nested, will remain to have their
   3121	 * AVIC still inhibited due to per-cpu AVIC inhibition.
   3122	 */
   3123	kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
   3124
   3125	++vcpu->stat.irq_window_exits;
   3126	return 1;
   3127}
   3128
   3129static int pause_interception(struct kvm_vcpu *vcpu)
   3130{
   3131	bool in_kernel;
   3132	/*
   3133	 * CPL is not made available for an SEV-ES guest, therefore
   3134	 * vcpu->arch.preempted_in_kernel can never be true.  Just
   3135	 * set in_kernel to false as well.
   3136	 */
   3137	in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
   3138
   3139	grow_ple_window(vcpu);
   3140
   3141	kvm_vcpu_on_spin(vcpu, in_kernel);
   3142	return kvm_skip_emulated_instruction(vcpu);
   3143}
   3144
   3145static int invpcid_interception(struct kvm_vcpu *vcpu)
   3146{
   3147	struct vcpu_svm *svm = to_svm(vcpu);
   3148	unsigned long type;
   3149	gva_t gva;
   3150
   3151	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
   3152		kvm_queue_exception(vcpu, UD_VECTOR);
   3153		return 1;
   3154	}
   3155
   3156	/*
   3157	 * For an INVPCID intercept:
   3158	 * EXITINFO1 provides the linear address of the memory operand.
   3159	 * EXITINFO2 provides the contents of the register operand.
   3160	 */
   3161	type = svm->vmcb->control.exit_info_2;
   3162	gva = svm->vmcb->control.exit_info_1;
   3163
   3164	return kvm_handle_invpcid(vcpu, type, gva);
   3165}
   3166
   3167static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
   3168	[SVM_EXIT_READ_CR0]			= cr_interception,
   3169	[SVM_EXIT_READ_CR3]			= cr_interception,
   3170	[SVM_EXIT_READ_CR4]			= cr_interception,
   3171	[SVM_EXIT_READ_CR8]			= cr_interception,
   3172	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
   3173	[SVM_EXIT_WRITE_CR0]			= cr_interception,
   3174	[SVM_EXIT_WRITE_CR3]			= cr_interception,
   3175	[SVM_EXIT_WRITE_CR4]			= cr_interception,
   3176	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
   3177	[SVM_EXIT_READ_DR0]			= dr_interception,
   3178	[SVM_EXIT_READ_DR1]			= dr_interception,
   3179	[SVM_EXIT_READ_DR2]			= dr_interception,
   3180	[SVM_EXIT_READ_DR3]			= dr_interception,
   3181	[SVM_EXIT_READ_DR4]			= dr_interception,
   3182	[SVM_EXIT_READ_DR5]			= dr_interception,
   3183	[SVM_EXIT_READ_DR6]			= dr_interception,
   3184	[SVM_EXIT_READ_DR7]			= dr_interception,
   3185	[SVM_EXIT_WRITE_DR0]			= dr_interception,
   3186	[SVM_EXIT_WRITE_DR1]			= dr_interception,
   3187	[SVM_EXIT_WRITE_DR2]			= dr_interception,
   3188	[SVM_EXIT_WRITE_DR3]			= dr_interception,
   3189	[SVM_EXIT_WRITE_DR4]			= dr_interception,
   3190	[SVM_EXIT_WRITE_DR5]			= dr_interception,
   3191	[SVM_EXIT_WRITE_DR6]			= dr_interception,
   3192	[SVM_EXIT_WRITE_DR7]			= dr_interception,
   3193	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
   3194	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
   3195	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
   3196	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
   3197	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
   3198	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
   3199	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
   3200	[SVM_EXIT_INTR]				= intr_interception,
   3201	[SVM_EXIT_NMI]				= nmi_interception,
   3202	[SVM_EXIT_SMI]				= smi_interception,
   3203	[SVM_EXIT_VINTR]			= interrupt_window_interception,
   3204	[SVM_EXIT_RDPMC]			= kvm_emulate_rdpmc,
   3205	[SVM_EXIT_CPUID]			= kvm_emulate_cpuid,
   3206	[SVM_EXIT_IRET]                         = iret_interception,
   3207	[SVM_EXIT_INVD]                         = kvm_emulate_invd,
   3208	[SVM_EXIT_PAUSE]			= pause_interception,
   3209	[SVM_EXIT_HLT]				= kvm_emulate_halt,
   3210	[SVM_EXIT_INVLPG]			= invlpg_interception,
   3211	[SVM_EXIT_INVLPGA]			= invlpga_interception,
   3212	[SVM_EXIT_IOIO]				= io_interception,
   3213	[SVM_EXIT_MSR]				= msr_interception,
   3214	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
   3215	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
   3216	[SVM_EXIT_VMRUN]			= vmrun_interception,
   3217	[SVM_EXIT_VMMCALL]			= kvm_emulate_hypercall,
   3218	[SVM_EXIT_VMLOAD]			= vmload_interception,
   3219	[SVM_EXIT_VMSAVE]			= vmsave_interception,
   3220	[SVM_EXIT_STGI]				= stgi_interception,
   3221	[SVM_EXIT_CLGI]				= clgi_interception,
   3222	[SVM_EXIT_SKINIT]			= skinit_interception,
   3223	[SVM_EXIT_RDTSCP]			= kvm_handle_invalid_op,
   3224	[SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
   3225	[SVM_EXIT_MONITOR]			= kvm_emulate_monitor,
   3226	[SVM_EXIT_MWAIT]			= kvm_emulate_mwait,
   3227	[SVM_EXIT_XSETBV]			= kvm_emulate_xsetbv,
   3228	[SVM_EXIT_RDPRU]			= kvm_handle_invalid_op,
   3229	[SVM_EXIT_EFER_WRITE_TRAP]		= efer_trap,
   3230	[SVM_EXIT_CR0_WRITE_TRAP]		= cr_trap,
   3231	[SVM_EXIT_CR4_WRITE_TRAP]		= cr_trap,
   3232	[SVM_EXIT_CR8_WRITE_TRAP]		= cr_trap,
   3233	[SVM_EXIT_INVPCID]                      = invpcid_interception,
   3234	[SVM_EXIT_NPF]				= npf_interception,
   3235	[SVM_EXIT_RSM]                          = rsm_interception,
   3236	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
   3237	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
   3238	[SVM_EXIT_VMGEXIT]			= sev_handle_vmgexit,
   3239};
   3240
   3241static void dump_vmcb(struct kvm_vcpu *vcpu)
   3242{
   3243	struct vcpu_svm *svm = to_svm(vcpu);
   3244	struct vmcb_control_area *control = &svm->vmcb->control;
   3245	struct vmcb_save_area *save = &svm->vmcb->save;
   3246	struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
   3247
   3248	if (!dump_invalid_vmcb) {
   3249		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
   3250		return;
   3251	}
   3252
   3253	pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
   3254	       svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
   3255	pr_err("VMCB Control Area:\n");
   3256	pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
   3257	pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
   3258	pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
   3259	pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
   3260	pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
   3261	pr_err("%-20s%08x %08x\n", "intercepts:",
   3262              control->intercepts[INTERCEPT_WORD3],
   3263	       control->intercepts[INTERCEPT_WORD4]);
   3264	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
   3265	pr_err("%-20s%d\n", "pause filter threshold:",
   3266	       control->pause_filter_thresh);
   3267	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
   3268	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
   3269	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
   3270	pr_err("%-20s%d\n", "asid:", control->asid);
   3271	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
   3272	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
   3273	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
   3274	pr_err("%-20s%08x\n", "int_state:", control->int_state);
   3275	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
   3276	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
   3277	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
   3278	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
   3279	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
   3280	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
   3281	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
   3282	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
   3283	pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
   3284	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
   3285	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
   3286	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
   3287	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
   3288	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
   3289	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
   3290	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
   3291	pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
   3292	pr_err("VMCB State Save Area:\n");
   3293	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3294	       "es:",
   3295	       save->es.selector, save->es.attrib,
   3296	       save->es.limit, save->es.base);
   3297	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3298	       "cs:",
   3299	       save->cs.selector, save->cs.attrib,
   3300	       save->cs.limit, save->cs.base);
   3301	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3302	       "ss:",
   3303	       save->ss.selector, save->ss.attrib,
   3304	       save->ss.limit, save->ss.base);
   3305	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3306	       "ds:",
   3307	       save->ds.selector, save->ds.attrib,
   3308	       save->ds.limit, save->ds.base);
   3309	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3310	       "fs:",
   3311	       save01->fs.selector, save01->fs.attrib,
   3312	       save01->fs.limit, save01->fs.base);
   3313	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3314	       "gs:",
   3315	       save01->gs.selector, save01->gs.attrib,
   3316	       save01->gs.limit, save01->gs.base);
   3317	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3318	       "gdtr:",
   3319	       save->gdtr.selector, save->gdtr.attrib,
   3320	       save->gdtr.limit, save->gdtr.base);
   3321	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3322	       "ldtr:",
   3323	       save01->ldtr.selector, save01->ldtr.attrib,
   3324	       save01->ldtr.limit, save01->ldtr.base);
   3325	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3326	       "idtr:",
   3327	       save->idtr.selector, save->idtr.attrib,
   3328	       save->idtr.limit, save->idtr.base);
   3329	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
   3330	       "tr:",
   3331	       save01->tr.selector, save01->tr.attrib,
   3332	       save01->tr.limit, save01->tr.base);
   3333	pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
   3334	       save->vmpl, save->cpl, save->efer);
   3335	pr_err("%-15s %016llx %-13s %016llx\n",
   3336	       "cr0:", save->cr0, "cr2:", save->cr2);
   3337	pr_err("%-15s %016llx %-13s %016llx\n",
   3338	       "cr3:", save->cr3, "cr4:", save->cr4);
   3339	pr_err("%-15s %016llx %-13s %016llx\n",
   3340	       "dr6:", save->dr6, "dr7:", save->dr7);
   3341	pr_err("%-15s %016llx %-13s %016llx\n",
   3342	       "rip:", save->rip, "rflags:", save->rflags);
   3343	pr_err("%-15s %016llx %-13s %016llx\n",
   3344	       "rsp:", save->rsp, "rax:", save->rax);
   3345	pr_err("%-15s %016llx %-13s %016llx\n",
   3346	       "star:", save01->star, "lstar:", save01->lstar);
   3347	pr_err("%-15s %016llx %-13s %016llx\n",
   3348	       "cstar:", save01->cstar, "sfmask:", save01->sfmask);
   3349	pr_err("%-15s %016llx %-13s %016llx\n",
   3350	       "kernel_gs_base:", save01->kernel_gs_base,
   3351	       "sysenter_cs:", save01->sysenter_cs);
   3352	pr_err("%-15s %016llx %-13s %016llx\n",
   3353	       "sysenter_esp:", save01->sysenter_esp,
   3354	       "sysenter_eip:", save01->sysenter_eip);
   3355	pr_err("%-15s %016llx %-13s %016llx\n",
   3356	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
   3357	pr_err("%-15s %016llx %-13s %016llx\n",
   3358	       "br_from:", save->br_from, "br_to:", save->br_to);
   3359	pr_err("%-15s %016llx %-13s %016llx\n",
   3360	       "excp_from:", save->last_excp_from,
   3361	       "excp_to:", save->last_excp_to);
   3362}
   3363
   3364static bool svm_check_exit_valid(u64 exit_code)
   3365{
   3366	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
   3367		svm_exit_handlers[exit_code]);
   3368}
   3369
   3370static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
   3371{
   3372	vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
   3373	dump_vmcb(vcpu);
   3374	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   3375	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
   3376	vcpu->run->internal.ndata = 2;
   3377	vcpu->run->internal.data[0] = exit_code;
   3378	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
   3379	return 0;
   3380}
   3381
   3382int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
   3383{
   3384	static const struct {
   3385		u64 code;
   3386		const char *name;
   3387	} codelut[] = {
   3388		SVM_EXIT_REASONS,
   3389		{ -1, NULL }
   3390	};
   3391	size_t i;
   3392
   3393	if (!svm_check_exit_valid(exit_code))
   3394		return svm_handle_invalid_exit(vcpu, exit_code);
   3395
   3396	if (cpc_loglevel >= CPC_LOGLVL_DBG && exit_code != SVM_EXIT_INTR) {
   3397		for (i = 0; i < sizeof(codelut) / sizeof(codelut[0]); i++) {
   3398			if (codelut[i].code == exit_code)
   3399				CPC_INFO("KVM EXIT %s (%u,%llu)\n",
   3400					codelut[i].name, cpc_apic_timer, cpc_retinst);
   3401		}
   3402	}
   3403
   3404	cpc_svm_exitcode = exit_code;
   3405
   3406#ifdef CONFIG_RETPOLINE
   3407	if (exit_code == SVM_EXIT_MSR)
   3408		return msr_interception(vcpu);
   3409	else if (exit_code == SVM_EXIT_VINTR)
   3410		return interrupt_window_interception(vcpu);
   3411	else if (exit_code == SVM_EXIT_INTR)
   3412		return intr_interception(vcpu);
   3413	else if (exit_code == SVM_EXIT_HLT)
   3414		return kvm_emulate_halt(vcpu);
   3415	else if (exit_code == SVM_EXIT_NPF)
   3416		return npf_interception(vcpu);
   3417#endif
   3418	return svm_exit_handlers[exit_code](vcpu);
   3419}
   3420
   3421static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
   3422			      u64 *info1, u64 *info2,
   3423			      u32 *intr_info, u32 *error_code)
   3424{
   3425	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
   3426
   3427	*reason = control->exit_code;
   3428	*info1 = control->exit_info_1;
   3429	*info2 = control->exit_info_2;
   3430	*intr_info = control->exit_int_info;
   3431	if ((*intr_info & SVM_EXITINTINFO_VALID) &&
   3432	    (*intr_info & SVM_EXITINTINFO_VALID_ERR))
   3433		*error_code = control->exit_int_info_err;
   3434	else
   3435		*error_code = 0;
   3436}
   3437
   3438static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
   3439{
   3440	struct vcpu_svm *svm = to_svm(vcpu);
   3441	struct kvm_run *kvm_run = vcpu->run;
   3442	u32 exit_code = svm->vmcb->control.exit_code;
   3443
   3444	if (cpc_pause_vm) {
   3445		CPC_DBG("Pausing vm..\n");
   3446		cpc_send_pause_event();
   3447	}
   3448
   3449	trace_kvm_exit(vcpu, KVM_ISA_SVM);
   3450
   3451	/* SEV-ES guests must use the CR write traps to track CR registers. */
   3452	if (!sev_es_guest(vcpu->kvm)) {
   3453		if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
   3454			vcpu->arch.cr0 = svm->vmcb->save.cr0;
   3455		if (npt_enabled)
   3456			vcpu->arch.cr3 = svm->vmcb->save.cr3;
   3457	}
   3458
   3459	if (is_guest_mode(vcpu)) {
   3460		int vmexit;
   3461
   3462		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
   3463
   3464		vmexit = nested_svm_exit_special(svm);
   3465
   3466		if (vmexit == NESTED_EXIT_CONTINUE)
   3467			vmexit = nested_svm_exit_handled(svm);
   3468
   3469		if (vmexit == NESTED_EXIT_DONE)
   3470			return 1;
   3471	}
   3472
   3473	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
   3474		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   3475		kvm_run->fail_entry.hardware_entry_failure_reason
   3476			= svm->vmcb->control.exit_code;
   3477		kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
   3478		dump_vmcb(vcpu);
   3479		return 0;
   3480	}
   3481
   3482	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
   3483	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
   3484	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
   3485	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
   3486		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
   3487		       "exit_code 0x%x\n",
   3488		       __func__, svm->vmcb->control.exit_int_info,
   3489		       exit_code);
   3490
   3491	if (exit_fastpath != EXIT_FASTPATH_NONE)
   3492		return 1;
   3493
   3494	return svm_invoke_exit_handler(vcpu, exit_code);
   3495}
   3496
   3497static void reload_tss(struct kvm_vcpu *vcpu)
   3498{
   3499	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
   3500
   3501	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
   3502	load_TR_desc();
   3503}
   3504
   3505static void pre_svm_run(struct kvm_vcpu *vcpu)
   3506{
   3507	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
   3508	struct vcpu_svm *svm = to_svm(vcpu);
   3509
   3510	/*
   3511	 * If the previous vmrun of the vmcb occurred on a different physical
   3512	 * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
   3513	 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
   3514	 */
   3515	if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
   3516		svm->current_vmcb->asid_generation = 0;
   3517		vmcb_mark_all_dirty(svm->vmcb);
   3518		svm->current_vmcb->cpu = vcpu->cpu;
   3519        }
   3520
   3521	if (sev_guest(vcpu->kvm))
   3522		return pre_sev_run(svm, vcpu->cpu);
   3523
   3524	/* FIXME: handle wraparound of asid_generation */
   3525	if (svm->current_vmcb->asid_generation != sd->asid_generation)
   3526		new_asid(svm, sd);
   3527}
   3528
   3529static void svm_inject_nmi(struct kvm_vcpu *vcpu)
   3530{
   3531	struct vcpu_svm *svm = to_svm(vcpu);
   3532
   3533	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
   3534	vcpu->arch.hflags |= HF_NMI_MASK;
   3535	if (!sev_es_guest(vcpu->kvm))
   3536		svm_set_intercept(svm, INTERCEPT_IRET);
   3537	++vcpu->stat.nmi_injections;
   3538}
   3539
   3540static void svm_inject_irq(struct kvm_vcpu *vcpu)
   3541{
   3542	struct vcpu_svm *svm = to_svm(vcpu);
   3543
   3544	BUG_ON(!(gif_set(svm)));
   3545
   3546	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
   3547	++vcpu->stat.irq_injections;
   3548
   3549	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
   3550		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
   3551}
   3552
   3553void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
   3554				     int trig_mode, int vector)
   3555{
   3556	/*
   3557	 * vcpu->arch.apicv_active must be read after vcpu->mode.
   3558	 * Pairs with smp_store_release in vcpu_enter_guest.
   3559	 */
   3560	bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
   3561
   3562	if (!READ_ONCE(vcpu->arch.apicv_active)) {
   3563		/* Process the interrupt via inject_pending_event */
   3564		kvm_make_request(KVM_REQ_EVENT, vcpu);
   3565		kvm_vcpu_kick(vcpu);
   3566		return;
   3567	}
   3568
   3569	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
   3570	if (in_guest_mode) {
   3571		/*
   3572		 * Signal the doorbell to tell hardware to inject the IRQ.  If
   3573		 * the vCPU exits the guest before the doorbell chimes, hardware
   3574		 * will automatically process AVIC interrupts at the next VMRUN.
   3575		 */
   3576		avic_ring_doorbell(vcpu);
   3577	} else {
   3578		/*
   3579		 * Wake the vCPU if it was blocking.  KVM will then detect the
   3580		 * pending IRQ when checking if the vCPU has a wake event.
   3581		 */
   3582		kvm_vcpu_wake_up(vcpu);
   3583	}
   3584}
   3585
   3586static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
   3587				  int trig_mode, int vector)
   3588{
   3589	kvm_lapic_set_irr(vector, apic);
   3590
   3591	/*
   3592	 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
   3593	 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
   3594	 * the read of guest_mode.  This guarantees that either VMRUN will see
   3595	 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
   3596	 * will signal the doorbell if the CPU has already entered the guest.
   3597	 */
   3598	smp_mb__after_atomic();
   3599	svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
   3600}
   3601
   3602static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
   3603{
   3604	struct vcpu_svm *svm = to_svm(vcpu);
   3605
   3606	/*
   3607	 * SEV-ES guests must always keep the CR intercepts cleared. CR
   3608	 * tracking is done using the CR write traps.
   3609	 */
   3610	if (sev_es_guest(vcpu->kvm))
   3611		return;
   3612
   3613	if (nested_svm_virtualize_tpr(vcpu))
   3614		return;
   3615
   3616	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
   3617
   3618	if (irr == -1)
   3619		return;
   3620
   3621	if (tpr >= irr)
   3622		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
   3623}
   3624
   3625bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
   3626{
   3627	struct vcpu_svm *svm = to_svm(vcpu);
   3628	struct vmcb *vmcb = svm->vmcb;
   3629	bool ret;
   3630
   3631	if (!gif_set(svm))
   3632		return true;
   3633
   3634	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
   3635		return false;
   3636
   3637	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
   3638	      (vcpu->arch.hflags & HF_NMI_MASK);
   3639
   3640	return ret;
   3641}
   3642
   3643static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   3644{
   3645	struct vcpu_svm *svm = to_svm(vcpu);
   3646	if (svm->nested.nested_run_pending)
   3647		return -EBUSY;
   3648
   3649	if (svm_nmi_blocked(vcpu))
   3650		return 0;
   3651
   3652	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
   3653	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
   3654		return -EBUSY;
   3655	return 1;
   3656}
   3657
   3658static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
   3659{
   3660	return !!(vcpu->arch.hflags & HF_NMI_MASK);
   3661}
   3662
   3663static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
   3664{
   3665	struct vcpu_svm *svm = to_svm(vcpu);
   3666
   3667	if (masked) {
   3668		vcpu->arch.hflags |= HF_NMI_MASK;
   3669		if (!sev_es_guest(vcpu->kvm))
   3670			svm_set_intercept(svm, INTERCEPT_IRET);
   3671	} else {
   3672		vcpu->arch.hflags &= ~HF_NMI_MASK;
   3673		if (!sev_es_guest(vcpu->kvm))
   3674			svm_clr_intercept(svm, INTERCEPT_IRET);
   3675	}
   3676}
   3677
   3678bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
   3679{
   3680	struct vcpu_svm *svm = to_svm(vcpu);
   3681	struct vmcb *vmcb = svm->vmcb;
   3682
   3683	if (!gif_set(svm))
   3684		return true;
   3685
   3686	if (is_guest_mode(vcpu)) {
   3687		/* As long as interrupts are being delivered...  */
   3688		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
   3689		    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
   3690		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
   3691			return true;
   3692
   3693		/* ... vmexits aren't blocked by the interrupt shadow  */
   3694		if (nested_exit_on_intr(svm))
   3695			return false;
   3696	} else {
   3697		if (!svm_get_if_flag(vcpu))
   3698			return true;
   3699	}
   3700
   3701	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
   3702}
   3703
   3704static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   3705{
   3706	struct vcpu_svm *svm = to_svm(vcpu);
   3707
   3708	if (svm->nested.nested_run_pending)
   3709		return -EBUSY;
   3710
   3711	if (svm_interrupt_blocked(vcpu))
   3712		return 0;
   3713
   3714	/*
   3715	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
   3716	 * e.g. if the IRQ arrived asynchronously after checking nested events.
   3717	 */
   3718	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
   3719		return -EBUSY;
   3720
   3721	return 1;
   3722}
   3723
   3724static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
   3725{
   3726	struct vcpu_svm *svm = to_svm(vcpu);
   3727
   3728	/*
   3729	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
   3730	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
   3731	 * get that intercept, this function will be called again though and
   3732	 * we'll get the vintr intercept. However, if the vGIF feature is
   3733	 * enabled, the STGI interception will not occur. Enable the irq
   3734	 * window under the assumption that the hardware will set the GIF.
   3735	 */
   3736	if (vgif || gif_set(svm)) {
   3737		/*
   3738		 * IRQ window is not needed when AVIC is enabled,
   3739		 * unless we have pending ExtINT since it cannot be injected
   3740		 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
   3741		 * and fallback to injecting IRQ via V_IRQ.
   3742		 *
   3743		 * If running nested, AVIC is already locally inhibited
   3744		 * on this vCPU, therefore there is no need to request
   3745		 * the VM wide AVIC inhibition.
   3746		 */
   3747		if (!is_guest_mode(vcpu))
   3748			kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
   3749
   3750		svm_set_vintr(svm);
   3751	}
   3752}
   3753
   3754static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
   3755{
   3756	struct vcpu_svm *svm = to_svm(vcpu);
   3757
   3758	if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
   3759		return; /* IRET will cause a vm exit */
   3760
   3761	if (!gif_set(svm)) {
   3762		if (vgif)
   3763			svm_set_intercept(svm, INTERCEPT_STGI);
   3764		return; /* STGI will cause a vm exit */
   3765	}
   3766
   3767	/*
   3768	 * Something prevents NMI from been injected. Single step over possible
   3769	 * problem (IRET or exception injection or interrupt shadow)
   3770	 */
   3771	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
   3772	svm->nmi_singlestep = true;
   3773	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
   3774}
   3775
   3776static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
   3777{
   3778	struct vcpu_svm *svm = to_svm(vcpu);
   3779
   3780	/*
   3781	 * Flush only the current ASID even if the TLB flush was invoked via
   3782	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
   3783	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
   3784	 * unconditionally does a TLB flush on both nested VM-Enter and nested
   3785	 * VM-Exit (via kvm_mmu_reset_context()).
   3786	 */
   3787	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
   3788		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
   3789	else
   3790		svm->current_vmcb->asid_generation--;
   3791}
   3792
   3793static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
   3794{
   3795	struct vcpu_svm *svm = to_svm(vcpu);
   3796
   3797	invlpga(gva, svm->vmcb->control.asid);
   3798}
   3799
   3800static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
   3801{
   3802	struct vcpu_svm *svm = to_svm(vcpu);
   3803
   3804	if (nested_svm_virtualize_tpr(vcpu))
   3805		return;
   3806
   3807	if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
   3808		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
   3809		kvm_set_cr8(vcpu, cr8);
   3810	}
   3811}
   3812
   3813static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
   3814{
   3815	struct vcpu_svm *svm = to_svm(vcpu);
   3816	u64 cr8;
   3817
   3818	if (nested_svm_virtualize_tpr(vcpu) ||
   3819	    kvm_vcpu_apicv_active(vcpu))
   3820		return;
   3821
   3822	cr8 = kvm_get_cr8(vcpu);
   3823	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
   3824	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
   3825}
   3826
   3827static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
   3828{
   3829	struct vcpu_svm *svm = to_svm(vcpu);
   3830	u8 vector;
   3831	int type;
   3832	u32 exitintinfo = svm->vmcb->control.exit_int_info;
   3833	unsigned int3_injected = svm->int3_injected;
   3834
   3835	svm->int3_injected = 0;
   3836
   3837	/*
   3838	 * If we've made progress since setting HF_IRET_MASK, we've
   3839	 * executed an IRET and can allow NMI injection.
   3840	 */
   3841	if ((vcpu->arch.hflags & HF_IRET_MASK) &&
   3842	    (sev_es_guest(vcpu->kvm) ||
   3843	     kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
   3844		vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
   3845		kvm_make_request(KVM_REQ_EVENT, vcpu);
   3846	}
   3847
   3848	vcpu->arch.nmi_injected = false;
   3849	kvm_clear_exception_queue(vcpu);
   3850	kvm_clear_interrupt_queue(vcpu);
   3851
   3852	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
   3853		return;
   3854
   3855	kvm_make_request(KVM_REQ_EVENT, vcpu);
   3856
   3857	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
   3858	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
   3859
   3860	switch (type) {
   3861	case SVM_EXITINTINFO_TYPE_NMI:
   3862		vcpu->arch.nmi_injected = true;
   3863		break;
   3864	case SVM_EXITINTINFO_TYPE_EXEPT:
   3865		/*
   3866		 * Never re-inject a #VC exception.
   3867		 */
   3868		if (vector == X86_TRAP_VC)
   3869			break;
   3870
   3871		/*
   3872		 * In case of software exceptions, do not reinject the vector,
   3873		 * but re-execute the instruction instead. Rewind RIP first
   3874		 * if we emulated INT3 before.
   3875		 */
   3876		if (kvm_exception_is_soft(vector)) {
   3877			if (vector == BP_VECTOR && int3_injected &&
   3878			    kvm_is_linear_rip(vcpu, svm->int3_rip))
   3879				kvm_rip_write(vcpu,
   3880					      kvm_rip_read(vcpu) - int3_injected);
   3881			break;
   3882		}
   3883		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
   3884			u32 err = svm->vmcb->control.exit_int_info_err;
   3885			kvm_requeue_exception_e(vcpu, vector, err);
   3886
   3887		} else
   3888			kvm_requeue_exception(vcpu, vector);
   3889		break;
   3890	case SVM_EXITINTINFO_TYPE_INTR:
   3891		kvm_queue_interrupt(vcpu, vector, false);
   3892		break;
   3893	default:
   3894		break;
   3895	}
   3896}
   3897
   3898static void svm_cancel_injection(struct kvm_vcpu *vcpu)
   3899{
   3900	struct vcpu_svm *svm = to_svm(vcpu);
   3901	struct vmcb_control_area *control = &svm->vmcb->control;
   3902
   3903	control->exit_int_info = control->event_inj;
   3904	control->exit_int_info_err = control->event_inj_err;
   3905	control->event_inj = 0;
   3906	svm_complete_interrupts(vcpu);
   3907}
   3908
   3909static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
   3910{
   3911	return 1;
   3912}
   3913
   3914static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
   3915{
   3916	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
   3917	    to_svm(vcpu)->vmcb->control.exit_info_1)
   3918		return handle_fastpath_set_msr_irqoff(vcpu);
   3919
   3920	return EXIT_FASTPATH_NONE;
   3921}
   3922
   3923static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
   3924{
   3925	struct vcpu_svm *svm = to_svm(vcpu);
   3926 	unsigned long vmcb_pa = svm->current_vmcb->pa;
   3927	int cpu;
   3928
   3929	/* debug prints influence single-stepping */
   3930	//WARN_ON_ONCE(cpc_singlestep && cpc_loglevel >= CPC_LOGLVL_DBG);
   3931
   3932	guest_state_enter_irqoff();
   3933
   3934	cpu = get_cpu();
   3935	WARN_ON(cpu != 2);
   3936
   3937	memset(cpc_msrmts, 0, L1_SETS);
   3938
   3939	if (cpc_singlestep_reset) {
   3940		if (cpc_apic_timer < cpc_apic_timer_min) {
   3941			cpc_apic_timer = cpc_apic_timer_min;
   3942		} else if (cpc_svm_exitcode == SVM_EXIT_NPF) {
   3943			cpc_apic_timer -= cpc_apic_timer_dec_npf;
   3944		} else if (cpc_svm_exitcode == SVM_EXIT_INTR) {
   3945			cpc_apic_timer -= cpc_apic_timer_dec_intr;
   3946		}
   3947
   3948		if (sev_es_guest(vcpu->kvm)) {
   3949			/* invalidate cached vmsa so rip is updated */
   3950			wbinvd();
   3951			cpc_rip_prev = svm->sev_es.vmsa->rip;
   3952		} else {
   3953			cpc_rip_prev = kvm_rip_read(vcpu);
   3954		}
   3955		cpc_rip_prev_set = true;
   3956		cpc_retinst_prev = 0;
   3957
   3958		cpc_singlestep = true;
   3959		cpc_singlestep_reset = false;
   3960	}
   3961
   3962	if (cpc_long_step) {
   3963		WARN_ON(cpc_singlestep);
   3964		cpc_apic_timer = 5000000;
   3965		cpc_apic_oneshot = true;
   3966	} else if (cpc_singlestep) {
   3967		cpc_apic_oneshot = true;
   3968	} else {
   3969		cpc_apic_oneshot = false;
   3970	}
   3971
   3972	if (cpc_apic_timer < cpc_apic_timer_min)
   3973		cpc_apic_timer = cpc_apic_timer_min;
   3974
   3975	cpc_retinst = cpc_read_pmc(CPC_RETINST_PMC);
   3976	cpc_retinst_user = cpc_read_pmc(CPC_RETINST_USER_PMC);
   3977	cpc_guest_misses = cpc_read_pmc(CPC_L1MISS_GUEST_PMC);
   3978
   3979	if (sev_es_guest(vcpu->kvm)) {
   3980		__svm_sev_es_vcpu_run(vmcb_pa);
   3981	} else {
   3982		struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
   3983
   3984		/*
   3985		 * Use a single vmcb (vmcb01 because it's always valid) for
   3986		 * context switching guest state via VMLOAD/VMSAVE, that way
   3987		 * the state doesn't need to be copied between vmcb01 and
   3988		 * vmcb02 when switching vmcbs for nested virtualization.
   3989		 */
   3990		vmload(svm->vmcb01.pa);
   3991		__svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
   3992		vmsave(svm->vmcb01.pa);
   3993
   3994		vmload(__sme_page_pa(sd->save_area));
   3995	}
   3996
   3997	cpc_retinst = cpc_read_pmc(CPC_RETINST_PMC) - cpc_retinst;
   3998	cpc_retinst_user = cpc_read_pmc(CPC_RETINST_USER_PMC) - cpc_retinst_user;
   3999	cpc_guest_misses = cpc_read_pmc(CPC_L1MISS_GUEST_PMC) - cpc_guest_misses;
   4000
   4001	if (cpc_baseline_measure && cpc_guest_misses < cpc_baseline_guest_misses)
   4002		cpc_baseline_guest_misses = cpc_guest_misses;
   4003
   4004	if (cpc_baseline_active) {
   4005		WARN_ON_ONCE(cpc_guest_misses < cpc_baseline_guest_misses);
   4006		cpc_guest_misses -= cpc_baseline_guest_misses;
   4007	}
   4008
   4009	if (cpc_prime_probe)
   4010		cpc_save_msrmts(cpc_ds);
   4011
   4012	if (cpc_track_mode == CPC_TRACK_PAGES) {
   4013		if (cpc_retinst >= 1)
   4014			cpc_track_pages.retinst += cpc_retinst - 1;
   4015		if (cpc_retinst_user >= 1)
   4016			cpc_track_pages.retinst_user += cpc_retinst_user - 1;
   4017	}
   4018
   4019	put_cpu();
   4020
   4021	guest_state_exit_irqoff();
   4022
   4023	if (cpc_apic_oneshot)
   4024		CPC_DBG("Oneshot %i\n", cpc_apic_timer);
   4025	CPC_DBG("Retinst %llu\n", cpc_retinst);
   4026
   4027	cpc_apic_oneshot = false;
   4028}
   4029
   4030static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
   4031{
   4032	struct vcpu_svm *svm = to_svm(vcpu);
   4033
   4034	if (cpc_pause_vm) {
   4035		CPC_DBG("Pausing vm..\n");
   4036		cpc_send_pause_event();
   4037	}
   4038
   4039	trace_kvm_entry(vcpu);
   4040
   4041	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
   4042	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
   4043	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
   4044
   4045	/*
   4046	 * Disable singlestep if we're injecting an interrupt/exception.
   4047	 * We don't want our modified rflags to be pushed on the stack where
   4048	 * we might not be able to easily reset them if we disabled NMI
   4049	 * singlestep later.
   4050	 */
   4051	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
   4052		/*
   4053		 * Event injection happens before external interrupts cause a
   4054		 * vmexit and interrupts are disabled here, so smp_send_reschedule
   4055		 * is enough to force an immediate vmexit.
   4056		 */
   4057		disable_nmi_singlestep(svm);
   4058		smp_send_reschedule(vcpu->cpu);
   4059	}
   4060
   4061	pre_svm_run(vcpu);
   4062
   4063	sync_lapic_to_cr8(vcpu);
   4064
   4065	if (unlikely(svm->asid != svm->vmcb->control.asid)) {
   4066		svm->vmcb->control.asid = svm->asid;
   4067		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
   4068	}
   4069	svm->vmcb->save.cr2 = vcpu->arch.cr2;
   4070
   4071	svm_hv_update_vp_id(svm->vmcb, vcpu);
   4072
   4073	/*
   4074	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
   4075	 * of a #DB.
   4076	 */
   4077	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
   4078		svm_set_dr6(svm, vcpu->arch.dr6);
   4079	else
   4080		svm_set_dr6(svm, DR6_ACTIVE_LOW);
   4081
   4082	clgi();
   4083	kvm_load_guest_xsave_state(vcpu);
   4084
   4085	kvm_wait_lapic_expire(vcpu);
   4086
   4087	/*
   4088	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
   4089	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
   4090	 * is no need to worry about the conditional branch over the wrmsr
   4091	 * being speculatively taken.
   4092	 */
   4093	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
   4094		x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
   4095
   4096	svm_vcpu_enter_exit(vcpu);
   4097
   4098	/*
   4099	 * We do not use IBRS in the kernel. If this vCPU has used the
   4100	 * SPEC_CTRL MSR it may have left it on; save the value and
   4101	 * turn it off. This is much more efficient than blindly adding
   4102	 * it to the atomic save/restore list. Especially as the former
   4103	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
   4104	 *
   4105	 * For non-nested case:
   4106	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
   4107	 * save it.
   4108	 *
   4109	 * For nested case:
   4110	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
   4111	 * save it.
   4112	 */
   4113	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
   4114	    unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
   4115		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
   4116
   4117	if (!sev_es_guest(vcpu->kvm))
   4118		reload_tss(vcpu);
   4119
   4120	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
   4121		x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
   4122
   4123	if (!sev_es_guest(vcpu->kvm)) {
   4124		vcpu->arch.cr2 = svm->vmcb->save.cr2;
   4125		vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
   4126		vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
   4127		vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
   4128	}
   4129	vcpu->arch.regs_dirty = 0;
   4130
   4131	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
   4132		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
   4133
   4134	kvm_load_host_xsave_state(vcpu);
   4135	stgi();
   4136
   4137	/* Any pending NMI will happen here */
   4138
   4139	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
   4140		kvm_after_interrupt(vcpu);
   4141
   4142	sync_cr8_to_lapic(vcpu);
   4143
   4144	svm->next_rip = 0;
   4145	if (is_guest_mode(vcpu)) {
   4146		nested_sync_control_from_vmcb02(svm);
   4147
   4148		/* Track VMRUNs that have made past consistency checking */
   4149		if (svm->nested.nested_run_pending &&
   4150		    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
   4151                        ++vcpu->stat.nested_run;
   4152
   4153		svm->nested.nested_run_pending = 0;
   4154	}
   4155
   4156	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
   4157	vmcb_mark_all_clean(svm->vmcb);
   4158
   4159	/* if exit due to PF check for async PF */
   4160	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
   4161		vcpu->arch.apf.host_apf_flags =
   4162			kvm_read_and_reset_apf_flags();
   4163
   4164	vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
   4165
   4166	/*
   4167	 * We need to handle MC intercepts here before the vcpu has a chance to
   4168	 * change the physical cpu
   4169	 */
   4170	if (unlikely(svm->vmcb->control.exit_code ==
   4171		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
   4172		svm_handle_mce(vcpu);
   4173
   4174	svm_complete_interrupts(vcpu);
   4175
   4176	if (is_guest_mode(vcpu))
   4177		return EXIT_FASTPATH_NONE;
   4178
   4179	return svm_exit_handlers_fastpath(vcpu);
   4180}
   4181
   4182static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
   4183			     int root_level)
   4184{
   4185	struct vcpu_svm *svm = to_svm(vcpu);
   4186	unsigned long cr3;
   4187
   4188	if (npt_enabled) {
   4189		svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
   4190		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
   4191
   4192		hv_track_root_tdp(vcpu, root_hpa);
   4193
   4194		cr3 = vcpu->arch.cr3;
   4195	} else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
   4196		cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
   4197	} else {
   4198		/* PCID in the guest should be impossible with a 32-bit MMU. */
   4199		WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
   4200		cr3 = root_hpa;
   4201	}
   4202
   4203	svm->vmcb->save.cr3 = cr3;
   4204	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
   4205}
   4206
   4207static int is_disabled(void)
   4208{
   4209	u64 vm_cr;
   4210
   4211	rdmsrl(MSR_VM_CR, vm_cr);
   4212	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
   4213		return 1;
   4214
   4215	return 0;
   4216}
   4217
   4218static void
   4219svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
   4220{
   4221	/*
   4222	 * Patch in the VMMCALL instruction:
   4223	 */
   4224	hypercall[0] = 0x0f;
   4225	hypercall[1] = 0x01;
   4226	hypercall[2] = 0xd9;
   4227}
   4228
   4229static int __init svm_check_processor_compat(void)
   4230{
   4231	return 0;
   4232}
   4233
   4234/*
   4235 * The kvm parameter can be NULL (module initialization, or invocation before
   4236 * VM creation). Be sure to check the kvm parameter before using it.
   4237 */
   4238static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
   4239{
   4240	switch (index) {
   4241	case MSR_IA32_MCG_EXT_CTL:
   4242	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
   4243		return false;
   4244	case MSR_IA32_SMBASE:
   4245		/* SEV-ES guests do not support SMM, so report false */
   4246		if (kvm && sev_es_guest(kvm))
   4247			return false;
   4248		break;
   4249	default:
   4250		break;
   4251	}
   4252
   4253	return true;
   4254}
   4255
   4256static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
   4257{
   4258	return 0;
   4259}
   4260
   4261static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
   4262{
   4263	struct vcpu_svm *svm = to_svm(vcpu);
   4264	struct kvm_cpuid_entry2 *best;
   4265	struct kvm *kvm = vcpu->kvm;
   4266
   4267	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
   4268				    boot_cpu_has(X86_FEATURE_XSAVE) &&
   4269				    boot_cpu_has(X86_FEATURE_XSAVES);
   4270
   4271	/* Update nrips enabled cache */
   4272	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
   4273			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
   4274
   4275	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
   4276	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
   4277
   4278	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
   4279
   4280	svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
   4281			guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
   4282
   4283	svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
   4284			guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
   4285
   4286	svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
   4287
   4288	svm_recalc_instruction_intercepts(vcpu, svm);
   4289
   4290	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
   4291	if (sev_guest(vcpu->kvm)) {
   4292		best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
   4293		if (best)
   4294			vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
   4295	}
   4296
   4297	if (kvm_vcpu_apicv_active(vcpu)) {
   4298		/*
   4299		 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
   4300		 * is exposed to the guest, disable AVIC.
   4301		 */
   4302		if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
   4303			kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
   4304	}
   4305	init_vmcb_after_set_cpuid(vcpu);
   4306}
   4307
   4308static bool svm_has_wbinvd_exit(void)
   4309{
   4310	return true;
   4311}
   4312
   4313#define PRE_EX(exit)  { .exit_code = (exit), \
   4314			.stage = X86_ICPT_PRE_EXCEPT, }
   4315#define POST_EX(exit) { .exit_code = (exit), \
   4316			.stage = X86_ICPT_POST_EXCEPT, }
   4317#define POST_MEM(exit) { .exit_code = (exit), \
   4318			.stage = X86_ICPT_POST_MEMACCESS, }
   4319
   4320static const struct __x86_intercept {
   4321	u32 exit_code;
   4322	enum x86_intercept_stage stage;
   4323} x86_intercept_map[] = {
   4324	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
   4325	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
   4326	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
   4327	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
   4328	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
   4329	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
   4330	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
   4331	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
   4332	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
   4333	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
   4334	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
   4335	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
   4336	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
   4337	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
   4338	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
   4339	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
   4340	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
   4341	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
   4342	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
   4343	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
   4344	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
   4345	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
   4346	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
   4347	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
   4348	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
   4349	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
   4350	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
   4351	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
   4352	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
   4353	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
   4354	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
   4355	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
   4356	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
   4357	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
   4358	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
   4359	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
   4360	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
   4361	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
   4362	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
   4363	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
   4364	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
   4365	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
   4366	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
   4367	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
   4368	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
   4369	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
   4370	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
   4371};
   4372
   4373#undef PRE_EX
   4374#undef POST_EX
   4375#undef POST_MEM
   4376
   4377static int svm_check_intercept(struct kvm_vcpu *vcpu,
   4378			       struct x86_instruction_info *info,
   4379			       enum x86_intercept_stage stage,
   4380			       struct x86_exception *exception)
   4381{
   4382	struct vcpu_svm *svm = to_svm(vcpu);
   4383	int vmexit, ret = X86EMUL_CONTINUE;
   4384	struct __x86_intercept icpt_info;
   4385	struct vmcb *vmcb = svm->vmcb;
   4386
   4387	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
   4388		goto out;
   4389
   4390	icpt_info = x86_intercept_map[info->intercept];
   4391
   4392	if (stage != icpt_info.stage)
   4393		goto out;
   4394
   4395	switch (icpt_info.exit_code) {
   4396	case SVM_EXIT_READ_CR0:
   4397		if (info->intercept == x86_intercept_cr_read)
   4398			icpt_info.exit_code += info->modrm_reg;
   4399		break;
   4400	case SVM_EXIT_WRITE_CR0: {
   4401		unsigned long cr0, val;
   4402
   4403		if (info->intercept == x86_intercept_cr_write)
   4404			icpt_info.exit_code += info->modrm_reg;
   4405
   4406		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
   4407		    info->intercept == x86_intercept_clts)
   4408			break;
   4409
   4410		if (!(vmcb12_is_intercept(&svm->nested.ctl,
   4411					INTERCEPT_SELECTIVE_CR0)))
   4412			break;
   4413
   4414		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
   4415		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
   4416
   4417		if (info->intercept == x86_intercept_lmsw) {
   4418			cr0 &= 0xfUL;
   4419			val &= 0xfUL;
   4420			/* lmsw can't clear PE - catch this here */
   4421			if (cr0 & X86_CR0_PE)
   4422				val |= X86_CR0_PE;
   4423		}
   4424
   4425		if (cr0 ^ val)
   4426			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
   4427
   4428		break;
   4429	}
   4430	case SVM_EXIT_READ_DR0:
   4431	case SVM_EXIT_WRITE_DR0:
   4432		icpt_info.exit_code += info->modrm_reg;
   4433		break;
   4434	case SVM_EXIT_MSR:
   4435		if (info->intercept == x86_intercept_wrmsr)
   4436			vmcb->control.exit_info_1 = 1;
   4437		else
   4438			vmcb->control.exit_info_1 = 0;
   4439		break;
   4440	case SVM_EXIT_PAUSE:
   4441		/*
   4442		 * We get this for NOP only, but pause
   4443		 * is rep not, check this here
   4444		 */
   4445		if (info->rep_prefix != REPE_PREFIX)
   4446			goto out;
   4447		break;
   4448	case SVM_EXIT_IOIO: {
   4449		u64 exit_info;
   4450		u32 bytes;
   4451
   4452		if (info->intercept == x86_intercept_in ||
   4453		    info->intercept == x86_intercept_ins) {
   4454			exit_info = ((info->src_val & 0xffff) << 16) |
   4455				SVM_IOIO_TYPE_MASK;
   4456			bytes = info->dst_bytes;
   4457		} else {
   4458			exit_info = (info->dst_val & 0xffff) << 16;
   4459			bytes = info->src_bytes;
   4460		}
   4461
   4462		if (info->intercept == x86_intercept_outs ||
   4463		    info->intercept == x86_intercept_ins)
   4464			exit_info |= SVM_IOIO_STR_MASK;
   4465
   4466		if (info->rep_prefix)
   4467			exit_info |= SVM_IOIO_REP_MASK;
   4468
   4469		bytes = min(bytes, 4u);
   4470
   4471		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
   4472
   4473		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
   4474
   4475		vmcb->control.exit_info_1 = exit_info;
   4476		vmcb->control.exit_info_2 = info->next_rip;
   4477
   4478		break;
   4479	}
   4480	default:
   4481		break;
   4482	}
   4483
   4484	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
   4485	if (static_cpu_has(X86_FEATURE_NRIPS))
   4486		vmcb->control.next_rip  = info->next_rip;
   4487	vmcb->control.exit_code = icpt_info.exit_code;
   4488	vmexit = nested_svm_exit_handled(svm);
   4489
   4490	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
   4491					   : X86EMUL_CONTINUE;
   4492
   4493out:
   4494	return ret;
   4495}
   4496
   4497static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
   4498{
   4499	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
   4500		vcpu->arch.at_instruction_boundary = true;
   4501}
   4502
   4503static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
   4504{
   4505	if (!kvm_pause_in_guest(vcpu->kvm))
   4506		shrink_ple_window(vcpu);
   4507}
   4508
   4509static void svm_setup_mce(struct kvm_vcpu *vcpu)
   4510{
   4511	/* [63:9] are reserved. */
   4512	vcpu->arch.mcg_cap &= 0x1ff;
   4513}
   4514
   4515bool svm_smi_blocked(struct kvm_vcpu *vcpu)
   4516{
   4517	struct vcpu_svm *svm = to_svm(vcpu);
   4518
   4519	/* Per APM Vol.2 15.22.2 "Response to SMI" */
   4520	if (!gif_set(svm))
   4521		return true;
   4522
   4523	return is_smm(vcpu);
   4524}
   4525
   4526static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
   4527{
   4528	struct vcpu_svm *svm = to_svm(vcpu);
   4529	if (svm->nested.nested_run_pending)
   4530		return -EBUSY;
   4531
   4532	if (svm_smi_blocked(vcpu))
   4533		return 0;
   4534
   4535	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
   4536	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
   4537		return -EBUSY;
   4538
   4539	return 1;
   4540}
   4541
   4542static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
   4543{
   4544	struct vcpu_svm *svm = to_svm(vcpu);
   4545	struct kvm_host_map map_save;
   4546	int ret;
   4547
   4548	if (!is_guest_mode(vcpu))
   4549		return 0;
   4550
   4551	/* FED8h - SVM Guest */
   4552	put_smstate(u64, smstate, 0x7ed8, 1);
   4553	/* FEE0h - SVM Guest VMCB Physical Address */
   4554	put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
   4555
   4556	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
   4557	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
   4558	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
   4559
   4560	ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
   4561	if (ret)
   4562		return ret;
   4563
   4564	/*
   4565	 * KVM uses VMCB01 to store L1 host state while L2 runs but
   4566	 * VMCB01 is going to be used during SMM and thus the state will
   4567	 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
   4568	 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
   4569	 * format of the area is identical to guest save area offsetted
   4570	 * by 0x400 (matches the offset of 'struct vmcb_save_area'
   4571	 * within 'struct vmcb'). Note: HSAVE area may also be used by
   4572	 * L1 hypervisor to save additional host context (e.g. KVM does
   4573	 * that, see svm_prepare_switch_to_guest()) which must be
   4574	 * preserved.
   4575	 */
   4576	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
   4577			 &map_save) == -EINVAL)
   4578		return 1;
   4579
   4580	BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
   4581
   4582	svm_copy_vmrun_state(map_save.hva + 0x400,
   4583			     &svm->vmcb01.ptr->save);
   4584
   4585	kvm_vcpu_unmap(vcpu, &map_save, true);
   4586	return 0;
   4587}
   4588
   4589static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
   4590{
   4591	struct vcpu_svm *svm = to_svm(vcpu);
   4592	struct kvm_host_map map, map_save;
   4593	u64 saved_efer, vmcb12_gpa;
   4594	struct vmcb *vmcb12;
   4595	int ret;
   4596
   4597	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
   4598		return 0;
   4599
   4600	/* Non-zero if SMI arrived while vCPU was in guest mode. */
   4601	if (!GET_SMSTATE(u64, smstate, 0x7ed8))
   4602		return 0;
   4603
   4604	if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
   4605		return 1;
   4606
   4607	saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
   4608	if (!(saved_efer & EFER_SVME))
   4609		return 1;
   4610
   4611	vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
   4612	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
   4613		return 1;
   4614
   4615	ret = 1;
   4616	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
   4617		goto unmap_map;
   4618
   4619	if (svm_allocate_nested(svm))
   4620		goto unmap_save;
   4621
   4622	/*
   4623	 * Restore L1 host state from L1 HSAVE area as VMCB01 was
   4624	 * used during SMM (see svm_enter_smm())
   4625	 */
   4626
   4627	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
   4628
   4629	/*
   4630	 * Enter the nested guest now
   4631	 */
   4632
   4633	vmcb_mark_all_dirty(svm->vmcb01.ptr);
   4634
   4635	vmcb12 = map.hva;
   4636	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
   4637	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
   4638	ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
   4639
   4640	if (ret)
   4641		goto unmap_save;
   4642
   4643	svm->nested.nested_run_pending = 1;
   4644
   4645unmap_save:
   4646	kvm_vcpu_unmap(vcpu, &map_save, true);
   4647unmap_map:
   4648	kvm_vcpu_unmap(vcpu, &map, true);
   4649	return ret;
   4650}
   4651
   4652static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
   4653{
   4654	struct vcpu_svm *svm = to_svm(vcpu);
   4655
   4656	if (!gif_set(svm)) {
   4657		if (vgif)
   4658			svm_set_intercept(svm, INTERCEPT_STGI);
   4659		/* STGI will cause a vm exit */
   4660	} else {
   4661		/* We must be in SMM; RSM will cause a vmexit anyway.  */
   4662	}
   4663}
   4664
   4665static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
   4666					void *insn, int insn_len)
   4667{
   4668	bool smep, smap, is_user;
   4669	unsigned long cr4;
   4670	u64 error_code;
   4671
   4672	/* Emulation is always possible when KVM has access to all guest state. */
   4673	if (!sev_guest(vcpu->kvm))
   4674		return true;
   4675
   4676	/* #UD and #GP should never be intercepted for SEV guests. */
   4677	WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
   4678				  EMULTYPE_TRAP_UD_FORCED |
   4679				  EMULTYPE_VMWARE_GP));
   4680
   4681	/*
   4682	 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
   4683	 * to guest register state.
   4684	 */
   4685	if (sev_es_guest(vcpu->kvm))
   4686		return false;
   4687
   4688	/*
   4689	 * Emulation is possible if the instruction is already decoded, e.g.
   4690	 * when completing I/O after returning from userspace.
   4691	 */
   4692	if (emul_type & EMULTYPE_NO_DECODE)
   4693		return true;
   4694
   4695	/*
   4696	 * Emulation is possible for SEV guests if and only if a prefilled
   4697	 * buffer containing the bytes of the intercepted instruction is
   4698	 * available. SEV guest memory is encrypted with a guest specific key
   4699	 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
   4700	 * decode garbage.
   4701	 *
   4702	 * Inject #UD if KVM reached this point without an instruction buffer.
   4703	 * In practice, this path should never be hit by a well-behaved guest,
   4704	 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
   4705	 * is still theoretically reachable, e.g. via unaccelerated fault-like
   4706	 * AVIC access, and needs to be handled by KVM to avoid putting the
   4707	 * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
   4708	 * but its the least awful option given lack of insight into the guest.
   4709	 */
   4710	if (unlikely(!insn)) {
   4711		kvm_queue_exception(vcpu, UD_VECTOR);
   4712		return false;
   4713	}
   4714
   4715	/*
   4716	 * Emulate for SEV guests if the insn buffer is not empty.  The buffer
   4717	 * will be empty if the DecodeAssist microcode cannot fetch bytes for
   4718	 * the faulting instruction because the code fetch itself faulted, e.g.
   4719	 * the guest attempted to fetch from emulated MMIO or a guest page
   4720	 * table used to translate CS:RIP resides in emulated MMIO.
   4721	 */
   4722	if (likely(insn_len))
   4723		return true;
   4724
   4725	/*
   4726	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
   4727	 *
   4728	 * Errata:
   4729	 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
   4730	 * possible that CPU microcode implementing DecodeAssist will fail to
   4731	 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
   4732	 * be '0'.  This happens because microcode reads CS:RIP using a _data_
   4733	 * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
   4734	 * gives up and does not fill the instruction bytes buffer.
   4735	 *
   4736	 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
   4737	 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
   4738	 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
   4739	 * GuestIntrBytes field of the VMCB.
   4740	 *
   4741	 * This does _not_ mean that the erratum has been encountered, as the
   4742	 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
   4743	 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
   4744	 * encountered a reserved/not-present #PF.
   4745	 *
   4746	 * To hit the erratum, the following conditions must be true:
   4747	 *    1. CR4.SMAP=1 (obviously).
   4748	 *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
   4749	 *       have been hit as the guest would have encountered a SMEP
   4750	 *       violation #PF, not a #NPF.
   4751	 *    3. The #NPF is not due to a code fetch, in which case failure to
   4752	 *       retrieve the instruction bytes is legitimate (see abvoe).
   4753	 *
   4754	 * In addition, don't apply the erratum workaround if the #NPF occurred
   4755	 * while translating guest page tables (see below).
   4756	 */
   4757	error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
   4758	if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
   4759		goto resume_guest;
   4760
   4761	cr4 = kvm_read_cr4(vcpu);
   4762	smep = cr4 & X86_CR4_SMEP;
   4763	smap = cr4 & X86_CR4_SMAP;
   4764	is_user = svm_get_cpl(vcpu) == 3;
   4765	if (smap && (!smep || is_user)) {
   4766		pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
   4767
   4768		/*
   4769		 * If the fault occurred in userspace, arbitrarily inject #GP
   4770		 * to avoid killing the guest and to hopefully avoid confusing
   4771		 * the guest kernel too much, e.g. injecting #PF would not be
   4772		 * coherent with respect to the guest's page tables.  Request
   4773		 * triple fault if the fault occurred in the kernel as there's
   4774		 * no fault that KVM can inject without confusing the guest.
   4775		 * In practice, the triple fault is moot as no sane SEV kernel
   4776		 * will execute from user memory while also running with SMAP=1.
   4777		 */
   4778		if (is_user)
   4779			kvm_inject_gp(vcpu, 0);
   4780		else
   4781			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   4782	}
   4783
   4784resume_guest:
   4785	/*
   4786	 * If the erratum was not hit, simply resume the guest and let it fault
   4787	 * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
   4788	 * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
   4789	 * userspace will kill the guest, and letting the emulator read garbage
   4790	 * will yield random behavior and potentially corrupt the guest.
   4791	 *
   4792	 * Simply resuming the guest is technically not a violation of the SEV
   4793	 * architecture.  AMD's APM states that all code fetches and page table
   4794	 * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
   4795	 * APM also states that encrypted accesses to MMIO are "ignored", but
   4796	 * doesn't explicitly define "ignored", i.e. doing nothing and letting
   4797	 * the guest spin is technically "ignoring" the access.
   4798	 */
   4799	return false;
   4800}
   4801
   4802static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
   4803{
   4804	struct vcpu_svm *svm = to_svm(vcpu);
   4805
   4806	/*
   4807	 * TODO: Last condition latch INIT signals on vCPU when
   4808	 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
   4809	 * To properly emulate the INIT intercept,
   4810	 * svm_check_nested_events() should call nested_svm_vmexit()
   4811	 * if an INIT signal is pending.
   4812	 */
   4813	return !gif_set(svm) ||
   4814		   (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
   4815}
   4816
   4817static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
   4818{
   4819	if (!sev_es_guest(vcpu->kvm))
   4820		return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
   4821
   4822	sev_vcpu_deliver_sipi_vector(vcpu, vector);
   4823}
   4824
   4825static void svm_vm_destroy(struct kvm *kvm)
   4826{
   4827	avic_vm_destroy(kvm);
   4828	sev_vm_destroy(kvm);
   4829}
   4830
   4831static int svm_vm_init(struct kvm *kvm)
   4832{
   4833	if (!pause_filter_count || !pause_filter_thresh)
   4834		kvm->arch.pause_in_guest = true;
   4835
   4836	if (enable_apicv) {
   4837		int ret = avic_vm_init(kvm);
   4838		if (ret)
   4839			return ret;
   4840	}
   4841
   4842	return 0;
   4843}
   4844
   4845static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
   4846{
   4847	struct page *page = snp_safe_alloc_page(vcpu);
   4848
   4849	if (!page)
   4850		return NULL;
   4851
   4852	return page_address(page);
   4853}
   4854
   4855static struct kvm_x86_ops svm_x86_ops __initdata = {
   4856	.name = "kvm_amd",
   4857
   4858	.hardware_unsetup = svm_hardware_unsetup,
   4859	.hardware_enable = svm_hardware_enable,
   4860	.hardware_disable = svm_hardware_disable,
   4861	.has_emulated_msr = svm_has_emulated_msr,
   4862
   4863	.vcpu_create = svm_vcpu_create,
   4864	.vcpu_free = svm_vcpu_free,
   4865	.vcpu_reset = svm_vcpu_reset,
   4866
   4867	.vm_size = sizeof(struct kvm_svm),
   4868	.vm_init = svm_vm_init,
   4869	.vm_destroy = svm_vm_destroy,
   4870
   4871	.prepare_switch_to_guest = svm_prepare_switch_to_guest,
   4872	.vcpu_load = svm_vcpu_load,
   4873	.vcpu_put = svm_vcpu_put,
   4874	.vcpu_blocking = avic_vcpu_blocking,
   4875	.vcpu_unblocking = avic_vcpu_unblocking,
   4876
   4877	.update_exception_bitmap = svm_update_exception_bitmap,
   4878	.get_msr_feature = svm_get_msr_feature,
   4879	.get_msr = svm_get_msr,
   4880	.set_msr = svm_set_msr,
   4881	.get_segment_base = svm_get_segment_base,
   4882	.get_segment = svm_get_segment,
   4883	.set_segment = svm_set_segment,
   4884	.get_cpl = svm_get_cpl,
   4885	.get_cs_db_l_bits = svm_get_cs_db_l_bits,
   4886	.set_cr0 = svm_set_cr0,
   4887	.post_set_cr3 = sev_post_set_cr3,
   4888	.is_valid_cr4 = svm_is_valid_cr4,
   4889	.set_cr4 = svm_set_cr4,
   4890	.set_efer = svm_set_efer,
   4891	.get_idt = svm_get_idt,
   4892	.set_idt = svm_set_idt,
   4893	.get_gdt = svm_get_gdt,
   4894	.set_gdt = svm_set_gdt,
   4895	.set_dr7 = svm_set_dr7,
   4896	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
   4897	.cache_reg = svm_cache_reg,
   4898	.get_rflags = svm_get_rflags,
   4899	.set_rflags = svm_set_rflags,
   4900	.get_if_flag = svm_get_if_flag,
   4901
   4902	.flush_tlb_all = svm_flush_tlb_current,
   4903	.flush_tlb_current = svm_flush_tlb_current,
   4904	.flush_tlb_gva = svm_flush_tlb_gva,
   4905	.flush_tlb_guest = svm_flush_tlb_current,
   4906
   4907	.vcpu_pre_run = svm_vcpu_pre_run,
   4908	.vcpu_run = svm_vcpu_run,
   4909	.handle_exit = svm_handle_exit,
   4910	.skip_emulated_instruction = svm_skip_emulated_instruction,
   4911	.update_emulated_instruction = NULL,
   4912	.set_interrupt_shadow = svm_set_interrupt_shadow,
   4913	.get_interrupt_shadow = svm_get_interrupt_shadow,
   4914	.patch_hypercall = svm_patch_hypercall,
   4915	.inject_irq = svm_inject_irq,
   4916	.inject_nmi = svm_inject_nmi,
   4917	.queue_exception = svm_queue_exception,
   4918	.cancel_injection = svm_cancel_injection,
   4919	.interrupt_allowed = svm_interrupt_allowed,
   4920	.nmi_allowed = svm_nmi_allowed,
   4921	.get_nmi_mask = svm_get_nmi_mask,
   4922	.set_nmi_mask = svm_set_nmi_mask,
   4923	.enable_nmi_window = svm_enable_nmi_window,
   4924	.enable_irq_window = svm_enable_irq_window,
   4925	.update_cr8_intercept = svm_update_cr8_intercept,
   4926	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
   4927	.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
   4928	.apicv_post_state_restore = avic_apicv_post_state_restore,
   4929
   4930	.get_mt_mask = svm_get_mt_mask,
   4931	.get_exit_info = svm_get_exit_info,
   4932
   4933	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
   4934
   4935	.has_wbinvd_exit = svm_has_wbinvd_exit,
   4936
   4937	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
   4938	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
   4939	.write_tsc_offset = svm_write_tsc_offset,
   4940	.write_tsc_multiplier = svm_write_tsc_multiplier,
   4941
   4942	.load_mmu_pgd = svm_load_mmu_pgd,
   4943
   4944	.check_intercept = svm_check_intercept,
   4945	.handle_exit_irqoff = svm_handle_exit_irqoff,
   4946
   4947	.request_immediate_exit = __kvm_request_immediate_exit,
   4948
   4949	.sched_in = svm_sched_in,
   4950
   4951	.nested_ops = &svm_nested_ops,
   4952
   4953	.deliver_interrupt = svm_deliver_interrupt,
   4954	.pi_update_irte = avic_pi_update_irte,
   4955	.setup_mce = svm_setup_mce,
   4956
   4957	.smi_allowed = svm_smi_allowed,
   4958	.enter_smm = svm_enter_smm,
   4959	.leave_smm = svm_leave_smm,
   4960	.enable_smi_window = svm_enable_smi_window,
   4961
   4962	.mem_enc_ioctl = sev_mem_enc_ioctl,
   4963	.mem_enc_register_region = sev_mem_enc_register_region,
   4964	.mem_enc_unregister_region = sev_mem_enc_unregister_region,
   4965
   4966	.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
   4967	.vm_move_enc_context_from = sev_vm_move_enc_context_from,
   4968
   4969	.can_emulate_instruction = svm_can_emulate_instruction,
   4970
   4971	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
   4972
   4973	.msr_filter_changed = svm_msr_filter_changed,
   4974	.complete_emulated_msr = svm_complete_emulated_msr,
   4975
   4976	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
   4977	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
   4978
   4979	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
   4980
   4981	.rmp_page_level_adjust = sev_rmp_page_level_adjust,
   4982};
   4983
   4984/*
   4985 * The default MMIO mask is a single bit (excluding the present bit),
   4986 * which could conflict with the memory encryption bit. Check for
   4987 * memory encryption support and override the default MMIO mask if
   4988 * memory encryption is enabled.
   4989 */
   4990static __init void svm_adjust_mmio_mask(void)
   4991{
   4992	unsigned int enc_bit, mask_bit;
   4993	u64 msr, mask;
   4994
   4995	/* If there is no memory encryption support, use existing mask */
   4996	if (cpuid_eax(0x80000000) < 0x8000001f)
   4997		return;
   4998
   4999	/* If memory encryption is not enabled, use existing mask */
   5000	rdmsrl(MSR_AMD64_SYSCFG, msr);
   5001	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
   5002		return;
   5003
   5004	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
   5005	mask_bit = boot_cpu_data.x86_phys_bits;
   5006
   5007	/* Increment the mask bit if it is the same as the encryption bit */
   5008	if (enc_bit == mask_bit)
   5009		mask_bit++;
   5010
   5011	/*
   5012	 * If the mask bit location is below 52, then some bits above the
   5013	 * physical addressing limit will always be reserved, so use the
   5014	 * rsvd_bits() function to generate the mask. This mask, along with
   5015	 * the present bit, will be used to generate a page fault with
   5016	 * PFER.RSV = 1.
   5017	 *
   5018	 * If the mask bit location is 52 (or above), then clear the mask.
   5019	 */
   5020	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
   5021
   5022	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
   5023}
   5024
   5025static __init void svm_set_cpu_caps(void)
   5026{
   5027	kvm_set_cpu_caps();
   5028
   5029	supported_xss = 0;
   5030
   5031	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
   5032	if (nested) {
   5033		kvm_cpu_cap_set(X86_FEATURE_SVM);
   5034		kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
   5035
   5036		if (nrips)
   5037			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
   5038
   5039		if (npt_enabled)
   5040			kvm_cpu_cap_set(X86_FEATURE_NPT);
   5041
   5042		if (tsc_scaling)
   5043			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
   5044
   5045		if (vls)
   5046			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
   5047		if (lbrv)
   5048			kvm_cpu_cap_set(X86_FEATURE_LBRV);
   5049
   5050		if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
   5051			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
   5052
   5053		if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
   5054			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
   5055
   5056		if (vgif)
   5057			kvm_cpu_cap_set(X86_FEATURE_VGIF);
   5058
   5059		/* Nested VM can receive #VMEXIT instead of triggering #GP */
   5060		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
   5061	}
   5062
   5063	/* CPUID 0x80000008 */
   5064	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
   5065	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
   5066		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
   5067
   5068	/* AMD PMU PERFCTR_CORE CPUID */
   5069	if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
   5070		kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
   5071
   5072	/* CPUID 0x8000001F (SME/SEV features) */
   5073	sev_set_cpu_caps();
   5074}
   5075
   5076static __init int svm_hardware_setup(void)
   5077{
   5078	int cpu;
   5079	struct page *iopm_pages;
   5080	void *iopm_va;
   5081	int r;
   5082	unsigned int order = get_order(IOPM_SIZE);
   5083
   5084	/*
   5085	 * NX is required for shadow paging and for NPT if the NX huge pages
   5086	 * mitigation is enabled.
   5087	 */
   5088	if (!boot_cpu_has(X86_FEATURE_NX)) {
   5089		pr_err_ratelimited("NX (Execute Disable) not supported\n");
   5090		return -EOPNOTSUPP;
   5091	}
   5092	kvm_enable_efer_bits(EFER_NX);
   5093
   5094	iopm_pages = alloc_pages(GFP_KERNEL, order);
   5095
   5096	if (!iopm_pages)
   5097		return -ENOMEM;
   5098
   5099	iopm_va = page_address(iopm_pages);
   5100	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
   5101	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
   5102
   5103	init_msrpm_offsets();
   5104
   5105	supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
   5106
   5107	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
   5108		kvm_enable_efer_bits(EFER_FFXSR);
   5109
   5110	if (tsc_scaling) {
   5111		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
   5112			tsc_scaling = false;
   5113		} else {
   5114			pr_info("TSC scaling supported\n");
   5115			kvm_has_tsc_control = true;
   5116		}
   5117	}
   5118	kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
   5119	kvm_tsc_scaling_ratio_frac_bits = 32;
   5120
   5121	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
   5122
   5123	/* Check for pause filtering support */
   5124	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
   5125		pause_filter_count = 0;
   5126		pause_filter_thresh = 0;
   5127	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
   5128		pause_filter_thresh = 0;
   5129	}
   5130
   5131	if (nested) {
   5132		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
   5133		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
   5134	}
   5135
   5136	/*
   5137	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
   5138	 * NPT isn't supported if the host is using 2-level paging since host
   5139	 * CR4 is unchanged on VMRUN.
   5140	 */
   5141	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
   5142		npt_enabled = false;
   5143
   5144	if (!boot_cpu_has(X86_FEATURE_NPT))
   5145		npt_enabled = false;
   5146
   5147	/* Force VM NPT level equal to the host's paging level */
   5148	kvm_configure_mmu(npt_enabled, get_npt_level(),
   5149			  get_npt_level(), PG_LEVEL_1G);
   5150	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
   5151
   5152	/* Setup shadow_me_value and shadow_me_mask */
   5153	kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
   5154
   5155	svm_adjust_mmio_mask();
   5156
   5157	/*
   5158	 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
   5159	 * may be modified by svm_adjust_mmio_mask()).
   5160	 */
   5161	sev_hardware_setup();
   5162
   5163	svm_hv_hardware_setup();
   5164
   5165	for_each_possible_cpu(cpu) {
   5166		r = svm_cpu_init(cpu);
   5167		if (r)
   5168			goto err;
   5169	}
   5170
   5171	if (nrips) {
   5172		if (!boot_cpu_has(X86_FEATURE_NRIPS))
   5173			nrips = false;
   5174	}
   5175
   5176	enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
   5177
   5178	if (enable_apicv) {
   5179		if (!boot_cpu_has(X86_FEATURE_AVIC)) {
   5180			pr_warn("AVIC is not supported in CPUID but force enabled");
   5181			pr_warn("Your system might crash and burn");
   5182		} else
   5183			pr_info("AVIC enabled\n");
   5184
   5185		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
   5186	} else {
   5187		svm_x86_ops.vcpu_blocking = NULL;
   5188		svm_x86_ops.vcpu_unblocking = NULL;
   5189		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
   5190	}
   5191
   5192	if (vls) {
   5193		if (!npt_enabled ||
   5194		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
   5195		    !IS_ENABLED(CONFIG_X86_64)) {
   5196			vls = false;
   5197		} else {
   5198			pr_info("Virtual VMLOAD VMSAVE supported\n");
   5199		}
   5200	}
   5201
   5202	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
   5203		svm_gp_erratum_intercept = false;
   5204
   5205	if (vgif) {
   5206		if (!boot_cpu_has(X86_FEATURE_VGIF))
   5207			vgif = false;
   5208		else
   5209			pr_info("Virtual GIF supported\n");
   5210	}
   5211
   5212	if (lbrv) {
   5213		if (!boot_cpu_has(X86_FEATURE_LBRV))
   5214			lbrv = false;
   5215		else
   5216			pr_info("LBR virtualization supported\n");
   5217	}
   5218
   5219	if (!enable_pmu)
   5220		pr_info("PMU virtualization is disabled\n");
   5221
   5222	svm_set_cpu_caps();
   5223
   5224	/*
   5225	 * It seems that on AMD processors PTE's accessed bit is
   5226	 * being set by the CPU hardware before the NPF vmexit.
   5227	 * This is not expected behaviour and our tests fail because
   5228	 * of it.
   5229	 * A workaround here is to disable support for
   5230	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
   5231	 * In this case userspace can know if there is support using
   5232	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
   5233	 * it
   5234	 * If future AMD CPU models change the behaviour described above,
   5235	 * this variable can be changed accordingly
   5236	 */
   5237	allow_smaller_maxphyaddr = !npt_enabled;
   5238
   5239	return 0;
   5240
   5241err:
   5242	svm_hardware_unsetup();
   5243	return r;
   5244}
   5245
   5246
   5247static struct kvm_x86_init_ops svm_init_ops __initdata = {
   5248	.cpu_has_kvm_support = has_svm,
   5249	.disabled_by_bios = is_disabled,
   5250	.hardware_setup = svm_hardware_setup,
   5251	.check_processor_compatibility = svm_check_processor_compat,
   5252
   5253	.runtime_ops = &svm_x86_ops,
   5254	.pmu_ops = &amd_pmu_ops,
   5255};
   5256
   5257static int __init svm_init(void)
   5258{
   5259	__unused_size_checks();
   5260
   5261	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
   5262			__alignof__(struct vcpu_svm), THIS_MODULE);
   5263}
   5264
   5265static void __exit svm_exit(void)
   5266{
   5267	kvm_exit();
   5268}
   5269
   5270module_init(svm_init)
   5271module_exit(svm_exit)