avic.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
avic.c (28114B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Kernel-based Virtual Machine driver for Linux
      4 *
      5 * AMD SVM support
      6 *
      7 * Copyright (C) 2006 Qumranet, Inc.
      8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
      9 *
     10 * Authors:
     11 *   Yaniv Kamay  <yaniv@qumranet.com>
     12 *   Avi Kivity   <avi@qumranet.com>
     13 */
     14
     15#define pr_fmt(fmt) "SVM: " fmt
     16
     17#include <linux/kvm_types.h>
     18#include <linux/hashtable.h>
     19#include <linux/amd-iommu.h>
     20#include <linux/kvm_host.h>
     21
     22#include <asm/irq_remapping.h>
     23
     24#include "trace.h"
     25#include "lapic.h"
     26#include "x86.h"
     27#include "irq.h"
     28#include "svm.h"
     29
     30/* AVIC GATAG is encoded using VM and VCPU IDs */
     31#define AVIC_VCPU_ID_BITS		8
     32#define AVIC_VCPU_ID_MASK		((1 << AVIC_VCPU_ID_BITS) - 1)
     33
     34#define AVIC_VM_ID_BITS			24
     35#define AVIC_VM_ID_NR			(1 << AVIC_VM_ID_BITS)
     36#define AVIC_VM_ID_MASK			((1 << AVIC_VM_ID_BITS) - 1)
     37
     38#define AVIC_GATAG(x, y)		(((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
     39						(y & AVIC_VCPU_ID_MASK))
     40#define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
     41#define AVIC_GATAG_TO_VCPUID(x)		(x & AVIC_VCPU_ID_MASK)
     42
     43/* Note:
     44 * This hash table is used to map VM_ID to a struct kvm_svm,
     45 * when handling AMD IOMMU GALOG notification to schedule in
     46 * a particular vCPU.
     47 */
     48#define SVM_VM_DATA_HASH_BITS	8
     49static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
     50static u32 next_vm_id = 0;
     51static bool next_vm_id_wrapped = 0;
     52static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
     53
     54/*
     55 * This is a wrapper of struct amd_iommu_ir_data.
     56 */
     57struct amd_svm_iommu_ir {
     58	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
     59	void *data;		/* Storing pointer to struct amd_ir_data */
     60};
     61
     62
     63/* Note:
     64 * This function is called from IOMMU driver to notify
     65 * SVM to schedule in a particular vCPU of a particular VM.
     66 */
     67int avic_ga_log_notifier(u32 ga_tag)
     68{
     69	unsigned long flags;
     70	struct kvm_svm *kvm_svm;
     71	struct kvm_vcpu *vcpu = NULL;
     72	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
     73	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
     74
     75	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
     76	trace_kvm_avic_ga_log(vm_id, vcpu_id);
     77
     78	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
     79	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
     80		if (kvm_svm->avic_vm_id != vm_id)
     81			continue;
     82		vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
     83		break;
     84	}
     85	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
     86
     87	/* Note:
     88	 * At this point, the IOMMU should have already set the pending
     89	 * bit in the vAPIC backing page. So, we just need to schedule
     90	 * in the vcpu.
     91	 */
     92	if (vcpu)
     93		kvm_vcpu_wake_up(vcpu);
     94
     95	return 0;
     96}
     97
     98void avic_vm_destroy(struct kvm *kvm)
     99{
    100	unsigned long flags;
    101	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
    102
    103	if (!enable_apicv)
    104		return;
    105
    106	if (kvm_svm->avic_logical_id_table_page)
    107		__free_page(kvm_svm->avic_logical_id_table_page);
    108	if (kvm_svm->avic_physical_id_table_page)
    109		__free_page(kvm_svm->avic_physical_id_table_page);
    110
    111	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
    112	hash_del(&kvm_svm->hnode);
    113	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
    114}
    115
    116int avic_vm_init(struct kvm *kvm)
    117{
    118	unsigned long flags;
    119	int err = -ENOMEM;
    120	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
    121	struct kvm_svm *k2;
    122	struct page *p_page;
    123	struct page *l_page;
    124	u32 vm_id;
    125
    126	if (!enable_apicv)
    127		return 0;
    128
    129	/* Allocating physical APIC ID table (4KB) */
    130	p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
    131	if (!p_page)
    132		goto free_avic;
    133
    134	kvm_svm->avic_physical_id_table_page = p_page;
    135
    136	/* Allocating logical APIC ID table (4KB) */
    137	l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
    138	if (!l_page)
    139		goto free_avic;
    140
    141	kvm_svm->avic_logical_id_table_page = l_page;
    142
    143	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
    144 again:
    145	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
    146	if (vm_id == 0) { /* id is 1-based, zero is not okay */
    147		next_vm_id_wrapped = 1;
    148		goto again;
    149	}
    150	/* Is it still in use? Only possible if wrapped at least once */
    151	if (next_vm_id_wrapped) {
    152		hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
    153			if (k2->avic_vm_id == vm_id)
    154				goto again;
    155		}
    156	}
    157	kvm_svm->avic_vm_id = vm_id;
    158	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
    159	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
    160
    161	/* AVIC cannot be supported on SNP-enabled system. */
    162	if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
    163		kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SNP);
    164
    165	return 0;
    166
    167free_avic:
    168	avic_vm_destroy(kvm);
    169	return err;
    170}
    171
    172void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
    173{
    174	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
    175	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
    176	phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
    177	phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
    178
    179	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
    180	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
    181	vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
    182	vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
    183	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
    184
    185	if (kvm_apicv_activated(svm->vcpu.kvm))
    186		vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
    187	else
    188		vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
    189}
    190
    191static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
    192				       unsigned int index)
    193{
    194	u64 *avic_physical_id_table;
    195	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
    196
    197	if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
    198		return NULL;
    199
    200	avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
    201
    202	return &avic_physical_id_table[index];
    203}
    204
    205/*
    206 * Note:
    207 * AVIC hardware walks the nested page table to check permissions,
    208 * but does not use the SPA address specified in the leaf page
    209 * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
    210 * field of the VMCB. Therefore, we set up the
    211 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
    212 */
    213static int avic_alloc_access_page(struct kvm *kvm)
    214{
    215	void __user *ret;
    216	int r = 0;
    217
    218	mutex_lock(&kvm->slots_lock);
    219
    220	if (kvm->arch.apic_access_memslot_enabled)
    221		goto out;
    222
    223	ret = __x86_set_memory_region(kvm,
    224				      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
    225				      APIC_DEFAULT_PHYS_BASE,
    226				      PAGE_SIZE);
    227	if (IS_ERR(ret)) {
    228		r = PTR_ERR(ret);
    229		goto out;
    230	}
    231
    232	kvm->arch.apic_access_memslot_enabled = true;
    233out:
    234	mutex_unlock(&kvm->slots_lock);
    235	return r;
    236}
    237
    238static int avic_init_backing_page(struct kvm_vcpu *vcpu)
    239{
    240	u64 *entry, new_entry;
    241	int id = vcpu->vcpu_id;
    242	struct vcpu_svm *svm = to_svm(vcpu);
    243
    244	if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
    245		return -EINVAL;
    246
    247	if (!vcpu->arch.apic->regs)
    248		return -EINVAL;
    249
    250	if (kvm_apicv_activated(vcpu->kvm)) {
    251		int ret;
    252
    253		ret = avic_alloc_access_page(vcpu->kvm);
    254		if (ret)
    255			return ret;
    256	}
    257
    258	svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
    259
    260	/* Setting AVIC backing page address in the phy APIC ID table */
    261	entry = avic_get_physical_id_entry(vcpu, id);
    262	if (!entry)
    263		return -EINVAL;
    264
    265	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
    266			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
    267			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
    268	WRITE_ONCE(*entry, new_entry);
    269
    270	svm->avic_physical_id_cache = entry;
    271
    272	return 0;
    273}
    274
    275void avic_ring_doorbell(struct kvm_vcpu *vcpu)
    276{
    277	/*
    278	 * Note, the vCPU could get migrated to a different pCPU at any point,
    279	 * which could result in signalling the wrong/previous pCPU.  But if
    280	 * that happens the vCPU is guaranteed to do a VMRUN (after being
    281	 * migrated) and thus will process pending interrupts, i.e. a doorbell
    282	 * is not needed (and the spurious one is harmless).
    283	 */
    284	int cpu = READ_ONCE(vcpu->cpu);
    285
    286	if (cpu != get_cpu())
    287		wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
    288	put_cpu();
    289}
    290
    291/*
    292 * A fast-path version of avic_kick_target_vcpus(), which attempts to match
    293 * destination APIC ID to vCPU without looping through all vCPUs.
    294 */
    295static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
    296				       u32 icrl, u32 icrh, u32 index)
    297{
    298	u32 l1_physical_id, dest;
    299	struct kvm_vcpu *target_vcpu;
    300	int dest_mode = icrl & APIC_DEST_MASK;
    301	int shorthand = icrl & APIC_SHORT_MASK;
    302	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
    303
    304	if (shorthand != APIC_DEST_NOSHORT)
    305		return -EINVAL;
    306
    307	if (apic_x2apic_mode(source))
    308		dest = icrh;
    309	else
    310		dest = GET_APIC_DEST_FIELD(icrh);
    311
    312	if (dest_mode == APIC_DEST_PHYSICAL) {
    313		/* broadcast destination, use slow path */
    314		if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
    315			return -EINVAL;
    316		if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
    317			return -EINVAL;
    318
    319		l1_physical_id = dest;
    320
    321		if (WARN_ON_ONCE(l1_physical_id != index))
    322			return -EINVAL;
    323
    324	} else {
    325		u32 bitmap, cluster;
    326		int logid_index;
    327
    328		if (apic_x2apic_mode(source)) {
    329			/* 16 bit dest mask, 16 bit cluster id */
    330			bitmap = dest & 0xFFFF0000;
    331			cluster = (dest >> 16) << 4;
    332		} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
    333			/* 8 bit dest mask*/
    334			bitmap = dest;
    335			cluster = 0;
    336		} else {
    337			/* 4 bit desk mask, 4 bit cluster id */
    338			bitmap = dest & 0xF;
    339			cluster = (dest >> 4) << 2;
    340		}
    341
    342		if (unlikely(!bitmap))
    343			/* guest bug: nobody to send the logical interrupt to */
    344			return 0;
    345
    346		if (!is_power_of_2(bitmap))
    347			/* multiple logical destinations, use slow path */
    348			return -EINVAL;
    349
    350		logid_index = cluster + __ffs(bitmap);
    351
    352		if (apic_x2apic_mode(source)) {
    353			l1_physical_id = logid_index;
    354		} else {
    355			u32 *avic_logical_id_table =
    356				page_address(kvm_svm->avic_logical_id_table_page);
    357
    358			u32 logid_entry = avic_logical_id_table[logid_index];
    359
    360			if (WARN_ON_ONCE(index != logid_index))
    361				return -EINVAL;
    362
    363			/* guest bug: non existing/reserved logical destination */
    364			if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
    365				return 0;
    366
    367			l1_physical_id = logid_entry &
    368					 AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
    369		}
    370	}
    371
    372	target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
    373	if (unlikely(!target_vcpu))
    374		/* guest bug: non existing vCPU is a target of this IPI*/
    375		return 0;
    376
    377	target_vcpu->arch.apic->irr_pending = true;
    378	svm_complete_interrupt_delivery(target_vcpu,
    379					icrl & APIC_MODE_MASK,
    380					icrl & APIC_INT_LEVELTRIG,
    381					icrl & APIC_VECTOR_MASK);
    382	return 0;
    383}
    384
    385static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
    386				   u32 icrl, u32 icrh, u32 index)
    387{
    388	unsigned long i;
    389	struct kvm_vcpu *vcpu;
    390
    391	if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
    392		return;
    393
    394	trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
    395
    396	/*
    397	 * Wake any target vCPUs that are blocking, i.e. waiting for a wake
    398	 * event.  There's no need to signal doorbells, as hardware has handled
    399	 * vCPUs that were in guest at the time of the IPI, and vCPUs that have
    400	 * since entered the guest will have processed pending IRQs at VMRUN.
    401	 */
    402	kvm_for_each_vcpu(i, vcpu, kvm) {
    403		if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
    404					GET_APIC_DEST_FIELD(icrh),
    405					icrl & APIC_DEST_MASK)) {
    406			vcpu->arch.apic->irr_pending = true;
    407			svm_complete_interrupt_delivery(vcpu,
    408							icrl & APIC_MODE_MASK,
    409							icrl & APIC_INT_LEVELTRIG,
    410							icrl & APIC_VECTOR_MASK);
    411		}
    412	}
    413}
    414
    415int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
    416{
    417	struct vcpu_svm *svm = to_svm(vcpu);
    418	u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
    419	u32 icrl = svm->vmcb->control.exit_info_1;
    420	u32 id = svm->vmcb->control.exit_info_2 >> 32;
    421	u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
    422	struct kvm_lapic *apic = vcpu->arch.apic;
    423
    424	trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
    425
    426	switch (id) {
    427	case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
    428		/*
    429		 * Emulate IPIs that are not handled by AVIC hardware, which
    430		 * only virtualizes Fixed, Edge-Triggered INTRs.  The exit is
    431		 * a trap, e.g. ICR holds the correct value and RIP has been
    432		 * advanced, KVM is responsible only for emulating the IPI.
    433		 * Sadly, hardware may sometimes leave the BUSY flag set, in
    434		 * which case KVM needs to emulate the ICR write as well in
    435		 * order to clear the BUSY flag.
    436		 */
    437		if (icrl & APIC_ICR_BUSY)
    438			kvm_apic_write_nodecode(vcpu, APIC_ICR);
    439		else
    440			kvm_apic_send_ipi(apic, icrl, icrh);
    441		break;
    442	case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
    443		/*
    444		 * At this point, we expect that the AVIC HW has already
    445		 * set the appropriate IRR bits on the valid target
    446		 * vcpus. So, we just need to kick the appropriate vcpu.
    447		 */
    448		avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
    449		break;
    450	case AVIC_IPI_FAILURE_INVALID_TARGET:
    451		break;
    452	case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
    453		WARN_ONCE(1, "Invalid backing page\n");
    454		break;
    455	default:
    456		pr_err("Unknown IPI interception\n");
    457	}
    458
    459	return 1;
    460}
    461
    462unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
    463{
    464	if (is_guest_mode(vcpu))
    465		return APICV_INHIBIT_REASON_NESTED;
    466	return 0;
    467}
    468
    469static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
    470{
    471	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
    472	int index;
    473	u32 *logical_apic_id_table;
    474	int dlid = GET_APIC_LOGICAL_ID(ldr);
    475
    476	if (!dlid)
    477		return NULL;
    478
    479	if (flat) { /* flat */
    480		index = ffs(dlid) - 1;
    481		if (index > 7)
    482			return NULL;
    483	} else { /* cluster */
    484		int cluster = (dlid & 0xf0) >> 4;
    485		int apic = ffs(dlid & 0x0f) - 1;
    486
    487		if ((apic < 0) || (apic > 7) ||
    488		    (cluster >= 0xf))
    489			return NULL;
    490		index = (cluster << 2) + apic;
    491	}
    492
    493	logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
    494
    495	return &logical_apic_id_table[index];
    496}
    497
    498static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
    499{
    500	bool flat;
    501	u32 *entry, new_entry;
    502
    503	flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
    504	entry = avic_get_logical_id_entry(vcpu, ldr, flat);
    505	if (!entry)
    506		return -EINVAL;
    507
    508	new_entry = READ_ONCE(*entry);
    509	new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
    510	new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
    511	new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
    512	WRITE_ONCE(*entry, new_entry);
    513
    514	return 0;
    515}
    516
    517static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
    518{
    519	struct vcpu_svm *svm = to_svm(vcpu);
    520	bool flat = svm->dfr_reg == APIC_DFR_FLAT;
    521	u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
    522
    523	if (entry)
    524		clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
    525}
    526
    527static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
    528{
    529	int ret = 0;
    530	struct vcpu_svm *svm = to_svm(vcpu);
    531	u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
    532	u32 id = kvm_xapic_id(vcpu->arch.apic);
    533
    534	if (ldr == svm->ldr_reg)
    535		return 0;
    536
    537	avic_invalidate_logical_id_entry(vcpu);
    538
    539	if (ldr)
    540		ret = avic_ldr_write(vcpu, id, ldr);
    541
    542	if (!ret)
    543		svm->ldr_reg = ldr;
    544
    545	return ret;
    546}
    547
    548static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
    549{
    550	struct vcpu_svm *svm = to_svm(vcpu);
    551	u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
    552
    553	if (svm->dfr_reg == dfr)
    554		return;
    555
    556	avic_invalidate_logical_id_entry(vcpu);
    557	svm->dfr_reg = dfr;
    558}
    559
    560static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
    561{
    562	u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
    563				AVIC_UNACCEL_ACCESS_OFFSET_MASK;
    564
    565	switch (offset) {
    566	case APIC_LDR:
    567		if (avic_handle_ldr_update(vcpu))
    568			return 0;
    569		break;
    570	case APIC_DFR:
    571		avic_handle_dfr_update(vcpu);
    572		break;
    573	default:
    574		break;
    575	}
    576
    577	kvm_apic_write_nodecode(vcpu, offset);
    578	return 1;
    579}
    580
    581static bool is_avic_unaccelerated_access_trap(u32 offset)
    582{
    583	bool ret = false;
    584
    585	switch (offset) {
    586	case APIC_ID:
    587	case APIC_EOI:
    588	case APIC_RRR:
    589	case APIC_LDR:
    590	case APIC_DFR:
    591	case APIC_SPIV:
    592	case APIC_ESR:
    593	case APIC_ICR:
    594	case APIC_LVTT:
    595	case APIC_LVTTHMR:
    596	case APIC_LVTPC:
    597	case APIC_LVT0:
    598	case APIC_LVT1:
    599	case APIC_LVTERR:
    600	case APIC_TMICT:
    601	case APIC_TDCR:
    602		ret = true;
    603		break;
    604	default:
    605		break;
    606	}
    607	return ret;
    608}
    609
    610int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
    611{
    612	struct vcpu_svm *svm = to_svm(vcpu);
    613	int ret = 0;
    614	u32 offset = svm->vmcb->control.exit_info_1 &
    615		     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
    616	u32 vector = svm->vmcb->control.exit_info_2 &
    617		     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
    618	bool write = (svm->vmcb->control.exit_info_1 >> 32) &
    619		     AVIC_UNACCEL_ACCESS_WRITE_MASK;
    620	bool trap = is_avic_unaccelerated_access_trap(offset);
    621
    622	trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
    623					    trap, write, vector);
    624	if (trap) {
    625		/* Handling Trap */
    626		WARN_ONCE(!write, "svm: Handling trap read.\n");
    627		ret = avic_unaccel_trap_write(vcpu);
    628	} else {
    629		/* Handling Fault */
    630		ret = kvm_emulate_instruction(vcpu, 0);
    631	}
    632
    633	return ret;
    634}
    635
    636int avic_init_vcpu(struct vcpu_svm *svm)
    637{
    638	int ret;
    639	struct kvm_vcpu *vcpu = &svm->vcpu;
    640
    641	if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
    642		return 0;
    643
    644	ret = avic_init_backing_page(vcpu);
    645	if (ret)
    646		return ret;
    647
    648	INIT_LIST_HEAD(&svm->ir_list);
    649	spin_lock_init(&svm->ir_list_lock);
    650	svm->dfr_reg = APIC_DFR_FLAT;
    651
    652	return ret;
    653}
    654
    655void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
    656{
    657	avic_handle_dfr_update(vcpu);
    658	avic_handle_ldr_update(vcpu);
    659}
    660
    661static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
    662{
    663	int ret = 0;
    664	unsigned long flags;
    665	struct amd_svm_iommu_ir *ir;
    666	struct vcpu_svm *svm = to_svm(vcpu);
    667
    668	if (!kvm_arch_has_assigned_device(vcpu->kvm))
    669		return 0;
    670
    671	/*
    672	 * Here, we go through the per-vcpu ir_list to update all existing
    673	 * interrupt remapping table entry targeting this vcpu.
    674	 */
    675	spin_lock_irqsave(&svm->ir_list_lock, flags);
    676
    677	if (list_empty(&svm->ir_list))
    678		goto out;
    679
    680	list_for_each_entry(ir, &svm->ir_list, node) {
    681		if (activate)
    682			ret = amd_iommu_activate_guest_mode(ir->data);
    683		else
    684			ret = amd_iommu_deactivate_guest_mode(ir->data);
    685		if (ret)
    686			break;
    687	}
    688out:
    689	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
    690	return ret;
    691}
    692
    693static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
    694{
    695	unsigned long flags;
    696	struct amd_svm_iommu_ir *cur;
    697
    698	spin_lock_irqsave(&svm->ir_list_lock, flags);
    699	list_for_each_entry(cur, &svm->ir_list, node) {
    700		if (cur->data != pi->ir_data)
    701			continue;
    702		list_del(&cur->node);
    703		kfree(cur);
    704		break;
    705	}
    706	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
    707}
    708
    709static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
    710{
    711	int ret = 0;
    712	unsigned long flags;
    713	struct amd_svm_iommu_ir *ir;
    714
    715	/**
    716	 * In some cases, the existing irte is updated and re-set,
    717	 * so we need to check here if it's already been * added
    718	 * to the ir_list.
    719	 */
    720	if (pi->ir_data && (pi->prev_ga_tag != 0)) {
    721		struct kvm *kvm = svm->vcpu.kvm;
    722		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
    723		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
    724		struct vcpu_svm *prev_svm;
    725
    726		if (!prev_vcpu) {
    727			ret = -EINVAL;
    728			goto out;
    729		}
    730
    731		prev_svm = to_svm(prev_vcpu);
    732		svm_ir_list_del(prev_svm, pi);
    733	}
    734
    735	/**
    736	 * Allocating new amd_iommu_pi_data, which will get
    737	 * add to the per-vcpu ir_list.
    738	 */
    739	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
    740	if (!ir) {
    741		ret = -ENOMEM;
    742		goto out;
    743	}
    744	ir->data = pi->ir_data;
    745
    746	spin_lock_irqsave(&svm->ir_list_lock, flags);
    747	list_add(&ir->node, &svm->ir_list);
    748	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
    749out:
    750	return ret;
    751}
    752
    753/*
    754 * Note:
    755 * The HW cannot support posting multicast/broadcast
    756 * interrupts to a vCPU. So, we still use legacy interrupt
    757 * remapping for these kind of interrupts.
    758 *
    759 * For lowest-priority interrupts, we only support
    760 * those with single CPU as the destination, e.g. user
    761 * configures the interrupts via /proc/irq or uses
    762 * irqbalance to make the interrupts single-CPU.
    763 */
    764static int
    765get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
    766		 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
    767{
    768	struct kvm_lapic_irq irq;
    769	struct kvm_vcpu *vcpu = NULL;
    770
    771	kvm_set_msi_irq(kvm, e, &irq);
    772
    773	if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
    774	    !kvm_irq_is_postable(&irq)) {
    775		pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
    776			 __func__, irq.vector);
    777		return -1;
    778	}
    779
    780	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
    781		 irq.vector);
    782	*svm = to_svm(vcpu);
    783	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
    784	vcpu_info->vector = irq.vector;
    785
    786	return 0;
    787}
    788
    789/*
    790 * avic_pi_update_irte - set IRTE for Posted-Interrupts
    791 *
    792 * @kvm: kvm
    793 * @host_irq: host irq of the interrupt
    794 * @guest_irq: gsi of the interrupt
    795 * @set: set or unset PI
    796 * returns 0 on success, < 0 on failure
    797 */
    798int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
    799			uint32_t guest_irq, bool set)
    800{
    801	struct kvm_kernel_irq_routing_entry *e;
    802	struct kvm_irq_routing_table *irq_rt;
    803	int idx, ret = 0;
    804
    805	if (!kvm_arch_has_assigned_device(kvm) ||
    806	    !irq_remapping_cap(IRQ_POSTING_CAP))
    807		return 0;
    808
    809	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
    810		 __func__, host_irq, guest_irq, set);
    811
    812	idx = srcu_read_lock(&kvm->irq_srcu);
    813	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
    814
    815	if (guest_irq >= irq_rt->nr_rt_entries ||
    816		hlist_empty(&irq_rt->map[guest_irq])) {
    817		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
    818			     guest_irq, irq_rt->nr_rt_entries);
    819		goto out;
    820	}
    821
    822	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
    823		struct vcpu_data vcpu_info;
    824		struct vcpu_svm *svm = NULL;
    825
    826		if (e->type != KVM_IRQ_ROUTING_MSI)
    827			continue;
    828
    829		/**
    830		 * Here, we setup with legacy mode in the following cases:
    831		 * 1. When cannot target interrupt to a specific vcpu.
    832		 * 2. Unsetting posted interrupt.
    833		 * 3. APIC virtualization is disabled for the vcpu.
    834		 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
    835		 */
    836		if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
    837		    kvm_vcpu_apicv_active(&svm->vcpu)) {
    838			struct amd_iommu_pi_data pi;
    839
    840			/* Try to enable guest_mode in IRTE */
    841			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
    842					    AVIC_HPA_MASK);
    843			pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
    844						     svm->vcpu.vcpu_id);
    845			pi.is_guest_mode = true;
    846			pi.vcpu_data = &vcpu_info;
    847			ret = irq_set_vcpu_affinity(host_irq, &pi);
    848
    849			/**
    850			 * Here, we successfully setting up vcpu affinity in
    851			 * IOMMU guest mode. Now, we need to store the posted
    852			 * interrupt information in a per-vcpu ir_list so that
    853			 * we can reference to them directly when we update vcpu
    854			 * scheduling information in IOMMU irte.
    855			 */
    856			if (!ret && pi.is_guest_mode)
    857				svm_ir_list_add(svm, &pi);
    858		} else {
    859			/* Use legacy mode in IRTE */
    860			struct amd_iommu_pi_data pi;
    861
    862			/**
    863			 * Here, pi is used to:
    864			 * - Tell IOMMU to use legacy mode for this interrupt.
    865			 * - Retrieve ga_tag of prior interrupt remapping data.
    866			 */
    867			pi.prev_ga_tag = 0;
    868			pi.is_guest_mode = false;
    869			ret = irq_set_vcpu_affinity(host_irq, &pi);
    870
    871			/**
    872			 * Check if the posted interrupt was previously
    873			 * setup with the guest_mode by checking if the ga_tag
    874			 * was cached. If so, we need to clean up the per-vcpu
    875			 * ir_list.
    876			 */
    877			if (!ret && pi.prev_ga_tag) {
    878				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
    879				struct kvm_vcpu *vcpu;
    880
    881				vcpu = kvm_get_vcpu_by_id(kvm, id);
    882				if (vcpu)
    883					svm_ir_list_del(to_svm(vcpu), &pi);
    884			}
    885		}
    886
    887		if (!ret && svm) {
    888			trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
    889						 e->gsi, vcpu_info.vector,
    890						 vcpu_info.pi_desc_addr, set);
    891		}
    892
    893		if (ret < 0) {
    894			pr_err("%s: failed to update PI IRTE\n", __func__);
    895			goto out;
    896		}
    897	}
    898
    899	ret = 0;
    900out:
    901	srcu_read_unlock(&kvm->irq_srcu, idx);
    902	return ret;
    903}
    904
    905bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
    906{
    907	ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
    908			  BIT(APICV_INHIBIT_REASON_ABSENT) |
    909			  BIT(APICV_INHIBIT_REASON_HYPERV) |
    910			  BIT(APICV_INHIBIT_REASON_NESTED) |
    911			  BIT(APICV_INHIBIT_REASON_IRQWIN) |
    912			  BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
    913			  BIT(APICV_INHIBIT_REASON_X2APIC) |
    914			  BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
    915			  BIT(APICV_INHIBIT_REASON_SEV)      |
    916			  BIT(APICV_INHIBIT_REASON_SNP)      |
    917			  BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
    918			  BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
    919
    920	return supported & BIT(reason);
    921}
    922
    923
    924static inline int
    925avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
    926{
    927	int ret = 0;
    928	unsigned long flags;
    929	struct amd_svm_iommu_ir *ir;
    930	struct vcpu_svm *svm = to_svm(vcpu);
    931
    932	if (!kvm_arch_has_assigned_device(vcpu->kvm))
    933		return 0;
    934
    935	/*
    936	 * Here, we go through the per-vcpu ir_list to update all existing
    937	 * interrupt remapping table entry targeting this vcpu.
    938	 */
    939	spin_lock_irqsave(&svm->ir_list_lock, flags);
    940
    941	if (list_empty(&svm->ir_list))
    942		goto out;
    943
    944	list_for_each_entry(ir, &svm->ir_list, node) {
    945		ret = amd_iommu_update_ga(cpu, r, ir->data);
    946		if (ret)
    947			break;
    948	}
    949out:
    950	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
    951	return ret;
    952}
    953
    954void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
    955{
    956	u64 entry;
    957	int h_physical_id = kvm_cpu_get_apicid(cpu);
    958	struct vcpu_svm *svm = to_svm(vcpu);
    959
    960	lockdep_assert_preemption_disabled();
    961
    962	if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
    963		return;
    964
    965	/*
    966	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
    967	 * is being scheduled in after being preempted.  The CPU entries in the
    968	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
    969	 * If the vCPU was migrated, its new CPU value will be stuffed when the
    970	 * vCPU unblocks.
    971	 */
    972	if (kvm_vcpu_is_blocking(vcpu))
    973		return;
    974
    975	entry = READ_ONCE(*(svm->avic_physical_id_cache));
    976	WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
    977
    978	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
    979	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
    980	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
    981
    982	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
    983	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
    984}
    985
    986void avic_vcpu_put(struct kvm_vcpu *vcpu)
    987{
    988	u64 entry;
    989	struct vcpu_svm *svm = to_svm(vcpu);
    990
    991	lockdep_assert_preemption_disabled();
    992
    993	entry = READ_ONCE(*(svm->avic_physical_id_cache));
    994
    995	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
    996	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
    997		return;
    998
    999	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
   1000
   1001	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
   1002	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
   1003}
   1004
   1005
   1006void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
   1007{
   1008	struct vcpu_svm *svm = to_svm(vcpu);
   1009	struct vmcb *vmcb = svm->vmcb01.ptr;
   1010	bool activated = kvm_vcpu_apicv_active(vcpu);
   1011
   1012	if (!enable_apicv)
   1013		return;
   1014
   1015	if (activated) {
   1016		/**
   1017		 * During AVIC temporary deactivation, guest could update
   1018		 * APIC ID, DFR and LDR registers, which would not be trapped
   1019		 * by avic_unaccelerated_access_interception(). In this case,
   1020		 * we need to check and update the AVIC logical APIC ID table
   1021		 * accordingly before re-activating.
   1022		 */
   1023		avic_apicv_post_state_restore(vcpu);
   1024		vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
   1025	} else {
   1026		vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
   1027	}
   1028	vmcb_mark_dirty(vmcb, VMCB_AVIC);
   1029
   1030	if (activated)
   1031		avic_vcpu_load(vcpu, vcpu->cpu);
   1032	else
   1033		avic_vcpu_put(vcpu);
   1034
   1035	avic_set_pi_irte_mode(vcpu, activated);
   1036}
   1037
   1038void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
   1039{
   1040	if (!kvm_vcpu_apicv_active(vcpu))
   1041		return;
   1042
   1043       /*
   1044        * Unload the AVIC when the vCPU is about to block, _before_
   1045        * the vCPU actually blocks.
   1046        *
   1047        * Any IRQs that arrive before IsRunning=0 will not cause an
   1048        * incomplete IPI vmexit on the source, therefore vIRR will also
   1049        * be checked by kvm_vcpu_check_block() before blocking.  The
   1050        * memory barrier implicit in set_current_state orders writing
   1051        * IsRunning=0 before reading the vIRR.  The processor needs a
   1052        * matching memory barrier on interrupt delivery between writing
   1053        * IRR and reading IsRunning; the lack of this barrier might be
   1054        * the cause of errata #1235).
   1055        */
   1056	avic_vcpu_put(vcpu);
   1057}
   1058
   1059void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
   1060{
   1061	if (!kvm_vcpu_apicv_active(vcpu))
   1062		return;
   1063
   1064	avic_vcpu_load(vcpu, vcpu->cpu);
   1065}