cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

book3s_xive.c (77927B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
      4 */
      5
      6#define pr_fmt(fmt) "xive-kvm: " fmt
      7
      8#include <linux/kernel.h>
      9#include <linux/kvm_host.h>
     10#include <linux/err.h>
     11#include <linux/gfp.h>
     12#include <linux/spinlock.h>
     13#include <linux/delay.h>
     14#include <linux/percpu.h>
     15#include <linux/cpumask.h>
     16#include <linux/uaccess.h>
     17#include <linux/irqdomain.h>
     18#include <asm/kvm_book3s.h>
     19#include <asm/kvm_ppc.h>
     20#include <asm/hvcall.h>
     21#include <asm/xics.h>
     22#include <asm/xive.h>
     23#include <asm/xive-regs.h>
     24#include <asm/debug.h>
     25#include <asm/time.h>
     26#include <asm/opal.h>
     27
     28#include <linux/debugfs.h>
     29#include <linux/seq_file.h>
     30
     31#include "book3s_xive.h"
     32
     33#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
     34#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
     35
     36/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
     37#define XICS_DUMMY	1
     38
     39static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc)
     40{
     41	u8 cppr;
     42	u16 ack;
     43
     44	/*
     45	 * Ensure any previous store to CPPR is ordered vs.
     46	 * the subsequent loads from PIPR or ACK.
     47	 */
     48	eieio();
     49
     50	/* Perform the acknowledge OS to register cycle. */
     51	ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG));
     52
     53	/* Synchronize subsequent queue accesses */
     54	mb();
     55
     56	/* XXX Check grouping level */
     57
     58	/* Anything ? */
     59	if (!((ack >> 8) & TM_QW1_NSR_EO))
     60		return;
     61
     62	/* Grab CPPR of the most favored pending interrupt */
     63	cppr = ack & 0xff;
     64	if (cppr < 8)
     65		xc->pending |= 1 << cppr;
     66
     67	/* Check consistency */
     68	if (cppr >= xc->hw_cppr)
     69		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
     70			smp_processor_id(), cppr, xc->hw_cppr);
     71
     72	/*
     73	 * Update our image of the HW CPPR. We don't yet modify
     74	 * xc->cppr, this will be done as we scan for interrupts
     75	 * in the queues.
     76	 */
     77	xc->hw_cppr = cppr;
     78}
     79
     80static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
     81{
     82	u64 val;
     83
     84	if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
     85		offset |= XIVE_ESB_LD_ST_MO;
     86
     87	val = __raw_readq(__x_eoi_page(xd) + offset);
     88#ifdef __LITTLE_ENDIAN__
     89	val >>= 64-8;
     90#endif
     91	return (u8)val;
     92}
     93
     94
     95static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
     96{
     97	/* If the XIVE supports the new "store EOI facility, use it */
     98	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
     99		__raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI);
    100	else if (xd->flags & XIVE_IRQ_FLAG_LSI) {
    101		/*
    102		 * For LSIs the HW EOI cycle is used rather than PQ bits,
    103		 * as they are automatically re-triggred in HW when still
    104		 * pending.
    105		 */
    106		__raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
    107	} else {
    108		uint64_t eoi_val;
    109
    110		/*
    111		 * Otherwise for EOI, we use the special MMIO that does
    112		 * a clear of both P and Q and returns the old Q,
    113		 * except for LSIs where we use the "EOI cycle" special
    114		 * load.
    115		 *
    116		 * This allows us to then do a re-trigger if Q was set
    117		 * rather than synthetizing an interrupt in software
    118		 */
    119		eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00);
    120
    121		/* Re-trigger if needed */
    122		if ((eoi_val & 1) && __x_trig_page(xd))
    123			__raw_writeq(0, __x_trig_page(xd));
    124	}
    125}
    126
    127enum {
    128	scan_fetch,
    129	scan_poll,
    130	scan_eoi,
    131};
    132
    133static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc,
    134				       u8 pending, int scan_type)
    135{
    136	u32 hirq = 0;
    137	u8 prio = 0xff;
    138
    139	/* Find highest pending priority */
    140	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
    141		struct xive_q *q;
    142		u32 idx, toggle;
    143		__be32 *qpage;
    144
    145		/*
    146		 * If pending is 0 this will return 0xff which is what
    147		 * we want
    148		 */
    149		prio = ffs(pending) - 1;
    150
    151		/* Don't scan past the guest cppr */
    152		if (prio >= xc->cppr || prio > 7) {
    153			if (xc->mfrr < xc->cppr) {
    154				prio = xc->mfrr;
    155				hirq = XICS_IPI;
    156			}
    157			break;
    158		}
    159
    160		/* Grab queue and pointers */
    161		q = &xc->queues[prio];
    162		idx = q->idx;
    163		toggle = q->toggle;
    164
    165		/*
    166		 * Snapshot the queue page. The test further down for EOI
    167		 * must use the same "copy" that was used by __xive_read_eq
    168		 * since qpage can be set concurrently and we don't want
    169		 * to miss an EOI.
    170		 */
    171		qpage = READ_ONCE(q->qpage);
    172
    173skip_ipi:
    174		/*
    175		 * Try to fetch from the queue. Will return 0 for a
    176		 * non-queueing priority (ie, qpage = 0).
    177		 */
    178		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
    179
    180		/*
    181		 * If this was a signal for an MFFR change done by
    182		 * H_IPI we skip it. Additionally, if we were fetching
    183		 * we EOI it now, thus re-enabling reception of a new
    184		 * such signal.
    185		 *
    186		 * We also need to do that if prio is 0 and we had no
    187		 * page for the queue. In this case, we have non-queued
    188		 * IPI that needs to be EOId.
    189		 *
    190		 * This is safe because if we have another pending MFRR
    191		 * change that wasn't observed above, the Q bit will have
    192		 * been set and another occurrence of the IPI will trigger.
    193		 */
    194		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
    195			if (scan_type == scan_fetch) {
    196				xive_vm_source_eoi(xc->vp_ipi,
    197						       &xc->vp_ipi_data);
    198				q->idx = idx;
    199				q->toggle = toggle;
    200			}
    201			/* Loop back on same queue with updated idx/toggle */
    202			WARN_ON(hirq && hirq != XICS_IPI);
    203			if (hirq)
    204				goto skip_ipi;
    205		}
    206
    207		/* If it's the dummy interrupt, continue searching */
    208		if (hirq == XICS_DUMMY)
    209			goto skip_ipi;
    210
    211		/* Clear the pending bit if the queue is now empty */
    212		if (!hirq) {
    213			pending &= ~(1 << prio);
    214
    215			/*
    216			 * Check if the queue count needs adjusting due to
    217			 * interrupts being moved away.
    218			 */
    219			if (atomic_read(&q->pending_count)) {
    220				int p = atomic_xchg(&q->pending_count, 0);
    221
    222				if (p) {
    223					WARN_ON(p > atomic_read(&q->count));
    224					atomic_sub(p, &q->count);
    225				}
    226			}
    227		}
    228
    229		/*
    230		 * If the most favoured prio we found pending is less
    231		 * favored (or equal) than a pending IPI, we return
    232		 * the IPI instead.
    233		 */
    234		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
    235			prio = xc->mfrr;
    236			hirq = XICS_IPI;
    237			break;
    238		}
    239
    240		/* If fetching, update queue pointers */
    241		if (scan_type == scan_fetch) {
    242			q->idx = idx;
    243			q->toggle = toggle;
    244		}
    245	}
    246
    247	/* If we are just taking a "peek", do nothing else */
    248	if (scan_type == scan_poll)
    249		return hirq;
    250
    251	/* Update the pending bits */
    252	xc->pending = pending;
    253
    254	/*
    255	 * If this is an EOI that's it, no CPPR adjustment done here,
    256	 * all we needed was cleanup the stale pending bits and check
    257	 * if there's anything left.
    258	 */
    259	if (scan_type == scan_eoi)
    260		return hirq;
    261
    262	/*
    263	 * If we found an interrupt, adjust what the guest CPPR should
    264	 * be as if we had just fetched that interrupt from HW.
    265	 *
    266	 * Note: This can only make xc->cppr smaller as the previous
    267	 * loop will only exit with hirq != 0 if prio is lower than
    268	 * the current xc->cppr. Thus we don't need to re-check xc->mfrr
    269	 * for pending IPIs.
    270	 */
    271	if (hirq)
    272		xc->cppr = prio;
    273	/*
    274	 * If it was an IPI the HW CPPR might have been lowered too much
    275	 * as the HW interrupt we use for IPIs is routed to priority 0.
    276	 *
    277	 * We re-sync it here.
    278	 */
    279	if (xc->cppr != xc->hw_cppr) {
    280		xc->hw_cppr = xc->cppr;
    281		__raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
    282	}
    283
    284	return hirq;
    285}
    286
    287static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu)
    288{
    289	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    290	u8 old_cppr;
    291	u32 hirq;
    292
    293	pr_devel("H_XIRR\n");
    294
    295	xc->stat_vm_h_xirr++;
    296
    297	/* First collect pending bits from HW */
    298	xive_vm_ack_pending(xc);
    299
    300	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
    301		 xc->pending, xc->hw_cppr, xc->cppr);
    302
    303	/* Grab previous CPPR and reverse map it */
    304	old_cppr = xive_prio_to_guest(xc->cppr);
    305
    306	/* Scan for actual interrupts */
    307	hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch);
    308
    309	pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
    310		 hirq, xc->hw_cppr, xc->cppr);
    311
    312	/* That should never hit */
    313	if (hirq & 0xff000000)
    314		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
    315
    316	/*
    317	 * XXX We could check if the interrupt is masked here and
    318	 * filter it. If we chose to do so, we would need to do:
    319	 *
    320	 *    if (masked) {
    321	 *        lock();
    322	 *        if (masked) {
    323	 *            old_Q = true;
    324	 *            hirq = 0;
    325	 *        }
    326	 *        unlock();
    327	 *    }
    328	 */
    329
    330	/* Return interrupt and old CPPR in GPR4 */
    331	vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24);
    332
    333	return H_SUCCESS;
    334}
    335
    336static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
    337{
    338	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    339	u8 pending = xc->pending;
    340	u32 hirq;
    341
    342	pr_devel("H_IPOLL(server=%ld)\n", server);
    343
    344	xc->stat_vm_h_ipoll++;
    345
    346	/* Grab the target VCPU if not the current one */
    347	if (xc->server_num != server) {
    348		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
    349		if (!vcpu)
    350			return H_PARAMETER;
    351		xc = vcpu->arch.xive_vcpu;
    352
    353		/* Scan all priorities */
    354		pending = 0xff;
    355	} else {
    356		/* Grab pending interrupt if any */
    357		__be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS);
    358		u8 pipr = be64_to_cpu(qw1) & 0xff;
    359
    360		if (pipr < 8)
    361			pending |= 1 << pipr;
    362	}
    363
    364	hirq = xive_vm_scan_interrupts(xc, pending, scan_poll);
    365
    366	/* Return interrupt and old CPPR in GPR4 */
    367	vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24);
    368
    369	return H_SUCCESS;
    370}
    371
    372static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc)
    373{
    374	u8 pending, prio;
    375
    376	pending = xc->pending;
    377	if (xc->mfrr != 0xff) {
    378		if (xc->mfrr < 8)
    379			pending |= 1 << xc->mfrr;
    380		else
    381			pending |= 0x80;
    382	}
    383	if (!pending)
    384		return;
    385	prio = ffs(pending) - 1;
    386
    387	__raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING);
    388}
    389
    390static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive,
    391					       struct kvmppc_xive_vcpu *xc)
    392{
    393	unsigned int prio;
    394
    395	/* For each priority that is now masked */
    396	for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
    397		struct xive_q *q = &xc->queues[prio];
    398		struct kvmppc_xive_irq_state *state;
    399		struct kvmppc_xive_src_block *sb;
    400		u32 idx, toggle, entry, irq, hw_num;
    401		struct xive_irq_data *xd;
    402		__be32 *qpage;
    403		u16 src;
    404
    405		idx = q->idx;
    406		toggle = q->toggle;
    407		qpage = READ_ONCE(q->qpage);
    408		if (!qpage)
    409			continue;
    410
    411		/* For each interrupt in the queue */
    412		for (;;) {
    413			entry = be32_to_cpup(qpage + idx);
    414
    415			/* No more ? */
    416			if ((entry >> 31) == toggle)
    417				break;
    418			irq = entry & 0x7fffffff;
    419
    420			/* Skip dummies and IPIs */
    421			if (irq == XICS_DUMMY || irq == XICS_IPI)
    422				goto next;
    423			sb = kvmppc_xive_find_source(xive, irq, &src);
    424			if (!sb)
    425				goto next;
    426			state = &sb->irq_state[src];
    427
    428			/* Has it been rerouted ? */
    429			if (xc->server_num == state->act_server)
    430				goto next;
    431
    432			/*
    433			 * Allright, it *has* been re-routed, kill it from
    434			 * the queue.
    435			 */
    436			qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
    437
    438			/* Find the HW interrupt */
    439			kvmppc_xive_select_irq(state, &hw_num, &xd);
    440
    441			/* If it's not an LSI, set PQ to 11 the EOI will force a resend */
    442			if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
    443				xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
    444
    445			/* EOI the source */
    446			xive_vm_source_eoi(hw_num, xd);
    447
    448next:
    449			idx = (idx + 1) & q->msk;
    450			if (idx == 0)
    451				toggle ^= 1;
    452		}
    453	}
    454}
    455
    456static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
    457{
    458	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    459	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
    460	u8 old_cppr;
    461
    462	pr_devel("H_CPPR(cppr=%ld)\n", cppr);
    463
    464	xc->stat_vm_h_cppr++;
    465
    466	/* Map CPPR */
    467	cppr = xive_prio_from_guest(cppr);
    468
    469	/* Remember old and update SW state */
    470	old_cppr = xc->cppr;
    471	xc->cppr = cppr;
    472
    473	/*
    474	 * Order the above update of xc->cppr with the subsequent
    475	 * read of xc->mfrr inside push_pending_to_hw()
    476	 */
    477	smp_mb();
    478
    479	if (cppr > old_cppr) {
    480		/*
    481		 * We are masking less, we need to look for pending things
    482		 * to deliver and set VP pending bits accordingly to trigger
    483		 * a new interrupt otherwise we might miss MFRR changes for
    484		 * which we have optimized out sending an IPI signal.
    485		 */
    486		xive_vm_push_pending_to_hw(xc);
    487	} else {
    488		/*
    489		 * We are masking more, we need to check the queue for any
    490		 * interrupt that has been routed to another CPU, take
    491		 * it out (replace it with the dummy) and retrigger it.
    492		 *
    493		 * This is necessary since those interrupts may otherwise
    494		 * never be processed, at least not until this CPU restores
    495		 * its CPPR.
    496		 *
    497		 * This is in theory racy vs. HW adding new interrupts to
    498		 * the queue. In practice this works because the interesting
    499		 * cases are when the guest has done a set_xive() to move the
    500		 * interrupt away, which flushes the xive, followed by the
    501		 * target CPU doing a H_CPPR. So any new interrupt coming into
    502		 * the queue must still be routed to us and isn't a source
    503		 * of concern.
    504		 */
    505		xive_vm_scan_for_rerouted_irqs(xive, xc);
    506	}
    507
    508	/* Apply new CPPR */
    509	xc->hw_cppr = cppr;
    510	__raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR);
    511
    512	return H_SUCCESS;
    513}
    514
    515static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
    516{
    517	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
    518	struct kvmppc_xive_src_block *sb;
    519	struct kvmppc_xive_irq_state *state;
    520	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    521	struct xive_irq_data *xd;
    522	u8 new_cppr = xirr >> 24;
    523	u32 irq = xirr & 0x00ffffff, hw_num;
    524	u16 src;
    525	int rc = 0;
    526
    527	pr_devel("H_EOI(xirr=%08lx)\n", xirr);
    528
    529	xc->stat_vm_h_eoi++;
    530
    531	xc->cppr = xive_prio_from_guest(new_cppr);
    532
    533	/*
    534	 * IPIs are synthetized from MFRR and thus don't need
    535	 * any special EOI handling. The underlying interrupt
    536	 * used to signal MFRR changes is EOId when fetched from
    537	 * the queue.
    538	 */
    539	if (irq == XICS_IPI || irq == 0) {
    540		/*
    541		 * This barrier orders the setting of xc->cppr vs.
    542		 * subsquent test of xc->mfrr done inside
    543		 * scan_interrupts and push_pending_to_hw
    544		 */
    545		smp_mb();
    546		goto bail;
    547	}
    548
    549	/* Find interrupt source */
    550	sb = kvmppc_xive_find_source(xive, irq, &src);
    551	if (!sb) {
    552		pr_devel(" source not found !\n");
    553		rc = H_PARAMETER;
    554		/* Same as above */
    555		smp_mb();
    556		goto bail;
    557	}
    558	state = &sb->irq_state[src];
    559	kvmppc_xive_select_irq(state, &hw_num, &xd);
    560
    561	state->in_eoi = true;
    562
    563	/*
    564	 * This barrier orders both setting of in_eoi above vs,
    565	 * subsequent test of guest_priority, and the setting
    566	 * of xc->cppr vs. subsquent test of xc->mfrr done inside
    567	 * scan_interrupts and push_pending_to_hw
    568	 */
    569	smp_mb();
    570
    571again:
    572	if (state->guest_priority == MASKED) {
    573		arch_spin_lock(&sb->lock);
    574		if (state->guest_priority != MASKED) {
    575			arch_spin_unlock(&sb->lock);
    576			goto again;
    577		}
    578		pr_devel(" EOI on saved P...\n");
    579
    580		/* Clear old_p, that will cause unmask to perform an EOI */
    581		state->old_p = false;
    582
    583		arch_spin_unlock(&sb->lock);
    584	} else {
    585		pr_devel(" EOI on source...\n");
    586
    587		/* Perform EOI on the source */
    588		xive_vm_source_eoi(hw_num, xd);
    589
    590		/* If it's an emulated LSI, check level and resend */
    591		if (state->lsi && state->asserted)
    592			__raw_writeq(0, __x_trig_page(xd));
    593
    594	}
    595
    596	/*
    597	 * This barrier orders the above guest_priority check
    598	 * and spin_lock/unlock with clearing in_eoi below.
    599	 *
    600	 * It also has to be a full mb() as it must ensure
    601	 * the MMIOs done in source_eoi() are completed before
    602	 * state->in_eoi is visible.
    603	 */
    604	mb();
    605	state->in_eoi = false;
    606bail:
    607
    608	/* Re-evaluate pending IRQs and update HW */
    609	xive_vm_scan_interrupts(xc, xc->pending, scan_eoi);
    610	xive_vm_push_pending_to_hw(xc);
    611	pr_devel(" after scan pending=%02x\n", xc->pending);
    612
    613	/* Apply new CPPR */
    614	xc->hw_cppr = xc->cppr;
    615	__raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
    616
    617	return rc;
    618}
    619
    620static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
    621			       unsigned long mfrr)
    622{
    623	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    624
    625	pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
    626
    627	xc->stat_vm_h_ipi++;
    628
    629	/* Find target */
    630	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
    631	if (!vcpu)
    632		return H_PARAMETER;
    633	xc = vcpu->arch.xive_vcpu;
    634
    635	/* Locklessly write over MFRR */
    636	xc->mfrr = mfrr;
    637
    638	/*
    639	 * The load of xc->cppr below and the subsequent MMIO store
    640	 * to the IPI must happen after the above mfrr update is
    641	 * globally visible so that:
    642	 *
    643	 * - Synchronize with another CPU doing an H_EOI or a H_CPPR
    644	 *   updating xc->cppr then reading xc->mfrr.
    645	 *
    646	 * - The target of the IPI sees the xc->mfrr update
    647	 */
    648	mb();
    649
    650	/* Shoot the IPI if most favored than target cppr */
    651	if (mfrr < xc->cppr)
    652		__raw_writeq(0, __x_trig_page(&xc->vp_ipi_data));
    653
    654	return H_SUCCESS;
    655}
    656
    657/*
    658 * We leave a gap of a couple of interrupts in the queue to
    659 * account for the IPI and additional safety guard.
    660 */
    661#define XIVE_Q_GAP	2
    662
    663static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu)
    664{
    665	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    666
    667	/* Check enablement at VP level */
    668	return xc->vp_cam & TM_QW1W2_HO;
    669}
    670
    671bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu)
    672{
    673	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    674	struct kvmppc_xive *xive = xc->xive;
    675
    676	if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE)
    677		return kvmppc_xive_vcpu_has_save_restore(vcpu);
    678
    679	return true;
    680}
    681
    682/*
    683 * Push a vcpu's context to the XIVE on guest entry.
    684 * This assumes we are in virtual mode (MMU on)
    685 */
    686void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
    687{
    688	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
    689	u64 pq;
    690
    691	/*
    692	 * Nothing to do if the platform doesn't have a XIVE
    693	 * or this vCPU doesn't have its own XIVE context
    694	 * (e.g. because it's not using an in-kernel interrupt controller).
    695	 */
    696	if (!tima || !vcpu->arch.xive_cam_word)
    697		return;
    698
    699	eieio();
    700	if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
    701		__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
    702	__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
    703	vcpu->arch.xive_pushed = 1;
    704	eieio();
    705
    706	/*
    707	 * We clear the irq_pending flag. There is a small chance of a
    708	 * race vs. the escalation interrupt happening on another
    709	 * processor setting it again, but the only consequence is to
    710	 * cause a spurious wakeup on the next H_CEDE, which is not an
    711	 * issue.
    712	 */
    713	vcpu->arch.irq_pending = 0;
    714
    715	/*
    716	 * In single escalation mode, if the escalation interrupt is
    717	 * on, we mask it.
    718	 */
    719	if (vcpu->arch.xive_esc_on) {
    720		pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
    721						  XIVE_ESB_SET_PQ_01));
    722		mb();
    723
    724		/*
    725		 * We have a possible subtle race here: The escalation
    726		 * interrupt might have fired and be on its way to the
    727		 * host queue while we mask it, and if we unmask it
    728		 * early enough (re-cede right away), there is a
    729		 * theoretical possibility that it fires again, thus
    730		 * landing in the target queue more than once which is
    731		 * a big no-no.
    732		 *
    733		 * Fortunately, solving this is rather easy. If the
    734		 * above load setting PQ to 01 returns a previous
    735		 * value where P is set, then we know the escalation
    736		 * interrupt is somewhere on its way to the host. In
    737		 * that case we simply don't clear the xive_esc_on
    738		 * flag below. It will be eventually cleared by the
    739		 * handler for the escalation interrupt.
    740		 *
    741		 * Then, when doing a cede, we check that flag again
    742		 * before re-enabling the escalation interrupt, and if
    743		 * set, we abort the cede.
    744		 */
    745		if (!(pq & XIVE_ESB_VAL_P))
    746			/* Now P is 0, we can clear the flag */
    747			vcpu->arch.xive_esc_on = 0;
    748	}
    749}
    750EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
    751
    752/*
    753 * Pull a vcpu's context from the XIVE on guest exit.
    754 * This assumes we are in virtual mode (MMU on)
    755 */
    756void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
    757{
    758	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
    759
    760	if (!vcpu->arch.xive_pushed)
    761		return;
    762
    763	/*
    764	 * Should not have been pushed if there is no tima
    765	 */
    766	if (WARN_ON(!tima))
    767		return;
    768
    769	eieio();
    770	/* First load to pull the context, we ignore the value */
    771	__raw_readl(tima + TM_SPC_PULL_OS_CTX);
    772	/* Second load to recover the context state (Words 0 and 1) */
    773	if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
    774		vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
    775
    776	/* Fixup some of the state for the next load */
    777	vcpu->arch.xive_saved_state.lsmfb = 0;
    778	vcpu->arch.xive_saved_state.ack = 0xff;
    779	vcpu->arch.xive_pushed = 0;
    780	eieio();
    781}
    782EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
    783
    784bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
    785{
    786	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
    787	bool ret = true;
    788
    789	if (!esc_vaddr)
    790		return ret;
    791
    792	/* we are using XIVE with single escalation */
    793
    794	if (vcpu->arch.xive_esc_on) {
    795		/*
    796		 * If we still have a pending escalation, abort the cede,
    797		 * and we must set PQ to 10 rather than 00 so that we don't
    798		 * potentially end up with two entries for the escalation
    799		 * interrupt in the XIVE interrupt queue.  In that case
    800		 * we also don't want to set xive_esc_on to 1 here in
    801		 * case we race with xive_esc_irq().
    802		 */
    803		ret = false;
    804		/*
    805		 * The escalation interrupts are special as we don't EOI them.
    806		 * There is no need to use the load-after-store ordering offset
    807		 * to set PQ to 10 as we won't use StoreEOI.
    808		 */
    809		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
    810	} else {
    811		vcpu->arch.xive_esc_on = true;
    812		mb();
    813		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
    814	}
    815	mb();
    816
    817	return ret;
    818}
    819EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
    820
    821/*
    822 * This is a simple trigger for a generic XIVE IRQ. This must
    823 * only be called for interrupts that support a trigger page
    824 */
    825static bool xive_irq_trigger(struct xive_irq_data *xd)
    826{
    827	/* This should be only for MSIs */
    828	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
    829		return false;
    830
    831	/* Those interrupts should always have a trigger page */
    832	if (WARN_ON(!xd->trig_mmio))
    833		return false;
    834
    835	out_be64(xd->trig_mmio, 0);
    836
    837	return true;
    838}
    839
    840static irqreturn_t xive_esc_irq(int irq, void *data)
    841{
    842	struct kvm_vcpu *vcpu = data;
    843
    844	vcpu->arch.irq_pending = 1;
    845	smp_mb();
    846	if (vcpu->arch.ceded || vcpu->arch.nested)
    847		kvmppc_fast_vcpu_kick(vcpu);
    848
    849	/* Since we have the no-EOI flag, the interrupt is effectively
    850	 * disabled now. Clearing xive_esc_on means we won't bother
    851	 * doing so on the next entry.
    852	 *
    853	 * This also allows the entry code to know that if a PQ combination
    854	 * of 10 is observed while xive_esc_on is true, it means the queue
    855	 * contains an unprocessed escalation interrupt. We don't make use of
    856	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
    857	 */
    858	vcpu->arch.xive_esc_on = false;
    859
    860	/* This orders xive_esc_on = false vs. subsequent stale_p = true */
    861	smp_wmb();	/* goes with smp_mb() in cleanup_single_escalation */
    862
    863	return IRQ_HANDLED;
    864}
    865
    866int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
    867				  bool single_escalation)
    868{
    869	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    870	struct xive_q *q = &xc->queues[prio];
    871	char *name = NULL;
    872	int rc;
    873
    874	/* Already there ? */
    875	if (xc->esc_virq[prio])
    876		return 0;
    877
    878	/* Hook up the escalation interrupt */
    879	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
    880	if (!xc->esc_virq[prio]) {
    881		pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
    882		       prio, xc->server_num);
    883		return -EIO;
    884	}
    885
    886	if (single_escalation)
    887		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
    888				 vcpu->kvm->arch.lpid, xc->server_num);
    889	else
    890		name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
    891				 vcpu->kvm->arch.lpid, xc->server_num, prio);
    892	if (!name) {
    893		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
    894		       prio, xc->server_num);
    895		rc = -ENOMEM;
    896		goto error;
    897	}
    898
    899	pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
    900
    901	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
    902			 IRQF_NO_THREAD, name, vcpu);
    903	if (rc) {
    904		pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
    905		       prio, xc->server_num);
    906		goto error;
    907	}
    908	xc->esc_virq_names[prio] = name;
    909
    910	/* In single escalation mode, we grab the ESB MMIO of the
    911	 * interrupt and mask it. Also populate the VCPU v/raddr
    912	 * of the ESB page for use by asm entry/exit code. Finally
    913	 * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the
    914	 * core code from performing an EOI on the escalation
    915	 * interrupt, thus leaving it effectively masked after
    916	 * it fires once.
    917	 */
    918	if (single_escalation) {
    919		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
    920		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
    921
    922		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
    923		vcpu->arch.xive_esc_raddr = xd->eoi_page;
    924		vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
    925		xd->flags |= XIVE_IRQ_FLAG_NO_EOI;
    926	}
    927
    928	return 0;
    929error:
    930	irq_dispose_mapping(xc->esc_virq[prio]);
    931	xc->esc_virq[prio] = 0;
    932	kfree(name);
    933	return rc;
    934}
    935
    936static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
    937{
    938	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
    939	struct kvmppc_xive *xive = xc->xive;
    940	struct xive_q *q =  &xc->queues[prio];
    941	void *qpage;
    942	int rc;
    943
    944	if (WARN_ON(q->qpage))
    945		return 0;
    946
    947	/* Allocate the queue and retrieve infos on current node for now */
    948	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
    949	if (!qpage) {
    950		pr_err("Failed to allocate queue %d for VCPU %d\n",
    951		       prio, xc->server_num);
    952		return -ENOMEM;
    953	}
    954	memset(qpage, 0, 1 << xive->q_order);
    955
    956	/*
    957	 * Reconfigure the queue. This will set q->qpage only once the
    958	 * queue is fully configured. This is a requirement for prio 0
    959	 * as we will stop doing EOIs for every IPI as soon as we observe
    960	 * qpage being non-NULL, and instead will only EOI when we receive
    961	 * corresponding queue 0 entries
    962	 */
    963	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
    964					 xive->q_order, true);
    965	if (rc)
    966		pr_err("Failed to configure queue %d for VCPU %d\n",
    967		       prio, xc->server_num);
    968	return rc;
    969}
    970
    971/* Called with xive->lock held */
    972static int xive_check_provisioning(struct kvm *kvm, u8 prio)
    973{
    974	struct kvmppc_xive *xive = kvm->arch.xive;
    975	struct kvm_vcpu *vcpu;
    976	unsigned long i;
    977	int rc;
    978
    979	lockdep_assert_held(&xive->lock);
    980
    981	/* Already provisioned ? */
    982	if (xive->qmap & (1 << prio))
    983		return 0;
    984
    985	pr_devel("Provisioning prio... %d\n", prio);
    986
    987	/* Provision each VCPU and enable escalations if needed */
    988	kvm_for_each_vcpu(i, vcpu, kvm) {
    989		if (!vcpu->arch.xive_vcpu)
    990			continue;
    991		rc = xive_provision_queue(vcpu, prio);
    992		if (rc == 0 && !kvmppc_xive_has_single_escalation(xive))
    993			kvmppc_xive_attach_escalation(vcpu, prio,
    994						      kvmppc_xive_has_single_escalation(xive));
    995		if (rc)
    996			return rc;
    997	}
    998
    999	/* Order previous stores and mark it as provisioned */
   1000	mb();
   1001	xive->qmap |= (1 << prio);
   1002	return 0;
   1003}
   1004
   1005static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
   1006{
   1007	struct kvm_vcpu *vcpu;
   1008	struct kvmppc_xive_vcpu *xc;
   1009	struct xive_q *q;
   1010
   1011	/* Locate target server */
   1012	vcpu = kvmppc_xive_find_server(kvm, server);
   1013	if (!vcpu) {
   1014		pr_warn("%s: Can't find server %d\n", __func__, server);
   1015		return;
   1016	}
   1017	xc = vcpu->arch.xive_vcpu;
   1018	if (WARN_ON(!xc))
   1019		return;
   1020
   1021	q = &xc->queues[prio];
   1022	atomic_inc(&q->pending_count);
   1023}
   1024
   1025static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
   1026{
   1027	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   1028	struct xive_q *q;
   1029	u32 max;
   1030
   1031	if (WARN_ON(!xc))
   1032		return -ENXIO;
   1033	if (!xc->valid)
   1034		return -ENXIO;
   1035
   1036	q = &xc->queues[prio];
   1037	if (WARN_ON(!q->qpage))
   1038		return -ENXIO;
   1039
   1040	/* Calculate max number of interrupts in that queue. */
   1041	max = (q->msk + 1) - XIVE_Q_GAP;
   1042	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
   1043}
   1044
   1045int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
   1046{
   1047	struct kvm_vcpu *vcpu;
   1048	unsigned long i;
   1049	int rc;
   1050
   1051	/* Locate target server */
   1052	vcpu = kvmppc_xive_find_server(kvm, *server);
   1053	if (!vcpu) {
   1054		pr_devel("Can't find server %d\n", *server);
   1055		return -EINVAL;
   1056	}
   1057
   1058	pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
   1059
   1060	/* Try pick it */
   1061	rc = xive_try_pick_queue(vcpu, prio);
   1062	if (rc == 0)
   1063		return rc;
   1064
   1065	pr_devel(" .. failed, looking up candidate...\n");
   1066
   1067	/* Failed, pick another VCPU */
   1068	kvm_for_each_vcpu(i, vcpu, kvm) {
   1069		if (!vcpu->arch.xive_vcpu)
   1070			continue;
   1071		rc = xive_try_pick_queue(vcpu, prio);
   1072		if (rc == 0) {
   1073			*server = vcpu->arch.xive_vcpu->server_num;
   1074			pr_devel("  found on 0x%x/%d\n", *server, prio);
   1075			return rc;
   1076		}
   1077	}
   1078	pr_devel("  no available target !\n");
   1079
   1080	/* No available target ! */
   1081	return -EBUSY;
   1082}
   1083
   1084static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
   1085			     struct kvmppc_xive_src_block *sb,
   1086			     struct kvmppc_xive_irq_state *state)
   1087{
   1088	struct xive_irq_data *xd;
   1089	u32 hw_num;
   1090	u8 old_prio;
   1091	u64 val;
   1092
   1093	/*
   1094	 * Take the lock, set masked, try again if racing
   1095	 * with H_EOI
   1096	 */
   1097	for (;;) {
   1098		arch_spin_lock(&sb->lock);
   1099		old_prio = state->guest_priority;
   1100		state->guest_priority = MASKED;
   1101		mb();
   1102		if (!state->in_eoi)
   1103			break;
   1104		state->guest_priority = old_prio;
   1105		arch_spin_unlock(&sb->lock);
   1106	}
   1107
   1108	/* No change ? Bail */
   1109	if (old_prio == MASKED)
   1110		return old_prio;
   1111
   1112	/* Get the right irq */
   1113	kvmppc_xive_select_irq(state, &hw_num, &xd);
   1114
   1115	/* Set PQ to 10, return old P and old Q and remember them */
   1116	val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
   1117	state->old_p = !!(val & 2);
   1118	state->old_q = !!(val & 1);
   1119
   1120	/*
   1121	 * Synchronize hardware to sensure the queues are updated when
   1122	 * masking
   1123	 */
   1124	xive_native_sync_source(hw_num);
   1125
   1126	return old_prio;
   1127}
   1128
   1129static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
   1130				 struct kvmppc_xive_irq_state *state)
   1131{
   1132	/*
   1133	 * Take the lock try again if racing with H_EOI
   1134	 */
   1135	for (;;) {
   1136		arch_spin_lock(&sb->lock);
   1137		if (!state->in_eoi)
   1138			break;
   1139		arch_spin_unlock(&sb->lock);
   1140	}
   1141}
   1142
   1143static void xive_finish_unmask(struct kvmppc_xive *xive,
   1144			       struct kvmppc_xive_src_block *sb,
   1145			       struct kvmppc_xive_irq_state *state,
   1146			       u8 prio)
   1147{
   1148	struct xive_irq_data *xd;
   1149	u32 hw_num;
   1150
   1151	/* If we aren't changing a thing, move on */
   1152	if (state->guest_priority != MASKED)
   1153		goto bail;
   1154
   1155	/* Get the right irq */
   1156	kvmppc_xive_select_irq(state, &hw_num, &xd);
   1157
   1158	/* Old Q set, set PQ to 11 */
   1159	if (state->old_q)
   1160		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
   1161
   1162	/*
   1163	 * If not old P, then perform an "effective" EOI,
   1164	 * on the source. This will handle the cases where
   1165	 * FW EOI is needed.
   1166	 */
   1167	if (!state->old_p)
   1168		xive_vm_source_eoi(hw_num, xd);
   1169
   1170	/* Synchronize ordering and mark unmasked */
   1171	mb();
   1172bail:
   1173	state->guest_priority = prio;
   1174}
   1175
   1176/*
   1177 * Target an interrupt to a given server/prio, this will fallback
   1178 * to another server if necessary and perform the HW targetting
   1179 * updates as needed
   1180 *
   1181 * NOTE: Must be called with the state lock held
   1182 */
   1183static int xive_target_interrupt(struct kvm *kvm,
   1184				 struct kvmppc_xive_irq_state *state,
   1185				 u32 server, u8 prio)
   1186{
   1187	struct kvmppc_xive *xive = kvm->arch.xive;
   1188	u32 hw_num;
   1189	int rc;
   1190
   1191	/*
   1192	 * This will return a tentative server and actual
   1193	 * priority. The count for that new target will have
   1194	 * already been incremented.
   1195	 */
   1196	rc = kvmppc_xive_select_target(kvm, &server, prio);
   1197
   1198	/*
   1199	 * We failed to find a target ? Not much we can do
   1200	 * at least until we support the GIQ.
   1201	 */
   1202	if (rc)
   1203		return rc;
   1204
   1205	/*
   1206	 * Increment the old queue pending count if there
   1207	 * was one so that the old queue count gets adjusted later
   1208	 * when observed to be empty.
   1209	 */
   1210	if (state->act_priority != MASKED)
   1211		xive_inc_q_pending(kvm,
   1212				   state->act_server,
   1213				   state->act_priority);
   1214	/*
   1215	 * Update state and HW
   1216	 */
   1217	state->act_priority = prio;
   1218	state->act_server = server;
   1219
   1220	/* Get the right irq */
   1221	kvmppc_xive_select_irq(state, &hw_num, NULL);
   1222
   1223	return xive_native_configure_irq(hw_num,
   1224					 kvmppc_xive_vp(xive, server),
   1225					 prio, state->number);
   1226}
   1227
   1228/*
   1229 * Targetting rules: In order to avoid losing track of
   1230 * pending interrupts across mask and unmask, which would
   1231 * allow queue overflows, we implement the following rules:
   1232 *
   1233 *  - Unless it was never enabled (or we run out of capacity)
   1234 *    an interrupt is always targetted at a valid server/queue
   1235 *    pair even when "masked" by the guest. This pair tends to
   1236 *    be the last one used but it can be changed under some
   1237 *    circumstances. That allows us to separate targetting
   1238 *    from masking, we only handle accounting during (re)targetting,
   1239 *    this also allows us to let an interrupt drain into its target
   1240 *    queue after masking, avoiding complex schemes to remove
   1241 *    interrupts out of remote processor queues.
   1242 *
   1243 *  - When masking, we set PQ to 10 and save the previous value
   1244 *    of P and Q.
   1245 *
   1246 *  - When unmasking, if saved Q was set, we set PQ to 11
   1247 *    otherwise we leave PQ to the HW state which will be either
   1248 *    10 if nothing happened or 11 if the interrupt fired while
   1249 *    masked. Effectively we are OR'ing the previous Q into the
   1250 *    HW Q.
   1251 *
   1252 *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
   1253 *    which will unmask the interrupt and shoot a new one if Q was
   1254 *    set.
   1255 *
   1256 *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
   1257 *    effectively meaning an H_EOI from the guest is still expected
   1258 *    for that interrupt).
   1259 *
   1260 *  - If H_EOI occurs while masked, we clear the saved P.
   1261 *
   1262 *  - When changing target, we account on the new target and
   1263 *    increment a separate "pending" counter on the old one.
   1264 *    This pending counter will be used to decrement the old
   1265 *    target's count when its queue has been observed empty.
   1266 */
   1267
   1268int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
   1269			 u32 priority)
   1270{
   1271	struct kvmppc_xive *xive = kvm->arch.xive;
   1272	struct kvmppc_xive_src_block *sb;
   1273	struct kvmppc_xive_irq_state *state;
   1274	u8 new_act_prio;
   1275	int rc = 0;
   1276	u16 idx;
   1277
   1278	if (!xive)
   1279		return -ENODEV;
   1280
   1281	pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
   1282		 irq, server, priority);
   1283
   1284	/* First, check provisioning of queues */
   1285	if (priority != MASKED) {
   1286		mutex_lock(&xive->lock);
   1287		rc = xive_check_provisioning(xive->kvm,
   1288			      xive_prio_from_guest(priority));
   1289		mutex_unlock(&xive->lock);
   1290	}
   1291	if (rc) {
   1292		pr_devel("  provisioning failure %d !\n", rc);
   1293		return rc;
   1294	}
   1295
   1296	sb = kvmppc_xive_find_source(xive, irq, &idx);
   1297	if (!sb)
   1298		return -EINVAL;
   1299	state = &sb->irq_state[idx];
   1300
   1301	/*
   1302	 * We first handle masking/unmasking since the locking
   1303	 * might need to be retried due to EOIs, we'll handle
   1304	 * targetting changes later. These functions will return
   1305	 * with the SB lock held.
   1306	 *
   1307	 * xive_lock_and_mask() will also set state->guest_priority
   1308	 * but won't otherwise change other fields of the state.
   1309	 *
   1310	 * xive_lock_for_unmask will not actually unmask, this will
   1311	 * be done later by xive_finish_unmask() once the targetting
   1312	 * has been done, so we don't try to unmask an interrupt
   1313	 * that hasn't yet been targetted.
   1314	 */
   1315	if (priority == MASKED)
   1316		xive_lock_and_mask(xive, sb, state);
   1317	else
   1318		xive_lock_for_unmask(sb, state);
   1319
   1320
   1321	/*
   1322	 * Then we handle targetting.
   1323	 *
   1324	 * First calculate a new "actual priority"
   1325	 */
   1326	new_act_prio = state->act_priority;
   1327	if (priority != MASKED)
   1328		new_act_prio = xive_prio_from_guest(priority);
   1329
   1330	pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
   1331		 new_act_prio, state->act_server, state->act_priority);
   1332
   1333	/*
   1334	 * Then check if we actually need to change anything,
   1335	 *
   1336	 * The condition for re-targetting the interrupt is that
   1337	 * we have a valid new priority (new_act_prio is not 0xff)
   1338	 * and either the server or the priority changed.
   1339	 *
   1340	 * Note: If act_priority was ff and the new priority is
   1341	 *       also ff, we don't do anything and leave the interrupt
   1342	 *       untargetted. An attempt of doing an int_on on an
   1343	 *       untargetted interrupt will fail. If that is a problem
   1344	 *       we could initialize interrupts with valid default
   1345	 */
   1346
   1347	if (new_act_prio != MASKED &&
   1348	    (state->act_server != server ||
   1349	     state->act_priority != new_act_prio))
   1350		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
   1351
   1352	/*
   1353	 * Perform the final unmasking of the interrupt source
   1354	 * if necessary
   1355	 */
   1356	if (priority != MASKED)
   1357		xive_finish_unmask(xive, sb, state, priority);
   1358
   1359	/*
   1360	 * Finally Update saved_priority to match. Only int_on/off
   1361	 * set this field to a different value.
   1362	 */
   1363	state->saved_priority = priority;
   1364
   1365	arch_spin_unlock(&sb->lock);
   1366	return rc;
   1367}
   1368
   1369int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
   1370			 u32 *priority)
   1371{
   1372	struct kvmppc_xive *xive = kvm->arch.xive;
   1373	struct kvmppc_xive_src_block *sb;
   1374	struct kvmppc_xive_irq_state *state;
   1375	u16 idx;
   1376
   1377	if (!xive)
   1378		return -ENODEV;
   1379
   1380	sb = kvmppc_xive_find_source(xive, irq, &idx);
   1381	if (!sb)
   1382		return -EINVAL;
   1383	state = &sb->irq_state[idx];
   1384	arch_spin_lock(&sb->lock);
   1385	*server = state->act_server;
   1386	*priority = state->guest_priority;
   1387	arch_spin_unlock(&sb->lock);
   1388
   1389	return 0;
   1390}
   1391
   1392int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
   1393{
   1394	struct kvmppc_xive *xive = kvm->arch.xive;
   1395	struct kvmppc_xive_src_block *sb;
   1396	struct kvmppc_xive_irq_state *state;
   1397	u16 idx;
   1398
   1399	if (!xive)
   1400		return -ENODEV;
   1401
   1402	sb = kvmppc_xive_find_source(xive, irq, &idx);
   1403	if (!sb)
   1404		return -EINVAL;
   1405	state = &sb->irq_state[idx];
   1406
   1407	pr_devel("int_on(irq=0x%x)\n", irq);
   1408
   1409	/*
   1410	 * Check if interrupt was not targetted
   1411	 */
   1412	if (state->act_priority == MASKED) {
   1413		pr_devel("int_on on untargetted interrupt\n");
   1414		return -EINVAL;
   1415	}
   1416
   1417	/* If saved_priority is 0xff, do nothing */
   1418	if (state->saved_priority == MASKED)
   1419		return 0;
   1420
   1421	/*
   1422	 * Lock and unmask it.
   1423	 */
   1424	xive_lock_for_unmask(sb, state);
   1425	xive_finish_unmask(xive, sb, state, state->saved_priority);
   1426	arch_spin_unlock(&sb->lock);
   1427
   1428	return 0;
   1429}
   1430
   1431int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
   1432{
   1433	struct kvmppc_xive *xive = kvm->arch.xive;
   1434	struct kvmppc_xive_src_block *sb;
   1435	struct kvmppc_xive_irq_state *state;
   1436	u16 idx;
   1437
   1438	if (!xive)
   1439		return -ENODEV;
   1440
   1441	sb = kvmppc_xive_find_source(xive, irq, &idx);
   1442	if (!sb)
   1443		return -EINVAL;
   1444	state = &sb->irq_state[idx];
   1445
   1446	pr_devel("int_off(irq=0x%x)\n", irq);
   1447
   1448	/*
   1449	 * Lock and mask
   1450	 */
   1451	state->saved_priority = xive_lock_and_mask(xive, sb, state);
   1452	arch_spin_unlock(&sb->lock);
   1453
   1454	return 0;
   1455}
   1456
   1457static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
   1458{
   1459	struct kvmppc_xive_src_block *sb;
   1460	struct kvmppc_xive_irq_state *state;
   1461	u16 idx;
   1462
   1463	sb = kvmppc_xive_find_source(xive, irq, &idx);
   1464	if (!sb)
   1465		return false;
   1466	state = &sb->irq_state[idx];
   1467	if (!state->valid)
   1468		return false;
   1469
   1470	/*
   1471	 * Trigger the IPI. This assumes we never restore a pass-through
   1472	 * interrupt which should be safe enough
   1473	 */
   1474	xive_irq_trigger(&state->ipi_data);
   1475
   1476	return true;
   1477}
   1478
   1479u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
   1480{
   1481	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   1482
   1483	if (!xc)
   1484		return 0;
   1485
   1486	/* Return the per-cpu state for state saving/migration */
   1487	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
   1488	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
   1489	       (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
   1490}
   1491
   1492int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
   1493{
   1494	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   1495	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
   1496	u8 cppr, mfrr;
   1497	u32 xisr;
   1498
   1499	if (!xc || !xive)
   1500		return -ENOENT;
   1501
   1502	/* Grab individual state fields. We don't use pending_pri */
   1503	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
   1504	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
   1505		KVM_REG_PPC_ICP_XISR_MASK;
   1506	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
   1507
   1508	pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
   1509		 xc->server_num, cppr, mfrr, xisr);
   1510
   1511	/*
   1512	 * We can't update the state of a "pushed" VCPU, but that
   1513	 * shouldn't happen because the vcpu->mutex makes running a
   1514	 * vcpu mutually exclusive with doing one_reg get/set on it.
   1515	 */
   1516	if (WARN_ON(vcpu->arch.xive_pushed))
   1517		return -EIO;
   1518
   1519	/* Update VCPU HW saved state */
   1520	vcpu->arch.xive_saved_state.cppr = cppr;
   1521	xc->hw_cppr = xc->cppr = cppr;
   1522
   1523	/*
   1524	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
   1525	 * having a pending MFRR change, which will re-evaluate the
   1526	 * target. The VCPU will thus potentially get a spurious
   1527	 * interrupt but that's not a big deal.
   1528	 */
   1529	xc->mfrr = mfrr;
   1530	if (mfrr < cppr)
   1531		xive_irq_trigger(&xc->vp_ipi_data);
   1532
   1533	/*
   1534	 * Now saved XIRR is "interesting". It means there's something in
   1535	 * the legacy "1 element" queue... for an IPI we simply ignore it,
   1536	 * as the MFRR restore will handle that. For anything else we need
   1537	 * to force a resend of the source.
   1538	 * However the source may not have been setup yet. If that's the
   1539	 * case, we keep that info and increment a counter in the xive to
   1540	 * tell subsequent xive_set_source() to go look.
   1541	 */
   1542	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
   1543		xc->delayed_irq = xisr;
   1544		xive->delayed_irqs++;
   1545		pr_devel("  xisr restore delayed\n");
   1546	}
   1547
   1548	return 0;
   1549}
   1550
   1551int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
   1552			   unsigned long host_irq)
   1553{
   1554	struct kvmppc_xive *xive = kvm->arch.xive;
   1555	struct kvmppc_xive_src_block *sb;
   1556	struct kvmppc_xive_irq_state *state;
   1557	struct irq_data *host_data =
   1558		irq_domain_get_irq_data(irq_get_default_host(), host_irq);
   1559	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
   1560	u16 idx;
   1561	u8 prio;
   1562	int rc;
   1563
   1564	if (!xive)
   1565		return -ENODEV;
   1566
   1567	pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
   1568		 __func__, guest_irq, host_irq, hw_irq);
   1569
   1570	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
   1571	if (!sb)
   1572		return -EINVAL;
   1573	state = &sb->irq_state[idx];
   1574
   1575	/*
   1576	 * Mark the passed-through interrupt as going to a VCPU,
   1577	 * this will prevent further EOIs and similar operations
   1578	 * from the XIVE code. It will also mask the interrupt
   1579	 * to either PQ=10 or 11 state, the latter if the interrupt
   1580	 * is pending. This will allow us to unmask or retrigger it
   1581	 * after routing it to the guest with a simple EOI.
   1582	 *
   1583	 * The "state" argument is a "token", all it needs is to be
   1584	 * non-NULL to switch to passed-through or NULL for the
   1585	 * other way around. We may not yet have an actual VCPU
   1586	 * target here and we don't really care.
   1587	 */
   1588	rc = irq_set_vcpu_affinity(host_irq, state);
   1589	if (rc) {
   1590		pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq);
   1591		return rc;
   1592	}
   1593
   1594	/*
   1595	 * Mask and read state of IPI. We need to know if its P bit
   1596	 * is set as that means it's potentially already using a
   1597	 * queue entry in the target
   1598	 */
   1599	prio = xive_lock_and_mask(xive, sb, state);
   1600	pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
   1601		 state->old_p, state->old_q);
   1602
   1603	/* Turn the IPI hard off */
   1604	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
   1605
   1606	/*
   1607	 * Reset ESB guest mapping. Needed when ESB pages are exposed
   1608	 * to the guest in XIVE native mode
   1609	 */
   1610	if (xive->ops && xive->ops->reset_mapped)
   1611		xive->ops->reset_mapped(kvm, guest_irq);
   1612
   1613	/* Grab info about irq */
   1614	state->pt_number = hw_irq;
   1615	state->pt_data = irq_data_get_irq_handler_data(host_data);
   1616
   1617	/*
   1618	 * Configure the IRQ to match the existing configuration of
   1619	 * the IPI if it was already targetted. Otherwise this will
   1620	 * mask the interrupt in a lossy way (act_priority is 0xff)
   1621	 * which is fine for a never started interrupt.
   1622	 */
   1623	xive_native_configure_irq(hw_irq,
   1624				  kvmppc_xive_vp(xive, state->act_server),
   1625				  state->act_priority, state->number);
   1626
   1627	/*
   1628	 * We do an EOI to enable the interrupt (and retrigger if needed)
   1629	 * if the guest has the interrupt unmasked and the P bit was *not*
   1630	 * set in the IPI. If it was set, we know a slot may still be in
   1631	 * use in the target queue thus we have to wait for a guest
   1632	 * originated EOI
   1633	 */
   1634	if (prio != MASKED && !state->old_p)
   1635		xive_vm_source_eoi(hw_irq, state->pt_data);
   1636
   1637	/* Clear old_p/old_q as they are no longer relevant */
   1638	state->old_p = state->old_q = false;
   1639
   1640	/* Restore guest prio (unlocks EOI) */
   1641	mb();
   1642	state->guest_priority = prio;
   1643	arch_spin_unlock(&sb->lock);
   1644
   1645	return 0;
   1646}
   1647EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
   1648
   1649int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
   1650			   unsigned long host_irq)
   1651{
   1652	struct kvmppc_xive *xive = kvm->arch.xive;
   1653	struct kvmppc_xive_src_block *sb;
   1654	struct kvmppc_xive_irq_state *state;
   1655	u16 idx;
   1656	u8 prio;
   1657	int rc;
   1658
   1659	if (!xive)
   1660		return -ENODEV;
   1661
   1662	pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq);
   1663
   1664	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
   1665	if (!sb)
   1666		return -EINVAL;
   1667	state = &sb->irq_state[idx];
   1668
   1669	/*
   1670	 * Mask and read state of IRQ. We need to know if its P bit
   1671	 * is set as that means it's potentially already using a
   1672	 * queue entry in the target
   1673	 */
   1674	prio = xive_lock_and_mask(xive, sb, state);
   1675	pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
   1676		 state->old_p, state->old_q);
   1677
   1678	/*
   1679	 * If old_p is set, the interrupt is pending, we switch it to
   1680	 * PQ=11. This will force a resend in the host so the interrupt
   1681	 * isn't lost to whatever host driver may pick it up
   1682	 */
   1683	if (state->old_p)
   1684		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
   1685
   1686	/* Release the passed-through interrupt to the host */
   1687	rc = irq_set_vcpu_affinity(host_irq, NULL);
   1688	if (rc) {
   1689		pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq);
   1690		return rc;
   1691	}
   1692
   1693	/* Forget about the IRQ */
   1694	state->pt_number = 0;
   1695	state->pt_data = NULL;
   1696
   1697	/*
   1698	 * Reset ESB guest mapping. Needed when ESB pages are exposed
   1699	 * to the guest in XIVE native mode
   1700	 */
   1701	if (xive->ops && xive->ops->reset_mapped) {
   1702		xive->ops->reset_mapped(kvm, guest_irq);
   1703	}
   1704
   1705	/* Reconfigure the IPI */
   1706	xive_native_configure_irq(state->ipi_number,
   1707				  kvmppc_xive_vp(xive, state->act_server),
   1708				  state->act_priority, state->number);
   1709
   1710	/*
   1711	 * If old_p is set (we have a queue entry potentially
   1712	 * occupied) or the interrupt is masked, we set the IPI
   1713	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
   1714	 */
   1715	if (prio == MASKED || state->old_p)
   1716		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
   1717	else
   1718		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
   1719
   1720	/* Restore guest prio (unlocks EOI) */
   1721	mb();
   1722	state->guest_priority = prio;
   1723	arch_spin_unlock(&sb->lock);
   1724
   1725	return 0;
   1726}
   1727EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
   1728
   1729void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
   1730{
   1731	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   1732	struct kvm *kvm = vcpu->kvm;
   1733	struct kvmppc_xive *xive = kvm->arch.xive;
   1734	int i, j;
   1735
   1736	for (i = 0; i <= xive->max_sbid; i++) {
   1737		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
   1738
   1739		if (!sb)
   1740			continue;
   1741		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
   1742			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
   1743
   1744			if (!state->valid)
   1745				continue;
   1746			if (state->act_priority == MASKED)
   1747				continue;
   1748			if (state->act_server != xc->server_num)
   1749				continue;
   1750
   1751			/* Clean it up */
   1752			arch_spin_lock(&sb->lock);
   1753			state->act_priority = MASKED;
   1754			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
   1755			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
   1756			if (state->pt_number) {
   1757				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
   1758				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
   1759			}
   1760			arch_spin_unlock(&sb->lock);
   1761		}
   1762	}
   1763
   1764	/* Disable vcpu's escalation interrupt */
   1765	if (vcpu->arch.xive_esc_on) {
   1766		__raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
   1767					     XIVE_ESB_SET_PQ_01));
   1768		vcpu->arch.xive_esc_on = false;
   1769	}
   1770
   1771	/*
   1772	 * Clear pointers to escalation interrupt ESB.
   1773	 * This is safe because the vcpu->mutex is held, preventing
   1774	 * any other CPU from concurrently executing a KVM_RUN ioctl.
   1775	 */
   1776	vcpu->arch.xive_esc_vaddr = 0;
   1777	vcpu->arch.xive_esc_raddr = 0;
   1778}
   1779
   1780/*
   1781 * In single escalation mode, the escalation interrupt is marked so
   1782 * that EOI doesn't re-enable it, but just sets the stale_p flag to
   1783 * indicate that the P bit has already been dealt with.  However, the
   1784 * assembly code that enters the guest sets PQ to 00 without clearing
   1785 * stale_p (because it has no easy way to address it).  Hence we have
   1786 * to adjust stale_p before shutting down the interrupt.
   1787 */
   1788void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
   1789				    struct kvmppc_xive_vcpu *xc, int irq)
   1790{
   1791	struct irq_data *d = irq_get_irq_data(irq);
   1792	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
   1793
   1794	/*
   1795	 * This slightly odd sequence gives the right result
   1796	 * (i.e. stale_p set if xive_esc_on is false) even if
   1797	 * we race with xive_esc_irq() and xive_irq_eoi().
   1798	 */
   1799	xd->stale_p = false;
   1800	smp_mb();		/* paired with smb_wmb in xive_esc_irq */
   1801	if (!vcpu->arch.xive_esc_on)
   1802		xd->stale_p = true;
   1803}
   1804
   1805void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
   1806{
   1807	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   1808	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
   1809	int i;
   1810
   1811	if (!kvmppc_xics_enabled(vcpu))
   1812		return;
   1813
   1814	if (!xc)
   1815		return;
   1816
   1817	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
   1818
   1819	/* Ensure no interrupt is still routed to that VP */
   1820	xc->valid = false;
   1821	kvmppc_xive_disable_vcpu_interrupts(vcpu);
   1822
   1823	/* Mask the VP IPI */
   1824	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
   1825
   1826	/* Free escalations */
   1827	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
   1828		if (xc->esc_virq[i]) {
   1829			if (kvmppc_xive_has_single_escalation(xc->xive))
   1830				xive_cleanup_single_escalation(vcpu, xc,
   1831							xc->esc_virq[i]);
   1832			free_irq(xc->esc_virq[i], vcpu);
   1833			irq_dispose_mapping(xc->esc_virq[i]);
   1834			kfree(xc->esc_virq_names[i]);
   1835		}
   1836	}
   1837
   1838	/* Disable the VP */
   1839	xive_native_disable_vp(xc->vp_id);
   1840
   1841	/* Clear the cam word so guest entry won't try to push context */
   1842	vcpu->arch.xive_cam_word = 0;
   1843
   1844	/* Free the queues */
   1845	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
   1846		struct xive_q *q = &xc->queues[i];
   1847
   1848		xive_native_disable_queue(xc->vp_id, q, i);
   1849		if (q->qpage) {
   1850			free_pages((unsigned long)q->qpage,
   1851				   xive->q_page_order);
   1852			q->qpage = NULL;
   1853		}
   1854	}
   1855
   1856	/* Free the IPI */
   1857	if (xc->vp_ipi) {
   1858		xive_cleanup_irq_data(&xc->vp_ipi_data);
   1859		xive_native_free_irq(xc->vp_ipi);
   1860	}
   1861	/* Free the VP */
   1862	kfree(xc);
   1863
   1864	/* Cleanup the vcpu */
   1865	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
   1866	vcpu->arch.xive_vcpu = NULL;
   1867}
   1868
   1869static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
   1870{
   1871	/* We have a block of xive->nr_servers VPs. We just need to check
   1872	 * packed vCPU ids are below that.
   1873	 */
   1874	return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers;
   1875}
   1876
   1877int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
   1878{
   1879	u32 vp_id;
   1880
   1881	if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
   1882		pr_devel("Out of bounds !\n");
   1883		return -EINVAL;
   1884	}
   1885
   1886	if (xive->vp_base == XIVE_INVALID_VP) {
   1887		xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
   1888		pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers);
   1889
   1890		if (xive->vp_base == XIVE_INVALID_VP)
   1891			return -ENOSPC;
   1892	}
   1893
   1894	vp_id = kvmppc_xive_vp(xive, cpu);
   1895	if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
   1896		pr_devel("Duplicate !\n");
   1897		return -EEXIST;
   1898	}
   1899
   1900	*vp = vp_id;
   1901
   1902	return 0;
   1903}
   1904
   1905int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
   1906			     struct kvm_vcpu *vcpu, u32 cpu)
   1907{
   1908	struct kvmppc_xive *xive = dev->private;
   1909	struct kvmppc_xive_vcpu *xc;
   1910	int i, r = -EBUSY;
   1911	u32 vp_id;
   1912
   1913	pr_devel("connect_vcpu(cpu=%d)\n", cpu);
   1914
   1915	if (dev->ops != &kvm_xive_ops) {
   1916		pr_devel("Wrong ops !\n");
   1917		return -EPERM;
   1918	}
   1919	if (xive->kvm != vcpu->kvm)
   1920		return -EPERM;
   1921	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
   1922		return -EBUSY;
   1923
   1924	/* We need to synchronize with queue provisioning */
   1925	mutex_lock(&xive->lock);
   1926
   1927	r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id);
   1928	if (r)
   1929		goto bail;
   1930
   1931	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
   1932	if (!xc) {
   1933		r = -ENOMEM;
   1934		goto bail;
   1935	}
   1936
   1937	vcpu->arch.xive_vcpu = xc;
   1938	xc->xive = xive;
   1939	xc->vcpu = vcpu;
   1940	xc->server_num = cpu;
   1941	xc->vp_id = vp_id;
   1942	xc->mfrr = 0xff;
   1943	xc->valid = true;
   1944
   1945	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
   1946	if (r)
   1947		goto bail;
   1948
   1949	if (!kvmppc_xive_check_save_restore(vcpu)) {
   1950		pr_err("inconsistent save-restore setup for VCPU %d\n", cpu);
   1951		r = -EIO;
   1952		goto bail;
   1953	}
   1954
   1955	/* Configure VCPU fields for use by assembly push/pull */
   1956	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
   1957	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
   1958
   1959	/* Allocate IPI */
   1960	xc->vp_ipi = xive_native_alloc_irq();
   1961	if (!xc->vp_ipi) {
   1962		pr_err("Failed to allocate xive irq for VCPU IPI\n");
   1963		r = -EIO;
   1964		goto bail;
   1965	}
   1966	pr_devel(" IPI=0x%x\n", xc->vp_ipi);
   1967
   1968	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
   1969	if (r)
   1970		goto bail;
   1971
   1972	/*
   1973	 * Enable the VP first as the single escalation mode will
   1974	 * affect escalation interrupts numbering
   1975	 */
   1976	r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
   1977	if (r) {
   1978		pr_err("Failed to enable VP in OPAL, err %d\n", r);
   1979		goto bail;
   1980	}
   1981
   1982	/*
   1983	 * Initialize queues. Initially we set them all for no queueing
   1984	 * and we enable escalation for queue 0 only which we'll use for
   1985	 * our mfrr change notifications. If the VCPU is hot-plugged, we
   1986	 * do handle provisioning however based on the existing "map"
   1987	 * of enabled queues.
   1988	 */
   1989	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
   1990		struct xive_q *q = &xc->queues[i];
   1991
   1992		/* Single escalation, no queue 7 */
   1993		if (i == 7 && kvmppc_xive_has_single_escalation(xive))
   1994			break;
   1995
   1996		/* Is queue already enabled ? Provision it */
   1997		if (xive->qmap & (1 << i)) {
   1998			r = xive_provision_queue(vcpu, i);
   1999			if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
   2000				kvmppc_xive_attach_escalation(
   2001					vcpu, i, kvmppc_xive_has_single_escalation(xive));
   2002			if (r)
   2003				goto bail;
   2004		} else {
   2005			r = xive_native_configure_queue(xc->vp_id,
   2006							q, i, NULL, 0, true);
   2007			if (r) {
   2008				pr_err("Failed to configure queue %d for VCPU %d\n",
   2009				       i, cpu);
   2010				goto bail;
   2011			}
   2012		}
   2013	}
   2014
   2015	/* If not done above, attach priority 0 escalation */
   2016	r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive));
   2017	if (r)
   2018		goto bail;
   2019
   2020	/* Route the IPI */
   2021	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
   2022	if (!r)
   2023		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
   2024
   2025bail:
   2026	mutex_unlock(&xive->lock);
   2027	if (r) {
   2028		kvmppc_xive_cleanup_vcpu(vcpu);
   2029		return r;
   2030	}
   2031
   2032	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
   2033	return 0;
   2034}
   2035
   2036/*
   2037 * Scanning of queues before/after migration save
   2038 */
   2039static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
   2040{
   2041	struct kvmppc_xive_src_block *sb;
   2042	struct kvmppc_xive_irq_state *state;
   2043	u16 idx;
   2044
   2045	sb = kvmppc_xive_find_source(xive, irq, &idx);
   2046	if (!sb)
   2047		return;
   2048
   2049	state = &sb->irq_state[idx];
   2050
   2051	/* Some sanity checking */
   2052	if (!state->valid) {
   2053		pr_err("invalid irq 0x%x in cpu queue!\n", irq);
   2054		return;
   2055	}
   2056
   2057	/*
   2058	 * If the interrupt is in a queue it should have P set.
   2059	 * We warn so that gets reported. A backtrace isn't useful
   2060	 * so no need to use a WARN_ON.
   2061	 */
   2062	if (!state->saved_p)
   2063		pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
   2064
   2065	/* Set flag */
   2066	state->in_queue = true;
   2067}
   2068
   2069static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
   2070				   struct kvmppc_xive_src_block *sb,
   2071				   u32 irq)
   2072{
   2073	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
   2074
   2075	if (!state->valid)
   2076		return;
   2077
   2078	/* Mask and save state, this will also sync HW queues */
   2079	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
   2080
   2081	/* Transfer P and Q */
   2082	state->saved_p = state->old_p;
   2083	state->saved_q = state->old_q;
   2084
   2085	/* Unlock */
   2086	arch_spin_unlock(&sb->lock);
   2087}
   2088
   2089static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
   2090				     struct kvmppc_xive_src_block *sb,
   2091				     u32 irq)
   2092{
   2093	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
   2094
   2095	if (!state->valid)
   2096		return;
   2097
   2098	/*
   2099	 * Lock / exclude EOI (not technically necessary if the
   2100	 * guest isn't running concurrently. If this becomes a
   2101	 * performance issue we can probably remove the lock.
   2102	 */
   2103	xive_lock_for_unmask(sb, state);
   2104
   2105	/* Restore mask/prio if it wasn't masked */
   2106	if (state->saved_scan_prio != MASKED)
   2107		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
   2108
   2109	/* Unlock */
   2110	arch_spin_unlock(&sb->lock);
   2111}
   2112
   2113static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
   2114{
   2115	u32 idx = q->idx;
   2116	u32 toggle = q->toggle;
   2117	u32 irq;
   2118
   2119	do {
   2120		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
   2121		if (irq > XICS_IPI)
   2122			xive_pre_save_set_queued(xive, irq);
   2123	} while(irq);
   2124}
   2125
   2126static void xive_pre_save_scan(struct kvmppc_xive *xive)
   2127{
   2128	struct kvm_vcpu *vcpu = NULL;
   2129	unsigned long i;
   2130	int j;
   2131
   2132	/*
   2133	 * See comment in xive_get_source() about how this
   2134	 * work. Collect a stable state for all interrupts
   2135	 */
   2136	for (i = 0; i <= xive->max_sbid; i++) {
   2137		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
   2138		if (!sb)
   2139			continue;
   2140		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
   2141			xive_pre_save_mask_irq(xive, sb, j);
   2142	}
   2143
   2144	/* Then scan the queues and update the "in_queue" flag */
   2145	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
   2146		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   2147		if (!xc)
   2148			continue;
   2149		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
   2150			if (xc->queues[j].qpage)
   2151				xive_pre_save_queue(xive, &xc->queues[j]);
   2152		}
   2153	}
   2154
   2155	/* Finally restore interrupt states */
   2156	for (i = 0; i <= xive->max_sbid; i++) {
   2157		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
   2158		if (!sb)
   2159			continue;
   2160		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
   2161			xive_pre_save_unmask_irq(xive, sb, j);
   2162	}
   2163}
   2164
   2165static void xive_post_save_scan(struct kvmppc_xive *xive)
   2166{
   2167	u32 i, j;
   2168
   2169	/* Clear all the in_queue flags */
   2170	for (i = 0; i <= xive->max_sbid; i++) {
   2171		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
   2172		if (!sb)
   2173			continue;
   2174		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
   2175			sb->irq_state[j].in_queue = false;
   2176	}
   2177
   2178	/* Next get_source() will do a new scan */
   2179	xive->saved_src_count = 0;
   2180}
   2181
   2182/*
   2183 * This returns the source configuration and state to user space.
   2184 */
   2185static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
   2186{
   2187	struct kvmppc_xive_src_block *sb;
   2188	struct kvmppc_xive_irq_state *state;
   2189	u64 __user *ubufp = (u64 __user *) addr;
   2190	u64 val, prio;
   2191	u16 idx;
   2192
   2193	sb = kvmppc_xive_find_source(xive, irq, &idx);
   2194	if (!sb)
   2195		return -ENOENT;
   2196
   2197	state = &sb->irq_state[idx];
   2198
   2199	if (!state->valid)
   2200		return -ENOENT;
   2201
   2202	pr_devel("get_source(%ld)...\n", irq);
   2203
   2204	/*
   2205	 * So to properly save the state into something that looks like a
   2206	 * XICS migration stream we cannot treat interrupts individually.
   2207	 *
   2208	 * We need, instead, mask them all (& save their previous PQ state)
   2209	 * to get a stable state in the HW, then sync them to ensure that
   2210	 * any interrupt that had already fired hits its queue, and finally
   2211	 * scan all the queues to collect which interrupts are still present
   2212	 * in the queues, so we can set the "pending" flag on them and
   2213	 * they can be resent on restore.
   2214	 *
   2215	 * So we do it all when the "first" interrupt gets saved, all the
   2216	 * state is collected at that point, the rest of xive_get_source()
   2217	 * will merely collect and convert that state to the expected
   2218	 * userspace bit mask.
   2219	 */
   2220	if (xive->saved_src_count == 0)
   2221		xive_pre_save_scan(xive);
   2222	xive->saved_src_count++;
   2223
   2224	/* Convert saved state into something compatible with xics */
   2225	val = state->act_server;
   2226	prio = state->saved_scan_prio;
   2227
   2228	if (prio == MASKED) {
   2229		val |= KVM_XICS_MASKED;
   2230		prio = state->saved_priority;
   2231	}
   2232	val |= prio << KVM_XICS_PRIORITY_SHIFT;
   2233	if (state->lsi) {
   2234		val |= KVM_XICS_LEVEL_SENSITIVE;
   2235		if (state->saved_p)
   2236			val |= KVM_XICS_PENDING;
   2237	} else {
   2238		if (state->saved_p)
   2239			val |= KVM_XICS_PRESENTED;
   2240
   2241		if (state->saved_q)
   2242			val |= KVM_XICS_QUEUED;
   2243
   2244		/*
   2245		 * We mark it pending (which will attempt a re-delivery)
   2246		 * if we are in a queue *or* we were masked and had
   2247		 * Q set which is equivalent to the XICS "masked pending"
   2248		 * state
   2249		 */
   2250		if (state->in_queue || (prio == MASKED && state->saved_q))
   2251			val |= KVM_XICS_PENDING;
   2252	}
   2253
   2254	/*
   2255	 * If that was the last interrupt saved, reset the
   2256	 * in_queue flags
   2257	 */
   2258	if (xive->saved_src_count == xive->src_count)
   2259		xive_post_save_scan(xive);
   2260
   2261	/* Copy the result to userspace */
   2262	if (put_user(val, ubufp))
   2263		return -EFAULT;
   2264
   2265	return 0;
   2266}
   2267
   2268struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
   2269	struct kvmppc_xive *xive, int irq)
   2270{
   2271	struct kvmppc_xive_src_block *sb;
   2272	int i, bid;
   2273
   2274	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
   2275
   2276	mutex_lock(&xive->lock);
   2277
   2278	/* block already exists - somebody else got here first */
   2279	if (xive->src_blocks[bid])
   2280		goto out;
   2281
   2282	/* Create the ICS */
   2283	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
   2284	if (!sb)
   2285		goto out;
   2286
   2287	sb->id = bid;
   2288
   2289	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
   2290		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
   2291		sb->irq_state[i].eisn = 0;
   2292		sb->irq_state[i].guest_priority = MASKED;
   2293		sb->irq_state[i].saved_priority = MASKED;
   2294		sb->irq_state[i].act_priority = MASKED;
   2295	}
   2296	smp_wmb();
   2297	xive->src_blocks[bid] = sb;
   2298
   2299	if (bid > xive->max_sbid)
   2300		xive->max_sbid = bid;
   2301
   2302out:
   2303	mutex_unlock(&xive->lock);
   2304	return xive->src_blocks[bid];
   2305}
   2306
   2307static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
   2308{
   2309	struct kvm *kvm = xive->kvm;
   2310	struct kvm_vcpu *vcpu = NULL;
   2311	unsigned long i;
   2312
   2313	kvm_for_each_vcpu(i, vcpu, kvm) {
   2314		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   2315
   2316		if (!xc)
   2317			continue;
   2318
   2319		if (xc->delayed_irq == irq) {
   2320			xc->delayed_irq = 0;
   2321			xive->delayed_irqs--;
   2322			return true;
   2323		}
   2324	}
   2325	return false;
   2326}
   2327
   2328static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
   2329{
   2330	struct kvmppc_xive_src_block *sb;
   2331	struct kvmppc_xive_irq_state *state;
   2332	u64 __user *ubufp = (u64 __user *) addr;
   2333	u16 idx;
   2334	u64 val;
   2335	u8 act_prio, guest_prio;
   2336	u32 server;
   2337	int rc = 0;
   2338
   2339	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
   2340		return -ENOENT;
   2341
   2342	pr_devel("set_source(irq=0x%lx)\n", irq);
   2343
   2344	/* Find the source */
   2345	sb = kvmppc_xive_find_source(xive, irq, &idx);
   2346	if (!sb) {
   2347		pr_devel("No source, creating source block...\n");
   2348		sb = kvmppc_xive_create_src_block(xive, irq);
   2349		if (!sb) {
   2350			pr_devel("Failed to create block...\n");
   2351			return -ENOMEM;
   2352		}
   2353	}
   2354	state = &sb->irq_state[idx];
   2355
   2356	/* Read user passed data */
   2357	if (get_user(val, ubufp)) {
   2358		pr_devel("fault getting user info !\n");
   2359		return -EFAULT;
   2360	}
   2361
   2362	server = val & KVM_XICS_DESTINATION_MASK;
   2363	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
   2364
   2365	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
   2366		 val, server, guest_prio);
   2367
   2368	/*
   2369	 * If the source doesn't already have an IPI, allocate
   2370	 * one and get the corresponding data
   2371	 */
   2372	if (!state->ipi_number) {
   2373		state->ipi_number = xive_native_alloc_irq();
   2374		if (state->ipi_number == 0) {
   2375			pr_devel("Failed to allocate IPI !\n");
   2376			return -ENOMEM;
   2377		}
   2378		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
   2379		pr_devel(" src_ipi=0x%x\n", state->ipi_number);
   2380	}
   2381
   2382	/*
   2383	 * We use lock_and_mask() to set us in the right masked
   2384	 * state. We will override that state from the saved state
   2385	 * further down, but this will handle the cases of interrupts
   2386	 * that need FW masking. We set the initial guest_priority to
   2387	 * 0 before calling it to ensure it actually performs the masking.
   2388	 */
   2389	state->guest_priority = 0;
   2390	xive_lock_and_mask(xive, sb, state);
   2391
   2392	/*
   2393	 * Now, we select a target if we have one. If we don't we
   2394	 * leave the interrupt untargetted. It means that an interrupt
   2395	 * can become "untargetted" accross migration if it was masked
   2396	 * by set_xive() but there is little we can do about it.
   2397	 */
   2398
   2399	/* First convert prio and mark interrupt as untargetted */
   2400	act_prio = xive_prio_from_guest(guest_prio);
   2401	state->act_priority = MASKED;
   2402
   2403	/*
   2404	 * We need to drop the lock due to the mutex below. Hopefully
   2405	 * nothing is touching that interrupt yet since it hasn't been
   2406	 * advertized to a running guest yet
   2407	 */
   2408	arch_spin_unlock(&sb->lock);
   2409
   2410	/* If we have a priority target the interrupt */
   2411	if (act_prio != MASKED) {
   2412		/* First, check provisioning of queues */
   2413		mutex_lock(&xive->lock);
   2414		rc = xive_check_provisioning(xive->kvm, act_prio);
   2415		mutex_unlock(&xive->lock);
   2416
   2417		/* Target interrupt */
   2418		if (rc == 0)
   2419			rc = xive_target_interrupt(xive->kvm, state,
   2420						   server, act_prio);
   2421		/*
   2422		 * If provisioning or targetting failed, leave it
   2423		 * alone and masked. It will remain disabled until
   2424		 * the guest re-targets it.
   2425		 */
   2426	}
   2427
   2428	/*
   2429	 * Find out if this was a delayed irq stashed in an ICP,
   2430	 * in which case, treat it as pending
   2431	 */
   2432	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
   2433		val |= KVM_XICS_PENDING;
   2434		pr_devel("  Found delayed ! forcing PENDING !\n");
   2435	}
   2436
   2437	/* Cleanup the SW state */
   2438	state->old_p = false;
   2439	state->old_q = false;
   2440	state->lsi = false;
   2441	state->asserted = false;
   2442
   2443	/* Restore LSI state */
   2444	if (val & KVM_XICS_LEVEL_SENSITIVE) {
   2445		state->lsi = true;
   2446		if (val & KVM_XICS_PENDING)
   2447			state->asserted = true;
   2448		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
   2449	}
   2450
   2451	/*
   2452	 * Restore P and Q. If the interrupt was pending, we
   2453	 * force Q and !P, which will trigger a resend.
   2454	 *
   2455	 * That means that a guest that had both an interrupt
   2456	 * pending (queued) and Q set will restore with only
   2457	 * one instance of that interrupt instead of 2, but that
   2458	 * is perfectly fine as coalescing interrupts that haven't
   2459	 * been presented yet is always allowed.
   2460	 */
   2461	if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
   2462		state->old_p = true;
   2463	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
   2464		state->old_q = true;
   2465
   2466	pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
   2467
   2468	/*
   2469	 * If the interrupt was unmasked, update guest priority and
   2470	 * perform the appropriate state transition and do a
   2471	 * re-trigger if necessary.
   2472	 */
   2473	if (val & KVM_XICS_MASKED) {
   2474		pr_devel("  masked, saving prio\n");
   2475		state->guest_priority = MASKED;
   2476		state->saved_priority = guest_prio;
   2477	} else {
   2478		pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
   2479		xive_finish_unmask(xive, sb, state, guest_prio);
   2480		state->saved_priority = guest_prio;
   2481	}
   2482
   2483	/* Increment the number of valid sources and mark this one valid */
   2484	if (!state->valid)
   2485		xive->src_count++;
   2486	state->valid = true;
   2487
   2488	return 0;
   2489}
   2490
   2491int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
   2492			bool line_status)
   2493{
   2494	struct kvmppc_xive *xive = kvm->arch.xive;
   2495	struct kvmppc_xive_src_block *sb;
   2496	struct kvmppc_xive_irq_state *state;
   2497	u16 idx;
   2498
   2499	if (!xive)
   2500		return -ENODEV;
   2501
   2502	sb = kvmppc_xive_find_source(xive, irq, &idx);
   2503	if (!sb)
   2504		return -EINVAL;
   2505
   2506	/* Perform locklessly .... (we need to do some RCUisms here...) */
   2507	state = &sb->irq_state[idx];
   2508	if (!state->valid)
   2509		return -EINVAL;
   2510
   2511	/* We don't allow a trigger on a passed-through interrupt */
   2512	if (state->pt_number)
   2513		return -EINVAL;
   2514
   2515	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
   2516		state->asserted = true;
   2517	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
   2518		state->asserted = false;
   2519		return 0;
   2520	}
   2521
   2522	/* Trigger the IPI */
   2523	xive_irq_trigger(&state->ipi_data);
   2524
   2525	return 0;
   2526}
   2527
   2528int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
   2529{
   2530	u32 __user *ubufp = (u32 __user *) addr;
   2531	u32 nr_servers;
   2532	int rc = 0;
   2533
   2534	if (get_user(nr_servers, ubufp))
   2535		return -EFAULT;
   2536
   2537	pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
   2538
   2539	if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS)
   2540		return -EINVAL;
   2541
   2542	mutex_lock(&xive->lock);
   2543	if (xive->vp_base != XIVE_INVALID_VP)
   2544		/* The VP block is allocated once and freed when the device
   2545		 * is released. Better not allow to change its size since its
   2546		 * used by connect_vcpu to validate vCPU ids are valid (eg,
   2547		 * setting it back to a higher value could allow connect_vcpu
   2548		 * to come up with a VP id that goes beyond the VP block, which
   2549		 * is likely to cause a crash in OPAL).
   2550		 */
   2551		rc = -EBUSY;
   2552	else if (nr_servers > KVM_MAX_VCPUS)
   2553		/* We don't need more servers. Higher vCPU ids get packed
   2554		 * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id().
   2555		 */
   2556		xive->nr_servers = KVM_MAX_VCPUS;
   2557	else
   2558		xive->nr_servers = nr_servers;
   2559
   2560	mutex_unlock(&xive->lock);
   2561
   2562	return rc;
   2563}
   2564
   2565static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
   2566{
   2567	struct kvmppc_xive *xive = dev->private;
   2568
   2569	/* We honor the existing XICS ioctl */
   2570	switch (attr->group) {
   2571	case KVM_DEV_XICS_GRP_SOURCES:
   2572		return xive_set_source(xive, attr->attr, attr->addr);
   2573	case KVM_DEV_XICS_GRP_CTRL:
   2574		switch (attr->attr) {
   2575		case KVM_DEV_XICS_NR_SERVERS:
   2576			return kvmppc_xive_set_nr_servers(xive, attr->addr);
   2577		}
   2578	}
   2579	return -ENXIO;
   2580}
   2581
   2582static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
   2583{
   2584	struct kvmppc_xive *xive = dev->private;
   2585
   2586	/* We honor the existing XICS ioctl */
   2587	switch (attr->group) {
   2588	case KVM_DEV_XICS_GRP_SOURCES:
   2589		return xive_get_source(xive, attr->attr, attr->addr);
   2590	}
   2591	return -ENXIO;
   2592}
   2593
   2594static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
   2595{
   2596	/* We honor the same limits as XICS, at least for now */
   2597	switch (attr->group) {
   2598	case KVM_DEV_XICS_GRP_SOURCES:
   2599		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
   2600		    attr->attr < KVMPPC_XICS_NR_IRQS)
   2601			return 0;
   2602		break;
   2603	case KVM_DEV_XICS_GRP_CTRL:
   2604		switch (attr->attr) {
   2605		case KVM_DEV_XICS_NR_SERVERS:
   2606			return 0;
   2607		}
   2608	}
   2609	return -ENXIO;
   2610}
   2611
   2612static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
   2613{
   2614	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
   2615	xive_native_configure_irq(hw_num, 0, MASKED, 0);
   2616}
   2617
   2618void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
   2619{
   2620	int i;
   2621
   2622	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
   2623		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
   2624
   2625		if (!state->valid)
   2626			continue;
   2627
   2628		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
   2629		xive_cleanup_irq_data(&state->ipi_data);
   2630		xive_native_free_irq(state->ipi_number);
   2631
   2632		/* Pass-through, cleanup too but keep IRQ hw data */
   2633		if (state->pt_number)
   2634			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
   2635
   2636		state->valid = false;
   2637	}
   2638}
   2639
   2640/*
   2641 * Called when device fd is closed.  kvm->lock is held.
   2642 */
   2643static void kvmppc_xive_release(struct kvm_device *dev)
   2644{
   2645	struct kvmppc_xive *xive = dev->private;
   2646	struct kvm *kvm = xive->kvm;
   2647	struct kvm_vcpu *vcpu;
   2648	unsigned long i;
   2649
   2650	pr_devel("Releasing xive device\n");
   2651
   2652	/*
   2653	 * Since this is the device release function, we know that
   2654	 * userspace does not have any open fd referring to the
   2655	 * device.  Therefore there can not be any of the device
   2656	 * attribute set/get functions being executed concurrently,
   2657	 * and similarly, the connect_vcpu and set/clr_mapped
   2658	 * functions also cannot be being executed.
   2659	 */
   2660
   2661	debugfs_remove(xive->dentry);
   2662
   2663	/*
   2664	 * We should clean up the vCPU interrupt presenters first.
   2665	 */
   2666	kvm_for_each_vcpu(i, vcpu, kvm) {
   2667		/*
   2668		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
   2669		 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
   2670		 * Holding the vcpu->mutex also means that the vcpu cannot
   2671		 * be executing the KVM_RUN ioctl, and therefore it cannot
   2672		 * be executing the XIVE push or pull code or accessing
   2673		 * the XIVE MMIO regions.
   2674		 */
   2675		mutex_lock(&vcpu->mutex);
   2676		kvmppc_xive_cleanup_vcpu(vcpu);
   2677		mutex_unlock(&vcpu->mutex);
   2678	}
   2679
   2680	/*
   2681	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
   2682	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
   2683	 * against xive code getting called during vcpu execution or
   2684	 * set/get one_reg operations.
   2685	 */
   2686	kvm->arch.xive = NULL;
   2687
   2688	/* Mask and free interrupts */
   2689	for (i = 0; i <= xive->max_sbid; i++) {
   2690		if (xive->src_blocks[i])
   2691			kvmppc_xive_free_sources(xive->src_blocks[i]);
   2692		kfree(xive->src_blocks[i]);
   2693		xive->src_blocks[i] = NULL;
   2694	}
   2695
   2696	if (xive->vp_base != XIVE_INVALID_VP)
   2697		xive_native_free_vp_block(xive->vp_base);
   2698
   2699	/*
   2700	 * A reference of the kvmppc_xive pointer is now kept under
   2701	 * the xive_devices struct of the machine for reuse. It is
   2702	 * freed when the VM is destroyed for now until we fix all the
   2703	 * execution paths.
   2704	 */
   2705
   2706	kfree(dev);
   2707}
   2708
   2709/*
   2710 * When the guest chooses the interrupt mode (XICS legacy or XIVE
   2711 * native), the VM will switch of KVM device. The previous device will
   2712 * be "released" before the new one is created.
   2713 *
   2714 * Until we are sure all execution paths are well protected, provide a
   2715 * fail safe (transitional) method for device destruction, in which
   2716 * the XIVE device pointer is recycled and not directly freed.
   2717 */
   2718struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
   2719{
   2720	struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
   2721		&kvm->arch.xive_devices.native :
   2722		&kvm->arch.xive_devices.xics_on_xive;
   2723	struct kvmppc_xive *xive = *kvm_xive_device;
   2724
   2725	if (!xive) {
   2726		xive = kzalloc(sizeof(*xive), GFP_KERNEL);
   2727		*kvm_xive_device = xive;
   2728	} else {
   2729		memset(xive, 0, sizeof(*xive));
   2730	}
   2731
   2732	return xive;
   2733}
   2734
   2735/*
   2736 * Create a XICS device with XIVE backend.  kvm->lock is held.
   2737 */
   2738static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
   2739{
   2740	struct kvmppc_xive *xive;
   2741	struct kvm *kvm = dev->kvm;
   2742
   2743	pr_devel("Creating xive for partition\n");
   2744
   2745	/* Already there ? */
   2746	if (kvm->arch.xive)
   2747		return -EEXIST;
   2748
   2749	xive = kvmppc_xive_get_device(kvm, type);
   2750	if (!xive)
   2751		return -ENOMEM;
   2752
   2753	dev->private = xive;
   2754	xive->dev = dev;
   2755	xive->kvm = kvm;
   2756	mutex_init(&xive->lock);
   2757
   2758	/* We use the default queue size set by the host */
   2759	xive->q_order = xive_native_default_eq_shift();
   2760	if (xive->q_order < PAGE_SHIFT)
   2761		xive->q_page_order = 0;
   2762	else
   2763		xive->q_page_order = xive->q_order - PAGE_SHIFT;
   2764
   2765	/* VP allocation is delayed to the first call to connect_vcpu */
   2766	xive->vp_base = XIVE_INVALID_VP;
   2767	/* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
   2768	 * on a POWER9 system.
   2769	 */
   2770	xive->nr_servers = KVM_MAX_VCPUS;
   2771
   2772	if (xive_native_has_single_escalation())
   2773		xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
   2774
   2775	if (xive_native_has_save_restore())
   2776		xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
   2777
   2778	kvm->arch.xive = xive;
   2779	return 0;
   2780}
   2781
   2782int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
   2783{
   2784	struct kvmppc_vcore *vc = vcpu->arch.vcore;
   2785
   2786	/* The VM should have configured XICS mode before doing XICS hcalls. */
   2787	if (!kvmppc_xics_enabled(vcpu))
   2788		return H_TOO_HARD;
   2789
   2790	switch (req) {
   2791	case H_XIRR:
   2792		return xive_vm_h_xirr(vcpu);
   2793	case H_CPPR:
   2794		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
   2795	case H_EOI:
   2796		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
   2797	case H_IPI:
   2798		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
   2799					  kvmppc_get_gpr(vcpu, 5));
   2800	case H_IPOLL:
   2801		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
   2802	case H_XIRR_X:
   2803		xive_vm_h_xirr(vcpu);
   2804		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
   2805		return H_SUCCESS;
   2806	}
   2807
   2808	return H_UNSUPPORTED;
   2809}
   2810EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
   2811
   2812int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
   2813{
   2814	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   2815	unsigned int i;
   2816
   2817	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
   2818		struct xive_q *q = &xc->queues[i];
   2819		u32 i0, i1, idx;
   2820
   2821		if (!q->qpage && !xc->esc_virq[i])
   2822			continue;
   2823
   2824		if (q->qpage) {
   2825			seq_printf(m, "    q[%d]: ", i);
   2826			idx = q->idx;
   2827			i0 = be32_to_cpup(q->qpage + idx);
   2828			idx = (idx + 1) & q->msk;
   2829			i1 = be32_to_cpup(q->qpage + idx);
   2830			seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
   2831				   i0, i1);
   2832		}
   2833		if (xc->esc_virq[i]) {
   2834			struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
   2835			struct xive_irq_data *xd =
   2836				irq_data_get_irq_handler_data(d);
   2837			u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
   2838
   2839			seq_printf(m, "    ESC %d %c%c EOI @%llx",
   2840				   xc->esc_virq[i],
   2841				   (pq & XIVE_ESB_VAL_P) ? 'P' : '-',
   2842				   (pq & XIVE_ESB_VAL_Q) ? 'Q' : '-',
   2843				   xd->eoi_page);
   2844			seq_puts(m, "\n");
   2845		}
   2846	}
   2847	return 0;
   2848}
   2849
   2850void kvmppc_xive_debug_show_sources(struct seq_file *m,
   2851				    struct kvmppc_xive_src_block *sb)
   2852{
   2853	int i;
   2854
   2855	seq_puts(m, "    LISN      HW/CHIP   TYPE    PQ      EISN    CPU/PRIO\n");
   2856	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
   2857		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
   2858		struct xive_irq_data *xd;
   2859		u64 pq;
   2860		u32 hw_num;
   2861
   2862		if (!state->valid)
   2863			continue;
   2864
   2865		kvmppc_xive_select_irq(state, &hw_num, &xd);
   2866
   2867		pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
   2868
   2869		seq_printf(m, "%08x  %08x/%02x", state->number, hw_num,
   2870			   xd->src_chip);
   2871		if (state->lsi)
   2872			seq_printf(m, " %cLSI", state->asserted ? '^' : ' ');
   2873		else
   2874			seq_puts(m, "  MSI");
   2875
   2876		seq_printf(m, " %s  %c%c  %08x   % 4d/%d",
   2877			   state->ipi_number == hw_num ? "IPI" : " PT",
   2878			   pq & XIVE_ESB_VAL_P ? 'P' : '-',
   2879			   pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
   2880			   state->eisn, state->act_server,
   2881			   state->act_priority);
   2882
   2883		seq_puts(m, "\n");
   2884	}
   2885}
   2886
   2887static int xive_debug_show(struct seq_file *m, void *private)
   2888{
   2889	struct kvmppc_xive *xive = m->private;
   2890	struct kvm *kvm = xive->kvm;
   2891	struct kvm_vcpu *vcpu;
   2892	u64 t_rm_h_xirr = 0;
   2893	u64 t_rm_h_ipoll = 0;
   2894	u64 t_rm_h_cppr = 0;
   2895	u64 t_rm_h_eoi = 0;
   2896	u64 t_rm_h_ipi = 0;
   2897	u64 t_vm_h_xirr = 0;
   2898	u64 t_vm_h_ipoll = 0;
   2899	u64 t_vm_h_cppr = 0;
   2900	u64 t_vm_h_eoi = 0;
   2901	u64 t_vm_h_ipi = 0;
   2902	unsigned long i;
   2903
   2904	if (!kvm)
   2905		return 0;
   2906
   2907	seq_puts(m, "=========\nVCPU state\n=========\n");
   2908
   2909	kvm_for_each_vcpu(i, vcpu, kvm) {
   2910		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
   2911
   2912		if (!xc)
   2913			continue;
   2914
   2915		seq_printf(m, "VCPU %d: VP:%#x/%02x\n"
   2916			 "    CPPR:%#x HWCPPR:%#x MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
   2917			 xc->server_num, xc->vp_id, xc->vp_chip_id,
   2918			 xc->cppr, xc->hw_cppr,
   2919			 xc->mfrr, xc->pending,
   2920			 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
   2921
   2922		kvmppc_xive_debug_show_queues(m, vcpu);
   2923
   2924		t_rm_h_xirr += xc->stat_rm_h_xirr;
   2925		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
   2926		t_rm_h_cppr += xc->stat_rm_h_cppr;
   2927		t_rm_h_eoi += xc->stat_rm_h_eoi;
   2928		t_rm_h_ipi += xc->stat_rm_h_ipi;
   2929		t_vm_h_xirr += xc->stat_vm_h_xirr;
   2930		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
   2931		t_vm_h_cppr += xc->stat_vm_h_cppr;
   2932		t_vm_h_eoi += xc->stat_vm_h_eoi;
   2933		t_vm_h_ipi += xc->stat_vm_h_ipi;
   2934	}
   2935
   2936	seq_puts(m, "Hcalls totals\n");
   2937	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
   2938	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
   2939	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
   2940	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
   2941	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
   2942
   2943	seq_puts(m, "=========\nSources\n=========\n");
   2944
   2945	for (i = 0; i <= xive->max_sbid; i++) {
   2946		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
   2947
   2948		if (sb) {
   2949			arch_spin_lock(&sb->lock);
   2950			kvmppc_xive_debug_show_sources(m, sb);
   2951			arch_spin_unlock(&sb->lock);
   2952		}
   2953	}
   2954
   2955	return 0;
   2956}
   2957
   2958DEFINE_SHOW_ATTRIBUTE(xive_debug);
   2959
   2960static void xive_debugfs_init(struct kvmppc_xive *xive)
   2961{
   2962	xive->dentry = debugfs_create_file("xive", S_IRUGO, xive->kvm->debugfs_dentry,
   2963					   xive, &xive_debug_fops);
   2964
   2965	pr_debug("%s: created\n", __func__);
   2966}
   2967
   2968static void kvmppc_xive_init(struct kvm_device *dev)
   2969{
   2970	struct kvmppc_xive *xive = dev->private;
   2971
   2972	/* Register some debug interfaces */
   2973	xive_debugfs_init(xive);
   2974}
   2975
   2976struct kvm_device_ops kvm_xive_ops = {
   2977	.name = "kvm-xive",
   2978	.create = kvmppc_xive_create,
   2979	.init = kvmppc_xive_init,
   2980	.release = kvmppc_xive_release,
   2981	.set_attr = xive_set_attr,
   2982	.get_attr = xive_get_attr,
   2983	.has_attr = xive_has_attr,
   2984};