cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

book3s_hv.c (163451B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
      4 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
      5 *
      6 * Authors:
      7 *    Paul Mackerras <paulus@au1.ibm.com>
      8 *    Alexander Graf <agraf@suse.de>
      9 *    Kevin Wolf <mail@kevin-wolf.de>
     10 *
     11 * Description: KVM functions specific to running on Book 3S
     12 * processors in hypervisor mode (specifically POWER7 and later).
     13 *
     14 * This file is derived from arch/powerpc/kvm/book3s.c,
     15 * by Alexander Graf <agraf@suse.de>.
     16 */
     17
     18#include <linux/kvm_host.h>
     19#include <linux/kernel.h>
     20#include <linux/err.h>
     21#include <linux/slab.h>
     22#include <linux/preempt.h>
     23#include <linux/sched/signal.h>
     24#include <linux/sched/stat.h>
     25#include <linux/delay.h>
     26#include <linux/export.h>
     27#include <linux/fs.h>
     28#include <linux/anon_inodes.h>
     29#include <linux/cpu.h>
     30#include <linux/cpumask.h>
     31#include <linux/spinlock.h>
     32#include <linux/page-flags.h>
     33#include <linux/srcu.h>
     34#include <linux/miscdevice.h>
     35#include <linux/debugfs.h>
     36#include <linux/gfp.h>
     37#include <linux/vmalloc.h>
     38#include <linux/highmem.h>
     39#include <linux/hugetlb.h>
     40#include <linux/kvm_irqfd.h>
     41#include <linux/irqbypass.h>
     42#include <linux/module.h>
     43#include <linux/compiler.h>
     44#include <linux/of.h>
     45#include <linux/irqdomain.h>
     46
     47#include <asm/ftrace.h>
     48#include <asm/reg.h>
     49#include <asm/ppc-opcode.h>
     50#include <asm/asm-prototypes.h>
     51#include <asm/archrandom.h>
     52#include <asm/debug.h>
     53#include <asm/disassemble.h>
     54#include <asm/cputable.h>
     55#include <asm/cacheflush.h>
     56#include <linux/uaccess.h>
     57#include <asm/interrupt.h>
     58#include <asm/io.h>
     59#include <asm/kvm_ppc.h>
     60#include <asm/kvm_book3s.h>
     61#include <asm/mmu_context.h>
     62#include <asm/lppaca.h>
     63#include <asm/pmc.h>
     64#include <asm/processor.h>
     65#include <asm/cputhreads.h>
     66#include <asm/page.h>
     67#include <asm/hvcall.h>
     68#include <asm/switch_to.h>
     69#include <asm/smp.h>
     70#include <asm/dbell.h>
     71#include <asm/hmi.h>
     72#include <asm/pnv-pci.h>
     73#include <asm/mmu.h>
     74#include <asm/opal.h>
     75#include <asm/xics.h>
     76#include <asm/xive.h>
     77#include <asm/hw_breakpoint.h>
     78#include <asm/kvm_book3s_uvmem.h>
     79#include <asm/ultravisor.h>
     80#include <asm/dtl.h>
     81#include <asm/plpar_wrappers.h>
     82
     83#include "book3s.h"
     84#include "book3s_hv.h"
     85
     86#define CREATE_TRACE_POINTS
     87#include "trace_hv.h"
     88
     89/* #define EXIT_DEBUG */
     90/* #define EXIT_DEBUG_SIMPLE */
     91/* #define EXIT_DEBUG_INT */
     92
     93/* Used to indicate that a guest page fault needs to be handled */
     94#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
     95/* Used to indicate that a guest passthrough interrupt needs to be handled */
     96#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
     97
     98/* Used as a "null" value for timebase values */
     99#define TB_NIL	(~(u64)0)
    100
    101static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
    102
    103static int dynamic_mt_modes = 6;
    104module_param(dynamic_mt_modes, int, 0644);
    105MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
    106static int target_smt_mode;
    107module_param(target_smt_mode, int, 0644);
    108MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
    109
    110static bool one_vm_per_core;
    111module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
    112MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
    113
    114#ifdef CONFIG_KVM_XICS
    115static const struct kernel_param_ops module_param_ops = {
    116	.set = param_set_int,
    117	.get = param_get_int,
    118};
    119
    120module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
    121MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
    122
    123module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
    124MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
    125#endif
    126
    127/* If set, guests are allowed to create and control nested guests */
    128static bool nested = true;
    129module_param(nested, bool, S_IRUGO | S_IWUSR);
    130MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
    131
    132static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
    133
    134/*
    135 * RWMR values for POWER8.  These control the rate at which PURR
    136 * and SPURR count and should be set according to the number of
    137 * online threads in the vcore being run.
    138 */
    139#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
    140#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
    141#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
    142#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
    143#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
    144#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
    145#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
    146#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
    147
    148static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
    149	RWMR_RPA_P8_1THREAD,
    150	RWMR_RPA_P8_1THREAD,
    151	RWMR_RPA_P8_2THREAD,
    152	RWMR_RPA_P8_3THREAD,
    153	RWMR_RPA_P8_4THREAD,
    154	RWMR_RPA_P8_5THREAD,
    155	RWMR_RPA_P8_6THREAD,
    156	RWMR_RPA_P8_7THREAD,
    157	RWMR_RPA_P8_8THREAD,
    158};
    159
    160static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
    161		int *ip)
    162{
    163	int i = *ip;
    164	struct kvm_vcpu *vcpu;
    165
    166	while (++i < MAX_SMT_THREADS) {
    167		vcpu = READ_ONCE(vc->runnable_threads[i]);
    168		if (vcpu) {
    169			*ip = i;
    170			return vcpu;
    171		}
    172	}
    173	return NULL;
    174}
    175
    176/* Used to traverse the list of runnable threads for a given vcore */
    177#define for_each_runnable_thread(i, vcpu, vc) \
    178	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
    179
    180static bool kvmppc_ipi_thread(int cpu)
    181{
    182	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
    183
    184	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
    185	if (kvmhv_on_pseries())
    186		return false;
    187
    188	/* On POWER9 we can use msgsnd to IPI any cpu */
    189	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
    190		msg |= get_hard_smp_processor_id(cpu);
    191		smp_mb();
    192		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
    193		return true;
    194	}
    195
    196	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
    197	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
    198		preempt_disable();
    199		if (cpu_first_thread_sibling(cpu) ==
    200		    cpu_first_thread_sibling(smp_processor_id())) {
    201			msg |= cpu_thread_in_core(cpu);
    202			smp_mb();
    203			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
    204			preempt_enable();
    205			return true;
    206		}
    207		preempt_enable();
    208	}
    209
    210#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
    211	if (cpu >= 0 && cpu < nr_cpu_ids) {
    212		if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
    213			xics_wake_cpu(cpu);
    214			return true;
    215		}
    216		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
    217		return true;
    218	}
    219#endif
    220
    221	return false;
    222}
    223
    224static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
    225{
    226	int cpu;
    227	struct rcuwait *waitp;
    228
    229	/*
    230	 * rcuwait_wake_up contains smp_mb() which orders prior stores that
    231	 * create pending work vs below loads of cpu fields. The other side
    232	 * is the barrier in vcpu run that orders setting the cpu fields vs
    233	 * testing for pending work.
    234	 */
    235
    236	waitp = kvm_arch_vcpu_get_wait(vcpu);
    237	if (rcuwait_wake_up(waitp))
    238		++vcpu->stat.generic.halt_wakeup;
    239
    240	cpu = READ_ONCE(vcpu->arch.thread_cpu);
    241	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
    242		return;
    243
    244	/* CPU points to the first thread of the core */
    245	cpu = vcpu->cpu;
    246	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
    247		smp_send_reschedule(cpu);
    248}
    249
    250/*
    251 * We use the vcpu_load/put functions to measure stolen time.
    252 * Stolen time is counted as time when either the vcpu is able to
    253 * run as part of a virtual core, but the task running the vcore
    254 * is preempted or sleeping, or when the vcpu needs something done
    255 * in the kernel by the task running the vcpu, but that task is
    256 * preempted or sleeping.  Those two things have to be counted
    257 * separately, since one of the vcpu tasks will take on the job
    258 * of running the core, and the other vcpu tasks in the vcore will
    259 * sleep waiting for it to do that, but that sleep shouldn't count
    260 * as stolen time.
    261 *
    262 * Hence we accumulate stolen time when the vcpu can run as part of
    263 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
    264 * needs its task to do other things in the kernel (for example,
    265 * service a page fault) in busy_stolen.  We don't accumulate
    266 * stolen time for a vcore when it is inactive, or for a vcpu
    267 * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
    268 * a misnomer; it means that the vcpu task is not executing in
    269 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
    270 * the kernel.  We don't have any way of dividing up that time
    271 * between time that the vcpu is genuinely stopped, time that
    272 * the task is actively working on behalf of the vcpu, and time
    273 * that the task is preempted, so we don't count any of it as
    274 * stolen.
    275 *
    276 * Updates to busy_stolen are protected by arch.tbacct_lock;
    277 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
    278 * lock.  The stolen times are measured in units of timebase ticks.
    279 * (Note that the != TB_NIL checks below are purely defensive;
    280 * they should never fail.)
    281 */
    282
    283static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
    284{
    285	unsigned long flags;
    286
    287	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
    288
    289	spin_lock_irqsave(&vc->stoltb_lock, flags);
    290	vc->preempt_tb = tb;
    291	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
    292}
    293
    294static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
    295{
    296	unsigned long flags;
    297
    298	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
    299
    300	spin_lock_irqsave(&vc->stoltb_lock, flags);
    301	if (vc->preempt_tb != TB_NIL) {
    302		vc->stolen_tb += tb - vc->preempt_tb;
    303		vc->preempt_tb = TB_NIL;
    304	}
    305	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
    306}
    307
    308static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
    309{
    310	struct kvmppc_vcore *vc = vcpu->arch.vcore;
    311	unsigned long flags;
    312	u64 now;
    313
    314	if (cpu_has_feature(CPU_FTR_ARCH_300))
    315		return;
    316
    317	now = mftb();
    318
    319	/*
    320	 * We can test vc->runner without taking the vcore lock,
    321	 * because only this task ever sets vc->runner to this
    322	 * vcpu, and once it is set to this vcpu, only this task
    323	 * ever sets it to NULL.
    324	 */
    325	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
    326		kvmppc_core_end_stolen(vc, now);
    327
    328	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
    329	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
    330	    vcpu->arch.busy_preempt != TB_NIL) {
    331		vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
    332		vcpu->arch.busy_preempt = TB_NIL;
    333	}
    334	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
    335}
    336
    337static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
    338{
    339	struct kvmppc_vcore *vc = vcpu->arch.vcore;
    340	unsigned long flags;
    341	u64 now;
    342
    343	if (cpu_has_feature(CPU_FTR_ARCH_300))
    344		return;
    345
    346	now = mftb();
    347
    348	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
    349		kvmppc_core_start_stolen(vc, now);
    350
    351	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
    352	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
    353		vcpu->arch.busy_preempt = now;
    354	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
    355}
    356
    357static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
    358{
    359	vcpu->arch.pvr = pvr;
    360}
    361
    362/* Dummy value used in computing PCR value below */
    363#define PCR_ARCH_31    (PCR_ARCH_300 << 1)
    364
    365static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
    366{
    367	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
    368	struct kvmppc_vcore *vc = vcpu->arch.vcore;
    369
    370	/* We can (emulate) our own architecture version and anything older */
    371	if (cpu_has_feature(CPU_FTR_ARCH_31))
    372		host_pcr_bit = PCR_ARCH_31;
    373	else if (cpu_has_feature(CPU_FTR_ARCH_300))
    374		host_pcr_bit = PCR_ARCH_300;
    375	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
    376		host_pcr_bit = PCR_ARCH_207;
    377	else if (cpu_has_feature(CPU_FTR_ARCH_206))
    378		host_pcr_bit = PCR_ARCH_206;
    379	else
    380		host_pcr_bit = PCR_ARCH_205;
    381
    382	/* Determine lowest PCR bit needed to run guest in given PVR level */
    383	guest_pcr_bit = host_pcr_bit;
    384	if (arch_compat) {
    385		switch (arch_compat) {
    386		case PVR_ARCH_205:
    387			guest_pcr_bit = PCR_ARCH_205;
    388			break;
    389		case PVR_ARCH_206:
    390		case PVR_ARCH_206p:
    391			guest_pcr_bit = PCR_ARCH_206;
    392			break;
    393		case PVR_ARCH_207:
    394			guest_pcr_bit = PCR_ARCH_207;
    395			break;
    396		case PVR_ARCH_300:
    397			guest_pcr_bit = PCR_ARCH_300;
    398			break;
    399		case PVR_ARCH_31:
    400			guest_pcr_bit = PCR_ARCH_31;
    401			break;
    402		default:
    403			return -EINVAL;
    404		}
    405	}
    406
    407	/* Check requested PCR bits don't exceed our capabilities */
    408	if (guest_pcr_bit > host_pcr_bit)
    409		return -EINVAL;
    410
    411	spin_lock(&vc->lock);
    412	vc->arch_compat = arch_compat;
    413	/*
    414	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
    415	 * Also set all reserved PCR bits
    416	 */
    417	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
    418	spin_unlock(&vc->lock);
    419
    420	return 0;
    421}
    422
    423static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
    424{
    425	int r;
    426
    427	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
    428	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
    429	       vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
    430	for (r = 0; r < 16; ++r)
    431		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
    432		       r, kvmppc_get_gpr(vcpu, r),
    433		       r+16, kvmppc_get_gpr(vcpu, r+16));
    434	pr_err("ctr = %.16lx  lr  = %.16lx\n",
    435	       vcpu->arch.regs.ctr, vcpu->arch.regs.link);
    436	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
    437	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
    438	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
    439	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
    440	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
    441	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
    442	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
    443	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
    444	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
    445	pr_err("fault dar = %.16lx dsisr = %.8x\n",
    446	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
    447	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
    448	for (r = 0; r < vcpu->arch.slb_max; ++r)
    449		pr_err("  ESID = %.16llx VSID = %.16llx\n",
    450		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
    451	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
    452	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
    453	       vcpu->arch.last_inst);
    454}
    455
    456static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
    457{
    458	return kvm_get_vcpu_by_id(kvm, id);
    459}
    460
    461static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
    462{
    463	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
    464	vpa->yield_count = cpu_to_be32(1);
    465}
    466
    467static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
    468		   unsigned long addr, unsigned long len)
    469{
    470	/* check address is cacheline aligned */
    471	if (addr & (L1_CACHE_BYTES - 1))
    472		return -EINVAL;
    473	spin_lock(&vcpu->arch.vpa_update_lock);
    474	if (v->next_gpa != addr || v->len != len) {
    475		v->next_gpa = addr;
    476		v->len = addr ? len : 0;
    477		v->update_pending = 1;
    478	}
    479	spin_unlock(&vcpu->arch.vpa_update_lock);
    480	return 0;
    481}
    482
    483/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
    484struct reg_vpa {
    485	u32 dummy;
    486	union {
    487		__be16 hword;
    488		__be32 word;
    489	} length;
    490};
    491
    492static int vpa_is_registered(struct kvmppc_vpa *vpap)
    493{
    494	if (vpap->update_pending)
    495		return vpap->next_gpa != 0;
    496	return vpap->pinned_addr != NULL;
    497}
    498
    499static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
    500				       unsigned long flags,
    501				       unsigned long vcpuid, unsigned long vpa)
    502{
    503	struct kvm *kvm = vcpu->kvm;
    504	unsigned long len, nb;
    505	void *va;
    506	struct kvm_vcpu *tvcpu;
    507	int err;
    508	int subfunc;
    509	struct kvmppc_vpa *vpap;
    510
    511	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
    512	if (!tvcpu)
    513		return H_PARAMETER;
    514
    515	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
    516	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
    517	    subfunc == H_VPA_REG_SLB) {
    518		/* Registering new area - address must be cache-line aligned */
    519		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
    520			return H_PARAMETER;
    521
    522		/* convert logical addr to kernel addr and read length */
    523		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
    524		if (va == NULL)
    525			return H_PARAMETER;
    526		if (subfunc == H_VPA_REG_VPA)
    527			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
    528		else
    529			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
    530		kvmppc_unpin_guest_page(kvm, va, vpa, false);
    531
    532		/* Check length */
    533		if (len > nb || len < sizeof(struct reg_vpa))
    534			return H_PARAMETER;
    535	} else {
    536		vpa = 0;
    537		len = 0;
    538	}
    539
    540	err = H_PARAMETER;
    541	vpap = NULL;
    542	spin_lock(&tvcpu->arch.vpa_update_lock);
    543
    544	switch (subfunc) {
    545	case H_VPA_REG_VPA:		/* register VPA */
    546		/*
    547		 * The size of our lppaca is 1kB because of the way we align
    548		 * it for the guest to avoid crossing a 4kB boundary. We only
    549		 * use 640 bytes of the structure though, so we should accept
    550		 * clients that set a size of 640.
    551		 */
    552		BUILD_BUG_ON(sizeof(struct lppaca) != 640);
    553		if (len < sizeof(struct lppaca))
    554			break;
    555		vpap = &tvcpu->arch.vpa;
    556		err = 0;
    557		break;
    558
    559	case H_VPA_REG_DTL:		/* register DTL */
    560		if (len < sizeof(struct dtl_entry))
    561			break;
    562		len -= len % sizeof(struct dtl_entry);
    563
    564		/* Check that they have previously registered a VPA */
    565		err = H_RESOURCE;
    566		if (!vpa_is_registered(&tvcpu->arch.vpa))
    567			break;
    568
    569		vpap = &tvcpu->arch.dtl;
    570		err = 0;
    571		break;
    572
    573	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
    574		/* Check that they have previously registered a VPA */
    575		err = H_RESOURCE;
    576		if (!vpa_is_registered(&tvcpu->arch.vpa))
    577			break;
    578
    579		vpap = &tvcpu->arch.slb_shadow;
    580		err = 0;
    581		break;
    582
    583	case H_VPA_DEREG_VPA:		/* deregister VPA */
    584		/* Check they don't still have a DTL or SLB buf registered */
    585		err = H_RESOURCE;
    586		if (vpa_is_registered(&tvcpu->arch.dtl) ||
    587		    vpa_is_registered(&tvcpu->arch.slb_shadow))
    588			break;
    589
    590		vpap = &tvcpu->arch.vpa;
    591		err = 0;
    592		break;
    593
    594	case H_VPA_DEREG_DTL:		/* deregister DTL */
    595		vpap = &tvcpu->arch.dtl;
    596		err = 0;
    597		break;
    598
    599	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
    600		vpap = &tvcpu->arch.slb_shadow;
    601		err = 0;
    602		break;
    603	}
    604
    605	if (vpap) {
    606		vpap->next_gpa = vpa;
    607		vpap->len = len;
    608		vpap->update_pending = 1;
    609	}
    610
    611	spin_unlock(&tvcpu->arch.vpa_update_lock);
    612
    613	return err;
    614}
    615
    616static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
    617{
    618	struct kvm *kvm = vcpu->kvm;
    619	void *va;
    620	unsigned long nb;
    621	unsigned long gpa;
    622
    623	/*
    624	 * We need to pin the page pointed to by vpap->next_gpa,
    625	 * but we can't call kvmppc_pin_guest_page under the lock
    626	 * as it does get_user_pages() and down_read().  So we
    627	 * have to drop the lock, pin the page, then get the lock
    628	 * again and check that a new area didn't get registered
    629	 * in the meantime.
    630	 */
    631	for (;;) {
    632		gpa = vpap->next_gpa;
    633		spin_unlock(&vcpu->arch.vpa_update_lock);
    634		va = NULL;
    635		nb = 0;
    636		if (gpa)
    637			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
    638		spin_lock(&vcpu->arch.vpa_update_lock);
    639		if (gpa == vpap->next_gpa)
    640			break;
    641		/* sigh... unpin that one and try again */
    642		if (va)
    643			kvmppc_unpin_guest_page(kvm, va, gpa, false);
    644	}
    645
    646	vpap->update_pending = 0;
    647	if (va && nb < vpap->len) {
    648		/*
    649		 * If it's now too short, it must be that userspace
    650		 * has changed the mappings underlying guest memory,
    651		 * so unregister the region.
    652		 */
    653		kvmppc_unpin_guest_page(kvm, va, gpa, false);
    654		va = NULL;
    655	}
    656	if (vpap->pinned_addr)
    657		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
    658					vpap->dirty);
    659	vpap->gpa = gpa;
    660	vpap->pinned_addr = va;
    661	vpap->dirty = false;
    662	if (va)
    663		vpap->pinned_end = va + vpap->len;
    664}
    665
    666static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
    667{
    668	if (!(vcpu->arch.vpa.update_pending ||
    669	      vcpu->arch.slb_shadow.update_pending ||
    670	      vcpu->arch.dtl.update_pending))
    671		return;
    672
    673	spin_lock(&vcpu->arch.vpa_update_lock);
    674	if (vcpu->arch.vpa.update_pending) {
    675		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
    676		if (vcpu->arch.vpa.pinned_addr)
    677			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
    678	}
    679	if (vcpu->arch.dtl.update_pending) {
    680		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
    681		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
    682		vcpu->arch.dtl_index = 0;
    683	}
    684	if (vcpu->arch.slb_shadow.update_pending)
    685		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
    686	spin_unlock(&vcpu->arch.vpa_update_lock);
    687}
    688
    689/*
    690 * Return the accumulated stolen time for the vcore up until `now'.
    691 * The caller should hold the vcore lock.
    692 */
    693static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
    694{
    695	u64 p;
    696	unsigned long flags;
    697
    698	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
    699
    700	spin_lock_irqsave(&vc->stoltb_lock, flags);
    701	p = vc->stolen_tb;
    702	if (vc->vcore_state != VCORE_INACTIVE &&
    703	    vc->preempt_tb != TB_NIL)
    704		p += now - vc->preempt_tb;
    705	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
    706	return p;
    707}
    708
    709static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
    710					unsigned int pcpu, u64 now,
    711					unsigned long stolen)
    712{
    713	struct dtl_entry *dt;
    714	struct lppaca *vpa;
    715
    716	dt = vcpu->arch.dtl_ptr;
    717	vpa = vcpu->arch.vpa.pinned_addr;
    718
    719	if (!dt || !vpa)
    720		return;
    721
    722	dt->dispatch_reason = 7;
    723	dt->preempt_reason = 0;
    724	dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
    725	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
    726	dt->ready_to_enqueue_time = 0;
    727	dt->waiting_to_ready_time = 0;
    728	dt->timebase = cpu_to_be64(now);
    729	dt->fault_addr = 0;
    730	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
    731	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
    732
    733	++dt;
    734	if (dt == vcpu->arch.dtl.pinned_end)
    735		dt = vcpu->arch.dtl.pinned_addr;
    736	vcpu->arch.dtl_ptr = dt;
    737	/* order writing *dt vs. writing vpa->dtl_idx */
    738	smp_wmb();
    739	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
    740	vcpu->arch.dtl.dirty = true;
    741}
    742
    743static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
    744				    struct kvmppc_vcore *vc)
    745{
    746	unsigned long stolen;
    747	unsigned long core_stolen;
    748	u64 now;
    749	unsigned long flags;
    750
    751	now = mftb();
    752
    753	core_stolen = vcore_stolen_time(vc, now);
    754	stolen = core_stolen - vcpu->arch.stolen_logged;
    755	vcpu->arch.stolen_logged = core_stolen;
    756	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
    757	stolen += vcpu->arch.busy_stolen;
    758	vcpu->arch.busy_stolen = 0;
    759	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
    760
    761	__kvmppc_create_dtl_entry(vcpu, vc->pcpu, now + vc->tb_offset, stolen);
    762}
    763
    764/* See if there is a doorbell interrupt pending for a vcpu */
    765static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
    766{
    767	int thr;
    768	struct kvmppc_vcore *vc;
    769
    770	if (vcpu->arch.doorbell_request)
    771		return true;
    772	if (cpu_has_feature(CPU_FTR_ARCH_300))
    773		return false;
    774	/*
    775	 * Ensure that the read of vcore->dpdes comes after the read
    776	 * of vcpu->doorbell_request.  This barrier matches the
    777	 * smp_wmb() in kvmppc_guest_entry_inject().
    778	 */
    779	smp_rmb();
    780	vc = vcpu->arch.vcore;
    781	thr = vcpu->vcpu_id - vc->first_vcpuid;
    782	return !!(vc->dpdes & (1 << thr));
    783}
    784
    785static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
    786{
    787	if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
    788		return true;
    789	if ((!vcpu->arch.vcore->arch_compat) &&
    790	    cpu_has_feature(CPU_FTR_ARCH_207S))
    791		return true;
    792	return false;
    793}
    794
    795static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
    796			     unsigned long resource, unsigned long value1,
    797			     unsigned long value2)
    798{
    799	switch (resource) {
    800	case H_SET_MODE_RESOURCE_SET_CIABR:
    801		if (!kvmppc_power8_compatible(vcpu))
    802			return H_P2;
    803		if (value2)
    804			return H_P4;
    805		if (mflags)
    806			return H_UNSUPPORTED_FLAG_START;
    807		/* Guests can't breakpoint the hypervisor */
    808		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
    809			return H_P3;
    810		vcpu->arch.ciabr  = value1;
    811		return H_SUCCESS;
    812	case H_SET_MODE_RESOURCE_SET_DAWR0:
    813		if (!kvmppc_power8_compatible(vcpu))
    814			return H_P2;
    815		if (!ppc_breakpoint_available())
    816			return H_P2;
    817		if (mflags)
    818			return H_UNSUPPORTED_FLAG_START;
    819		if (value2 & DABRX_HYP)
    820			return H_P4;
    821		vcpu->arch.dawr0  = value1;
    822		vcpu->arch.dawrx0 = value2;
    823		return H_SUCCESS;
    824	case H_SET_MODE_RESOURCE_SET_DAWR1:
    825		if (!kvmppc_power8_compatible(vcpu))
    826			return H_P2;
    827		if (!ppc_breakpoint_available())
    828			return H_P2;
    829		if (!cpu_has_feature(CPU_FTR_DAWR1))
    830			return H_P2;
    831		if (!vcpu->kvm->arch.dawr1_enabled)
    832			return H_FUNCTION;
    833		if (mflags)
    834			return H_UNSUPPORTED_FLAG_START;
    835		if (value2 & DABRX_HYP)
    836			return H_P4;
    837		vcpu->arch.dawr1  = value1;
    838		vcpu->arch.dawrx1 = value2;
    839		return H_SUCCESS;
    840	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
    841		/*
    842		 * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
    843		 * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
    844		 */
    845		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
    846				kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
    847			return H_UNSUPPORTED_FLAG_START;
    848		return H_TOO_HARD;
    849	default:
    850		return H_TOO_HARD;
    851	}
    852}
    853
    854/* Copy guest memory in place - must reside within a single memslot */
    855static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
    856				  unsigned long len)
    857{
    858	struct kvm_memory_slot *to_memslot = NULL;
    859	struct kvm_memory_slot *from_memslot = NULL;
    860	unsigned long to_addr, from_addr;
    861	int r;
    862
    863	/* Get HPA for from address */
    864	from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
    865	if (!from_memslot)
    866		return -EFAULT;
    867	if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
    868			     << PAGE_SHIFT))
    869		return -EINVAL;
    870	from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
    871	if (kvm_is_error_hva(from_addr))
    872		return -EFAULT;
    873	from_addr |= (from & (PAGE_SIZE - 1));
    874
    875	/* Get HPA for to address */
    876	to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
    877	if (!to_memslot)
    878		return -EFAULT;
    879	if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
    880			   << PAGE_SHIFT))
    881		return -EINVAL;
    882	to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
    883	if (kvm_is_error_hva(to_addr))
    884		return -EFAULT;
    885	to_addr |= (to & (PAGE_SIZE - 1));
    886
    887	/* Perform copy */
    888	r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
    889			     len);
    890	if (r)
    891		return -EFAULT;
    892	mark_page_dirty(kvm, to >> PAGE_SHIFT);
    893	return 0;
    894}
    895
    896static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
    897			       unsigned long dest, unsigned long src)
    898{
    899	u64 pg_sz = SZ_4K;		/* 4K page size */
    900	u64 pg_mask = SZ_4K - 1;
    901	int ret;
    902
    903	/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
    904	if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
    905		      H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
    906		return H_PARAMETER;
    907
    908	/* dest (and src if copy_page flag set) must be page aligned */
    909	if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
    910		return H_PARAMETER;
    911
    912	/* zero and/or copy the page as determined by the flags */
    913	if (flags & H_COPY_PAGE) {
    914		ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
    915		if (ret < 0)
    916			return H_PARAMETER;
    917	} else if (flags & H_ZERO_PAGE) {
    918		ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
    919		if (ret < 0)
    920			return H_PARAMETER;
    921	}
    922
    923	/* We can ignore the remaining flags */
    924
    925	return H_SUCCESS;
    926}
    927
    928static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
    929{
    930	struct kvmppc_vcore *vcore = target->arch.vcore;
    931
    932	/*
    933	 * We expect to have been called by the real mode handler
    934	 * (kvmppc_rm_h_confer()) which would have directly returned
    935	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
    936	 * have useful work to do and should not confer) so we don't
    937	 * recheck that here.
    938	 *
    939	 * In the case of the P9 single vcpu per vcore case, the real
    940	 * mode handler is not called but no other threads are in the
    941	 * source vcore.
    942	 */
    943	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
    944		spin_lock(&vcore->lock);
    945		if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
    946		    vcore->vcore_state != VCORE_INACTIVE &&
    947		    vcore->runner)
    948			target = vcore->runner;
    949		spin_unlock(&vcore->lock);
    950	}
    951
    952	return kvm_vcpu_yield_to(target);
    953}
    954
    955static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
    956{
    957	int yield_count = 0;
    958	struct lppaca *lppaca;
    959
    960	spin_lock(&vcpu->arch.vpa_update_lock);
    961	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
    962	if (lppaca)
    963		yield_count = be32_to_cpu(lppaca->yield_count);
    964	spin_unlock(&vcpu->arch.vpa_update_lock);
    965	return yield_count;
    966}
    967
    968/*
    969 * H_RPT_INVALIDATE hcall handler for nested guests.
    970 *
    971 * Handles only nested process-scoped invalidation requests in L0.
    972 */
    973static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
    974{
    975	unsigned long type = kvmppc_get_gpr(vcpu, 6);
    976	unsigned long pid, pg_sizes, start, end;
    977
    978	/*
    979	 * The partition-scoped invalidations aren't handled here in L0.
    980	 */
    981	if (type & H_RPTI_TYPE_NESTED)
    982		return RESUME_HOST;
    983
    984	pid = kvmppc_get_gpr(vcpu, 4);
    985	pg_sizes = kvmppc_get_gpr(vcpu, 7);
    986	start = kvmppc_get_gpr(vcpu, 8);
    987	end = kvmppc_get_gpr(vcpu, 9);
    988
    989	do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
    990				type, pg_sizes, start, end);
    991
    992	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
    993	return RESUME_GUEST;
    994}
    995
    996static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
    997				    unsigned long id, unsigned long target,
    998				    unsigned long type, unsigned long pg_sizes,
    999				    unsigned long start, unsigned long end)
   1000{
   1001	if (!kvm_is_radix(vcpu->kvm))
   1002		return H_UNSUPPORTED;
   1003
   1004	if (end < start)
   1005		return H_P5;
   1006
   1007	/*
   1008	 * Partition-scoped invalidation for nested guests.
   1009	 */
   1010	if (type & H_RPTI_TYPE_NESTED) {
   1011		if (!nesting_enabled(vcpu->kvm))
   1012			return H_FUNCTION;
   1013
   1014		/* Support only cores as target */
   1015		if (target != H_RPTI_TARGET_CMMU)
   1016			return H_P2;
   1017
   1018		return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
   1019					       start, end);
   1020	}
   1021
   1022	/*
   1023	 * Process-scoped invalidation for L1 guests.
   1024	 */
   1025	do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
   1026				type, pg_sizes, start, end);
   1027	return H_SUCCESS;
   1028}
   1029
   1030int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
   1031{
   1032	struct kvm *kvm = vcpu->kvm;
   1033	unsigned long req = kvmppc_get_gpr(vcpu, 3);
   1034	unsigned long target, ret = H_SUCCESS;
   1035	int yield_count;
   1036	struct kvm_vcpu *tvcpu;
   1037	int idx, rc;
   1038
   1039	if (req <= MAX_HCALL_OPCODE &&
   1040	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
   1041		return RESUME_HOST;
   1042
   1043	switch (req) {
   1044	case H_REMOVE:
   1045		ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
   1046					kvmppc_get_gpr(vcpu, 5),
   1047					kvmppc_get_gpr(vcpu, 6));
   1048		if (ret == H_TOO_HARD)
   1049			return RESUME_HOST;
   1050		break;
   1051	case H_ENTER:
   1052		ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
   1053					kvmppc_get_gpr(vcpu, 5),
   1054					kvmppc_get_gpr(vcpu, 6),
   1055					kvmppc_get_gpr(vcpu, 7));
   1056		if (ret == H_TOO_HARD)
   1057			return RESUME_HOST;
   1058		break;
   1059	case H_READ:
   1060		ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
   1061					kvmppc_get_gpr(vcpu, 5));
   1062		if (ret == H_TOO_HARD)
   1063			return RESUME_HOST;
   1064		break;
   1065	case H_CLEAR_MOD:
   1066		ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
   1067					kvmppc_get_gpr(vcpu, 5));
   1068		if (ret == H_TOO_HARD)
   1069			return RESUME_HOST;
   1070		break;
   1071	case H_CLEAR_REF:
   1072		ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
   1073					kvmppc_get_gpr(vcpu, 5));
   1074		if (ret == H_TOO_HARD)
   1075			return RESUME_HOST;
   1076		break;
   1077	case H_PROTECT:
   1078		ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
   1079					kvmppc_get_gpr(vcpu, 5),
   1080					kvmppc_get_gpr(vcpu, 6));
   1081		if (ret == H_TOO_HARD)
   1082			return RESUME_HOST;
   1083		break;
   1084	case H_BULK_REMOVE:
   1085		ret = kvmppc_h_bulk_remove(vcpu);
   1086		if (ret == H_TOO_HARD)
   1087			return RESUME_HOST;
   1088		break;
   1089
   1090	case H_CEDE:
   1091		break;
   1092	case H_PROD:
   1093		target = kvmppc_get_gpr(vcpu, 4);
   1094		tvcpu = kvmppc_find_vcpu(kvm, target);
   1095		if (!tvcpu) {
   1096			ret = H_PARAMETER;
   1097			break;
   1098		}
   1099		tvcpu->arch.prodded = 1;
   1100		smp_mb(); /* This orders prodded store vs ceded load */
   1101		if (tvcpu->arch.ceded)
   1102			kvmppc_fast_vcpu_kick_hv(tvcpu);
   1103		break;
   1104	case H_CONFER:
   1105		target = kvmppc_get_gpr(vcpu, 4);
   1106		if (target == -1)
   1107			break;
   1108		tvcpu = kvmppc_find_vcpu(kvm, target);
   1109		if (!tvcpu) {
   1110			ret = H_PARAMETER;
   1111			break;
   1112		}
   1113		yield_count = kvmppc_get_gpr(vcpu, 5);
   1114		if (kvmppc_get_yield_count(tvcpu) != yield_count)
   1115			break;
   1116		kvm_arch_vcpu_yield_to(tvcpu);
   1117		break;
   1118	case H_REGISTER_VPA:
   1119		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
   1120					kvmppc_get_gpr(vcpu, 5),
   1121					kvmppc_get_gpr(vcpu, 6));
   1122		break;
   1123	case H_RTAS:
   1124		if (list_empty(&kvm->arch.rtas_tokens))
   1125			return RESUME_HOST;
   1126
   1127		idx = srcu_read_lock(&kvm->srcu);
   1128		rc = kvmppc_rtas_hcall(vcpu);
   1129		srcu_read_unlock(&kvm->srcu, idx);
   1130
   1131		if (rc == -ENOENT)
   1132			return RESUME_HOST;
   1133		else if (rc == 0)
   1134			break;
   1135
   1136		/* Send the error out to userspace via KVM_RUN */
   1137		return rc;
   1138	case H_LOGICAL_CI_LOAD:
   1139		ret = kvmppc_h_logical_ci_load(vcpu);
   1140		if (ret == H_TOO_HARD)
   1141			return RESUME_HOST;
   1142		break;
   1143	case H_LOGICAL_CI_STORE:
   1144		ret = kvmppc_h_logical_ci_store(vcpu);
   1145		if (ret == H_TOO_HARD)
   1146			return RESUME_HOST;
   1147		break;
   1148	case H_SET_MODE:
   1149		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
   1150					kvmppc_get_gpr(vcpu, 5),
   1151					kvmppc_get_gpr(vcpu, 6),
   1152					kvmppc_get_gpr(vcpu, 7));
   1153		if (ret == H_TOO_HARD)
   1154			return RESUME_HOST;
   1155		break;
   1156	case H_XIRR:
   1157	case H_CPPR:
   1158	case H_EOI:
   1159	case H_IPI:
   1160	case H_IPOLL:
   1161	case H_XIRR_X:
   1162		if (kvmppc_xics_enabled(vcpu)) {
   1163			if (xics_on_xive()) {
   1164				ret = H_NOT_AVAILABLE;
   1165				return RESUME_GUEST;
   1166			}
   1167			ret = kvmppc_xics_hcall(vcpu, req);
   1168			break;
   1169		}
   1170		return RESUME_HOST;
   1171	case H_SET_DABR:
   1172		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
   1173		break;
   1174	case H_SET_XDABR:
   1175		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
   1176						kvmppc_get_gpr(vcpu, 5));
   1177		break;
   1178#ifdef CONFIG_SPAPR_TCE_IOMMU
   1179	case H_GET_TCE:
   1180		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
   1181						kvmppc_get_gpr(vcpu, 5));
   1182		if (ret == H_TOO_HARD)
   1183			return RESUME_HOST;
   1184		break;
   1185	case H_PUT_TCE:
   1186		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
   1187						kvmppc_get_gpr(vcpu, 5),
   1188						kvmppc_get_gpr(vcpu, 6));
   1189		if (ret == H_TOO_HARD)
   1190			return RESUME_HOST;
   1191		break;
   1192	case H_PUT_TCE_INDIRECT:
   1193		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
   1194						kvmppc_get_gpr(vcpu, 5),
   1195						kvmppc_get_gpr(vcpu, 6),
   1196						kvmppc_get_gpr(vcpu, 7));
   1197		if (ret == H_TOO_HARD)
   1198			return RESUME_HOST;
   1199		break;
   1200	case H_STUFF_TCE:
   1201		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
   1202						kvmppc_get_gpr(vcpu, 5),
   1203						kvmppc_get_gpr(vcpu, 6),
   1204						kvmppc_get_gpr(vcpu, 7));
   1205		if (ret == H_TOO_HARD)
   1206			return RESUME_HOST;
   1207		break;
   1208#endif
   1209	case H_RANDOM:
   1210		if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
   1211			ret = H_HARDWARE;
   1212		break;
   1213	case H_RPT_INVALIDATE:
   1214		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
   1215					      kvmppc_get_gpr(vcpu, 5),
   1216					      kvmppc_get_gpr(vcpu, 6),
   1217					      kvmppc_get_gpr(vcpu, 7),
   1218					      kvmppc_get_gpr(vcpu, 8),
   1219					      kvmppc_get_gpr(vcpu, 9));
   1220		break;
   1221
   1222	case H_SET_PARTITION_TABLE:
   1223		ret = H_FUNCTION;
   1224		if (nesting_enabled(kvm))
   1225			ret = kvmhv_set_partition_table(vcpu);
   1226		break;
   1227	case H_ENTER_NESTED:
   1228		ret = H_FUNCTION;
   1229		if (!nesting_enabled(kvm))
   1230			break;
   1231		ret = kvmhv_enter_nested_guest(vcpu);
   1232		if (ret == H_INTERRUPT) {
   1233			kvmppc_set_gpr(vcpu, 3, 0);
   1234			vcpu->arch.hcall_needed = 0;
   1235			return -EINTR;
   1236		} else if (ret == H_TOO_HARD) {
   1237			kvmppc_set_gpr(vcpu, 3, 0);
   1238			vcpu->arch.hcall_needed = 0;
   1239			return RESUME_HOST;
   1240		}
   1241		break;
   1242	case H_TLB_INVALIDATE:
   1243		ret = H_FUNCTION;
   1244		if (nesting_enabled(kvm))
   1245			ret = kvmhv_do_nested_tlbie(vcpu);
   1246		break;
   1247	case H_COPY_TOFROM_GUEST:
   1248		ret = H_FUNCTION;
   1249		if (nesting_enabled(kvm))
   1250			ret = kvmhv_copy_tofrom_guest_nested(vcpu);
   1251		break;
   1252	case H_PAGE_INIT:
   1253		ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
   1254					 kvmppc_get_gpr(vcpu, 5),
   1255					 kvmppc_get_gpr(vcpu, 6));
   1256		break;
   1257	case H_SVM_PAGE_IN:
   1258		ret = H_UNSUPPORTED;
   1259		if (kvmppc_get_srr1(vcpu) & MSR_S)
   1260			ret = kvmppc_h_svm_page_in(kvm,
   1261						   kvmppc_get_gpr(vcpu, 4),
   1262						   kvmppc_get_gpr(vcpu, 5),
   1263						   kvmppc_get_gpr(vcpu, 6));
   1264		break;
   1265	case H_SVM_PAGE_OUT:
   1266		ret = H_UNSUPPORTED;
   1267		if (kvmppc_get_srr1(vcpu) & MSR_S)
   1268			ret = kvmppc_h_svm_page_out(kvm,
   1269						    kvmppc_get_gpr(vcpu, 4),
   1270						    kvmppc_get_gpr(vcpu, 5),
   1271						    kvmppc_get_gpr(vcpu, 6));
   1272		break;
   1273	case H_SVM_INIT_START:
   1274		ret = H_UNSUPPORTED;
   1275		if (kvmppc_get_srr1(vcpu) & MSR_S)
   1276			ret = kvmppc_h_svm_init_start(kvm);
   1277		break;
   1278	case H_SVM_INIT_DONE:
   1279		ret = H_UNSUPPORTED;
   1280		if (kvmppc_get_srr1(vcpu) & MSR_S)
   1281			ret = kvmppc_h_svm_init_done(kvm);
   1282		break;
   1283	case H_SVM_INIT_ABORT:
   1284		/*
   1285		 * Even if that call is made by the Ultravisor, the SSR1 value
   1286		 * is the guest context one, with the secure bit clear as it has
   1287		 * not yet been secured. So we can't check it here.
   1288		 * Instead the kvm->arch.secure_guest flag is checked inside
   1289		 * kvmppc_h_svm_init_abort().
   1290		 */
   1291		ret = kvmppc_h_svm_init_abort(kvm);
   1292		break;
   1293
   1294	default:
   1295		return RESUME_HOST;
   1296	}
   1297	WARN_ON_ONCE(ret == H_TOO_HARD);
   1298	kvmppc_set_gpr(vcpu, 3, ret);
   1299	vcpu->arch.hcall_needed = 0;
   1300	return RESUME_GUEST;
   1301}
   1302
   1303/*
   1304 * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
   1305 * handlers in book3s_hv_rmhandlers.S.
   1306 *
   1307 * This has to be done early, not in kvmppc_pseries_do_hcall(), so
   1308 * that the cede logic in kvmppc_run_single_vcpu() works properly.
   1309 */
   1310static void kvmppc_cede(struct kvm_vcpu *vcpu)
   1311{
   1312	vcpu->arch.shregs.msr |= MSR_EE;
   1313	vcpu->arch.ceded = 1;
   1314	smp_mb();
   1315	if (vcpu->arch.prodded) {
   1316		vcpu->arch.prodded = 0;
   1317		smp_mb();
   1318		vcpu->arch.ceded = 0;
   1319	}
   1320}
   1321
   1322static int kvmppc_hcall_impl_hv(unsigned long cmd)
   1323{
   1324	switch (cmd) {
   1325	case H_CEDE:
   1326	case H_PROD:
   1327	case H_CONFER:
   1328	case H_REGISTER_VPA:
   1329	case H_SET_MODE:
   1330#ifdef CONFIG_SPAPR_TCE_IOMMU
   1331	case H_GET_TCE:
   1332	case H_PUT_TCE:
   1333	case H_PUT_TCE_INDIRECT:
   1334	case H_STUFF_TCE:
   1335#endif
   1336	case H_LOGICAL_CI_LOAD:
   1337	case H_LOGICAL_CI_STORE:
   1338#ifdef CONFIG_KVM_XICS
   1339	case H_XIRR:
   1340	case H_CPPR:
   1341	case H_EOI:
   1342	case H_IPI:
   1343	case H_IPOLL:
   1344	case H_XIRR_X:
   1345#endif
   1346	case H_PAGE_INIT:
   1347	case H_RPT_INVALIDATE:
   1348		return 1;
   1349	}
   1350
   1351	/* See if it's in the real-mode table */
   1352	return kvmppc_hcall_impl_hv_realmode(cmd);
   1353}
   1354
   1355static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
   1356{
   1357	u32 last_inst;
   1358
   1359	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
   1360					EMULATE_DONE) {
   1361		/*
   1362		 * Fetch failed, so return to guest and
   1363		 * try executing it again.
   1364		 */
   1365		return RESUME_GUEST;
   1366	}
   1367
   1368	if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
   1369		vcpu->run->exit_reason = KVM_EXIT_DEBUG;
   1370		vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
   1371		return RESUME_HOST;
   1372	} else {
   1373		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
   1374		return RESUME_GUEST;
   1375	}
   1376}
   1377
   1378static void do_nothing(void *x)
   1379{
   1380}
   1381
   1382static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
   1383{
   1384	int thr, cpu, pcpu, nthreads;
   1385	struct kvm_vcpu *v;
   1386	unsigned long dpdes;
   1387
   1388	nthreads = vcpu->kvm->arch.emul_smt_mode;
   1389	dpdes = 0;
   1390	cpu = vcpu->vcpu_id & ~(nthreads - 1);
   1391	for (thr = 0; thr < nthreads; ++thr, ++cpu) {
   1392		v = kvmppc_find_vcpu(vcpu->kvm, cpu);
   1393		if (!v)
   1394			continue;
   1395		/*
   1396		 * If the vcpu is currently running on a physical cpu thread,
   1397		 * interrupt it in order to pull it out of the guest briefly,
   1398		 * which will update its vcore->dpdes value.
   1399		 */
   1400		pcpu = READ_ONCE(v->cpu);
   1401		if (pcpu >= 0)
   1402			smp_call_function_single(pcpu, do_nothing, NULL, 1);
   1403		if (kvmppc_doorbell_pending(v))
   1404			dpdes |= 1 << thr;
   1405	}
   1406	return dpdes;
   1407}
   1408
   1409/*
   1410 * On POWER9, emulate doorbell-related instructions in order to
   1411 * give the guest the illusion of running on a multi-threaded core.
   1412 * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
   1413 * and mfspr DPDES.
   1414 */
   1415static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
   1416{
   1417	u32 inst, rb, thr;
   1418	unsigned long arg;
   1419	struct kvm *kvm = vcpu->kvm;
   1420	struct kvm_vcpu *tvcpu;
   1421
   1422	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
   1423		return RESUME_GUEST;
   1424	if (get_op(inst) != 31)
   1425		return EMULATE_FAIL;
   1426	rb = get_rb(inst);
   1427	thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
   1428	switch (get_xop(inst)) {
   1429	case OP_31_XOP_MSGSNDP:
   1430		arg = kvmppc_get_gpr(vcpu, rb);
   1431		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
   1432			break;
   1433		arg &= 0x7f;
   1434		if (arg >= kvm->arch.emul_smt_mode)
   1435			break;
   1436		tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
   1437		if (!tvcpu)
   1438			break;
   1439		if (!tvcpu->arch.doorbell_request) {
   1440			tvcpu->arch.doorbell_request = 1;
   1441			kvmppc_fast_vcpu_kick_hv(tvcpu);
   1442		}
   1443		break;
   1444	case OP_31_XOP_MSGCLRP:
   1445		arg = kvmppc_get_gpr(vcpu, rb);
   1446		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
   1447			break;
   1448		vcpu->arch.vcore->dpdes = 0;
   1449		vcpu->arch.doorbell_request = 0;
   1450		break;
   1451	case OP_31_XOP_MFSPR:
   1452		switch (get_sprn(inst)) {
   1453		case SPRN_TIR:
   1454			arg = thr;
   1455			break;
   1456		case SPRN_DPDES:
   1457			arg = kvmppc_read_dpdes(vcpu);
   1458			break;
   1459		default:
   1460			return EMULATE_FAIL;
   1461		}
   1462		kvmppc_set_gpr(vcpu, get_rt(inst), arg);
   1463		break;
   1464	default:
   1465		return EMULATE_FAIL;
   1466	}
   1467	kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
   1468	return RESUME_GUEST;
   1469}
   1470
   1471/*
   1472 * If the lppaca had pmcregs_in_use clear when we exited the guest, then
   1473 * HFSCR_PM is cleared for next entry. If the guest then tries to access
   1474 * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
   1475 * back in the guest HFSCR will cause the next entry to load the PMU SPRs and
   1476 * allow the guest access to continue.
   1477 */
   1478static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
   1479{
   1480	if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
   1481		return EMULATE_FAIL;
   1482
   1483	vcpu->arch.hfscr |= HFSCR_PM;
   1484
   1485	return RESUME_GUEST;
   1486}
   1487
   1488static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
   1489{
   1490	if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
   1491		return EMULATE_FAIL;
   1492
   1493	vcpu->arch.hfscr |= HFSCR_EBB;
   1494
   1495	return RESUME_GUEST;
   1496}
   1497
   1498static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
   1499{
   1500	if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
   1501		return EMULATE_FAIL;
   1502
   1503	vcpu->arch.hfscr |= HFSCR_TM;
   1504
   1505	return RESUME_GUEST;
   1506}
   1507
   1508static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
   1509				 struct task_struct *tsk)
   1510{
   1511	struct kvm_run *run = vcpu->run;
   1512	int r = RESUME_HOST;
   1513
   1514	vcpu->stat.sum_exits++;
   1515
   1516	/*
   1517	 * This can happen if an interrupt occurs in the last stages
   1518	 * of guest entry or the first stages of guest exit (i.e. after
   1519	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
   1520	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
   1521	 * That can happen due to a bug, or due to a machine check
   1522	 * occurring at just the wrong time.
   1523	 */
   1524	if (vcpu->arch.shregs.msr & MSR_HV) {
   1525		printk(KERN_EMERG "KVM trap in HV mode!\n");
   1526		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
   1527			vcpu->arch.trap, kvmppc_get_pc(vcpu),
   1528			vcpu->arch.shregs.msr);
   1529		kvmppc_dump_regs(vcpu);
   1530		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   1531		run->hw.hardware_exit_reason = vcpu->arch.trap;
   1532		return RESUME_HOST;
   1533	}
   1534	run->exit_reason = KVM_EXIT_UNKNOWN;
   1535	run->ready_for_interrupt_injection = 1;
   1536	switch (vcpu->arch.trap) {
   1537	/* We're good on these - the host merely wanted to get our attention */
   1538	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
   1539		WARN_ON_ONCE(1); /* Should never happen */
   1540		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
   1541		fallthrough;
   1542	case BOOK3S_INTERRUPT_HV_DECREMENTER:
   1543		vcpu->stat.dec_exits++;
   1544		r = RESUME_GUEST;
   1545		break;
   1546	case BOOK3S_INTERRUPT_EXTERNAL:
   1547	case BOOK3S_INTERRUPT_H_DOORBELL:
   1548	case BOOK3S_INTERRUPT_H_VIRT:
   1549		vcpu->stat.ext_intr_exits++;
   1550		r = RESUME_GUEST;
   1551		break;
   1552	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
   1553	case BOOK3S_INTERRUPT_HMI:
   1554	case BOOK3S_INTERRUPT_PERFMON:
   1555	case BOOK3S_INTERRUPT_SYSTEM_RESET:
   1556		r = RESUME_GUEST;
   1557		break;
   1558	case BOOK3S_INTERRUPT_MACHINE_CHECK: {
   1559		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
   1560					      DEFAULT_RATELIMIT_BURST);
   1561		/*
   1562		 * Print the MCE event to host console. Ratelimit so the guest
   1563		 * can't flood the host log.
   1564		 */
   1565		if (__ratelimit(&rs))
   1566			machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
   1567
   1568		/*
   1569		 * If the guest can do FWNMI, exit to userspace so it can
   1570		 * deliver a FWNMI to the guest.
   1571		 * Otherwise we synthesize a machine check for the guest
   1572		 * so that it knows that the machine check occurred.
   1573		 */
   1574		if (!vcpu->kvm->arch.fwnmi_enabled) {
   1575			ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
   1576			kvmppc_core_queue_machine_check(vcpu, flags);
   1577			r = RESUME_GUEST;
   1578			break;
   1579		}
   1580
   1581		/* Exit to guest with KVM_EXIT_NMI as exit reason */
   1582		run->exit_reason = KVM_EXIT_NMI;
   1583		run->hw.hardware_exit_reason = vcpu->arch.trap;
   1584		/* Clear out the old NMI status from run->flags */
   1585		run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
   1586		/* Now set the NMI status */
   1587		if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
   1588			run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
   1589		else
   1590			run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
   1591
   1592		r = RESUME_HOST;
   1593		break;
   1594	}
   1595	case BOOK3S_INTERRUPT_PROGRAM:
   1596	{
   1597		ulong flags;
   1598		/*
   1599		 * Normally program interrupts are delivered directly
   1600		 * to the guest by the hardware, but we can get here
   1601		 * as a result of a hypervisor emulation interrupt
   1602		 * (e40) getting turned into a 700 by BML RTAS.
   1603		 */
   1604		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
   1605		kvmppc_core_queue_program(vcpu, flags);
   1606		r = RESUME_GUEST;
   1607		break;
   1608	}
   1609	case BOOK3S_INTERRUPT_SYSCALL:
   1610	{
   1611		int i;
   1612
   1613		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
   1614			/*
   1615			 * Guest userspace executed sc 1. This can only be
   1616			 * reached by the P9 path because the old path
   1617			 * handles this case in realmode hcall handlers.
   1618			 */
   1619			if (!kvmhv_vcpu_is_radix(vcpu)) {
   1620				/*
   1621				 * A guest could be running PR KVM, so this
   1622				 * may be a PR KVM hcall. It must be reflected
   1623				 * to the guest kernel as a sc interrupt.
   1624				 */
   1625				kvmppc_core_queue_syscall(vcpu);
   1626			} else {
   1627				/*
   1628				 * Radix guests can not run PR KVM or nested HV
   1629				 * hash guests which might run PR KVM, so this
   1630				 * is always a privilege fault. Send a program
   1631				 * check to guest kernel.
   1632				 */
   1633				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
   1634			}
   1635			r = RESUME_GUEST;
   1636			break;
   1637		}
   1638
   1639		/*
   1640		 * hcall - gather args and set exit_reason. This will next be
   1641		 * handled by kvmppc_pseries_do_hcall which may be able to deal
   1642		 * with it and resume guest, or may punt to userspace.
   1643		 */
   1644		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
   1645		for (i = 0; i < 9; ++i)
   1646			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
   1647		run->exit_reason = KVM_EXIT_PAPR_HCALL;
   1648		vcpu->arch.hcall_needed = 1;
   1649		r = RESUME_HOST;
   1650		break;
   1651	}
   1652	/*
   1653	 * We get these next two if the guest accesses a page which it thinks
   1654	 * it has mapped but which is not actually present, either because
   1655	 * it is for an emulated I/O device or because the corresonding
   1656	 * host page has been paged out.
   1657	 *
   1658	 * Any other HDSI/HISI interrupts have been handled already for P7/8
   1659	 * guests. For POWER9 hash guests not using rmhandlers, basic hash
   1660	 * fault handling is done here.
   1661	 */
   1662	case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
   1663		unsigned long vsid;
   1664		long err;
   1665
   1666		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
   1667		    unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
   1668			r = RESUME_GUEST; /* Just retry if it's the canary */
   1669			break;
   1670		}
   1671
   1672		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
   1673			/*
   1674			 * Radix doesn't require anything, and pre-ISAv3.0 hash
   1675			 * already attempted to handle this in rmhandlers. The
   1676			 * hash fault handling below is v3 only (it uses ASDR
   1677			 * via fault_gpa).
   1678			 */
   1679			r = RESUME_PAGE_FAULT;
   1680			break;
   1681		}
   1682
   1683		if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
   1684			kvmppc_core_queue_data_storage(vcpu,
   1685				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
   1686			r = RESUME_GUEST;
   1687			break;
   1688		}
   1689
   1690		if (!(vcpu->arch.shregs.msr & MSR_DR))
   1691			vsid = vcpu->kvm->arch.vrma_slb_v;
   1692		else
   1693			vsid = vcpu->arch.fault_gpa;
   1694
   1695		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
   1696				vsid, vcpu->arch.fault_dsisr, true);
   1697		if (err == 0) {
   1698			r = RESUME_GUEST;
   1699		} else if (err == -1 || err == -2) {
   1700			r = RESUME_PAGE_FAULT;
   1701		} else {
   1702			kvmppc_core_queue_data_storage(vcpu,
   1703				vcpu->arch.fault_dar, err);
   1704			r = RESUME_GUEST;
   1705		}
   1706		break;
   1707	}
   1708	case BOOK3S_INTERRUPT_H_INST_STORAGE: {
   1709		unsigned long vsid;
   1710		long err;
   1711
   1712		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
   1713		vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
   1714			DSISR_SRR1_MATCH_64S;
   1715		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
   1716			/*
   1717			 * Radix doesn't require anything, and pre-ISAv3.0 hash
   1718			 * already attempted to handle this in rmhandlers. The
   1719			 * hash fault handling below is v3 only (it uses ASDR
   1720			 * via fault_gpa).
   1721			 */
   1722			if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
   1723				vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
   1724			r = RESUME_PAGE_FAULT;
   1725			break;
   1726		}
   1727
   1728		if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
   1729			kvmppc_core_queue_inst_storage(vcpu,
   1730				vcpu->arch.fault_dsisr);
   1731			r = RESUME_GUEST;
   1732			break;
   1733		}
   1734
   1735		if (!(vcpu->arch.shregs.msr & MSR_IR))
   1736			vsid = vcpu->kvm->arch.vrma_slb_v;
   1737		else
   1738			vsid = vcpu->arch.fault_gpa;
   1739
   1740		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
   1741				vsid, vcpu->arch.fault_dsisr, false);
   1742		if (err == 0) {
   1743			r = RESUME_GUEST;
   1744		} else if (err == -1) {
   1745			r = RESUME_PAGE_FAULT;
   1746		} else {
   1747			kvmppc_core_queue_inst_storage(vcpu, err);
   1748			r = RESUME_GUEST;
   1749		}
   1750		break;
   1751	}
   1752
   1753	/*
   1754	 * This occurs if the guest executes an illegal instruction.
   1755	 * If the guest debug is disabled, generate a program interrupt
   1756	 * to the guest. If guest debug is enabled, we need to check
   1757	 * whether the instruction is a software breakpoint instruction.
   1758	 * Accordingly return to Guest or Host.
   1759	 */
   1760	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
   1761		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
   1762			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
   1763				swab32(vcpu->arch.emul_inst) :
   1764				vcpu->arch.emul_inst;
   1765		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
   1766			r = kvmppc_emulate_debug_inst(vcpu);
   1767		} else {
   1768			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
   1769			r = RESUME_GUEST;
   1770		}
   1771		break;
   1772
   1773#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   1774	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
   1775		/*
   1776		 * This occurs for various TM-related instructions that
   1777		 * we need to emulate on POWER9 DD2.2.  We have already
   1778		 * handled the cases where the guest was in real-suspend
   1779		 * mode and was transitioning to transactional state.
   1780		 */
   1781		r = kvmhv_p9_tm_emulation(vcpu);
   1782		if (r != -1)
   1783			break;
   1784		fallthrough; /* go to facility unavailable handler */
   1785#endif
   1786
   1787	/*
   1788	 * This occurs if the guest (kernel or userspace), does something that
   1789	 * is prohibited by HFSCR.
   1790	 * On POWER9, this could be a doorbell instruction that we need
   1791	 * to emulate.
   1792	 * Otherwise, we just generate a program interrupt to the guest.
   1793	 */
   1794	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
   1795		u64 cause = vcpu->arch.hfscr >> 56;
   1796
   1797		r = EMULATE_FAIL;
   1798		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
   1799			if (cause == FSCR_MSGP_LG)
   1800				r = kvmppc_emulate_doorbell_instr(vcpu);
   1801			if (cause == FSCR_PM_LG)
   1802				r = kvmppc_pmu_unavailable(vcpu);
   1803			if (cause == FSCR_EBB_LG)
   1804				r = kvmppc_ebb_unavailable(vcpu);
   1805			if (cause == FSCR_TM_LG)
   1806				r = kvmppc_tm_unavailable(vcpu);
   1807		}
   1808		if (r == EMULATE_FAIL) {
   1809			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
   1810			r = RESUME_GUEST;
   1811		}
   1812		break;
   1813	}
   1814
   1815	case BOOK3S_INTERRUPT_HV_RM_HARD:
   1816		r = RESUME_PASSTHROUGH;
   1817		break;
   1818	default:
   1819		kvmppc_dump_regs(vcpu);
   1820		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
   1821			vcpu->arch.trap, kvmppc_get_pc(vcpu),
   1822			vcpu->arch.shregs.msr);
   1823		run->hw.hardware_exit_reason = vcpu->arch.trap;
   1824		r = RESUME_HOST;
   1825		break;
   1826	}
   1827
   1828	return r;
   1829}
   1830
   1831static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
   1832{
   1833	int r;
   1834	int srcu_idx;
   1835
   1836	vcpu->stat.sum_exits++;
   1837
   1838	/*
   1839	 * This can happen if an interrupt occurs in the last stages
   1840	 * of guest entry or the first stages of guest exit (i.e. after
   1841	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
   1842	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
   1843	 * That can happen due to a bug, or due to a machine check
   1844	 * occurring at just the wrong time.
   1845	 */
   1846	if (vcpu->arch.shregs.msr & MSR_HV) {
   1847		pr_emerg("KVM trap in HV mode while nested!\n");
   1848		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
   1849			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
   1850			 vcpu->arch.shregs.msr);
   1851		kvmppc_dump_regs(vcpu);
   1852		return RESUME_HOST;
   1853	}
   1854	switch (vcpu->arch.trap) {
   1855	/* We're good on these - the host merely wanted to get our attention */
   1856	case BOOK3S_INTERRUPT_HV_DECREMENTER:
   1857		vcpu->stat.dec_exits++;
   1858		r = RESUME_GUEST;
   1859		break;
   1860	case BOOK3S_INTERRUPT_EXTERNAL:
   1861		vcpu->stat.ext_intr_exits++;
   1862		r = RESUME_HOST;
   1863		break;
   1864	case BOOK3S_INTERRUPT_H_DOORBELL:
   1865	case BOOK3S_INTERRUPT_H_VIRT:
   1866		vcpu->stat.ext_intr_exits++;
   1867		r = RESUME_GUEST;
   1868		break;
   1869	/* These need to go to the nested HV */
   1870	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
   1871		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
   1872		vcpu->stat.dec_exits++;
   1873		r = RESUME_HOST;
   1874		break;
   1875	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
   1876	case BOOK3S_INTERRUPT_HMI:
   1877	case BOOK3S_INTERRUPT_PERFMON:
   1878	case BOOK3S_INTERRUPT_SYSTEM_RESET:
   1879		r = RESUME_GUEST;
   1880		break;
   1881	case BOOK3S_INTERRUPT_MACHINE_CHECK:
   1882	{
   1883		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
   1884					      DEFAULT_RATELIMIT_BURST);
   1885		/* Pass the machine check to the L1 guest */
   1886		r = RESUME_HOST;
   1887		/* Print the MCE event to host console. */
   1888		if (__ratelimit(&rs))
   1889			machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
   1890		break;
   1891	}
   1892	/*
   1893	 * We get these next two if the guest accesses a page which it thinks
   1894	 * it has mapped but which is not actually present, either because
   1895	 * it is for an emulated I/O device or because the corresonding
   1896	 * host page has been paged out.
   1897	 */
   1898	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
   1899		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
   1900		r = kvmhv_nested_page_fault(vcpu);
   1901		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
   1902		break;
   1903	case BOOK3S_INTERRUPT_H_INST_STORAGE:
   1904		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
   1905		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
   1906					 DSISR_SRR1_MATCH_64S;
   1907		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
   1908			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
   1909		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
   1910		r = kvmhv_nested_page_fault(vcpu);
   1911		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
   1912		break;
   1913
   1914#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   1915	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
   1916		/*
   1917		 * This occurs for various TM-related instructions that
   1918		 * we need to emulate on POWER9 DD2.2.  We have already
   1919		 * handled the cases where the guest was in real-suspend
   1920		 * mode and was transitioning to transactional state.
   1921		 */
   1922		r = kvmhv_p9_tm_emulation(vcpu);
   1923		if (r != -1)
   1924			break;
   1925		fallthrough; /* go to facility unavailable handler */
   1926#endif
   1927
   1928	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
   1929		u64 cause = vcpu->arch.hfscr >> 56;
   1930
   1931		/*
   1932		 * Only pass HFU interrupts to the L1 if the facility is
   1933		 * permitted but disabled by the L1's HFSCR, otherwise
   1934		 * the interrupt does not make sense to the L1 so turn
   1935		 * it into a HEAI.
   1936		 */
   1937		if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
   1938				(vcpu->arch.nested_hfscr & (1UL << cause))) {
   1939			vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
   1940
   1941			/*
   1942			 * If the fetch failed, return to guest and
   1943			 * try executing it again.
   1944			 */
   1945			r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
   1946						 &vcpu->arch.emul_inst);
   1947			if (r != EMULATE_DONE)
   1948				r = RESUME_GUEST;
   1949			else
   1950				r = RESUME_HOST;
   1951		} else {
   1952			r = RESUME_HOST;
   1953		}
   1954
   1955		break;
   1956	}
   1957
   1958	case BOOK3S_INTERRUPT_HV_RM_HARD:
   1959		vcpu->arch.trap = 0;
   1960		r = RESUME_GUEST;
   1961		if (!xics_on_xive())
   1962			kvmppc_xics_rm_complete(vcpu, 0);
   1963		break;
   1964	case BOOK3S_INTERRUPT_SYSCALL:
   1965	{
   1966		unsigned long req = kvmppc_get_gpr(vcpu, 3);
   1967
   1968		/*
   1969		 * The H_RPT_INVALIDATE hcalls issued by nested
   1970		 * guests for process-scoped invalidations when
   1971		 * GTSE=0, are handled here in L0.
   1972		 */
   1973		if (req == H_RPT_INVALIDATE) {
   1974			r = kvmppc_nested_h_rpt_invalidate(vcpu);
   1975			break;
   1976		}
   1977
   1978		r = RESUME_HOST;
   1979		break;
   1980	}
   1981	default:
   1982		r = RESUME_HOST;
   1983		break;
   1984	}
   1985
   1986	return r;
   1987}
   1988
   1989static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
   1990					    struct kvm_sregs *sregs)
   1991{
   1992	int i;
   1993
   1994	memset(sregs, 0, sizeof(struct kvm_sregs));
   1995	sregs->pvr = vcpu->arch.pvr;
   1996	for (i = 0; i < vcpu->arch.slb_max; i++) {
   1997		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
   1998		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
   1999	}
   2000
   2001	return 0;
   2002}
   2003
   2004static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
   2005					    struct kvm_sregs *sregs)
   2006{
   2007	int i, j;
   2008
   2009	/* Only accept the same PVR as the host's, since we can't spoof it */
   2010	if (sregs->pvr != vcpu->arch.pvr)
   2011		return -EINVAL;
   2012
   2013	j = 0;
   2014	for (i = 0; i < vcpu->arch.slb_nr; i++) {
   2015		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
   2016			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
   2017			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
   2018			++j;
   2019		}
   2020	}
   2021	vcpu->arch.slb_max = j;
   2022
   2023	return 0;
   2024}
   2025
   2026/*
   2027 * Enforce limits on guest LPCR values based on hardware availability,
   2028 * guest configuration, and possibly hypervisor support and security
   2029 * concerns.
   2030 */
   2031unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
   2032{
   2033	/* LPCR_TC only applies to HPT guests */
   2034	if (kvm_is_radix(kvm))
   2035		lpcr &= ~LPCR_TC;
   2036
   2037	/* On POWER8 and above, userspace can modify AIL */
   2038	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
   2039		lpcr &= ~LPCR_AIL;
   2040	if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
   2041		lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
   2042	/*
   2043	 * On some POWER9s we force AIL off for radix guests to prevent
   2044	 * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
   2045	 * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
   2046	 * be cached, which the host TLB management does not expect.
   2047	 */
   2048	if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
   2049		lpcr &= ~LPCR_AIL;
   2050
   2051	/*
   2052	 * On POWER9, allow userspace to enable large decrementer for the
   2053	 * guest, whether or not the host has it enabled.
   2054	 */
   2055	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   2056		lpcr &= ~LPCR_LD;
   2057
   2058	return lpcr;
   2059}
   2060
   2061static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
   2062{
   2063	if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
   2064		WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
   2065			  lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
   2066	}
   2067}
   2068
   2069static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
   2070		bool preserve_top32)
   2071{
   2072	struct kvm *kvm = vcpu->kvm;
   2073	struct kvmppc_vcore *vc = vcpu->arch.vcore;
   2074	u64 mask;
   2075
   2076	spin_lock(&vc->lock);
   2077
   2078	/*
   2079	 * Userspace can only modify
   2080	 * DPFD (default prefetch depth), ILE (interrupt little-endian),
   2081	 * TC (translation control), AIL (alternate interrupt location),
   2082	 * LD (large decrementer).
   2083	 * These are subject to restrictions from kvmppc_filter_lcpr_hv().
   2084	 */
   2085	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;
   2086
   2087	/* Broken 32-bit version of LPCR must not clear top bits */
   2088	if (preserve_top32)
   2089		mask &= 0xFFFFFFFF;
   2090
   2091	new_lpcr = kvmppc_filter_lpcr_hv(kvm,
   2092			(vc->lpcr & ~mask) | (new_lpcr & mask));
   2093
   2094	/*
   2095	 * If ILE (interrupt little-endian) has changed, update the
   2096	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
   2097	 */
   2098	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
   2099		struct kvm_vcpu *vcpu;
   2100		unsigned long i;
   2101
   2102		kvm_for_each_vcpu(i, vcpu, kvm) {
   2103			if (vcpu->arch.vcore != vc)
   2104				continue;
   2105			if (new_lpcr & LPCR_ILE)
   2106				vcpu->arch.intr_msr |= MSR_LE;
   2107			else
   2108				vcpu->arch.intr_msr &= ~MSR_LE;
   2109		}
   2110	}
   2111
   2112	vc->lpcr = new_lpcr;
   2113
   2114	spin_unlock(&vc->lock);
   2115}
   2116
   2117static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
   2118				 union kvmppc_one_reg *val)
   2119{
   2120	int r = 0;
   2121	long int i;
   2122
   2123	switch (id) {
   2124	case KVM_REG_PPC_DEBUG_INST:
   2125		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
   2126		break;
   2127	case KVM_REG_PPC_HIOR:
   2128		*val = get_reg_val(id, 0);
   2129		break;
   2130	case KVM_REG_PPC_DABR:
   2131		*val = get_reg_val(id, vcpu->arch.dabr);
   2132		break;
   2133	case KVM_REG_PPC_DABRX:
   2134		*val = get_reg_val(id, vcpu->arch.dabrx);
   2135		break;
   2136	case KVM_REG_PPC_DSCR:
   2137		*val = get_reg_val(id, vcpu->arch.dscr);
   2138		break;
   2139	case KVM_REG_PPC_PURR:
   2140		*val = get_reg_val(id, vcpu->arch.purr);
   2141		break;
   2142	case KVM_REG_PPC_SPURR:
   2143		*val = get_reg_val(id, vcpu->arch.spurr);
   2144		break;
   2145	case KVM_REG_PPC_AMR:
   2146		*val = get_reg_val(id, vcpu->arch.amr);
   2147		break;
   2148	case KVM_REG_PPC_UAMOR:
   2149		*val = get_reg_val(id, vcpu->arch.uamor);
   2150		break;
   2151	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
   2152		i = id - KVM_REG_PPC_MMCR0;
   2153		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
   2154		break;
   2155	case KVM_REG_PPC_MMCR2:
   2156		*val = get_reg_val(id, vcpu->arch.mmcr[2]);
   2157		break;
   2158	case KVM_REG_PPC_MMCRA:
   2159		*val = get_reg_val(id, vcpu->arch.mmcra);
   2160		break;
   2161	case KVM_REG_PPC_MMCRS:
   2162		*val = get_reg_val(id, vcpu->arch.mmcrs);
   2163		break;
   2164	case KVM_REG_PPC_MMCR3:
   2165		*val = get_reg_val(id, vcpu->arch.mmcr[3]);
   2166		break;
   2167	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
   2168		i = id - KVM_REG_PPC_PMC1;
   2169		*val = get_reg_val(id, vcpu->arch.pmc[i]);
   2170		break;
   2171	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
   2172		i = id - KVM_REG_PPC_SPMC1;
   2173		*val = get_reg_val(id, vcpu->arch.spmc[i]);
   2174		break;
   2175	case KVM_REG_PPC_SIAR:
   2176		*val = get_reg_val(id, vcpu->arch.siar);
   2177		break;
   2178	case KVM_REG_PPC_SDAR:
   2179		*val = get_reg_val(id, vcpu->arch.sdar);
   2180		break;
   2181	case KVM_REG_PPC_SIER:
   2182		*val = get_reg_val(id, vcpu->arch.sier[0]);
   2183		break;
   2184	case KVM_REG_PPC_SIER2:
   2185		*val = get_reg_val(id, vcpu->arch.sier[1]);
   2186		break;
   2187	case KVM_REG_PPC_SIER3:
   2188		*val = get_reg_val(id, vcpu->arch.sier[2]);
   2189		break;
   2190	case KVM_REG_PPC_IAMR:
   2191		*val = get_reg_val(id, vcpu->arch.iamr);
   2192		break;
   2193	case KVM_REG_PPC_PSPB:
   2194		*val = get_reg_val(id, vcpu->arch.pspb);
   2195		break;
   2196	case KVM_REG_PPC_DPDES:
   2197		/*
   2198		 * On POWER9, where we are emulating msgsndp etc.,
   2199		 * we return 1 bit for each vcpu, which can come from
   2200		 * either vcore->dpdes or doorbell_request.
   2201		 * On POWER8, doorbell_request is 0.
   2202		 */
   2203		if (cpu_has_feature(CPU_FTR_ARCH_300))
   2204			*val = get_reg_val(id, vcpu->arch.doorbell_request);
   2205		else
   2206			*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
   2207		break;
   2208	case KVM_REG_PPC_VTB:
   2209		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
   2210		break;
   2211	case KVM_REG_PPC_DAWR:
   2212		*val = get_reg_val(id, vcpu->arch.dawr0);
   2213		break;
   2214	case KVM_REG_PPC_DAWRX:
   2215		*val = get_reg_val(id, vcpu->arch.dawrx0);
   2216		break;
   2217	case KVM_REG_PPC_DAWR1:
   2218		*val = get_reg_val(id, vcpu->arch.dawr1);
   2219		break;
   2220	case KVM_REG_PPC_DAWRX1:
   2221		*val = get_reg_val(id, vcpu->arch.dawrx1);
   2222		break;
   2223	case KVM_REG_PPC_CIABR:
   2224		*val = get_reg_val(id, vcpu->arch.ciabr);
   2225		break;
   2226	case KVM_REG_PPC_CSIGR:
   2227		*val = get_reg_val(id, vcpu->arch.csigr);
   2228		break;
   2229	case KVM_REG_PPC_TACR:
   2230		*val = get_reg_val(id, vcpu->arch.tacr);
   2231		break;
   2232	case KVM_REG_PPC_TCSCR:
   2233		*val = get_reg_val(id, vcpu->arch.tcscr);
   2234		break;
   2235	case KVM_REG_PPC_PID:
   2236		*val = get_reg_val(id, vcpu->arch.pid);
   2237		break;
   2238	case KVM_REG_PPC_ACOP:
   2239		*val = get_reg_val(id, vcpu->arch.acop);
   2240		break;
   2241	case KVM_REG_PPC_WORT:
   2242		*val = get_reg_val(id, vcpu->arch.wort);
   2243		break;
   2244	case KVM_REG_PPC_TIDR:
   2245		*val = get_reg_val(id, vcpu->arch.tid);
   2246		break;
   2247	case KVM_REG_PPC_PSSCR:
   2248		*val = get_reg_val(id, vcpu->arch.psscr);
   2249		break;
   2250	case KVM_REG_PPC_VPA_ADDR:
   2251		spin_lock(&vcpu->arch.vpa_update_lock);
   2252		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
   2253		spin_unlock(&vcpu->arch.vpa_update_lock);
   2254		break;
   2255	case KVM_REG_PPC_VPA_SLB:
   2256		spin_lock(&vcpu->arch.vpa_update_lock);
   2257		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
   2258		val->vpaval.length = vcpu->arch.slb_shadow.len;
   2259		spin_unlock(&vcpu->arch.vpa_update_lock);
   2260		break;
   2261	case KVM_REG_PPC_VPA_DTL:
   2262		spin_lock(&vcpu->arch.vpa_update_lock);
   2263		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
   2264		val->vpaval.length = vcpu->arch.dtl.len;
   2265		spin_unlock(&vcpu->arch.vpa_update_lock);
   2266		break;
   2267	case KVM_REG_PPC_TB_OFFSET:
   2268		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
   2269		break;
   2270	case KVM_REG_PPC_LPCR:
   2271	case KVM_REG_PPC_LPCR_64:
   2272		*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
   2273		break;
   2274	case KVM_REG_PPC_PPR:
   2275		*val = get_reg_val(id, vcpu->arch.ppr);
   2276		break;
   2277#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   2278	case KVM_REG_PPC_TFHAR:
   2279		*val = get_reg_val(id, vcpu->arch.tfhar);
   2280		break;
   2281	case KVM_REG_PPC_TFIAR:
   2282		*val = get_reg_val(id, vcpu->arch.tfiar);
   2283		break;
   2284	case KVM_REG_PPC_TEXASR:
   2285		*val = get_reg_val(id, vcpu->arch.texasr);
   2286		break;
   2287	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
   2288		i = id - KVM_REG_PPC_TM_GPR0;
   2289		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
   2290		break;
   2291	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
   2292	{
   2293		int j;
   2294		i = id - KVM_REG_PPC_TM_VSR0;
   2295		if (i < 32)
   2296			for (j = 0; j < TS_FPRWIDTH; j++)
   2297				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
   2298		else {
   2299			if (cpu_has_feature(CPU_FTR_ALTIVEC))
   2300				val->vval = vcpu->arch.vr_tm.vr[i-32];
   2301			else
   2302				r = -ENXIO;
   2303		}
   2304		break;
   2305	}
   2306	case KVM_REG_PPC_TM_CR:
   2307		*val = get_reg_val(id, vcpu->arch.cr_tm);
   2308		break;
   2309	case KVM_REG_PPC_TM_XER:
   2310		*val = get_reg_val(id, vcpu->arch.xer_tm);
   2311		break;
   2312	case KVM_REG_PPC_TM_LR:
   2313		*val = get_reg_val(id, vcpu->arch.lr_tm);
   2314		break;
   2315	case KVM_REG_PPC_TM_CTR:
   2316		*val = get_reg_val(id, vcpu->arch.ctr_tm);
   2317		break;
   2318	case KVM_REG_PPC_TM_FPSCR:
   2319		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
   2320		break;
   2321	case KVM_REG_PPC_TM_AMR:
   2322		*val = get_reg_val(id, vcpu->arch.amr_tm);
   2323		break;
   2324	case KVM_REG_PPC_TM_PPR:
   2325		*val = get_reg_val(id, vcpu->arch.ppr_tm);
   2326		break;
   2327	case KVM_REG_PPC_TM_VRSAVE:
   2328		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
   2329		break;
   2330	case KVM_REG_PPC_TM_VSCR:
   2331		if (cpu_has_feature(CPU_FTR_ALTIVEC))
   2332			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
   2333		else
   2334			r = -ENXIO;
   2335		break;
   2336	case KVM_REG_PPC_TM_DSCR:
   2337		*val = get_reg_val(id, vcpu->arch.dscr_tm);
   2338		break;
   2339	case KVM_REG_PPC_TM_TAR:
   2340		*val = get_reg_val(id, vcpu->arch.tar_tm);
   2341		break;
   2342#endif
   2343	case KVM_REG_PPC_ARCH_COMPAT:
   2344		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
   2345		break;
   2346	case KVM_REG_PPC_DEC_EXPIRY:
   2347		*val = get_reg_val(id, vcpu->arch.dec_expires);
   2348		break;
   2349	case KVM_REG_PPC_ONLINE:
   2350		*val = get_reg_val(id, vcpu->arch.online);
   2351		break;
   2352	case KVM_REG_PPC_PTCR:
   2353		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
   2354		break;
   2355	default:
   2356		r = -EINVAL;
   2357		break;
   2358	}
   2359
   2360	return r;
   2361}
   2362
   2363static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
   2364				 union kvmppc_one_reg *val)
   2365{
   2366	int r = 0;
   2367	long int i;
   2368	unsigned long addr, len;
   2369
   2370	switch (id) {
   2371	case KVM_REG_PPC_HIOR:
   2372		/* Only allow this to be set to zero */
   2373		if (set_reg_val(id, *val))
   2374			r = -EINVAL;
   2375		break;
   2376	case KVM_REG_PPC_DABR:
   2377		vcpu->arch.dabr = set_reg_val(id, *val);
   2378		break;
   2379	case KVM_REG_PPC_DABRX:
   2380		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
   2381		break;
   2382	case KVM_REG_PPC_DSCR:
   2383		vcpu->arch.dscr = set_reg_val(id, *val);
   2384		break;
   2385	case KVM_REG_PPC_PURR:
   2386		vcpu->arch.purr = set_reg_val(id, *val);
   2387		break;
   2388	case KVM_REG_PPC_SPURR:
   2389		vcpu->arch.spurr = set_reg_val(id, *val);
   2390		break;
   2391	case KVM_REG_PPC_AMR:
   2392		vcpu->arch.amr = set_reg_val(id, *val);
   2393		break;
   2394	case KVM_REG_PPC_UAMOR:
   2395		vcpu->arch.uamor = set_reg_val(id, *val);
   2396		break;
   2397	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
   2398		i = id - KVM_REG_PPC_MMCR0;
   2399		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
   2400		break;
   2401	case KVM_REG_PPC_MMCR2:
   2402		vcpu->arch.mmcr[2] = set_reg_val(id, *val);
   2403		break;
   2404	case KVM_REG_PPC_MMCRA:
   2405		vcpu->arch.mmcra = set_reg_val(id, *val);
   2406		break;
   2407	case KVM_REG_PPC_MMCRS:
   2408		vcpu->arch.mmcrs = set_reg_val(id, *val);
   2409		break;
   2410	case KVM_REG_PPC_MMCR3:
   2411		*val = get_reg_val(id, vcpu->arch.mmcr[3]);
   2412		break;
   2413	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
   2414		i = id - KVM_REG_PPC_PMC1;
   2415		vcpu->arch.pmc[i] = set_reg_val(id, *val);
   2416		break;
   2417	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
   2418		i = id - KVM_REG_PPC_SPMC1;
   2419		vcpu->arch.spmc[i] = set_reg_val(id, *val);
   2420		break;
   2421	case KVM_REG_PPC_SIAR:
   2422		vcpu->arch.siar = set_reg_val(id, *val);
   2423		break;
   2424	case KVM_REG_PPC_SDAR:
   2425		vcpu->arch.sdar = set_reg_val(id, *val);
   2426		break;
   2427	case KVM_REG_PPC_SIER:
   2428		vcpu->arch.sier[0] = set_reg_val(id, *val);
   2429		break;
   2430	case KVM_REG_PPC_SIER2:
   2431		vcpu->arch.sier[1] = set_reg_val(id, *val);
   2432		break;
   2433	case KVM_REG_PPC_SIER3:
   2434		vcpu->arch.sier[2] = set_reg_val(id, *val);
   2435		break;
   2436	case KVM_REG_PPC_IAMR:
   2437		vcpu->arch.iamr = set_reg_val(id, *val);
   2438		break;
   2439	case KVM_REG_PPC_PSPB:
   2440		vcpu->arch.pspb = set_reg_val(id, *val);
   2441		break;
   2442	case KVM_REG_PPC_DPDES:
   2443		if (cpu_has_feature(CPU_FTR_ARCH_300))
   2444			vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
   2445		else
   2446			vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
   2447		break;
   2448	case KVM_REG_PPC_VTB:
   2449		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
   2450		break;
   2451	case KVM_REG_PPC_DAWR:
   2452		vcpu->arch.dawr0 = set_reg_val(id, *val);
   2453		break;
   2454	case KVM_REG_PPC_DAWRX:
   2455		vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
   2456		break;
   2457	case KVM_REG_PPC_DAWR1:
   2458		vcpu->arch.dawr1 = set_reg_val(id, *val);
   2459		break;
   2460	case KVM_REG_PPC_DAWRX1:
   2461		vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP;
   2462		break;
   2463	case KVM_REG_PPC_CIABR:
   2464		vcpu->arch.ciabr = set_reg_val(id, *val);
   2465		/* Don't allow setting breakpoints in hypervisor code */
   2466		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
   2467			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */
   2468		break;
   2469	case KVM_REG_PPC_CSIGR:
   2470		vcpu->arch.csigr = set_reg_val(id, *val);
   2471		break;
   2472	case KVM_REG_PPC_TACR:
   2473		vcpu->arch.tacr = set_reg_val(id, *val);
   2474		break;
   2475	case KVM_REG_PPC_TCSCR:
   2476		vcpu->arch.tcscr = set_reg_val(id, *val);
   2477		break;
   2478	case KVM_REG_PPC_PID:
   2479		vcpu->arch.pid = set_reg_val(id, *val);
   2480		break;
   2481	case KVM_REG_PPC_ACOP:
   2482		vcpu->arch.acop = set_reg_val(id, *val);
   2483		break;
   2484	case KVM_REG_PPC_WORT:
   2485		vcpu->arch.wort = set_reg_val(id, *val);
   2486		break;
   2487	case KVM_REG_PPC_TIDR:
   2488		vcpu->arch.tid = set_reg_val(id, *val);
   2489		break;
   2490	case KVM_REG_PPC_PSSCR:
   2491		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
   2492		break;
   2493	case KVM_REG_PPC_VPA_ADDR:
   2494		addr = set_reg_val(id, *val);
   2495		r = -EINVAL;
   2496		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
   2497			      vcpu->arch.dtl.next_gpa))
   2498			break;
   2499		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
   2500		break;
   2501	case KVM_REG_PPC_VPA_SLB:
   2502		addr = val->vpaval.addr;
   2503		len = val->vpaval.length;
   2504		r = -EINVAL;
   2505		if (addr && !vcpu->arch.vpa.next_gpa)
   2506			break;
   2507		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
   2508		break;
   2509	case KVM_REG_PPC_VPA_DTL:
   2510		addr = val->vpaval.addr;
   2511		len = val->vpaval.length;
   2512		r = -EINVAL;
   2513		if (addr && (len < sizeof(struct dtl_entry) ||
   2514			     !vcpu->arch.vpa.next_gpa))
   2515			break;
   2516		len -= len % sizeof(struct dtl_entry);
   2517		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
   2518		break;
   2519	case KVM_REG_PPC_TB_OFFSET:
   2520		/* round up to multiple of 2^24 */
   2521		vcpu->arch.vcore->tb_offset =
   2522			ALIGN(set_reg_val(id, *val), 1UL << 24);
   2523		break;
   2524	case KVM_REG_PPC_LPCR:
   2525		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
   2526		break;
   2527	case KVM_REG_PPC_LPCR_64:
   2528		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
   2529		break;
   2530	case KVM_REG_PPC_PPR:
   2531		vcpu->arch.ppr = set_reg_val(id, *val);
   2532		break;
   2533#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   2534	case KVM_REG_PPC_TFHAR:
   2535		vcpu->arch.tfhar = set_reg_val(id, *val);
   2536		break;
   2537	case KVM_REG_PPC_TFIAR:
   2538		vcpu->arch.tfiar = set_reg_val(id, *val);
   2539		break;
   2540	case KVM_REG_PPC_TEXASR:
   2541		vcpu->arch.texasr = set_reg_val(id, *val);
   2542		break;
   2543	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
   2544		i = id - KVM_REG_PPC_TM_GPR0;
   2545		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
   2546		break;
   2547	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
   2548	{
   2549		int j;
   2550		i = id - KVM_REG_PPC_TM_VSR0;
   2551		if (i < 32)
   2552			for (j = 0; j < TS_FPRWIDTH; j++)
   2553				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
   2554		else
   2555			if (cpu_has_feature(CPU_FTR_ALTIVEC))
   2556				vcpu->arch.vr_tm.vr[i-32] = val->vval;
   2557			else
   2558				r = -ENXIO;
   2559		break;
   2560	}
   2561	case KVM_REG_PPC_TM_CR:
   2562		vcpu->arch.cr_tm = set_reg_val(id, *val);
   2563		break;
   2564	case KVM_REG_PPC_TM_XER:
   2565		vcpu->arch.xer_tm = set_reg_val(id, *val);
   2566		break;
   2567	case KVM_REG_PPC_TM_LR:
   2568		vcpu->arch.lr_tm = set_reg_val(id, *val);
   2569		break;
   2570	case KVM_REG_PPC_TM_CTR:
   2571		vcpu->arch.ctr_tm = set_reg_val(id, *val);
   2572		break;
   2573	case KVM_REG_PPC_TM_FPSCR:
   2574		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
   2575		break;
   2576	case KVM_REG_PPC_TM_AMR:
   2577		vcpu->arch.amr_tm = set_reg_val(id, *val);
   2578		break;
   2579	case KVM_REG_PPC_TM_PPR:
   2580		vcpu->arch.ppr_tm = set_reg_val(id, *val);
   2581		break;
   2582	case KVM_REG_PPC_TM_VRSAVE:
   2583		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
   2584		break;
   2585	case KVM_REG_PPC_TM_VSCR:
   2586		if (cpu_has_feature(CPU_FTR_ALTIVEC))
   2587			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
   2588		else
   2589			r = - ENXIO;
   2590		break;
   2591	case KVM_REG_PPC_TM_DSCR:
   2592		vcpu->arch.dscr_tm = set_reg_val(id, *val);
   2593		break;
   2594	case KVM_REG_PPC_TM_TAR:
   2595		vcpu->arch.tar_tm = set_reg_val(id, *val);
   2596		break;
   2597#endif
   2598	case KVM_REG_PPC_ARCH_COMPAT:
   2599		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
   2600		break;
   2601	case KVM_REG_PPC_DEC_EXPIRY:
   2602		vcpu->arch.dec_expires = set_reg_val(id, *val);
   2603		break;
   2604	case KVM_REG_PPC_ONLINE:
   2605		i = set_reg_val(id, *val);
   2606		if (i && !vcpu->arch.online)
   2607			atomic_inc(&vcpu->arch.vcore->online_count);
   2608		else if (!i && vcpu->arch.online)
   2609			atomic_dec(&vcpu->arch.vcore->online_count);
   2610		vcpu->arch.online = i;
   2611		break;
   2612	case KVM_REG_PPC_PTCR:
   2613		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
   2614		break;
   2615	default:
   2616		r = -EINVAL;
   2617		break;
   2618	}
   2619
   2620	return r;
   2621}
   2622
   2623/*
   2624 * On POWER9, threads are independent and can be in different partitions.
   2625 * Therefore we consider each thread to be a subcore.
   2626 * There is a restriction that all threads have to be in the same
   2627 * MMU mode (radix or HPT), unfortunately, but since we only support
   2628 * HPT guests on a HPT host so far, that isn't an impediment yet.
   2629 */
   2630static int threads_per_vcore(struct kvm *kvm)
   2631{
   2632	if (cpu_has_feature(CPU_FTR_ARCH_300))
   2633		return 1;
   2634	return threads_per_subcore;
   2635}
   2636
   2637static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
   2638{
   2639	struct kvmppc_vcore *vcore;
   2640
   2641	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
   2642
   2643	if (vcore == NULL)
   2644		return NULL;
   2645
   2646	spin_lock_init(&vcore->lock);
   2647	spin_lock_init(&vcore->stoltb_lock);
   2648	rcuwait_init(&vcore->wait);
   2649	vcore->preempt_tb = TB_NIL;
   2650	vcore->lpcr = kvm->arch.lpcr;
   2651	vcore->first_vcpuid = id;
   2652	vcore->kvm = kvm;
   2653	INIT_LIST_HEAD(&vcore->preempt_list);
   2654
   2655	return vcore;
   2656}
   2657
   2658#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
   2659static struct debugfs_timings_element {
   2660	const char *name;
   2661	size_t offset;
   2662} timings[] = {
   2663	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
   2664	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
   2665	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
   2666	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
   2667	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
   2668};
   2669
   2670#define N_TIMINGS	(ARRAY_SIZE(timings))
   2671
   2672struct debugfs_timings_state {
   2673	struct kvm_vcpu	*vcpu;
   2674	unsigned int	buflen;
   2675	char		buf[N_TIMINGS * 100];
   2676};
   2677
   2678static int debugfs_timings_open(struct inode *inode, struct file *file)
   2679{
   2680	struct kvm_vcpu *vcpu = inode->i_private;
   2681	struct debugfs_timings_state *p;
   2682
   2683	p = kzalloc(sizeof(*p), GFP_KERNEL);
   2684	if (!p)
   2685		return -ENOMEM;
   2686
   2687	kvm_get_kvm(vcpu->kvm);
   2688	p->vcpu = vcpu;
   2689	file->private_data = p;
   2690
   2691	return nonseekable_open(inode, file);
   2692}
   2693
   2694static int debugfs_timings_release(struct inode *inode, struct file *file)
   2695{
   2696	struct debugfs_timings_state *p = file->private_data;
   2697
   2698	kvm_put_kvm(p->vcpu->kvm);
   2699	kfree(p);
   2700	return 0;
   2701}
   2702
   2703static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
   2704				    size_t len, loff_t *ppos)
   2705{
   2706	struct debugfs_timings_state *p = file->private_data;
   2707	struct kvm_vcpu *vcpu = p->vcpu;
   2708	char *s, *buf_end;
   2709	struct kvmhv_tb_accumulator tb;
   2710	u64 count;
   2711	loff_t pos;
   2712	ssize_t n;
   2713	int i, loops;
   2714	bool ok;
   2715
   2716	if (!p->buflen) {
   2717		s = p->buf;
   2718		buf_end = s + sizeof(p->buf);
   2719		for (i = 0; i < N_TIMINGS; ++i) {
   2720			struct kvmhv_tb_accumulator *acc;
   2721
   2722			acc = (struct kvmhv_tb_accumulator *)
   2723				((unsigned long)vcpu + timings[i].offset);
   2724			ok = false;
   2725			for (loops = 0; loops < 1000; ++loops) {
   2726				count = acc->seqcount;
   2727				if (!(count & 1)) {
   2728					smp_rmb();
   2729					tb = *acc;
   2730					smp_rmb();
   2731					if (count == acc->seqcount) {
   2732						ok = true;
   2733						break;
   2734					}
   2735				}
   2736				udelay(1);
   2737			}
   2738			if (!ok)
   2739				snprintf(s, buf_end - s, "%s: stuck\n",
   2740					timings[i].name);
   2741			else
   2742				snprintf(s, buf_end - s,
   2743					"%s: %llu %llu %llu %llu\n",
   2744					timings[i].name, count / 2,
   2745					tb_to_ns(tb.tb_total),
   2746					tb_to_ns(tb.tb_min),
   2747					tb_to_ns(tb.tb_max));
   2748			s += strlen(s);
   2749		}
   2750		p->buflen = s - p->buf;
   2751	}
   2752
   2753	pos = *ppos;
   2754	if (pos >= p->buflen)
   2755		return 0;
   2756	if (len > p->buflen - pos)
   2757		len = p->buflen - pos;
   2758	n = copy_to_user(buf, p->buf + pos, len);
   2759	if (n) {
   2760		if (n == len)
   2761			return -EFAULT;
   2762		len -= n;
   2763	}
   2764	*ppos = pos + len;
   2765	return len;
   2766}
   2767
   2768static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
   2769				     size_t len, loff_t *ppos)
   2770{
   2771	return -EACCES;
   2772}
   2773
   2774static const struct file_operations debugfs_timings_ops = {
   2775	.owner	 = THIS_MODULE,
   2776	.open	 = debugfs_timings_open,
   2777	.release = debugfs_timings_release,
   2778	.read	 = debugfs_timings_read,
   2779	.write	 = debugfs_timings_write,
   2780	.llseek	 = generic_file_llseek,
   2781};
   2782
   2783/* Create a debugfs directory for the vcpu */
   2784static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
   2785{
   2786	debugfs_create_file("timings", 0444, debugfs_dentry, vcpu,
   2787			    &debugfs_timings_ops);
   2788	return 0;
   2789}
   2790
   2791#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
   2792static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
   2793{
   2794	return 0;
   2795}
   2796#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
   2797
   2798static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
   2799{
   2800	int err;
   2801	int core;
   2802	struct kvmppc_vcore *vcore;
   2803	struct kvm *kvm;
   2804	unsigned int id;
   2805
   2806	kvm = vcpu->kvm;
   2807	id = vcpu->vcpu_id;
   2808
   2809	vcpu->arch.shared = &vcpu->arch.shregs;
   2810#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
   2811	/*
   2812	 * The shared struct is never shared on HV,
   2813	 * so we can always use host endianness
   2814	 */
   2815#ifdef __BIG_ENDIAN__
   2816	vcpu->arch.shared_big_endian = true;
   2817#else
   2818	vcpu->arch.shared_big_endian = false;
   2819#endif
   2820#endif
   2821	vcpu->arch.mmcr[0] = MMCR0_FC;
   2822	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
   2823		vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT;
   2824		vcpu->arch.mmcra = MMCRA_BHRB_DISABLE;
   2825	}
   2826
   2827	vcpu->arch.ctrl = CTRL_RUNLATCH;
   2828	/* default to host PVR, since we can't spoof it */
   2829	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
   2830	spin_lock_init(&vcpu->arch.vpa_update_lock);
   2831	spin_lock_init(&vcpu->arch.tbacct_lock);
   2832	vcpu->arch.busy_preempt = TB_NIL;
   2833	vcpu->arch.shregs.msr = MSR_ME;
   2834	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
   2835
   2836	/*
   2837	 * Set the default HFSCR for the guest from the host value.
   2838	 * This value is only used on POWER9.
   2839	 * On POWER9, we want to virtualize the doorbell facility, so we
   2840	 * don't set the HFSCR_MSGP bit, and that causes those instructions
   2841	 * to trap and then we emulate them.
   2842	 */
   2843	vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
   2844		HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
   2845	if (cpu_has_feature(CPU_FTR_HVMODE)) {
   2846		vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
   2847#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   2848		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
   2849			vcpu->arch.hfscr |= HFSCR_TM;
   2850#endif
   2851	}
   2852	if (cpu_has_feature(CPU_FTR_TM_COMP))
   2853		vcpu->arch.hfscr |= HFSCR_TM;
   2854
   2855	vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
   2856
   2857	/*
   2858	 * PM, EBB, TM are demand-faulted so start with it clear.
   2859	 */
   2860	vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM);
   2861
   2862	kvmppc_mmu_book3s_hv_init(vcpu);
   2863
   2864	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
   2865
   2866	init_waitqueue_head(&vcpu->arch.cpu_run);
   2867
   2868	mutex_lock(&kvm->lock);
   2869	vcore = NULL;
   2870	err = -EINVAL;
   2871	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
   2872		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
   2873			pr_devel("KVM: VCPU ID too high\n");
   2874			core = KVM_MAX_VCORES;
   2875		} else {
   2876			BUG_ON(kvm->arch.smt_mode != 1);
   2877			core = kvmppc_pack_vcpu_id(kvm, id);
   2878		}
   2879	} else {
   2880		core = id / kvm->arch.smt_mode;
   2881	}
   2882	if (core < KVM_MAX_VCORES) {
   2883		vcore = kvm->arch.vcores[core];
   2884		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
   2885			pr_devel("KVM: collision on id %u", id);
   2886			vcore = NULL;
   2887		} else if (!vcore) {
   2888			/*
   2889			 * Take mmu_setup_lock for mutual exclusion
   2890			 * with kvmppc_update_lpcr().
   2891			 */
   2892			err = -ENOMEM;
   2893			vcore = kvmppc_vcore_create(kvm,
   2894					id & ~(kvm->arch.smt_mode - 1));
   2895			mutex_lock(&kvm->arch.mmu_setup_lock);
   2896			kvm->arch.vcores[core] = vcore;
   2897			kvm->arch.online_vcores++;
   2898			mutex_unlock(&kvm->arch.mmu_setup_lock);
   2899		}
   2900	}
   2901	mutex_unlock(&kvm->lock);
   2902
   2903	if (!vcore)
   2904		return err;
   2905
   2906	spin_lock(&vcore->lock);
   2907	++vcore->num_threads;
   2908	spin_unlock(&vcore->lock);
   2909	vcpu->arch.vcore = vcore;
   2910	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
   2911	vcpu->arch.thread_cpu = -1;
   2912	vcpu->arch.prev_cpu = -1;
   2913
   2914	vcpu->arch.cpu_type = KVM_CPU_3S_64;
   2915	kvmppc_sanity_check(vcpu);
   2916
   2917	return 0;
   2918}
   2919
   2920static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
   2921			      unsigned long flags)
   2922{
   2923	int err;
   2924	int esmt = 0;
   2925
   2926	if (flags)
   2927		return -EINVAL;
   2928	if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
   2929		return -EINVAL;
   2930	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
   2931		/*
   2932		 * On POWER8 (or POWER7), the threading mode is "strict",
   2933		 * so we pack smt_mode vcpus per vcore.
   2934		 */
   2935		if (smt_mode > threads_per_subcore)
   2936			return -EINVAL;
   2937	} else {
   2938		/*
   2939		 * On POWER9, the threading mode is "loose",
   2940		 * so each vcpu gets its own vcore.
   2941		 */
   2942		esmt = smt_mode;
   2943		smt_mode = 1;
   2944	}
   2945	mutex_lock(&kvm->lock);
   2946	err = -EBUSY;
   2947	if (!kvm->arch.online_vcores) {
   2948		kvm->arch.smt_mode = smt_mode;
   2949		kvm->arch.emul_smt_mode = esmt;
   2950		err = 0;
   2951	}
   2952	mutex_unlock(&kvm->lock);
   2953
   2954	return err;
   2955}
   2956
   2957static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
   2958{
   2959	if (vpa->pinned_addr)
   2960		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
   2961					vpa->dirty);
   2962}
   2963
   2964static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
   2965{
   2966	spin_lock(&vcpu->arch.vpa_update_lock);
   2967	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
   2968	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
   2969	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
   2970	spin_unlock(&vcpu->arch.vpa_update_lock);
   2971}
   2972
   2973static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
   2974{
   2975	/* Indicate we want to get back into the guest */
   2976	return 1;
   2977}
   2978
   2979static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
   2980{
   2981	unsigned long dec_nsec, now;
   2982
   2983	now = get_tb();
   2984	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
   2985		/* decrementer has already gone negative */
   2986		kvmppc_core_queue_dec(vcpu);
   2987		kvmppc_core_prepare_to_enter(vcpu);
   2988		return;
   2989	}
   2990	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
   2991	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
   2992	vcpu->arch.timer_running = 1;
   2993}
   2994
   2995extern int __kvmppc_vcore_entry(void);
   2996
   2997static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
   2998				   struct kvm_vcpu *vcpu, u64 tb)
   2999{
   3000	u64 now;
   3001
   3002	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
   3003		return;
   3004	spin_lock_irq(&vcpu->arch.tbacct_lock);
   3005	now = tb;
   3006	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
   3007		vcpu->arch.stolen_logged;
   3008	vcpu->arch.busy_preempt = now;
   3009	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
   3010	spin_unlock_irq(&vcpu->arch.tbacct_lock);
   3011	--vc->n_runnable;
   3012	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
   3013}
   3014
   3015static int kvmppc_grab_hwthread(int cpu)
   3016{
   3017	struct paca_struct *tpaca;
   3018	long timeout = 10000;
   3019
   3020	tpaca = paca_ptrs[cpu];
   3021
   3022	/* Ensure the thread won't go into the kernel if it wakes */
   3023	tpaca->kvm_hstate.kvm_vcpu = NULL;
   3024	tpaca->kvm_hstate.kvm_vcore = NULL;
   3025	tpaca->kvm_hstate.napping = 0;
   3026	smp_wmb();
   3027	tpaca->kvm_hstate.hwthread_req = 1;
   3028
   3029	/*
   3030	 * If the thread is already executing in the kernel (e.g. handling
   3031	 * a stray interrupt), wait for it to get back to nap mode.
   3032	 * The smp_mb() is to ensure that our setting of hwthread_req
   3033	 * is visible before we look at hwthread_state, so if this
   3034	 * races with the code at system_reset_pSeries and the thread
   3035	 * misses our setting of hwthread_req, we are sure to see its
   3036	 * setting of hwthread_state, and vice versa.
   3037	 */
   3038	smp_mb();
   3039	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
   3040		if (--timeout <= 0) {
   3041			pr_err("KVM: couldn't grab cpu %d\n", cpu);
   3042			return -EBUSY;
   3043		}
   3044		udelay(1);
   3045	}
   3046	return 0;
   3047}
   3048
   3049static void kvmppc_release_hwthread(int cpu)
   3050{
   3051	struct paca_struct *tpaca;
   3052
   3053	tpaca = paca_ptrs[cpu];
   3054	tpaca->kvm_hstate.hwthread_req = 0;
   3055	tpaca->kvm_hstate.kvm_vcpu = NULL;
   3056	tpaca->kvm_hstate.kvm_vcore = NULL;
   3057	tpaca->kvm_hstate.kvm_split_mode = NULL;
   3058}
   3059
   3060static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
   3061
   3062static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
   3063{
   3064	struct kvm_nested_guest *nested = vcpu->arch.nested;
   3065	cpumask_t *need_tlb_flush;
   3066	int i;
   3067
   3068	if (nested)
   3069		need_tlb_flush = &nested->need_tlb_flush;
   3070	else
   3071		need_tlb_flush = &kvm->arch.need_tlb_flush;
   3072
   3073	cpu = cpu_first_tlb_thread_sibling(cpu);
   3074	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
   3075					i += cpu_tlb_thread_sibling_step())
   3076		cpumask_set_cpu(i, need_tlb_flush);
   3077
   3078	/*
   3079	 * Make sure setting of bit in need_tlb_flush precedes testing of
   3080	 * cpu_in_guest. The matching barrier on the other side is hwsync
   3081	 * when switching to guest MMU mode, which happens between
   3082	 * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
   3083	 * being tested.
   3084	 */
   3085	smp_mb();
   3086
   3087	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
   3088					i += cpu_tlb_thread_sibling_step()) {
   3089		struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
   3090
   3091		if (running == kvm)
   3092			smp_call_function_single(i, do_nothing, NULL, 1);
   3093	}
   3094}
   3095
   3096static void do_migrate_away_vcpu(void *arg)
   3097{
   3098	struct kvm_vcpu *vcpu = arg;
   3099	struct kvm *kvm = vcpu->kvm;
   3100
   3101	/*
   3102	 * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
   3103	 * ptesync sequence on the old CPU before migrating to a new one, in
   3104	 * case we interrupted the guest between a tlbie ; eieio ;
   3105	 * tlbsync; ptesync sequence.
   3106	 *
   3107	 * Otherwise, ptesync is sufficient for ordering tlbiel sequences.
   3108	 */
   3109	if (kvm->arch.lpcr & LPCR_GTSE)
   3110		asm volatile("eieio; tlbsync; ptesync");
   3111	else
   3112		asm volatile("ptesync");
   3113}
   3114
   3115static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
   3116{
   3117	struct kvm_nested_guest *nested = vcpu->arch.nested;
   3118	struct kvm *kvm = vcpu->kvm;
   3119	int prev_cpu;
   3120
   3121	if (!cpu_has_feature(CPU_FTR_HVMODE))
   3122		return;
   3123
   3124	if (nested)
   3125		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
   3126	else
   3127		prev_cpu = vcpu->arch.prev_cpu;
   3128
   3129	/*
   3130	 * With radix, the guest can do TLB invalidations itself,
   3131	 * and it could choose to use the local form (tlbiel) if
   3132	 * it is invalidating a translation that has only ever been
   3133	 * used on one vcpu.  However, that doesn't mean it has
   3134	 * only ever been used on one physical cpu, since vcpus
   3135	 * can move around between pcpus.  To cope with this, when
   3136	 * a vcpu moves from one pcpu to another, we need to tell
   3137	 * any vcpus running on the same core as this vcpu previously
   3138	 * ran to flush the TLB.
   3139	 */
   3140	if (prev_cpu != pcpu) {
   3141		if (prev_cpu >= 0) {
   3142			if (cpu_first_tlb_thread_sibling(prev_cpu) !=
   3143			    cpu_first_tlb_thread_sibling(pcpu))
   3144				radix_flush_cpu(kvm, prev_cpu, vcpu);
   3145
   3146			smp_call_function_single(prev_cpu,
   3147					do_migrate_away_vcpu, vcpu, 1);
   3148		}
   3149		if (nested)
   3150			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
   3151		else
   3152			vcpu->arch.prev_cpu = pcpu;
   3153	}
   3154}
   3155
   3156static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
   3157{
   3158	int cpu;
   3159	struct paca_struct *tpaca;
   3160
   3161	cpu = vc->pcpu;
   3162	if (vcpu) {
   3163		if (vcpu->arch.timer_running) {
   3164			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
   3165			vcpu->arch.timer_running = 0;
   3166		}
   3167		cpu += vcpu->arch.ptid;
   3168		vcpu->cpu = vc->pcpu;
   3169		vcpu->arch.thread_cpu = cpu;
   3170	}
   3171	tpaca = paca_ptrs[cpu];
   3172	tpaca->kvm_hstate.kvm_vcpu = vcpu;
   3173	tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
   3174	tpaca->kvm_hstate.fake_suspend = 0;
   3175	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
   3176	smp_wmb();
   3177	tpaca->kvm_hstate.kvm_vcore = vc;
   3178	if (cpu != smp_processor_id())
   3179		kvmppc_ipi_thread(cpu);
   3180}
   3181
   3182static void kvmppc_wait_for_nap(int n_threads)
   3183{
   3184	int cpu = smp_processor_id();
   3185	int i, loops;
   3186
   3187	if (n_threads <= 1)
   3188		return;
   3189	for (loops = 0; loops < 1000000; ++loops) {
   3190		/*
   3191		 * Check if all threads are finished.
   3192		 * We set the vcore pointer when starting a thread
   3193		 * and the thread clears it when finished, so we look
   3194		 * for any threads that still have a non-NULL vcore ptr.
   3195		 */
   3196		for (i = 1; i < n_threads; ++i)
   3197			if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
   3198				break;
   3199		if (i == n_threads) {
   3200			HMT_medium();
   3201			return;
   3202		}
   3203		HMT_low();
   3204	}
   3205	HMT_medium();
   3206	for (i = 1; i < n_threads; ++i)
   3207		if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
   3208			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
   3209}
   3210
   3211/*
   3212 * Check that we are on thread 0 and that any other threads in
   3213 * this core are off-line.  Then grab the threads so they can't
   3214 * enter the kernel.
   3215 */
   3216static int on_primary_thread(void)
   3217{
   3218	int cpu = smp_processor_id();
   3219	int thr;
   3220
   3221	/* Are we on a primary subcore? */
   3222	if (cpu_thread_in_subcore(cpu))
   3223		return 0;
   3224
   3225	thr = 0;
   3226	while (++thr < threads_per_subcore)
   3227		if (cpu_online(cpu + thr))
   3228			return 0;
   3229
   3230	/* Grab all hw threads so they can't go into the kernel */
   3231	for (thr = 1; thr < threads_per_subcore; ++thr) {
   3232		if (kvmppc_grab_hwthread(cpu + thr)) {
   3233			/* Couldn't grab one; let the others go */
   3234			do {
   3235				kvmppc_release_hwthread(cpu + thr);
   3236			} while (--thr > 0);
   3237			return 0;
   3238		}
   3239	}
   3240	return 1;
   3241}
   3242
   3243/*
   3244 * A list of virtual cores for each physical CPU.
   3245 * These are vcores that could run but their runner VCPU tasks are
   3246 * (or may be) preempted.
   3247 */
   3248struct preempted_vcore_list {
   3249	struct list_head	list;
   3250	spinlock_t		lock;
   3251};
   3252
   3253static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
   3254
   3255static void init_vcore_lists(void)
   3256{
   3257	int cpu;
   3258
   3259	for_each_possible_cpu(cpu) {
   3260		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
   3261		spin_lock_init(&lp->lock);
   3262		INIT_LIST_HEAD(&lp->list);
   3263	}
   3264}
   3265
   3266static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
   3267{
   3268	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
   3269
   3270	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
   3271
   3272	vc->vcore_state = VCORE_PREEMPT;
   3273	vc->pcpu = smp_processor_id();
   3274	if (vc->num_threads < threads_per_vcore(vc->kvm)) {
   3275		spin_lock(&lp->lock);
   3276		list_add_tail(&vc->preempt_list, &lp->list);
   3277		spin_unlock(&lp->lock);
   3278	}
   3279
   3280	/* Start accumulating stolen time */
   3281	kvmppc_core_start_stolen(vc, mftb());
   3282}
   3283
   3284static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
   3285{
   3286	struct preempted_vcore_list *lp;
   3287
   3288	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
   3289
   3290	kvmppc_core_end_stolen(vc, mftb());
   3291	if (!list_empty(&vc->preempt_list)) {
   3292		lp = &per_cpu(preempted_vcores, vc->pcpu);
   3293		spin_lock(&lp->lock);
   3294		list_del_init(&vc->preempt_list);
   3295		spin_unlock(&lp->lock);
   3296	}
   3297	vc->vcore_state = VCORE_INACTIVE;
   3298}
   3299
   3300/*
   3301 * This stores information about the virtual cores currently
   3302 * assigned to a physical core.
   3303 */
   3304struct core_info {
   3305	int		n_subcores;
   3306	int		max_subcore_threads;
   3307	int		total_threads;
   3308	int		subcore_threads[MAX_SUBCORES];
   3309	struct kvmppc_vcore *vc[MAX_SUBCORES];
   3310};
   3311
   3312/*
   3313 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
   3314 * respectively in 2-way micro-threading (split-core) mode on POWER8.
   3315 */
   3316static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
   3317
   3318static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
   3319{
   3320	memset(cip, 0, sizeof(*cip));
   3321	cip->n_subcores = 1;
   3322	cip->max_subcore_threads = vc->num_threads;
   3323	cip->total_threads = vc->num_threads;
   3324	cip->subcore_threads[0] = vc->num_threads;
   3325	cip->vc[0] = vc;
   3326}
   3327
   3328static bool subcore_config_ok(int n_subcores, int n_threads)
   3329{
   3330	/*
   3331	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
   3332	 * split-core mode, with one thread per subcore.
   3333	 */
   3334	if (cpu_has_feature(CPU_FTR_ARCH_300))
   3335		return n_subcores <= 4 && n_threads == 1;
   3336
   3337	/* On POWER8, can only dynamically split if unsplit to begin with */
   3338	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
   3339		return false;
   3340	if (n_subcores > MAX_SUBCORES)
   3341		return false;
   3342	if (n_subcores > 1) {
   3343		if (!(dynamic_mt_modes & 2))
   3344			n_subcores = 4;
   3345		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
   3346			return false;
   3347	}
   3348
   3349	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
   3350}
   3351
   3352static void init_vcore_to_run(struct kvmppc_vcore *vc)
   3353{
   3354	vc->entry_exit_map = 0;
   3355	vc->in_guest = 0;
   3356	vc->napping_threads = 0;
   3357	vc->conferring_threads = 0;
   3358	vc->tb_offset_applied = 0;
   3359}
   3360
   3361static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
   3362{
   3363	int n_threads = vc->num_threads;
   3364	int sub;
   3365
   3366	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
   3367		return false;
   3368
   3369	/* In one_vm_per_core mode, require all vcores to be from the same vm */
   3370	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
   3371		return false;
   3372
   3373	if (n_threads < cip->max_subcore_threads)
   3374		n_threads = cip->max_subcore_threads;
   3375	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
   3376		return false;
   3377	cip->max_subcore_threads = n_threads;
   3378
   3379	sub = cip->n_subcores;
   3380	++cip->n_subcores;
   3381	cip->total_threads += vc->num_threads;
   3382	cip->subcore_threads[sub] = vc->num_threads;
   3383	cip->vc[sub] = vc;
   3384	init_vcore_to_run(vc);
   3385	list_del_init(&vc->preempt_list);
   3386
   3387	return true;
   3388}
   3389
   3390/*
   3391 * Work out whether it is possible to piggyback the execution of
   3392 * vcore *pvc onto the execution of the other vcores described in *cip.
   3393 */
   3394static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
   3395			  int target_threads)
   3396{
   3397	if (cip->total_threads + pvc->num_threads > target_threads)
   3398		return false;
   3399
   3400	return can_dynamic_split(pvc, cip);
   3401}
   3402
   3403static void prepare_threads(struct kvmppc_vcore *vc)
   3404{
   3405	int i;
   3406	struct kvm_vcpu *vcpu;
   3407
   3408	for_each_runnable_thread(i, vcpu, vc) {
   3409		if (signal_pending(vcpu->arch.run_task))
   3410			vcpu->arch.ret = -EINTR;
   3411		else if (vcpu->arch.vpa.update_pending ||
   3412			 vcpu->arch.slb_shadow.update_pending ||
   3413			 vcpu->arch.dtl.update_pending)
   3414			vcpu->arch.ret = RESUME_GUEST;
   3415		else
   3416			continue;
   3417		kvmppc_remove_runnable(vc, vcpu, mftb());
   3418		wake_up(&vcpu->arch.cpu_run);
   3419	}
   3420}
   3421
   3422static void collect_piggybacks(struct core_info *cip, int target_threads)
   3423{
   3424	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
   3425	struct kvmppc_vcore *pvc, *vcnext;
   3426
   3427	spin_lock(&lp->lock);
   3428	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
   3429		if (!spin_trylock(&pvc->lock))
   3430			continue;
   3431		prepare_threads(pvc);
   3432		if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
   3433			list_del_init(&pvc->preempt_list);
   3434			if (pvc->runner == NULL) {
   3435				pvc->vcore_state = VCORE_INACTIVE;
   3436				kvmppc_core_end_stolen(pvc, mftb());
   3437			}
   3438			spin_unlock(&pvc->lock);
   3439			continue;
   3440		}
   3441		if (!can_piggyback(pvc, cip, target_threads)) {
   3442			spin_unlock(&pvc->lock);
   3443			continue;
   3444		}
   3445		kvmppc_core_end_stolen(pvc, mftb());
   3446		pvc->vcore_state = VCORE_PIGGYBACK;
   3447		if (cip->total_threads >= target_threads)
   3448			break;
   3449	}
   3450	spin_unlock(&lp->lock);
   3451}
   3452
   3453static bool recheck_signals_and_mmu(struct core_info *cip)
   3454{
   3455	int sub, i;
   3456	struct kvm_vcpu *vcpu;
   3457	struct kvmppc_vcore *vc;
   3458
   3459	for (sub = 0; sub < cip->n_subcores; ++sub) {
   3460		vc = cip->vc[sub];
   3461		if (!vc->kvm->arch.mmu_ready)
   3462			return true;
   3463		for_each_runnable_thread(i, vcpu, vc)
   3464			if (signal_pending(vcpu->arch.run_task))
   3465				return true;
   3466	}
   3467	return false;
   3468}
   3469
   3470static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
   3471{
   3472	int still_running = 0, i;
   3473	u64 now;
   3474	long ret;
   3475	struct kvm_vcpu *vcpu;
   3476
   3477	spin_lock(&vc->lock);
   3478	now = get_tb();
   3479	for_each_runnable_thread(i, vcpu, vc) {
   3480		/*
   3481		 * It's safe to unlock the vcore in the loop here, because
   3482		 * for_each_runnable_thread() is safe against removal of
   3483		 * the vcpu, and the vcore state is VCORE_EXITING here,
   3484		 * so any vcpus becoming runnable will have their arch.trap
   3485		 * set to zero and can't actually run in the guest.
   3486		 */
   3487		spin_unlock(&vc->lock);
   3488		/* cancel pending dec exception if dec is positive */
   3489		if (now < kvmppc_dec_expires_host_tb(vcpu) &&
   3490		    kvmppc_core_pending_dec(vcpu))
   3491			kvmppc_core_dequeue_dec(vcpu);
   3492
   3493		trace_kvm_guest_exit(vcpu);
   3494
   3495		ret = RESUME_GUEST;
   3496		if (vcpu->arch.trap)
   3497			ret = kvmppc_handle_exit_hv(vcpu,
   3498						    vcpu->arch.run_task);
   3499
   3500		vcpu->arch.ret = ret;
   3501		vcpu->arch.trap = 0;
   3502
   3503		spin_lock(&vc->lock);
   3504		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
   3505			if (vcpu->arch.pending_exceptions)
   3506				kvmppc_core_prepare_to_enter(vcpu);
   3507			if (vcpu->arch.ceded)
   3508				kvmppc_set_timer(vcpu);
   3509			else
   3510				++still_running;
   3511		} else {
   3512			kvmppc_remove_runnable(vc, vcpu, mftb());
   3513			wake_up(&vcpu->arch.cpu_run);
   3514		}
   3515	}
   3516	if (!is_master) {
   3517		if (still_running > 0) {
   3518			kvmppc_vcore_preempt(vc);
   3519		} else if (vc->runner) {
   3520			vc->vcore_state = VCORE_PREEMPT;
   3521			kvmppc_core_start_stolen(vc, mftb());
   3522		} else {
   3523			vc->vcore_state = VCORE_INACTIVE;
   3524		}
   3525		if (vc->n_runnable > 0 && vc->runner == NULL) {
   3526			/* make sure there's a candidate runner awake */
   3527			i = -1;
   3528			vcpu = next_runnable_thread(vc, &i);
   3529			wake_up(&vcpu->arch.cpu_run);
   3530		}
   3531	}
   3532	spin_unlock(&vc->lock);
   3533}
   3534
   3535/*
   3536 * Clear core from the list of active host cores as we are about to
   3537 * enter the guest. Only do this if it is the primary thread of the
   3538 * core (not if a subcore) that is entering the guest.
   3539 */
   3540static inline int kvmppc_clear_host_core(unsigned int cpu)
   3541{
   3542	int core;
   3543
   3544	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
   3545		return 0;
   3546	/*
   3547	 * Memory barrier can be omitted here as we will do a smp_wmb()
   3548	 * later in kvmppc_start_thread and we need ensure that state is
   3549	 * visible to other CPUs only after we enter guest.
   3550	 */
   3551	core = cpu >> threads_shift;
   3552	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
   3553	return 0;
   3554}
   3555
   3556/*
   3557 * Advertise this core as an active host core since we exited the guest
   3558 * Only need to do this if it is the primary thread of the core that is
   3559 * exiting.
   3560 */
   3561static inline int kvmppc_set_host_core(unsigned int cpu)
   3562{
   3563	int core;
   3564
   3565	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
   3566		return 0;
   3567
   3568	/*
   3569	 * Memory barrier can be omitted here because we do a spin_unlock
   3570	 * immediately after this which provides the memory barrier.
   3571	 */
   3572	core = cpu >> threads_shift;
   3573	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
   3574	return 0;
   3575}
   3576
   3577static void set_irq_happened(int trap)
   3578{
   3579	switch (trap) {
   3580	case BOOK3S_INTERRUPT_EXTERNAL:
   3581		local_paca->irq_happened |= PACA_IRQ_EE;
   3582		break;
   3583	case BOOK3S_INTERRUPT_H_DOORBELL:
   3584		local_paca->irq_happened |= PACA_IRQ_DBELL;
   3585		break;
   3586	case BOOK3S_INTERRUPT_HMI:
   3587		local_paca->irq_happened |= PACA_IRQ_HMI;
   3588		break;
   3589	case BOOK3S_INTERRUPT_SYSTEM_RESET:
   3590		replay_system_reset();
   3591		break;
   3592	}
   3593}
   3594
   3595/*
   3596 * Run a set of guest threads on a physical core.
   3597 * Called with vc->lock held.
   3598 */
   3599static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
   3600{
   3601	struct kvm_vcpu *vcpu;
   3602	int i;
   3603	int srcu_idx;
   3604	struct core_info core_info;
   3605	struct kvmppc_vcore *pvc;
   3606	struct kvm_split_mode split_info, *sip;
   3607	int split, subcore_size, active;
   3608	int sub;
   3609	bool thr0_done;
   3610	unsigned long cmd_bit, stat_bit;
   3611	int pcpu, thr;
   3612	int target_threads;
   3613	int controlled_threads;
   3614	int trap;
   3615	bool is_power8;
   3616
   3617	if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
   3618		return;
   3619
   3620	/*
   3621	 * Remove from the list any threads that have a signal pending
   3622	 * or need a VPA update done
   3623	 */
   3624	prepare_threads(vc);
   3625
   3626	/* if the runner is no longer runnable, let the caller pick a new one */
   3627	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
   3628		return;
   3629
   3630	/*
   3631	 * Initialize *vc.
   3632	 */
   3633	init_vcore_to_run(vc);
   3634	vc->preempt_tb = TB_NIL;
   3635
   3636	/*
   3637	 * Number of threads that we will be controlling: the same as
   3638	 * the number of threads per subcore, except on POWER9,
   3639	 * where it's 1 because the threads are (mostly) independent.
   3640	 */
   3641	controlled_threads = threads_per_vcore(vc->kvm);
   3642
   3643	/*
   3644	 * Make sure we are running on primary threads, and that secondary
   3645	 * threads are offline.  Also check if the number of threads in this
   3646	 * guest are greater than the current system threads per guest.
   3647	 */
   3648	if ((controlled_threads > 1) &&
   3649	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
   3650		for_each_runnable_thread(i, vcpu, vc) {
   3651			vcpu->arch.ret = -EBUSY;
   3652			kvmppc_remove_runnable(vc, vcpu, mftb());
   3653			wake_up(&vcpu->arch.cpu_run);
   3654		}
   3655		goto out;
   3656	}
   3657
   3658	/*
   3659	 * See if we could run any other vcores on the physical core
   3660	 * along with this one.
   3661	 */
   3662	init_core_info(&core_info, vc);
   3663	pcpu = smp_processor_id();
   3664	target_threads = controlled_threads;
   3665	if (target_smt_mode && target_smt_mode < target_threads)
   3666		target_threads = target_smt_mode;
   3667	if (vc->num_threads < target_threads)
   3668		collect_piggybacks(&core_info, target_threads);
   3669
   3670	/*
   3671	 * Hard-disable interrupts, and check resched flag and signals.
   3672	 * If we need to reschedule or deliver a signal, clean up
   3673	 * and return without going into the guest(s).
   3674	 * If the mmu_ready flag has been cleared, don't go into the
   3675	 * guest because that means a HPT resize operation is in progress.
   3676	 */
   3677	local_irq_disable();
   3678	hard_irq_disable();
   3679	if (lazy_irq_pending() || need_resched() ||
   3680	    recheck_signals_and_mmu(&core_info)) {
   3681		local_irq_enable();
   3682		vc->vcore_state = VCORE_INACTIVE;
   3683		/* Unlock all except the primary vcore */
   3684		for (sub = 1; sub < core_info.n_subcores; ++sub) {
   3685			pvc = core_info.vc[sub];
   3686			/* Put back on to the preempted vcores list */
   3687			kvmppc_vcore_preempt(pvc);
   3688			spin_unlock(&pvc->lock);
   3689		}
   3690		for (i = 0; i < controlled_threads; ++i)
   3691			kvmppc_release_hwthread(pcpu + i);
   3692		return;
   3693	}
   3694
   3695	kvmppc_clear_host_core(pcpu);
   3696
   3697	/* Decide on micro-threading (split-core) mode */
   3698	subcore_size = threads_per_subcore;
   3699	cmd_bit = stat_bit = 0;
   3700	split = core_info.n_subcores;
   3701	sip = NULL;
   3702	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
   3703
   3704	if (split > 1) {
   3705		sip = &split_info;
   3706		memset(&split_info, 0, sizeof(split_info));
   3707		for (sub = 0; sub < core_info.n_subcores; ++sub)
   3708			split_info.vc[sub] = core_info.vc[sub];
   3709
   3710		if (is_power8) {
   3711			if (split == 2 && (dynamic_mt_modes & 2)) {
   3712				cmd_bit = HID0_POWER8_1TO2LPAR;
   3713				stat_bit = HID0_POWER8_2LPARMODE;
   3714			} else {
   3715				split = 4;
   3716				cmd_bit = HID0_POWER8_1TO4LPAR;
   3717				stat_bit = HID0_POWER8_4LPARMODE;
   3718			}
   3719			subcore_size = MAX_SMT_THREADS / split;
   3720			split_info.rpr = mfspr(SPRN_RPR);
   3721			split_info.pmmar = mfspr(SPRN_PMMAR);
   3722			split_info.ldbar = mfspr(SPRN_LDBAR);
   3723			split_info.subcore_size = subcore_size;
   3724		} else {
   3725			split_info.subcore_size = 1;
   3726		}
   3727
   3728		/* order writes to split_info before kvm_split_mode pointer */
   3729		smp_wmb();
   3730	}
   3731
   3732	for (thr = 0; thr < controlled_threads; ++thr) {
   3733		struct paca_struct *paca = paca_ptrs[pcpu + thr];
   3734
   3735		paca->kvm_hstate.napping = 0;
   3736		paca->kvm_hstate.kvm_split_mode = sip;
   3737	}
   3738
   3739	/* Initiate micro-threading (split-core) on POWER8 if required */
   3740	if (cmd_bit) {
   3741		unsigned long hid0 = mfspr(SPRN_HID0);
   3742
   3743		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
   3744		mb();
   3745		mtspr(SPRN_HID0, hid0);
   3746		isync();
   3747		for (;;) {
   3748			hid0 = mfspr(SPRN_HID0);
   3749			if (hid0 & stat_bit)
   3750				break;
   3751			cpu_relax();
   3752		}
   3753	}
   3754
   3755	/*
   3756	 * On POWER8, set RWMR register.
   3757	 * Since it only affects PURR and SPURR, it doesn't affect
   3758	 * the host, so we don't save/restore the host value.
   3759	 */
   3760	if (is_power8) {
   3761		unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
   3762		int n_online = atomic_read(&vc->online_count);
   3763
   3764		/*
   3765		 * Use the 8-thread value if we're doing split-core
   3766		 * or if the vcore's online count looks bogus.
   3767		 */
   3768		if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
   3769		    n_online >= 1 && n_online <= MAX_SMT_THREADS)
   3770			rwmr_val = p8_rwmr_values[n_online];
   3771		mtspr(SPRN_RWMR, rwmr_val);
   3772	}
   3773
   3774	/* Start all the threads */
   3775	active = 0;
   3776	for (sub = 0; sub < core_info.n_subcores; ++sub) {
   3777		thr = is_power8 ? subcore_thread_map[sub] : sub;
   3778		thr0_done = false;
   3779		active |= 1 << thr;
   3780		pvc = core_info.vc[sub];
   3781		pvc->pcpu = pcpu + thr;
   3782		for_each_runnable_thread(i, vcpu, pvc) {
   3783			/*
   3784			 * XXX: is kvmppc_start_thread called too late here?
   3785			 * It updates vcpu->cpu and vcpu->arch.thread_cpu
   3786			 * which are used by kvmppc_fast_vcpu_kick_hv(), but
   3787			 * kick is called after new exceptions become available
   3788			 * and exceptions are checked earlier than here, by
   3789			 * kvmppc_core_prepare_to_enter.
   3790			 */
   3791			kvmppc_start_thread(vcpu, pvc);
   3792			kvmppc_create_dtl_entry(vcpu, pvc);
   3793			trace_kvm_guest_enter(vcpu);
   3794			if (!vcpu->arch.ptid)
   3795				thr0_done = true;
   3796			active |= 1 << (thr + vcpu->arch.ptid);
   3797		}
   3798		/*
   3799		 * We need to start the first thread of each subcore
   3800		 * even if it doesn't have a vcpu.
   3801		 */
   3802		if (!thr0_done)
   3803			kvmppc_start_thread(NULL, pvc);
   3804	}
   3805
   3806	/*
   3807	 * Ensure that split_info.do_nap is set after setting
   3808	 * the vcore pointer in the PACA of the secondaries.
   3809	 */
   3810	smp_mb();
   3811
   3812	/*
   3813	 * When doing micro-threading, poke the inactive threads as well.
   3814	 * This gets them to the nap instruction after kvm_do_nap,
   3815	 * which reduces the time taken to unsplit later.
   3816	 */
   3817	if (cmd_bit) {
   3818		split_info.do_nap = 1;	/* ask secondaries to nap when done */
   3819		for (thr = 1; thr < threads_per_subcore; ++thr)
   3820			if (!(active & (1 << thr)))
   3821				kvmppc_ipi_thread(pcpu + thr);
   3822	}
   3823
   3824	vc->vcore_state = VCORE_RUNNING;
   3825	preempt_disable();
   3826
   3827	trace_kvmppc_run_core(vc, 0);
   3828
   3829	for (sub = 0; sub < core_info.n_subcores; ++sub)
   3830		spin_unlock(&core_info.vc[sub]->lock);
   3831
   3832	guest_enter_irqoff();
   3833
   3834	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
   3835
   3836	this_cpu_disable_ftrace();
   3837
   3838	/*
   3839	 * Interrupts will be enabled once we get into the guest,
   3840	 * so tell lockdep that we're about to enable interrupts.
   3841	 */
   3842	trace_hardirqs_on();
   3843
   3844	trap = __kvmppc_vcore_entry();
   3845
   3846	trace_hardirqs_off();
   3847
   3848	this_cpu_enable_ftrace();
   3849
   3850	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
   3851
   3852	set_irq_happened(trap);
   3853
   3854	spin_lock(&vc->lock);
   3855	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
   3856	vc->vcore_state = VCORE_EXITING;
   3857
   3858	/* wait for secondary threads to finish writing their state to memory */
   3859	kvmppc_wait_for_nap(controlled_threads);
   3860
   3861	/* Return to whole-core mode if we split the core earlier */
   3862	if (cmd_bit) {
   3863		unsigned long hid0 = mfspr(SPRN_HID0);
   3864		unsigned long loops = 0;
   3865
   3866		hid0 &= ~HID0_POWER8_DYNLPARDIS;
   3867		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
   3868		mb();
   3869		mtspr(SPRN_HID0, hid0);
   3870		isync();
   3871		for (;;) {
   3872			hid0 = mfspr(SPRN_HID0);
   3873			if (!(hid0 & stat_bit))
   3874				break;
   3875			cpu_relax();
   3876			++loops;
   3877		}
   3878		split_info.do_nap = 0;
   3879	}
   3880
   3881	kvmppc_set_host_core(pcpu);
   3882
   3883	context_tracking_guest_exit();
   3884	if (!vtime_accounting_enabled_this_cpu()) {
   3885		local_irq_enable();
   3886		/*
   3887		 * Service IRQs here before vtime_account_guest_exit() so any
   3888		 * ticks that occurred while running the guest are accounted to
   3889		 * the guest. If vtime accounting is enabled, accounting uses
   3890		 * TB rather than ticks, so it can be done without enabling
   3891		 * interrupts here, which has the problem that it accounts
   3892		 * interrupt processing overhead to the host.
   3893		 */
   3894		local_irq_disable();
   3895	}
   3896	vtime_account_guest_exit();
   3897
   3898	local_irq_enable();
   3899
   3900	/* Let secondaries go back to the offline loop */
   3901	for (i = 0; i < controlled_threads; ++i) {
   3902		kvmppc_release_hwthread(pcpu + i);
   3903		if (sip && sip->napped[i])
   3904			kvmppc_ipi_thread(pcpu + i);
   3905	}
   3906
   3907	spin_unlock(&vc->lock);
   3908
   3909	/* make sure updates to secondary vcpu structs are visible now */
   3910	smp_mb();
   3911
   3912	preempt_enable();
   3913
   3914	for (sub = 0; sub < core_info.n_subcores; ++sub) {
   3915		pvc = core_info.vc[sub];
   3916		post_guest_process(pvc, pvc == vc);
   3917	}
   3918
   3919	spin_lock(&vc->lock);
   3920
   3921 out:
   3922	vc->vcore_state = VCORE_INACTIVE;
   3923	trace_kvmppc_run_core(vc, 1);
   3924}
   3925
   3926static inline bool hcall_is_xics(unsigned long req)
   3927{
   3928	return req == H_EOI || req == H_CPPR || req == H_IPI ||
   3929		req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
   3930}
   3931
   3932static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
   3933{
   3934	struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
   3935	if (lp) {
   3936		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
   3937		lp->yield_count = cpu_to_be32(yield_count);
   3938		vcpu->arch.vpa.dirty = 1;
   3939	}
   3940}
   3941
   3942/* call our hypervisor to load up HV regs and go */
   3943static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
   3944{
   3945	struct kvmppc_vcore *vc = vcpu->arch.vcore;
   3946	unsigned long host_psscr;
   3947	unsigned long msr;
   3948	struct hv_guest_state hvregs;
   3949	struct p9_host_os_sprs host_os_sprs;
   3950	s64 dec;
   3951	int trap;
   3952
   3953	msr = mfmsr();
   3954
   3955	save_p9_host_os_sprs(&host_os_sprs);
   3956
   3957	/*
   3958	 * We need to save and restore the guest visible part of the
   3959	 * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
   3960	 * doesn't do this for us. Note only required if pseries since
   3961	 * this is done in kvmhv_vcpu_entry_p9() below otherwise.
   3962	 */
   3963	host_psscr = mfspr(SPRN_PSSCR_PR);
   3964
   3965	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
   3966	if (lazy_irq_pending())
   3967		return 0;
   3968
   3969	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
   3970		msr = mfmsr(); /* TM restore can update msr */
   3971
   3972	if (vcpu->arch.psscr != host_psscr)
   3973		mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
   3974
   3975	kvmhv_save_hv_regs(vcpu, &hvregs);
   3976	hvregs.lpcr = lpcr;
   3977	hvregs.amor = ~0;
   3978	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
   3979	hvregs.version = HV_GUEST_STATE_VERSION;
   3980	if (vcpu->arch.nested) {
   3981		hvregs.lpid = vcpu->arch.nested->shadow_lpid;
   3982		hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
   3983	} else {
   3984		hvregs.lpid = vcpu->kvm->arch.lpid;
   3985		hvregs.vcpu_token = vcpu->vcpu_id;
   3986	}
   3987	hvregs.hdec_expiry = time_limit;
   3988
   3989	/*
   3990	 * When setting DEC, we must always deal with irq_work_raise
   3991	 * via NMI vs setting DEC. The problem occurs right as we
   3992	 * switch into guest mode if a NMI hits and sets pending work
   3993	 * and sets DEC, then that will apply to the guest and not
   3994	 * bring us back to the host.
   3995	 *
   3996	 * irq_work_raise could check a flag (or possibly LPCR[HDICE]
   3997	 * for example) and set HDEC to 1? That wouldn't solve the
   3998	 * nested hv case which needs to abort the hcall or zero the
   3999	 * time limit.
   4000	 *
   4001	 * XXX: Another day's problem.
   4002	 */
   4003	mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
   4004
   4005	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
   4006	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
   4007	switch_pmu_to_guest(vcpu, &host_os_sprs);
   4008	trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
   4009				  __pa(&vcpu->arch.regs));
   4010	kvmhv_restore_hv_return_state(vcpu, &hvregs);
   4011	switch_pmu_to_host(vcpu, &host_os_sprs);
   4012	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
   4013	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
   4014	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
   4015	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
   4016
   4017	store_vcpu_state(vcpu);
   4018
   4019	dec = mfspr(SPRN_DEC);
   4020	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
   4021		dec = (s32) dec;
   4022	*tb = mftb();
   4023	vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset);
   4024
   4025	timer_rearm_host_dec(*tb);
   4026
   4027	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
   4028	if (vcpu->arch.psscr != host_psscr)
   4029		mtspr(SPRN_PSSCR_PR, host_psscr);
   4030
   4031	return trap;
   4032}
   4033
   4034/*
   4035 * Guest entry for POWER9 and later CPUs.
   4036 */
   4037static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
   4038			 unsigned long lpcr, u64 *tb)
   4039{
   4040	struct kvm *kvm = vcpu->kvm;
   4041	struct kvm_nested_guest *nested = vcpu->arch.nested;
   4042	u64 next_timer;
   4043	int trap;
   4044
   4045	next_timer = timer_get_next_tb();
   4046	if (*tb >= next_timer)
   4047		return BOOK3S_INTERRUPT_HV_DECREMENTER;
   4048	if (next_timer < time_limit)
   4049		time_limit = next_timer;
   4050	else if (*tb >= time_limit) /* nested time limit */
   4051		return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
   4052
   4053	vcpu->arch.ceded = 0;
   4054
   4055	vcpu_vpa_increment_dispatch(vcpu);
   4056
   4057	if (kvmhv_on_pseries()) {
   4058		trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
   4059
   4060		/* H_CEDE has to be handled now, not later */
   4061		if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
   4062		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
   4063			kvmppc_cede(vcpu);
   4064			kvmppc_set_gpr(vcpu, 3, 0);
   4065			trap = 0;
   4066		}
   4067
   4068	} else if (nested) {
   4069		__this_cpu_write(cpu_in_guest, kvm);
   4070		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
   4071		__this_cpu_write(cpu_in_guest, NULL);
   4072
   4073	} else {
   4074		kvmppc_xive_push_vcpu(vcpu);
   4075
   4076		__this_cpu_write(cpu_in_guest, kvm);
   4077		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
   4078		__this_cpu_write(cpu_in_guest, NULL);
   4079
   4080		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
   4081		    !(vcpu->arch.shregs.msr & MSR_PR)) {
   4082			unsigned long req = kvmppc_get_gpr(vcpu, 3);
   4083
   4084			/*
   4085			 * XIVE rearm and XICS hcalls must be handled
   4086			 * before xive context is pulled (is this
   4087			 * true?)
   4088			 */
   4089			if (req == H_CEDE) {
   4090				/* H_CEDE has to be handled now */
   4091				kvmppc_cede(vcpu);
   4092				if (!kvmppc_xive_rearm_escalation(vcpu)) {
   4093					/*
   4094					 * Pending escalation so abort
   4095					 * the cede.
   4096					 */
   4097					vcpu->arch.ceded = 0;
   4098				}
   4099				kvmppc_set_gpr(vcpu, 3, 0);
   4100				trap = 0;
   4101
   4102			} else if (req == H_ENTER_NESTED) {
   4103				/*
   4104				 * L2 should not run with the L1
   4105				 * context so rearm and pull it.
   4106				 */
   4107				if (!kvmppc_xive_rearm_escalation(vcpu)) {
   4108					/*
   4109					 * Pending escalation so abort
   4110					 * H_ENTER_NESTED.
   4111					 */
   4112					kvmppc_set_gpr(vcpu, 3, 0);
   4113					trap = 0;
   4114				}
   4115
   4116			} else if (hcall_is_xics(req)) {
   4117				int ret;
   4118
   4119				ret = kvmppc_xive_xics_hcall(vcpu, req);
   4120				if (ret != H_TOO_HARD) {
   4121					kvmppc_set_gpr(vcpu, 3, ret);
   4122					trap = 0;
   4123				}
   4124			}
   4125		}
   4126		kvmppc_xive_pull_vcpu(vcpu);
   4127
   4128		if (kvm_is_radix(kvm))
   4129			vcpu->arch.slb_max = 0;
   4130	}
   4131
   4132	vcpu_vpa_increment_dispatch(vcpu);
   4133
   4134	return trap;
   4135}
   4136
   4137/*
   4138 * Wait for some other vcpu thread to execute us, and
   4139 * wake us up when we need to handle something in the host.
   4140 */
   4141static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
   4142				 struct kvm_vcpu *vcpu, int wait_state)
   4143{
   4144	DEFINE_WAIT(wait);
   4145
   4146	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
   4147	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
   4148		spin_unlock(&vc->lock);
   4149		schedule();
   4150		spin_lock(&vc->lock);
   4151	}
   4152	finish_wait(&vcpu->arch.cpu_run, &wait);
   4153}
   4154
   4155static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
   4156{
   4157	if (!halt_poll_ns_grow)
   4158		return;
   4159
   4160	vc->halt_poll_ns *= halt_poll_ns_grow;
   4161	if (vc->halt_poll_ns < halt_poll_ns_grow_start)
   4162		vc->halt_poll_ns = halt_poll_ns_grow_start;
   4163}
   4164
   4165static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
   4166{
   4167	if (halt_poll_ns_shrink == 0)
   4168		vc->halt_poll_ns = 0;
   4169	else
   4170		vc->halt_poll_ns /= halt_poll_ns_shrink;
   4171}
   4172
   4173#ifdef CONFIG_KVM_XICS
   4174static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
   4175{
   4176	if (!xics_on_xive())
   4177		return false;
   4178	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
   4179		vcpu->arch.xive_saved_state.cppr;
   4180}
   4181#else
   4182static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
   4183{
   4184	return false;
   4185}
   4186#endif /* CONFIG_KVM_XICS */
   4187
   4188static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
   4189{
   4190	if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
   4191	    kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
   4192		return true;
   4193
   4194	return false;
   4195}
   4196
   4197static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
   4198{
   4199	if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
   4200		return true;
   4201	return false;
   4202}
   4203
   4204/*
   4205 * Check to see if any of the runnable vcpus on the vcore have pending
   4206 * exceptions or are no longer ceded
   4207 */
   4208static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
   4209{
   4210	struct kvm_vcpu *vcpu;
   4211	int i;
   4212
   4213	for_each_runnable_thread(i, vcpu, vc) {
   4214		if (kvmppc_vcpu_check_block(vcpu))
   4215			return 1;
   4216	}
   4217
   4218	return 0;
   4219}
   4220
   4221/*
   4222 * All the vcpus in this vcore are idle, so wait for a decrementer
   4223 * or external interrupt to one of the vcpus.  vc->lock is held.
   4224 */
   4225static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
   4226{
   4227	ktime_t cur, start_poll, start_wait;
   4228	int do_sleep = 1;
   4229	u64 block_ns;
   4230
   4231	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
   4232
   4233	/* Poll for pending exceptions and ceded state */
   4234	cur = start_poll = ktime_get();
   4235	if (vc->halt_poll_ns) {
   4236		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
   4237		++vc->runner->stat.generic.halt_attempted_poll;
   4238
   4239		vc->vcore_state = VCORE_POLLING;
   4240		spin_unlock(&vc->lock);
   4241
   4242		do {
   4243			if (kvmppc_vcore_check_block(vc)) {
   4244				do_sleep = 0;
   4245				break;
   4246			}
   4247			cur = ktime_get();
   4248		} while (kvm_vcpu_can_poll(cur, stop));
   4249
   4250		spin_lock(&vc->lock);
   4251		vc->vcore_state = VCORE_INACTIVE;
   4252
   4253		if (!do_sleep) {
   4254			++vc->runner->stat.generic.halt_successful_poll;
   4255			goto out;
   4256		}
   4257	}
   4258
   4259	prepare_to_rcuwait(&vc->wait);
   4260	set_current_state(TASK_INTERRUPTIBLE);
   4261	if (kvmppc_vcore_check_block(vc)) {
   4262		finish_rcuwait(&vc->wait);
   4263		do_sleep = 0;
   4264		/* If we polled, count this as a successful poll */
   4265		if (vc->halt_poll_ns)
   4266			++vc->runner->stat.generic.halt_successful_poll;
   4267		goto out;
   4268	}
   4269
   4270	start_wait = ktime_get();
   4271
   4272	vc->vcore_state = VCORE_SLEEPING;
   4273	trace_kvmppc_vcore_blocked(vc->runner, 0);
   4274	spin_unlock(&vc->lock);
   4275	schedule();
   4276	finish_rcuwait(&vc->wait);
   4277	spin_lock(&vc->lock);
   4278	vc->vcore_state = VCORE_INACTIVE;
   4279	trace_kvmppc_vcore_blocked(vc->runner, 1);
   4280	++vc->runner->stat.halt_successful_wait;
   4281
   4282	cur = ktime_get();
   4283
   4284out:
   4285	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
   4286
   4287	/* Attribute wait time */
   4288	if (do_sleep) {
   4289		vc->runner->stat.generic.halt_wait_ns +=
   4290			ktime_to_ns(cur) - ktime_to_ns(start_wait);
   4291		KVM_STATS_LOG_HIST_UPDATE(
   4292				vc->runner->stat.generic.halt_wait_hist,
   4293				ktime_to_ns(cur) - ktime_to_ns(start_wait));
   4294		/* Attribute failed poll time */
   4295		if (vc->halt_poll_ns) {
   4296			vc->runner->stat.generic.halt_poll_fail_ns +=
   4297				ktime_to_ns(start_wait) -
   4298				ktime_to_ns(start_poll);
   4299			KVM_STATS_LOG_HIST_UPDATE(
   4300				vc->runner->stat.generic.halt_poll_fail_hist,
   4301				ktime_to_ns(start_wait) -
   4302				ktime_to_ns(start_poll));
   4303		}
   4304	} else {
   4305		/* Attribute successful poll time */
   4306		if (vc->halt_poll_ns) {
   4307			vc->runner->stat.generic.halt_poll_success_ns +=
   4308				ktime_to_ns(cur) -
   4309				ktime_to_ns(start_poll);
   4310			KVM_STATS_LOG_HIST_UPDATE(
   4311				vc->runner->stat.generic.halt_poll_success_hist,
   4312				ktime_to_ns(cur) - ktime_to_ns(start_poll));
   4313		}
   4314	}
   4315
   4316	/* Adjust poll time */
   4317	if (halt_poll_ns) {
   4318		if (block_ns <= vc->halt_poll_ns)
   4319			;
   4320		/* We slept and blocked for longer than the max halt time */
   4321		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
   4322			shrink_halt_poll_ns(vc);
   4323		/* We slept and our poll time is too small */
   4324		else if (vc->halt_poll_ns < halt_poll_ns &&
   4325				block_ns < halt_poll_ns)
   4326			grow_halt_poll_ns(vc);
   4327		if (vc->halt_poll_ns > halt_poll_ns)
   4328			vc->halt_poll_ns = halt_poll_ns;
   4329	} else
   4330		vc->halt_poll_ns = 0;
   4331
   4332	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
   4333}
   4334
   4335/*
   4336 * This never fails for a radix guest, as none of the operations it does
   4337 * for a radix guest can fail or have a way to report failure.
   4338 */
   4339static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
   4340{
   4341	int r = 0;
   4342	struct kvm *kvm = vcpu->kvm;
   4343
   4344	mutex_lock(&kvm->arch.mmu_setup_lock);
   4345	if (!kvm->arch.mmu_ready) {
   4346		if (!kvm_is_radix(kvm))
   4347			r = kvmppc_hv_setup_htab_rma(vcpu);
   4348		if (!r) {
   4349			if (cpu_has_feature(CPU_FTR_ARCH_300))
   4350				kvmppc_setup_partition_table(kvm);
   4351			kvm->arch.mmu_ready = 1;
   4352		}
   4353	}
   4354	mutex_unlock(&kvm->arch.mmu_setup_lock);
   4355	return r;
   4356}
   4357
   4358static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
   4359{
   4360	struct kvm_run *run = vcpu->run;
   4361	int n_ceded, i, r;
   4362	struct kvmppc_vcore *vc;
   4363	struct kvm_vcpu *v;
   4364
   4365	trace_kvmppc_run_vcpu_enter(vcpu);
   4366
   4367	run->exit_reason = 0;
   4368	vcpu->arch.ret = RESUME_GUEST;
   4369	vcpu->arch.trap = 0;
   4370	kvmppc_update_vpas(vcpu);
   4371
   4372	/*
   4373	 * Synchronize with other threads in this virtual core
   4374	 */
   4375	vc = vcpu->arch.vcore;
   4376	spin_lock(&vc->lock);
   4377	vcpu->arch.ceded = 0;
   4378	vcpu->arch.run_task = current;
   4379	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
   4380	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
   4381	vcpu->arch.busy_preempt = TB_NIL;
   4382	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
   4383	++vc->n_runnable;
   4384
   4385	/*
   4386	 * This happens the first time this is called for a vcpu.
   4387	 * If the vcore is already running, we may be able to start
   4388	 * this thread straight away and have it join in.
   4389	 */
   4390	if (!signal_pending(current)) {
   4391		if ((vc->vcore_state == VCORE_PIGGYBACK ||
   4392		     vc->vcore_state == VCORE_RUNNING) &&
   4393			   !VCORE_IS_EXITING(vc)) {
   4394			kvmppc_create_dtl_entry(vcpu, vc);
   4395			kvmppc_start_thread(vcpu, vc);
   4396			trace_kvm_guest_enter(vcpu);
   4397		} else if (vc->vcore_state == VCORE_SLEEPING) {
   4398		        rcuwait_wake_up(&vc->wait);
   4399		}
   4400
   4401	}
   4402
   4403	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
   4404	       !signal_pending(current)) {
   4405		/* See if the MMU is ready to go */
   4406		if (!vcpu->kvm->arch.mmu_ready) {
   4407			spin_unlock(&vc->lock);
   4408			r = kvmhv_setup_mmu(vcpu);
   4409			spin_lock(&vc->lock);
   4410			if (r) {
   4411				run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   4412				run->fail_entry.
   4413					hardware_entry_failure_reason = 0;
   4414				vcpu->arch.ret = r;
   4415				break;
   4416			}
   4417		}
   4418
   4419		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
   4420			kvmppc_vcore_end_preempt(vc);
   4421
   4422		if (vc->vcore_state != VCORE_INACTIVE) {
   4423			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
   4424			continue;
   4425		}
   4426		for_each_runnable_thread(i, v, vc) {
   4427			kvmppc_core_prepare_to_enter(v);
   4428			if (signal_pending(v->arch.run_task)) {
   4429				kvmppc_remove_runnable(vc, v, mftb());
   4430				v->stat.signal_exits++;
   4431				v->run->exit_reason = KVM_EXIT_INTR;
   4432				v->arch.ret = -EINTR;
   4433				wake_up(&v->arch.cpu_run);
   4434			}
   4435		}
   4436		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
   4437			break;
   4438		n_ceded = 0;
   4439		for_each_runnable_thread(i, v, vc) {
   4440			if (!kvmppc_vcpu_woken(v))
   4441				n_ceded += v->arch.ceded;
   4442			else
   4443				v->arch.ceded = 0;
   4444		}
   4445		vc->runner = vcpu;
   4446		if (n_ceded == vc->n_runnable) {
   4447			kvmppc_vcore_blocked(vc);
   4448		} else if (need_resched()) {
   4449			kvmppc_vcore_preempt(vc);
   4450			/* Let something else run */
   4451			cond_resched_lock(&vc->lock);
   4452			if (vc->vcore_state == VCORE_PREEMPT)
   4453				kvmppc_vcore_end_preempt(vc);
   4454		} else {
   4455			kvmppc_run_core(vc);
   4456		}
   4457		vc->runner = NULL;
   4458	}
   4459
   4460	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
   4461	       (vc->vcore_state == VCORE_RUNNING ||
   4462		vc->vcore_state == VCORE_EXITING ||
   4463		vc->vcore_state == VCORE_PIGGYBACK))
   4464		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
   4465
   4466	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
   4467		kvmppc_vcore_end_preempt(vc);
   4468
   4469	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
   4470		kvmppc_remove_runnable(vc, vcpu, mftb());
   4471		vcpu->stat.signal_exits++;
   4472		run->exit_reason = KVM_EXIT_INTR;
   4473		vcpu->arch.ret = -EINTR;
   4474	}
   4475
   4476	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
   4477		/* Wake up some vcpu to run the core */
   4478		i = -1;
   4479		v = next_runnable_thread(vc, &i);
   4480		wake_up(&v->arch.cpu_run);
   4481	}
   4482
   4483	trace_kvmppc_run_vcpu_exit(vcpu);
   4484	spin_unlock(&vc->lock);
   4485	return vcpu->arch.ret;
   4486}
   4487
   4488int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
   4489			  unsigned long lpcr)
   4490{
   4491	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
   4492	struct kvm_run *run = vcpu->run;
   4493	int trap, r, pcpu;
   4494	int srcu_idx;
   4495	struct kvmppc_vcore *vc;
   4496	struct kvm *kvm = vcpu->kvm;
   4497	struct kvm_nested_guest *nested = vcpu->arch.nested;
   4498	unsigned long flags;
   4499	u64 tb;
   4500
   4501	trace_kvmppc_run_vcpu_enter(vcpu);
   4502
   4503	run->exit_reason = 0;
   4504	vcpu->arch.ret = RESUME_GUEST;
   4505	vcpu->arch.trap = 0;
   4506
   4507	vc = vcpu->arch.vcore;
   4508	vcpu->arch.ceded = 0;
   4509	vcpu->arch.run_task = current;
   4510	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
   4511	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
   4512
   4513	/* See if the MMU is ready to go */
   4514	if (unlikely(!kvm->arch.mmu_ready)) {
   4515		r = kvmhv_setup_mmu(vcpu);
   4516		if (r) {
   4517			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   4518			run->fail_entry.hardware_entry_failure_reason = 0;
   4519			vcpu->arch.ret = r;
   4520			return r;
   4521		}
   4522	}
   4523
   4524	if (need_resched())
   4525		cond_resched();
   4526
   4527	kvmppc_update_vpas(vcpu);
   4528
   4529	preempt_disable();
   4530	pcpu = smp_processor_id();
   4531	if (kvm_is_radix(kvm))
   4532		kvmppc_prepare_radix_vcpu(vcpu, pcpu);
   4533
   4534	/* flags save not required, but irq_pmu has no disable/enable API */
   4535	powerpc_local_irq_pmu_save(flags);
   4536
   4537	if (signal_pending(current))
   4538		goto sigpend;
   4539	if (need_resched() || !kvm->arch.mmu_ready)
   4540		goto out;
   4541
   4542	vcpu->cpu = pcpu;
   4543	vcpu->arch.thread_cpu = pcpu;
   4544	vc->pcpu = pcpu;
   4545	local_paca->kvm_hstate.kvm_vcpu = vcpu;
   4546	local_paca->kvm_hstate.ptid = 0;
   4547	local_paca->kvm_hstate.fake_suspend = 0;
   4548
   4549	/*
   4550	 * Orders set cpu/thread_cpu vs testing for pending interrupts and
   4551	 * doorbells below. The other side is when these fields are set vs
   4552	 * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to
   4553	 * kick a vCPU to notice the pending interrupt.
   4554	 */
   4555	smp_mb();
   4556
   4557	if (!nested) {
   4558		kvmppc_core_prepare_to_enter(vcpu);
   4559		if (vcpu->arch.shregs.msr & MSR_EE) {
   4560			if (xive_interrupt_pending(vcpu))
   4561				kvmppc_inject_interrupt_hv(vcpu,
   4562						BOOK3S_INTERRUPT_EXTERNAL, 0);
   4563		} else if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
   4564			     &vcpu->arch.pending_exceptions)) {
   4565			lpcr |= LPCR_MER;
   4566		}
   4567	} else if (vcpu->arch.pending_exceptions ||
   4568		   vcpu->arch.doorbell_request ||
   4569		   xive_interrupt_pending(vcpu)) {
   4570		vcpu->arch.ret = RESUME_HOST;
   4571		goto out;
   4572	}
   4573
   4574	if (vcpu->arch.timer_running) {
   4575		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
   4576		vcpu->arch.timer_running = 0;
   4577	}
   4578
   4579	tb = mftb();
   4580
   4581	__kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0);
   4582
   4583	trace_kvm_guest_enter(vcpu);
   4584
   4585	guest_enter_irqoff();
   4586
   4587	srcu_idx = srcu_read_lock(&kvm->srcu);
   4588
   4589	this_cpu_disable_ftrace();
   4590
   4591	/* Tell lockdep that we're about to enable interrupts */
   4592	trace_hardirqs_on();
   4593
   4594	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb);
   4595	vcpu->arch.trap = trap;
   4596
   4597	trace_hardirqs_off();
   4598
   4599	this_cpu_enable_ftrace();
   4600
   4601	srcu_read_unlock(&kvm->srcu, srcu_idx);
   4602
   4603	set_irq_happened(trap);
   4604
   4605	context_tracking_guest_exit();
   4606	if (!vtime_accounting_enabled_this_cpu()) {
   4607		local_irq_enable();
   4608		/*
   4609		 * Service IRQs here before vtime_account_guest_exit() so any
   4610		 * ticks that occurred while running the guest are accounted to
   4611		 * the guest. If vtime accounting is enabled, accounting uses
   4612		 * TB rather than ticks, so it can be done without enabling
   4613		 * interrupts here, which has the problem that it accounts
   4614		 * interrupt processing overhead to the host.
   4615		 */
   4616		local_irq_disable();
   4617	}
   4618	vtime_account_guest_exit();
   4619
   4620	vcpu->cpu = -1;
   4621	vcpu->arch.thread_cpu = -1;
   4622
   4623	powerpc_local_irq_pmu_restore(flags);
   4624
   4625	preempt_enable();
   4626
   4627	/*
   4628	 * cancel pending decrementer exception if DEC is now positive, or if
   4629	 * entering a nested guest in which case the decrementer is now owned
   4630	 * by L2 and the L1 decrementer is provided in hdec_expires
   4631	 */
   4632	if (kvmppc_core_pending_dec(vcpu) &&
   4633			((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
   4634			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
   4635			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
   4636		kvmppc_core_dequeue_dec(vcpu);
   4637
   4638	trace_kvm_guest_exit(vcpu);
   4639	r = RESUME_GUEST;
   4640	if (trap) {
   4641		if (!nested)
   4642			r = kvmppc_handle_exit_hv(vcpu, current);
   4643		else
   4644			r = kvmppc_handle_nested_exit(vcpu);
   4645	}
   4646	vcpu->arch.ret = r;
   4647
   4648	if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
   4649		kvmppc_set_timer(vcpu);
   4650
   4651		prepare_to_rcuwait(wait);
   4652		for (;;) {
   4653			set_current_state(TASK_INTERRUPTIBLE);
   4654			if (signal_pending(current)) {
   4655				vcpu->stat.signal_exits++;
   4656				run->exit_reason = KVM_EXIT_INTR;
   4657				vcpu->arch.ret = -EINTR;
   4658				break;
   4659			}
   4660
   4661			if (kvmppc_vcpu_check_block(vcpu))
   4662				break;
   4663
   4664			trace_kvmppc_vcore_blocked(vcpu, 0);
   4665			schedule();
   4666			trace_kvmppc_vcore_blocked(vcpu, 1);
   4667		}
   4668		finish_rcuwait(wait);
   4669	}
   4670	vcpu->arch.ceded = 0;
   4671
   4672 done:
   4673	trace_kvmppc_run_vcpu_exit(vcpu);
   4674
   4675	return vcpu->arch.ret;
   4676
   4677 sigpend:
   4678	vcpu->stat.signal_exits++;
   4679	run->exit_reason = KVM_EXIT_INTR;
   4680	vcpu->arch.ret = -EINTR;
   4681 out:
   4682	vcpu->cpu = -1;
   4683	vcpu->arch.thread_cpu = -1;
   4684	powerpc_local_irq_pmu_restore(flags);
   4685	preempt_enable();
   4686	goto done;
   4687}
   4688
   4689static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
   4690{
   4691	struct kvm_run *run = vcpu->run;
   4692	int r;
   4693	int srcu_idx;
   4694	struct kvm *kvm;
   4695	unsigned long msr;
   4696
   4697	if (!vcpu->arch.sane) {
   4698		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
   4699		return -EINVAL;
   4700	}
   4701
   4702	/* No need to go into the guest when all we'll do is come back out */
   4703	if (signal_pending(current)) {
   4704		run->exit_reason = KVM_EXIT_INTR;
   4705		return -EINTR;
   4706	}
   4707
   4708#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   4709	/*
   4710	 * Don't allow entry with a suspended transaction, because
   4711	 * the guest entry/exit code will lose it.
   4712	 */
   4713	if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
   4714	    (current->thread.regs->msr & MSR_TM)) {
   4715		if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
   4716			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
   4717			run->fail_entry.hardware_entry_failure_reason = 0;
   4718			return -EINVAL;
   4719		}
   4720	}
   4721#endif
   4722
   4723	/*
   4724	 * Force online to 1 for the sake of old userspace which doesn't
   4725	 * set it.
   4726	 */
   4727	if (!vcpu->arch.online) {
   4728		atomic_inc(&vcpu->arch.vcore->online_count);
   4729		vcpu->arch.online = 1;
   4730	}
   4731
   4732	kvmppc_core_prepare_to_enter(vcpu);
   4733
   4734	kvm = vcpu->kvm;
   4735	atomic_inc(&kvm->arch.vcpus_running);
   4736	/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
   4737	smp_mb();
   4738
   4739	msr = 0;
   4740	if (IS_ENABLED(CONFIG_PPC_FPU))
   4741		msr |= MSR_FP;
   4742	if (cpu_has_feature(CPU_FTR_ALTIVEC))
   4743		msr |= MSR_VEC;
   4744	if (cpu_has_feature(CPU_FTR_VSX))
   4745		msr |= MSR_VSX;
   4746	if ((cpu_has_feature(CPU_FTR_TM) ||
   4747	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
   4748			(vcpu->arch.hfscr & HFSCR_TM))
   4749		msr |= MSR_TM;
   4750	msr = msr_check_and_set(msr);
   4751
   4752	kvmppc_save_user_regs();
   4753
   4754	kvmppc_save_current_sprs();
   4755
   4756	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   4757		vcpu->arch.waitp = &vcpu->arch.vcore->wait;
   4758	vcpu->arch.pgdir = kvm->mm->pgd;
   4759	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
   4760
   4761	do {
   4762		if (cpu_has_feature(CPU_FTR_ARCH_300))
   4763			r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
   4764						  vcpu->arch.vcore->lpcr);
   4765		else
   4766			r = kvmppc_run_vcpu(vcpu);
   4767
   4768		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
   4769			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
   4770				/*
   4771				 * These should have been caught reflected
   4772				 * into the guest by now. Final sanity check:
   4773				 * don't allow userspace to execute hcalls in
   4774				 * the hypervisor.
   4775				 */
   4776				r = RESUME_GUEST;
   4777				continue;
   4778			}
   4779			trace_kvm_hcall_enter(vcpu);
   4780			r = kvmppc_pseries_do_hcall(vcpu);
   4781			trace_kvm_hcall_exit(vcpu, r);
   4782			kvmppc_core_prepare_to_enter(vcpu);
   4783		} else if (r == RESUME_PAGE_FAULT) {
   4784			srcu_idx = srcu_read_lock(&kvm->srcu);
   4785			r = kvmppc_book3s_hv_page_fault(vcpu,
   4786				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
   4787			srcu_read_unlock(&kvm->srcu, srcu_idx);
   4788		} else if (r == RESUME_PASSTHROUGH) {
   4789			if (WARN_ON(xics_on_xive()))
   4790				r = H_SUCCESS;
   4791			else
   4792				r = kvmppc_xics_rm_complete(vcpu, 0);
   4793		}
   4794	} while (is_kvmppc_resume_guest(r));
   4795
   4796	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
   4797	atomic_dec(&kvm->arch.vcpus_running);
   4798
   4799	srr_regs_clobbered();
   4800
   4801	return r;
   4802}
   4803
   4804static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
   4805				     int shift, int sllp)
   4806{
   4807	(*sps)->page_shift = shift;
   4808	(*sps)->slb_enc = sllp;
   4809	(*sps)->enc[0].page_shift = shift;
   4810	(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
   4811	/*
   4812	 * Add 16MB MPSS support (may get filtered out by userspace)
   4813	 */
   4814	if (shift != 24) {
   4815		int penc = kvmppc_pgsize_lp_encoding(shift, 24);
   4816		if (penc != -1) {
   4817			(*sps)->enc[1].page_shift = 24;
   4818			(*sps)->enc[1].pte_enc = penc;
   4819		}
   4820	}
   4821	(*sps)++;
   4822}
   4823
   4824static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
   4825					 struct kvm_ppc_smmu_info *info)
   4826{
   4827	struct kvm_ppc_one_seg_page_size *sps;
   4828
   4829	/*
   4830	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
   4831	 * POWER7 doesn't support keys for instruction accesses,
   4832	 * POWER8 and POWER9 do.
   4833	 */
   4834	info->data_keys = 32;
   4835	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
   4836
   4837	/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
   4838	info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
   4839	info->slb_size = 32;
   4840
   4841	/* We only support these sizes for now, and no muti-size segments */
   4842	sps = &info->sps[0];
   4843	kvmppc_add_seg_page_size(&sps, 12, 0);
   4844	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
   4845	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
   4846
   4847	/* If running as a nested hypervisor, we don't support HPT guests */
   4848	if (kvmhv_on_pseries())
   4849		info->flags |= KVM_PPC_NO_HASH;
   4850
   4851	return 0;
   4852}
   4853
   4854/*
   4855 * Get (and clear) the dirty memory log for a memory slot.
   4856 */
   4857static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
   4858					 struct kvm_dirty_log *log)
   4859{
   4860	struct kvm_memslots *slots;
   4861	struct kvm_memory_slot *memslot;
   4862	int r;
   4863	unsigned long n, i;
   4864	unsigned long *buf, *p;
   4865	struct kvm_vcpu *vcpu;
   4866
   4867	mutex_lock(&kvm->slots_lock);
   4868
   4869	r = -EINVAL;
   4870	if (log->slot >= KVM_USER_MEM_SLOTS)
   4871		goto out;
   4872
   4873	slots = kvm_memslots(kvm);
   4874	memslot = id_to_memslot(slots, log->slot);
   4875	r = -ENOENT;
   4876	if (!memslot || !memslot->dirty_bitmap)
   4877		goto out;
   4878
   4879	/*
   4880	 * Use second half of bitmap area because both HPT and radix
   4881	 * accumulate bits in the first half.
   4882	 */
   4883	n = kvm_dirty_bitmap_bytes(memslot);
   4884	buf = memslot->dirty_bitmap + n / sizeof(long);
   4885	memset(buf, 0, n);
   4886
   4887	if (kvm_is_radix(kvm))
   4888		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
   4889	else
   4890		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
   4891	if (r)
   4892		goto out;
   4893
   4894	/*
   4895	 * We accumulate dirty bits in the first half of the
   4896	 * memslot's dirty_bitmap area, for when pages are paged
   4897	 * out or modified by the host directly.  Pick up these
   4898	 * bits and add them to the map.
   4899	 */
   4900	p = memslot->dirty_bitmap;
   4901	for (i = 0; i < n / sizeof(long); ++i)
   4902		buf[i] |= xchg(&p[i], 0);
   4903
   4904	/* Harvest dirty bits from VPA and DTL updates */
   4905	/* Note: we never modify the SLB shadow buffer areas */
   4906	kvm_for_each_vcpu(i, vcpu, kvm) {
   4907		spin_lock(&vcpu->arch.vpa_update_lock);
   4908		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
   4909		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
   4910		spin_unlock(&vcpu->arch.vpa_update_lock);
   4911	}
   4912
   4913	r = -EFAULT;
   4914	if (copy_to_user(log->dirty_bitmap, buf, n))
   4915		goto out;
   4916
   4917	r = 0;
   4918out:
   4919	mutex_unlock(&kvm->slots_lock);
   4920	return r;
   4921}
   4922
   4923static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
   4924{
   4925	vfree(slot->arch.rmap);
   4926	slot->arch.rmap = NULL;
   4927}
   4928
   4929static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
   4930				const struct kvm_memory_slot *old,
   4931				struct kvm_memory_slot *new,
   4932				enum kvm_mr_change change)
   4933{
   4934	if (change == KVM_MR_CREATE) {
   4935		unsigned long size = array_size(new->npages, sizeof(*new->arch.rmap));
   4936
   4937		if ((size >> PAGE_SHIFT) > totalram_pages())
   4938			return -ENOMEM;
   4939
   4940		new->arch.rmap = vzalloc(size);
   4941		if (!new->arch.rmap)
   4942			return -ENOMEM;
   4943	} else if (change != KVM_MR_DELETE) {
   4944		new->arch.rmap = old->arch.rmap;
   4945	}
   4946
   4947	return 0;
   4948}
   4949
   4950static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
   4951				struct kvm_memory_slot *old,
   4952				const struct kvm_memory_slot *new,
   4953				enum kvm_mr_change change)
   4954{
   4955	/*
   4956	 * If we are creating or modifying a memslot, it might make
   4957	 * some address that was previously cached as emulated
   4958	 * MMIO be no longer emulated MMIO, so invalidate
   4959	 * all the caches of emulated MMIO translations.
   4960	 */
   4961	if (change != KVM_MR_DELETE)
   4962		atomic64_inc(&kvm->arch.mmio_update);
   4963
   4964	/*
   4965	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
   4966	 * have already called kvm_arch_flush_shadow_memslot() to
   4967	 * flush shadow mappings.  For KVM_MR_CREATE we have no
   4968	 * previous mappings.  So the only case to handle is
   4969	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
   4970	 * has been changed.
   4971	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
   4972	 * to get rid of any THP PTEs in the partition-scoped page tables
   4973	 * so we can track dirtiness at the page level; we flush when
   4974	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
   4975	 * using THP PTEs.
   4976	 */
   4977	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
   4978	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
   4979		kvmppc_radix_flush_memslot(kvm, old);
   4980	/*
   4981	 * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
   4982	 */
   4983	if (!kvm->arch.secure_guest)
   4984		return;
   4985
   4986	switch (change) {
   4987	case KVM_MR_CREATE:
   4988		/*
   4989		 * @TODO kvmppc_uvmem_memslot_create() can fail and
   4990		 * return error. Fix this.
   4991		 */
   4992		kvmppc_uvmem_memslot_create(kvm, new);
   4993		break;
   4994	case KVM_MR_DELETE:
   4995		kvmppc_uvmem_memslot_delete(kvm, old);
   4996		break;
   4997	default:
   4998		/* TODO: Handle KVM_MR_MOVE */
   4999		break;
   5000	}
   5001}
   5002
   5003/*
   5004 * Update LPCR values in kvm->arch and in vcores.
   5005 * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
   5006 * of kvm->arch.lpcr update).
   5007 */
   5008void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
   5009{
   5010	long int i;
   5011	u32 cores_done = 0;
   5012
   5013	if ((kvm->arch.lpcr & mask) == lpcr)
   5014		return;
   5015
   5016	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
   5017
   5018	for (i = 0; i < KVM_MAX_VCORES; ++i) {
   5019		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
   5020		if (!vc)
   5021			continue;
   5022
   5023		spin_lock(&vc->lock);
   5024		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
   5025		verify_lpcr(kvm, vc->lpcr);
   5026		spin_unlock(&vc->lock);
   5027		if (++cores_done >= kvm->arch.online_vcores)
   5028			break;
   5029	}
   5030}
   5031
   5032void kvmppc_setup_partition_table(struct kvm *kvm)
   5033{
   5034	unsigned long dw0, dw1;
   5035
   5036	if (!kvm_is_radix(kvm)) {
   5037		/* PS field - page size for VRMA */
   5038		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
   5039			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
   5040		/* HTABSIZE and HTABORG fields */
   5041		dw0 |= kvm->arch.sdr1;
   5042
   5043		/* Second dword as set by userspace */
   5044		dw1 = kvm->arch.process_table;
   5045	} else {
   5046		dw0 = PATB_HR | radix__get_tree_size() |
   5047			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
   5048		dw1 = PATB_GR | kvm->arch.process_table;
   5049	}
   5050	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
   5051}
   5052
   5053/*
   5054 * Set up HPT (hashed page table) and RMA (real-mode area).
   5055 * Must be called with kvm->arch.mmu_setup_lock held.
   5056 */
   5057static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
   5058{
   5059	int err = 0;
   5060	struct kvm *kvm = vcpu->kvm;
   5061	unsigned long hva;
   5062	struct kvm_memory_slot *memslot;
   5063	struct vm_area_struct *vma;
   5064	unsigned long lpcr = 0, senc;
   5065	unsigned long psize, porder;
   5066	int srcu_idx;
   5067
   5068	/* Allocate hashed page table (if not done already) and reset it */
   5069	if (!kvm->arch.hpt.virt) {
   5070		int order = KVM_DEFAULT_HPT_ORDER;
   5071		struct kvm_hpt_info info;
   5072
   5073		err = kvmppc_allocate_hpt(&info, order);
   5074		/* If we get here, it means userspace didn't specify a
   5075		 * size explicitly.  So, try successively smaller
   5076		 * sizes if the default failed. */
   5077		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
   5078			err  = kvmppc_allocate_hpt(&info, order);
   5079
   5080		if (err < 0) {
   5081			pr_err("KVM: Couldn't alloc HPT\n");
   5082			goto out;
   5083		}
   5084
   5085		kvmppc_set_hpt(kvm, &info);
   5086	}
   5087
   5088	/* Look up the memslot for guest physical address 0 */
   5089	srcu_idx = srcu_read_lock(&kvm->srcu);
   5090	memslot = gfn_to_memslot(kvm, 0);
   5091
   5092	/* We must have some memory at 0 by now */
   5093	err = -EINVAL;
   5094	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
   5095		goto out_srcu;
   5096
   5097	/* Look up the VMA for the start of this memory slot */
   5098	hva = memslot->userspace_addr;
   5099	mmap_read_lock(kvm->mm);
   5100	vma = vma_lookup(kvm->mm, hva);
   5101	if (!vma || (vma->vm_flags & VM_IO))
   5102		goto up_out;
   5103
   5104	psize = vma_kernel_pagesize(vma);
   5105
   5106	mmap_read_unlock(kvm->mm);
   5107
   5108	/* We can handle 4k, 64k or 16M pages in the VRMA */
   5109	if (psize >= 0x1000000)
   5110		psize = 0x1000000;
   5111	else if (psize >= 0x10000)
   5112		psize = 0x10000;
   5113	else
   5114		psize = 0x1000;
   5115	porder = __ilog2(psize);
   5116
   5117	senc = slb_pgsize_encoding(psize);
   5118	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
   5119		(VRMA_VSID << SLB_VSID_SHIFT_1T);
   5120	/* Create HPTEs in the hash page table for the VRMA */
   5121	kvmppc_map_vrma(vcpu, memslot, porder);
   5122
   5123	/* Update VRMASD field in the LPCR */
   5124	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
   5125		/* the -4 is to account for senc values starting at 0x10 */
   5126		lpcr = senc << (LPCR_VRMASD_SH - 4);
   5127		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
   5128	}
   5129
   5130	/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
   5131	smp_wmb();
   5132	err = 0;
   5133 out_srcu:
   5134	srcu_read_unlock(&kvm->srcu, srcu_idx);
   5135 out:
   5136	return err;
   5137
   5138 up_out:
   5139	mmap_read_unlock(kvm->mm);
   5140	goto out_srcu;
   5141}
   5142
   5143/*
   5144 * Must be called with kvm->arch.mmu_setup_lock held and
   5145 * mmu_ready = 0 and no vcpus running.
   5146 */
   5147int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
   5148{
   5149	unsigned long lpcr, lpcr_mask;
   5150
   5151	if (nesting_enabled(kvm))
   5152		kvmhv_release_all_nested(kvm);
   5153	kvmppc_rmap_reset(kvm);
   5154	kvm->arch.process_table = 0;
   5155	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
   5156	spin_lock(&kvm->mmu_lock);
   5157	kvm->arch.radix = 0;
   5158	spin_unlock(&kvm->mmu_lock);
   5159	kvmppc_free_radix(kvm);
   5160
   5161	lpcr = LPCR_VPM1;
   5162	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
   5163	if (cpu_has_feature(CPU_FTR_ARCH_31))
   5164		lpcr_mask |= LPCR_HAIL;
   5165	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
   5166
   5167	return 0;
   5168}
   5169
   5170/*
   5171 * Must be called with kvm->arch.mmu_setup_lock held and
   5172 * mmu_ready = 0 and no vcpus running.
   5173 */
   5174int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
   5175{
   5176	unsigned long lpcr, lpcr_mask;
   5177	int err;
   5178
   5179	err = kvmppc_init_vm_radix(kvm);
   5180	if (err)
   5181		return err;
   5182	kvmppc_rmap_reset(kvm);
   5183	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
   5184	spin_lock(&kvm->mmu_lock);
   5185	kvm->arch.radix = 1;
   5186	spin_unlock(&kvm->mmu_lock);
   5187	kvmppc_free_hpt(&kvm->arch.hpt);
   5188
   5189	lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
   5190	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
   5191	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
   5192		lpcr_mask |= LPCR_HAIL;
   5193		if (cpu_has_feature(CPU_FTR_HVMODE) &&
   5194				(kvm->arch.host_lpcr & LPCR_HAIL))
   5195			lpcr |= LPCR_HAIL;
   5196	}
   5197	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
   5198
   5199	return 0;
   5200}
   5201
   5202#ifdef CONFIG_KVM_XICS
   5203/*
   5204 * Allocate a per-core structure for managing state about which cores are
   5205 * running in the host versus the guest and for exchanging data between
   5206 * real mode KVM and CPU running in the host.
   5207 * This is only done for the first VM.
   5208 * The allocated structure stays even if all VMs have stopped.
   5209 * It is only freed when the kvm-hv module is unloaded.
   5210 * It's OK for this routine to fail, we just don't support host
   5211 * core operations like redirecting H_IPI wakeups.
   5212 */
   5213void kvmppc_alloc_host_rm_ops(void)
   5214{
   5215	struct kvmppc_host_rm_ops *ops;
   5216	unsigned long l_ops;
   5217	int cpu, core;
   5218	int size;
   5219
   5220	if (cpu_has_feature(CPU_FTR_ARCH_300))
   5221		return;
   5222
   5223	/* Not the first time here ? */
   5224	if (kvmppc_host_rm_ops_hv != NULL)
   5225		return;
   5226
   5227	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
   5228	if (!ops)
   5229		return;
   5230
   5231	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
   5232	ops->rm_core = kzalloc(size, GFP_KERNEL);
   5233
   5234	if (!ops->rm_core) {
   5235		kfree(ops);
   5236		return;
   5237	}
   5238
   5239	cpus_read_lock();
   5240
   5241	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
   5242		if (!cpu_online(cpu))
   5243			continue;
   5244
   5245		core = cpu >> threads_shift;
   5246		ops->rm_core[core].rm_state.in_host = 1;
   5247	}
   5248
   5249	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
   5250
   5251	/*
   5252	 * Make the contents of the kvmppc_host_rm_ops structure visible
   5253	 * to other CPUs before we assign it to the global variable.
   5254	 * Do an atomic assignment (no locks used here), but if someone
   5255	 * beats us to it, just free our copy and return.
   5256	 */
   5257	smp_wmb();
   5258	l_ops = (unsigned long) ops;
   5259
   5260	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
   5261		cpus_read_unlock();
   5262		kfree(ops->rm_core);
   5263		kfree(ops);
   5264		return;
   5265	}
   5266
   5267	cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
   5268					     "ppc/kvm_book3s:prepare",
   5269					     kvmppc_set_host_core,
   5270					     kvmppc_clear_host_core);
   5271	cpus_read_unlock();
   5272}
   5273
   5274void kvmppc_free_host_rm_ops(void)
   5275{
   5276	if (kvmppc_host_rm_ops_hv) {
   5277		cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
   5278		kfree(kvmppc_host_rm_ops_hv->rm_core);
   5279		kfree(kvmppc_host_rm_ops_hv);
   5280		kvmppc_host_rm_ops_hv = NULL;
   5281	}
   5282}
   5283#endif
   5284
   5285static int kvmppc_core_init_vm_hv(struct kvm *kvm)
   5286{
   5287	unsigned long lpcr, lpid;
   5288	int ret;
   5289
   5290	mutex_init(&kvm->arch.uvmem_lock);
   5291	INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
   5292	mutex_init(&kvm->arch.mmu_setup_lock);
   5293
   5294	/* Allocate the guest's logical partition ID */
   5295
   5296	lpid = kvmppc_alloc_lpid();
   5297	if ((long)lpid < 0)
   5298		return -ENOMEM;
   5299	kvm->arch.lpid = lpid;
   5300
   5301	kvmppc_alloc_host_rm_ops();
   5302
   5303	kvmhv_vm_nested_init(kvm);
   5304
   5305	/*
   5306	 * Since we don't flush the TLB when tearing down a VM,
   5307	 * and this lpid might have previously been used,
   5308	 * make sure we flush on each core before running the new VM.
   5309	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
   5310	 * does this flush for us.
   5311	 */
   5312	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5313		cpumask_setall(&kvm->arch.need_tlb_flush);
   5314
   5315	/* Start out with the default set of hcalls enabled */
   5316	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
   5317	       sizeof(kvm->arch.enabled_hcalls));
   5318
   5319	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5320		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
   5321
   5322	/* Init LPCR for virtual RMA mode */
   5323	if (cpu_has_feature(CPU_FTR_HVMODE)) {
   5324		kvm->arch.host_lpid = mfspr(SPRN_LPID);
   5325		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
   5326		lpcr &= LPCR_PECE | LPCR_LPES;
   5327	} else {
   5328		/*
   5329		 * The L2 LPES mode will be set by the L0 according to whether
   5330		 * or not it needs to take external interrupts in HV mode.
   5331		 */
   5332		lpcr = 0;
   5333	}
   5334	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
   5335		LPCR_VPM0 | LPCR_VPM1;
   5336	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
   5337		(VRMA_VSID << SLB_VSID_SHIFT_1T);
   5338	/* On POWER8 turn on online bit to enable PURR/SPURR */
   5339	if (cpu_has_feature(CPU_FTR_ARCH_207S))
   5340		lpcr |= LPCR_ONL;
   5341	/*
   5342	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
   5343	 * Set HVICE bit to enable hypervisor virtualization interrupts.
   5344	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
   5345	 * be unnecessary but better safe than sorry in case we re-enable
   5346	 * EE in HV mode with this LPCR still set)
   5347	 */
   5348	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
   5349		lpcr &= ~LPCR_VPM0;
   5350		lpcr |= LPCR_HVICE | LPCR_HEIC;
   5351
   5352		/*
   5353		 * If xive is enabled, we route 0x500 interrupts directly
   5354		 * to the guest.
   5355		 */
   5356		if (xics_on_xive())
   5357			lpcr |= LPCR_LPES;
   5358	}
   5359
   5360	/*
   5361	 * If the host uses radix, the guest starts out as radix.
   5362	 */
   5363	if (radix_enabled()) {
   5364		kvm->arch.radix = 1;
   5365		kvm->arch.mmu_ready = 1;
   5366		lpcr &= ~LPCR_VPM1;
   5367		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
   5368		if (cpu_has_feature(CPU_FTR_HVMODE) &&
   5369		    cpu_has_feature(CPU_FTR_ARCH_31) &&
   5370		    (kvm->arch.host_lpcr & LPCR_HAIL))
   5371			lpcr |= LPCR_HAIL;
   5372		ret = kvmppc_init_vm_radix(kvm);
   5373		if (ret) {
   5374			kvmppc_free_lpid(kvm->arch.lpid);
   5375			return ret;
   5376		}
   5377		kvmppc_setup_partition_table(kvm);
   5378	}
   5379
   5380	verify_lpcr(kvm, lpcr);
   5381	kvm->arch.lpcr = lpcr;
   5382
   5383	/* Initialization for future HPT resizes */
   5384	kvm->arch.resize_hpt = NULL;
   5385
   5386	/*
   5387	 * Work out how many sets the TLB has, for the use of
   5388	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
   5389	 */
   5390	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
   5391		/*
   5392		 * P10 will flush all the congruence class with a single tlbiel
   5393		 */
   5394		kvm->arch.tlb_sets = 1;
   5395	} else if (radix_enabled())
   5396		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
   5397	else if (cpu_has_feature(CPU_FTR_ARCH_300))
   5398		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
   5399	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
   5400		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
   5401	else
   5402		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
   5403
   5404	/*
   5405	 * Track that we now have a HV mode VM active. This blocks secondary
   5406	 * CPU threads from coming online.
   5407	 */
   5408	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5409		kvm_hv_vm_activated();
   5410
   5411	/*
   5412	 * Initialize smt_mode depending on processor.
   5413	 * POWER8 and earlier have to use "strict" threading, where
   5414	 * all vCPUs in a vcore have to run on the same (sub)core,
   5415	 * whereas on POWER9 the threads can each run a different
   5416	 * guest.
   5417	 */
   5418	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5419		kvm->arch.smt_mode = threads_per_subcore;
   5420	else
   5421		kvm->arch.smt_mode = 1;
   5422	kvm->arch.emul_smt_mode = 1;
   5423
   5424	return 0;
   5425}
   5426
   5427static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm)
   5428{
   5429	kvmppc_mmu_debugfs_init(kvm);
   5430	if (radix_enabled())
   5431		kvmhv_radix_debugfs_init(kvm);
   5432	return 0;
   5433}
   5434
   5435static void kvmppc_free_vcores(struct kvm *kvm)
   5436{
   5437	long int i;
   5438
   5439	for (i = 0; i < KVM_MAX_VCORES; ++i)
   5440		kfree(kvm->arch.vcores[i]);
   5441	kvm->arch.online_vcores = 0;
   5442}
   5443
   5444static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
   5445{
   5446	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5447		kvm_hv_vm_deactivated();
   5448
   5449	kvmppc_free_vcores(kvm);
   5450
   5451
   5452	if (kvm_is_radix(kvm))
   5453		kvmppc_free_radix(kvm);
   5454	else
   5455		kvmppc_free_hpt(&kvm->arch.hpt);
   5456
   5457	/* Perform global invalidation and return lpid to the pool */
   5458	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
   5459		if (nesting_enabled(kvm))
   5460			kvmhv_release_all_nested(kvm);
   5461		kvm->arch.process_table = 0;
   5462		if (kvm->arch.secure_guest)
   5463			uv_svm_terminate(kvm->arch.lpid);
   5464		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
   5465	}
   5466
   5467	kvmppc_free_lpid(kvm->arch.lpid);
   5468
   5469	kvmppc_free_pimap(kvm);
   5470}
   5471
   5472/* We don't need to emulate any privileged instructions or dcbz */
   5473static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
   5474				     unsigned int inst, int *advance)
   5475{
   5476	return EMULATE_FAIL;
   5477}
   5478
   5479static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
   5480					ulong spr_val)
   5481{
   5482	return EMULATE_FAIL;
   5483}
   5484
   5485static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
   5486					ulong *spr_val)
   5487{
   5488	return EMULATE_FAIL;
   5489}
   5490
   5491static int kvmppc_core_check_processor_compat_hv(void)
   5492{
   5493	if (cpu_has_feature(CPU_FTR_HVMODE) &&
   5494	    cpu_has_feature(CPU_FTR_ARCH_206))
   5495		return 0;
   5496
   5497	/* POWER9 in radix mode is capable of being a nested hypervisor. */
   5498	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
   5499		return 0;
   5500
   5501	return -EIO;
   5502}
   5503
   5504#ifdef CONFIG_KVM_XICS
   5505
   5506void kvmppc_free_pimap(struct kvm *kvm)
   5507{
   5508	kfree(kvm->arch.pimap);
   5509}
   5510
   5511static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
   5512{
   5513	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
   5514}
   5515
   5516static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
   5517{
   5518	struct irq_desc *desc;
   5519	struct kvmppc_irq_map *irq_map;
   5520	struct kvmppc_passthru_irqmap *pimap;
   5521	struct irq_chip *chip;
   5522	int i, rc = 0;
   5523	struct irq_data *host_data;
   5524
   5525	if (!kvm_irq_bypass)
   5526		return 1;
   5527
   5528	desc = irq_to_desc(host_irq);
   5529	if (!desc)
   5530		return -EIO;
   5531
   5532	mutex_lock(&kvm->lock);
   5533
   5534	pimap = kvm->arch.pimap;
   5535	if (pimap == NULL) {
   5536		/* First call, allocate structure to hold IRQ map */
   5537		pimap = kvmppc_alloc_pimap();
   5538		if (pimap == NULL) {
   5539			mutex_unlock(&kvm->lock);
   5540			return -ENOMEM;
   5541		}
   5542		kvm->arch.pimap = pimap;
   5543	}
   5544
   5545	/*
   5546	 * For now, we only support interrupts for which the EOI operation
   5547	 * is an OPAL call followed by a write to XIRR, since that's
   5548	 * what our real-mode EOI code does, or a XIVE interrupt
   5549	 */
   5550	chip = irq_data_get_irq_chip(&desc->irq_data);
   5551	if (!chip || !is_pnv_opal_msi(chip)) {
   5552		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
   5553			host_irq, guest_gsi);
   5554		mutex_unlock(&kvm->lock);
   5555		return -ENOENT;
   5556	}
   5557
   5558	/*
   5559	 * See if we already have an entry for this guest IRQ number.
   5560	 * If it's mapped to a hardware IRQ number, that's an error,
   5561	 * otherwise re-use this entry.
   5562	 */
   5563	for (i = 0; i < pimap->n_mapped; i++) {
   5564		if (guest_gsi == pimap->mapped[i].v_hwirq) {
   5565			if (pimap->mapped[i].r_hwirq) {
   5566				mutex_unlock(&kvm->lock);
   5567				return -EINVAL;
   5568			}
   5569			break;
   5570		}
   5571	}
   5572
   5573	if (i == KVMPPC_PIRQ_MAPPED) {
   5574		mutex_unlock(&kvm->lock);
   5575		return -EAGAIN;		/* table is full */
   5576	}
   5577
   5578	irq_map = &pimap->mapped[i];
   5579
   5580	irq_map->v_hwirq = guest_gsi;
   5581	irq_map->desc = desc;
   5582
   5583	/*
   5584	 * Order the above two stores before the next to serialize with
   5585	 * the KVM real mode handler.
   5586	 */
   5587	smp_wmb();
   5588
   5589	/*
   5590	 * The 'host_irq' number is mapped in the PCI-MSI domain but
   5591	 * the underlying calls, which will EOI the interrupt in real
   5592	 * mode, need an HW IRQ number mapped in the XICS IRQ domain.
   5593	 */
   5594	host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
   5595	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
   5596
   5597	if (i == pimap->n_mapped)
   5598		pimap->n_mapped++;
   5599
   5600	if (xics_on_xive())
   5601		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
   5602	else
   5603		kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
   5604	if (rc)
   5605		irq_map->r_hwirq = 0;
   5606
   5607	mutex_unlock(&kvm->lock);
   5608
   5609	return 0;
   5610}
   5611
   5612static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
   5613{
   5614	struct irq_desc *desc;
   5615	struct kvmppc_passthru_irqmap *pimap;
   5616	int i, rc = 0;
   5617
   5618	if (!kvm_irq_bypass)
   5619		return 0;
   5620
   5621	desc = irq_to_desc(host_irq);
   5622	if (!desc)
   5623		return -EIO;
   5624
   5625	mutex_lock(&kvm->lock);
   5626	if (!kvm->arch.pimap)
   5627		goto unlock;
   5628
   5629	pimap = kvm->arch.pimap;
   5630
   5631	for (i = 0; i < pimap->n_mapped; i++) {
   5632		if (guest_gsi == pimap->mapped[i].v_hwirq)
   5633			break;
   5634	}
   5635
   5636	if (i == pimap->n_mapped) {
   5637		mutex_unlock(&kvm->lock);
   5638		return -ENODEV;
   5639	}
   5640
   5641	if (xics_on_xive())
   5642		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
   5643	else
   5644		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
   5645
   5646	/* invalidate the entry (what do do on error from the above ?) */
   5647	pimap->mapped[i].r_hwirq = 0;
   5648
   5649	/*
   5650	 * We don't free this structure even when the count goes to
   5651	 * zero. The structure is freed when we destroy the VM.
   5652	 */
   5653 unlock:
   5654	mutex_unlock(&kvm->lock);
   5655	return rc;
   5656}
   5657
   5658static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
   5659					     struct irq_bypass_producer *prod)
   5660{
   5661	int ret = 0;
   5662	struct kvm_kernel_irqfd *irqfd =
   5663		container_of(cons, struct kvm_kernel_irqfd, consumer);
   5664
   5665	irqfd->producer = prod;
   5666
   5667	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
   5668	if (ret)
   5669		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
   5670			prod->irq, irqfd->gsi, ret);
   5671
   5672	return ret;
   5673}
   5674
   5675static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
   5676					      struct irq_bypass_producer *prod)
   5677{
   5678	int ret;
   5679	struct kvm_kernel_irqfd *irqfd =
   5680		container_of(cons, struct kvm_kernel_irqfd, consumer);
   5681
   5682	irqfd->producer = NULL;
   5683
   5684	/*
   5685	 * When producer of consumer is unregistered, we change back to
   5686	 * default external interrupt handling mode - KVM real mode
   5687	 * will switch back to host.
   5688	 */
   5689	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
   5690	if (ret)
   5691		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
   5692			prod->irq, irqfd->gsi, ret);
   5693}
   5694#endif
   5695
   5696static long kvm_arch_vm_ioctl_hv(struct file *filp,
   5697				 unsigned int ioctl, unsigned long arg)
   5698{
   5699	struct kvm *kvm __maybe_unused = filp->private_data;
   5700	void __user *argp = (void __user *)arg;
   5701	long r;
   5702
   5703	switch (ioctl) {
   5704
   5705	case KVM_PPC_ALLOCATE_HTAB: {
   5706		u32 htab_order;
   5707
   5708		/* If we're a nested hypervisor, we currently only support radix */
   5709		if (kvmhv_on_pseries()) {
   5710			r = -EOPNOTSUPP;
   5711			break;
   5712		}
   5713
   5714		r = -EFAULT;
   5715		if (get_user(htab_order, (u32 __user *)argp))
   5716			break;
   5717		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
   5718		if (r)
   5719			break;
   5720		r = 0;
   5721		break;
   5722	}
   5723
   5724	case KVM_PPC_GET_HTAB_FD: {
   5725		struct kvm_get_htab_fd ghf;
   5726
   5727		r = -EFAULT;
   5728		if (copy_from_user(&ghf, argp, sizeof(ghf)))
   5729			break;
   5730		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
   5731		break;
   5732	}
   5733
   5734	case KVM_PPC_RESIZE_HPT_PREPARE: {
   5735		struct kvm_ppc_resize_hpt rhpt;
   5736
   5737		r = -EFAULT;
   5738		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
   5739			break;
   5740
   5741		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
   5742		break;
   5743	}
   5744
   5745	case KVM_PPC_RESIZE_HPT_COMMIT: {
   5746		struct kvm_ppc_resize_hpt rhpt;
   5747
   5748		r = -EFAULT;
   5749		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
   5750			break;
   5751
   5752		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
   5753		break;
   5754	}
   5755
   5756	default:
   5757		r = -ENOTTY;
   5758	}
   5759
   5760	return r;
   5761}
   5762
   5763/*
   5764 * List of hcall numbers to enable by default.
   5765 * For compatibility with old userspace, we enable by default
   5766 * all hcalls that were implemented before the hcall-enabling
   5767 * facility was added.  Note this list should not include H_RTAS.
   5768 */
   5769static unsigned int default_hcall_list[] = {
   5770	H_REMOVE,
   5771	H_ENTER,
   5772	H_READ,
   5773	H_PROTECT,
   5774	H_BULK_REMOVE,
   5775#ifdef CONFIG_SPAPR_TCE_IOMMU
   5776	H_GET_TCE,
   5777	H_PUT_TCE,
   5778#endif
   5779	H_SET_DABR,
   5780	H_SET_XDABR,
   5781	H_CEDE,
   5782	H_PROD,
   5783	H_CONFER,
   5784	H_REGISTER_VPA,
   5785#ifdef CONFIG_KVM_XICS
   5786	H_EOI,
   5787	H_CPPR,
   5788	H_IPI,
   5789	H_IPOLL,
   5790	H_XIRR,
   5791	H_XIRR_X,
   5792#endif
   5793	0
   5794};
   5795
   5796static void init_default_hcalls(void)
   5797{
   5798	int i;
   5799	unsigned int hcall;
   5800
   5801	for (i = 0; default_hcall_list[i]; ++i) {
   5802		hcall = default_hcall_list[i];
   5803		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
   5804		__set_bit(hcall / 4, default_enabled_hcalls);
   5805	}
   5806}
   5807
   5808static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
   5809{
   5810	unsigned long lpcr;
   5811	int radix;
   5812	int err;
   5813
   5814	/* If not on a POWER9, reject it */
   5815	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5816		return -ENODEV;
   5817
   5818	/* If any unknown flags set, reject it */
   5819	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
   5820		return -EINVAL;
   5821
   5822	/* GR (guest radix) bit in process_table field must match */
   5823	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
   5824	if (!!(cfg->process_table & PATB_GR) != radix)
   5825		return -EINVAL;
   5826
   5827	/* Process table size field must be reasonable, i.e. <= 24 */
   5828	if ((cfg->process_table & PRTS_MASK) > 24)
   5829		return -EINVAL;
   5830
   5831	/* We can change a guest to/from radix now, if the host is radix */
   5832	if (radix && !radix_enabled())
   5833		return -EINVAL;
   5834
   5835	/* If we're a nested hypervisor, we currently only support radix */
   5836	if (kvmhv_on_pseries() && !radix)
   5837		return -EINVAL;
   5838
   5839	mutex_lock(&kvm->arch.mmu_setup_lock);
   5840	if (radix != kvm_is_radix(kvm)) {
   5841		if (kvm->arch.mmu_ready) {
   5842			kvm->arch.mmu_ready = 0;
   5843			/* order mmu_ready vs. vcpus_running */
   5844			smp_mb();
   5845			if (atomic_read(&kvm->arch.vcpus_running)) {
   5846				kvm->arch.mmu_ready = 1;
   5847				err = -EBUSY;
   5848				goto out_unlock;
   5849			}
   5850		}
   5851		if (radix)
   5852			err = kvmppc_switch_mmu_to_radix(kvm);
   5853		else
   5854			err = kvmppc_switch_mmu_to_hpt(kvm);
   5855		if (err)
   5856			goto out_unlock;
   5857	}
   5858
   5859	kvm->arch.process_table = cfg->process_table;
   5860	kvmppc_setup_partition_table(kvm);
   5861
   5862	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
   5863	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
   5864	err = 0;
   5865
   5866 out_unlock:
   5867	mutex_unlock(&kvm->arch.mmu_setup_lock);
   5868	return err;
   5869}
   5870
   5871static int kvmhv_enable_nested(struct kvm *kvm)
   5872{
   5873	if (!nested)
   5874		return -EPERM;
   5875	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   5876		return -ENODEV;
   5877	if (!radix_enabled())
   5878		return -ENODEV;
   5879
   5880	/* kvm == NULL means the caller is testing if the capability exists */
   5881	if (kvm)
   5882		kvm->arch.nested_enable = true;
   5883	return 0;
   5884}
   5885
   5886static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
   5887				 int size)
   5888{
   5889	int rc = -EINVAL;
   5890
   5891	if (kvmhv_vcpu_is_radix(vcpu)) {
   5892		rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
   5893
   5894		if (rc > 0)
   5895			rc = -EINVAL;
   5896	}
   5897
   5898	/* For now quadrants are the only way to access nested guest memory */
   5899	if (rc && vcpu->arch.nested)
   5900		rc = -EAGAIN;
   5901
   5902	return rc;
   5903}
   5904
   5905static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
   5906				int size)
   5907{
   5908	int rc = -EINVAL;
   5909
   5910	if (kvmhv_vcpu_is_radix(vcpu)) {
   5911		rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
   5912
   5913		if (rc > 0)
   5914			rc = -EINVAL;
   5915	}
   5916
   5917	/* For now quadrants are the only way to access nested guest memory */
   5918	if (rc && vcpu->arch.nested)
   5919		rc = -EAGAIN;
   5920
   5921	return rc;
   5922}
   5923
   5924static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
   5925{
   5926	unpin_vpa(kvm, vpa);
   5927	vpa->gpa = 0;
   5928	vpa->pinned_addr = NULL;
   5929	vpa->dirty = false;
   5930	vpa->update_pending = 0;
   5931}
   5932
   5933/*
   5934 * Enable a guest to become a secure VM, or test whether
   5935 * that could be enabled.
   5936 * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
   5937 * tested (kvm == NULL) or enabled (kvm != NULL).
   5938 */
   5939static int kvmhv_enable_svm(struct kvm *kvm)
   5940{
   5941	if (!kvmppc_uvmem_available())
   5942		return -EINVAL;
   5943	if (kvm)
   5944		kvm->arch.svm_enabled = 1;
   5945	return 0;
   5946}
   5947
   5948/*
   5949 *  IOCTL handler to turn off secure mode of guest
   5950 *
   5951 * - Release all device pages
   5952 * - Issue ucall to terminate the guest on the UV side
   5953 * - Unpin the VPA pages.
   5954 * - Reinit the partition scoped page tables
   5955 */
   5956static int kvmhv_svm_off(struct kvm *kvm)
   5957{
   5958	struct kvm_vcpu *vcpu;
   5959	int mmu_was_ready;
   5960	int srcu_idx;
   5961	int ret = 0;
   5962	unsigned long i;
   5963
   5964	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
   5965		return ret;
   5966
   5967	mutex_lock(&kvm->arch.mmu_setup_lock);
   5968	mmu_was_ready = kvm->arch.mmu_ready;
   5969	if (kvm->arch.mmu_ready) {
   5970		kvm->arch.mmu_ready = 0;
   5971		/* order mmu_ready vs. vcpus_running */
   5972		smp_mb();
   5973		if (atomic_read(&kvm->arch.vcpus_running)) {
   5974			kvm->arch.mmu_ready = 1;
   5975			ret = -EBUSY;
   5976			goto out;
   5977		}
   5978	}
   5979
   5980	srcu_idx = srcu_read_lock(&kvm->srcu);
   5981	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
   5982		struct kvm_memory_slot *memslot;
   5983		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
   5984		int bkt;
   5985
   5986		if (!slots)
   5987			continue;
   5988
   5989		kvm_for_each_memslot(memslot, bkt, slots) {
   5990			kvmppc_uvmem_drop_pages(memslot, kvm, true);
   5991			uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
   5992		}
   5993	}
   5994	srcu_read_unlock(&kvm->srcu, srcu_idx);
   5995
   5996	ret = uv_svm_terminate(kvm->arch.lpid);
   5997	if (ret != U_SUCCESS) {
   5998		ret = -EINVAL;
   5999		goto out;
   6000	}
   6001
   6002	/*
   6003	 * When secure guest is reset, all the guest pages are sent
   6004	 * to UV via UV_PAGE_IN before the non-boot vcpus get a
   6005	 * chance to run and unpin their VPA pages. Unpinning of all
   6006	 * VPA pages is done here explicitly so that VPA pages
   6007	 * can be migrated to the secure side.
   6008	 *
   6009	 * This is required to for the secure SMP guest to reboot
   6010	 * correctly.
   6011	 */
   6012	kvm_for_each_vcpu(i, vcpu, kvm) {
   6013		spin_lock(&vcpu->arch.vpa_update_lock);
   6014		unpin_vpa_reset(kvm, &vcpu->arch.dtl);
   6015		unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
   6016		unpin_vpa_reset(kvm, &vcpu->arch.vpa);
   6017		spin_unlock(&vcpu->arch.vpa_update_lock);
   6018	}
   6019
   6020	kvmppc_setup_partition_table(kvm);
   6021	kvm->arch.secure_guest = 0;
   6022	kvm->arch.mmu_ready = mmu_was_ready;
   6023out:
   6024	mutex_unlock(&kvm->arch.mmu_setup_lock);
   6025	return ret;
   6026}
   6027
   6028static int kvmhv_enable_dawr1(struct kvm *kvm)
   6029{
   6030	if (!cpu_has_feature(CPU_FTR_DAWR1))
   6031		return -ENODEV;
   6032
   6033	/* kvm == NULL means the caller is testing if the capability exists */
   6034	if (kvm)
   6035		kvm->arch.dawr1_enabled = true;
   6036	return 0;
   6037}
   6038
   6039static bool kvmppc_hash_v3_possible(void)
   6040{
   6041	if (!cpu_has_feature(CPU_FTR_ARCH_300))
   6042		return false;
   6043
   6044	if (!cpu_has_feature(CPU_FTR_HVMODE))
   6045		return false;
   6046
   6047	/*
   6048	 * POWER9 chips before version 2.02 can't have some threads in
   6049	 * HPT mode and some in radix mode on the same core.
   6050	 */
   6051	if (radix_enabled()) {
   6052		unsigned int pvr = mfspr(SPRN_PVR);
   6053		if ((pvr >> 16) == PVR_POWER9 &&
   6054		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
   6055		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
   6056			return false;
   6057	}
   6058
   6059	return true;
   6060}
   6061
   6062static struct kvmppc_ops kvm_ops_hv = {
   6063	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
   6064	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
   6065	.get_one_reg = kvmppc_get_one_reg_hv,
   6066	.set_one_reg = kvmppc_set_one_reg_hv,
   6067	.vcpu_load   = kvmppc_core_vcpu_load_hv,
   6068	.vcpu_put    = kvmppc_core_vcpu_put_hv,
   6069	.inject_interrupt = kvmppc_inject_interrupt_hv,
   6070	.set_msr     = kvmppc_set_msr_hv,
   6071	.vcpu_run    = kvmppc_vcpu_run_hv,
   6072	.vcpu_create = kvmppc_core_vcpu_create_hv,
   6073	.vcpu_free   = kvmppc_core_vcpu_free_hv,
   6074	.check_requests = kvmppc_core_check_requests_hv,
   6075	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
   6076	.flush_memslot  = kvmppc_core_flush_memslot_hv,
   6077	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
   6078	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
   6079	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
   6080	.age_gfn = kvm_age_gfn_hv,
   6081	.test_age_gfn = kvm_test_age_gfn_hv,
   6082	.set_spte_gfn = kvm_set_spte_gfn_hv,
   6083	.free_memslot = kvmppc_core_free_memslot_hv,
   6084	.init_vm =  kvmppc_core_init_vm_hv,
   6085	.destroy_vm = kvmppc_core_destroy_vm_hv,
   6086	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
   6087	.emulate_op = kvmppc_core_emulate_op_hv,
   6088	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
   6089	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
   6090	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
   6091	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
   6092	.hcall_implemented = kvmppc_hcall_impl_hv,
   6093#ifdef CONFIG_KVM_XICS
   6094	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
   6095	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
   6096#endif
   6097	.configure_mmu = kvmhv_configure_mmu,
   6098	.get_rmmu_info = kvmhv_get_rmmu_info,
   6099	.set_smt_mode = kvmhv_set_smt_mode,
   6100	.enable_nested = kvmhv_enable_nested,
   6101	.load_from_eaddr = kvmhv_load_from_eaddr,
   6102	.store_to_eaddr = kvmhv_store_to_eaddr,
   6103	.enable_svm = kvmhv_enable_svm,
   6104	.svm_off = kvmhv_svm_off,
   6105	.enable_dawr1 = kvmhv_enable_dawr1,
   6106	.hash_v3_possible = kvmppc_hash_v3_possible,
   6107	.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
   6108	.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
   6109};
   6110
   6111static int kvm_init_subcore_bitmap(void)
   6112{
   6113	int i, j;
   6114	int nr_cores = cpu_nr_cores();
   6115	struct sibling_subcore_state *sibling_subcore_state;
   6116
   6117	for (i = 0; i < nr_cores; i++) {
   6118		int first_cpu = i * threads_per_core;
   6119		int node = cpu_to_node(first_cpu);
   6120
   6121		/* Ignore if it is already allocated. */
   6122		if (paca_ptrs[first_cpu]->sibling_subcore_state)
   6123			continue;
   6124
   6125		sibling_subcore_state =
   6126			kzalloc_node(sizeof(struct sibling_subcore_state),
   6127							GFP_KERNEL, node);
   6128		if (!sibling_subcore_state)
   6129			return -ENOMEM;
   6130
   6131
   6132		for (j = 0; j < threads_per_core; j++) {
   6133			int cpu = first_cpu + j;
   6134
   6135			paca_ptrs[cpu]->sibling_subcore_state =
   6136						sibling_subcore_state;
   6137		}
   6138	}
   6139	return 0;
   6140}
   6141
   6142static int kvmppc_radix_possible(void)
   6143{
   6144	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
   6145}
   6146
   6147static int kvmppc_book3s_init_hv(void)
   6148{
   6149	int r;
   6150
   6151	if (!tlbie_capable) {
   6152		pr_err("KVM-HV: Host does not support TLBIE\n");
   6153		return -ENODEV;
   6154	}
   6155
   6156	/*
   6157	 * FIXME!! Do we need to check on all cpus ?
   6158	 */
   6159	r = kvmppc_core_check_processor_compat_hv();
   6160	if (r < 0)
   6161		return -ENODEV;
   6162
   6163	r = kvmhv_nested_init();
   6164	if (r)
   6165		return r;
   6166
   6167	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
   6168		r = kvm_init_subcore_bitmap();
   6169		if (r)
   6170			goto err;
   6171	}
   6172
   6173	/*
   6174	 * We need a way of accessing the XICS interrupt controller,
   6175	 * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
   6176	 * indirectly, via OPAL.
   6177	 */
   6178#ifdef CONFIG_SMP
   6179	if (!xics_on_xive() && !kvmhv_on_pseries() &&
   6180	    !local_paca->kvm_hstate.xics_phys) {
   6181		struct device_node *np;
   6182
   6183		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
   6184		if (!np) {
   6185			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
   6186			r = -ENODEV;
   6187			goto err;
   6188		}
   6189		/* presence of intc confirmed - node can be dropped again */
   6190		of_node_put(np);
   6191	}
   6192#endif
   6193
   6194	init_default_hcalls();
   6195
   6196	init_vcore_lists();
   6197
   6198	r = kvmppc_mmu_hv_init();
   6199	if (r)
   6200		goto err;
   6201
   6202	if (kvmppc_radix_possible()) {
   6203		r = kvmppc_radix_init();
   6204		if (r)
   6205			goto err;
   6206	}
   6207
   6208	r = kvmppc_uvmem_init();
   6209	if (r < 0) {
   6210		pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
   6211		return r;
   6212	}
   6213
   6214	kvm_ops_hv.owner = THIS_MODULE;
   6215	kvmppc_hv_ops = &kvm_ops_hv;
   6216
   6217	return 0;
   6218
   6219err:
   6220	kvmhv_nested_exit();
   6221	kvmppc_radix_exit();
   6222
   6223	return r;
   6224}
   6225
   6226static void kvmppc_book3s_exit_hv(void)
   6227{
   6228	kvmppc_uvmem_free();
   6229	kvmppc_free_host_rm_ops();
   6230	if (kvmppc_radix_possible())
   6231		kvmppc_radix_exit();
   6232	kvmppc_hv_ops = NULL;
   6233	kvmhv_nested_exit();
   6234}
   6235
   6236module_init(kvmppc_book3s_init_hv);
   6237module_exit(kvmppc_book3s_exit_hv);
   6238MODULE_LICENSE("GPL");
   6239MODULE_ALIAS_MISCDEV(KVM_MINOR);
   6240MODULE_ALIAS("devname:kvm");