cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vsie.c (42825B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * kvm nested virtualization support for s390x
      4 *
      5 * Copyright IBM Corp. 2016, 2018
      6 *
      7 *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
      8 */
      9#include <linux/vmalloc.h>
     10#include <linux/kvm_host.h>
     11#include <linux/bug.h>
     12#include <linux/list.h>
     13#include <linux/bitmap.h>
     14#include <linux/sched/signal.h>
     15
     16#include <asm/gmap.h>
     17#include <asm/mmu_context.h>
     18#include <asm/sclp.h>
     19#include <asm/nmi.h>
     20#include <asm/dis.h>
     21#include <asm/fpu/api.h>
     22#include "kvm-s390.h"
     23#include "gaccess.h"
     24
     25struct vsie_page {
     26	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
     27	/*
     28	 * the backup info for machine check. ensure it's at
     29	 * the same offset as that in struct sie_page!
     30	 */
     31	struct mcck_volatile_info mcck_info;    /* 0x0200 */
     32	/*
     33	 * The pinned original scb. Be aware that other VCPUs can modify
     34	 * it while we read from it. Values that are used for conditions or
     35	 * are reused conditionally, should be accessed via READ_ONCE.
     36	 */
     37	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
     38	/* the shadow gmap in use by the vsie_page */
     39	struct gmap *gmap;			/* 0x0220 */
     40	/* address of the last reported fault to guest2 */
     41	unsigned long fault_addr;		/* 0x0228 */
     42	/* calculated guest addresses of satellite control blocks */
     43	gpa_t sca_gpa;				/* 0x0230 */
     44	gpa_t itdba_gpa;			/* 0x0238 */
     45	gpa_t gvrd_gpa;				/* 0x0240 */
     46	gpa_t riccbd_gpa;			/* 0x0248 */
     47	gpa_t sdnx_gpa;				/* 0x0250 */
     48	__u8 reserved[0x0700 - 0x0258];		/* 0x0258 */
     49	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
     50	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
     51};
     52
     53/* trigger a validity icpt for the given scb */
     54static int set_validity_icpt(struct kvm_s390_sie_block *scb,
     55			     __u16 reason_code)
     56{
     57	scb->ipa = 0x1000;
     58	scb->ipb = ((__u32) reason_code) << 16;
     59	scb->icptcode = ICPT_VALIDITY;
     60	return 1;
     61}
     62
     63/* mark the prefix as unmapped, this will block the VSIE */
     64static void prefix_unmapped(struct vsie_page *vsie_page)
     65{
     66	atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
     67}
     68
     69/* mark the prefix as unmapped and wait until the VSIE has been left */
     70static void prefix_unmapped_sync(struct vsie_page *vsie_page)
     71{
     72	prefix_unmapped(vsie_page);
     73	if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
     74		atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
     75	while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
     76		cpu_relax();
     77}
     78
     79/* mark the prefix as mapped, this will allow the VSIE to run */
     80static void prefix_mapped(struct vsie_page *vsie_page)
     81{
     82	atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
     83}
     84
     85/* test if the prefix is mapped into the gmap shadow */
     86static int prefix_is_mapped(struct vsie_page *vsie_page)
     87{
     88	return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
     89}
     90
     91/* copy the updated intervention request bits into the shadow scb */
     92static void update_intervention_requests(struct vsie_page *vsie_page)
     93{
     94	const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
     95	int cpuflags;
     96
     97	cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
     98	atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
     99	atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
    100}
    101
    102/* shadow (filter and validate) the cpuflags  */
    103static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    104{
    105	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    106	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    107	int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
    108
    109	/* we don't allow ESA/390 guests */
    110	if (!(cpuflags & CPUSTAT_ZARCH))
    111		return set_validity_icpt(scb_s, 0x0001U);
    112
    113	if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
    114		return set_validity_icpt(scb_s, 0x0001U);
    115	else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
    116		return set_validity_icpt(scb_s, 0x0007U);
    117
    118	/* intervention requests will be set later */
    119	newflags = CPUSTAT_ZARCH;
    120	if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
    121		newflags |= CPUSTAT_GED;
    122	if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
    123		if (cpuflags & CPUSTAT_GED)
    124			return set_validity_icpt(scb_s, 0x0001U);
    125		newflags |= CPUSTAT_GED2;
    126	}
    127	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
    128		newflags |= cpuflags & CPUSTAT_P;
    129	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
    130		newflags |= cpuflags & CPUSTAT_SM;
    131	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
    132		newflags |= cpuflags & CPUSTAT_IBS;
    133	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS))
    134		newflags |= cpuflags & CPUSTAT_KSS;
    135
    136	atomic_set(&scb_s->cpuflags, newflags);
    137	return 0;
    138}
    139/* Copy to APCB FORMAT1 from APCB FORMAT0 */
    140static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
    141			unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
    142{
    143	struct kvm_s390_apcb0 tmp;
    144
    145	if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
    146		return -EFAULT;
    147
    148	apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
    149	apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL;
    150	apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL;
    151
    152	return 0;
    153
    154}
    155
    156/**
    157 * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
    158 * @vcpu: pointer to the virtual CPU
    159 * @apcb_s: pointer to start of apcb in the shadow crycb
    160 * @apcb_o: pointer to start of original apcb in the guest2
    161 * @apcb_h: pointer to start of apcb in the guest1
    162 *
    163 * Returns 0 and -EFAULT on error reading guest apcb
    164 */
    165static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
    166			unsigned long apcb_o, unsigned long *apcb_h)
    167{
    168	if (read_guest_real(vcpu, apcb_o, apcb_s,
    169			    sizeof(struct kvm_s390_apcb0)))
    170		return -EFAULT;
    171
    172	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
    173
    174	return 0;
    175}
    176
    177/**
    178 * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
    179 * @vcpu: pointer to the virtual CPU
    180 * @apcb_s: pointer to start of apcb in the shadow crycb
    181 * @apcb_o: pointer to start of original guest apcb
    182 * @apcb_h: pointer to start of apcb in the host
    183 *
    184 * Returns 0 and -EFAULT on error reading guest apcb
    185 */
    186static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
    187			unsigned long apcb_o,
    188			unsigned long *apcb_h)
    189{
    190	if (read_guest_real(vcpu, apcb_o, apcb_s,
    191			    sizeof(struct kvm_s390_apcb1)))
    192		return -EFAULT;
    193
    194	bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
    195
    196	return 0;
    197}
    198
    199/**
    200 * setup_apcb - Create a shadow copy of the apcb.
    201 * @vcpu: pointer to the virtual CPU
    202 * @crycb_s: pointer to shadow crycb
    203 * @crycb_o: pointer to original guest crycb
    204 * @crycb_h: pointer to the host crycb
    205 * @fmt_o: format of the original guest crycb.
    206 * @fmt_h: format of the host crycb.
    207 *
    208 * Checks the compatibility between the guest and host crycb and calls the
    209 * appropriate copy function.
    210 *
    211 * Return 0 or an error number if the guest and host crycb are incompatible.
    212 */
    213static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
    214	       const u32 crycb_o,
    215	       struct kvm_s390_crypto_cb *crycb_h,
    216	       int fmt_o, int fmt_h)
    217{
    218	struct kvm_s390_crypto_cb *crycb;
    219
    220	crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
    221
    222	switch (fmt_o) {
    223	case CRYCB_FORMAT2:
    224		if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
    225			return -EACCES;
    226		if (fmt_h != CRYCB_FORMAT2)
    227			return -EINVAL;
    228		return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
    229				    (unsigned long) &crycb->apcb1,
    230				    (unsigned long *)&crycb_h->apcb1);
    231	case CRYCB_FORMAT1:
    232		switch (fmt_h) {
    233		case CRYCB_FORMAT2:
    234			return setup_apcb10(vcpu, &crycb_s->apcb1,
    235					    (unsigned long) &crycb->apcb0,
    236					    &crycb_h->apcb1);
    237		case CRYCB_FORMAT1:
    238			return setup_apcb00(vcpu,
    239					    (unsigned long *) &crycb_s->apcb0,
    240					    (unsigned long) &crycb->apcb0,
    241					    (unsigned long *) &crycb_h->apcb0);
    242		}
    243		break;
    244	case CRYCB_FORMAT0:
    245		if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
    246			return -EACCES;
    247
    248		switch (fmt_h) {
    249		case CRYCB_FORMAT2:
    250			return setup_apcb10(vcpu, &crycb_s->apcb1,
    251					    (unsigned long) &crycb->apcb0,
    252					    &crycb_h->apcb1);
    253		case CRYCB_FORMAT1:
    254		case CRYCB_FORMAT0:
    255			return setup_apcb00(vcpu,
    256					    (unsigned long *) &crycb_s->apcb0,
    257					    (unsigned long) &crycb->apcb0,
    258					    (unsigned long *) &crycb_h->apcb0);
    259		}
    260	}
    261	return -EINVAL;
    262}
    263
    264/**
    265 * shadow_crycb - Create a shadow copy of the crycb block
    266 * @vcpu: a pointer to the virtual CPU
    267 * @vsie_page: a pointer to internal date used for the vSIE
    268 *
    269 * Create a shadow copy of the crycb block and setup key wrapping, if
    270 * requested for guest 3 and enabled for guest 2.
    271 *
    272 * We accept format-1 or format-2, but we convert format-1 into format-2
    273 * in the shadow CRYCB.
    274 * Using format-2 enables the firmware to choose the right format when
    275 * scheduling the SIE.
    276 * There is nothing to do for format-0.
    277 *
    278 * This function centralize the issuing of set_validity_icpt() for all
    279 * the subfunctions working on the crycb.
    280 *
    281 * Returns: - 0 if shadowed or nothing to do
    282 *          - > 0 if control has to be given to guest 2
    283 */
    284static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    285{
    286	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    287	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    288	const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd);
    289	const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
    290	unsigned long *b1, *b2;
    291	u8 ecb3_flags;
    292	u32 ecd_flags;
    293	int apie_h;
    294	int apie_s;
    295	int key_msk = test_kvm_facility(vcpu->kvm, 76);
    296	int fmt_o = crycbd_o & CRYCB_FORMAT_MASK;
    297	int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK;
    298	int ret = 0;
    299
    300	scb_s->crycbd = 0;
    301
    302	apie_h = vcpu->arch.sie_block->eca & ECA_APIE;
    303	apie_s = apie_h & scb_o->eca;
    304	if (!apie_s && (!key_msk || (fmt_o == CRYCB_FORMAT0)))
    305		return 0;
    306
    307	if (!crycb_addr)
    308		return set_validity_icpt(scb_s, 0x0039U);
    309
    310	if (fmt_o == CRYCB_FORMAT1)
    311		if ((crycb_addr & PAGE_MASK) !=
    312		    ((crycb_addr + 128) & PAGE_MASK))
    313			return set_validity_icpt(scb_s, 0x003CU);
    314
    315	if (apie_s) {
    316		ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr,
    317				 vcpu->kvm->arch.crypto.crycb,
    318				 fmt_o, fmt_h);
    319		if (ret)
    320			goto end;
    321		scb_s->eca |= scb_o->eca & ECA_APIE;
    322	}
    323
    324	/* we may only allow it if enabled for guest 2 */
    325	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
    326		     (ECB3_AES | ECB3_DEA);
    327	ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
    328	if (!ecb3_flags && !ecd_flags)
    329		goto end;
    330
    331	/* copy only the wrapping keys */
    332	if (read_guest_real(vcpu, crycb_addr + 72,
    333			    vsie_page->crycb.dea_wrapping_key_mask, 56))
    334		return set_validity_icpt(scb_s, 0x0035U);
    335
    336	scb_s->ecb3 |= ecb3_flags;
    337	scb_s->ecd |= ecd_flags;
    338
    339	/* xor both blocks in one run */
    340	b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
    341	b2 = (unsigned long *)
    342			    vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
    343	/* as 56%8 == 0, bitmap_xor won't overwrite any data */
    344	bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
    345end:
    346	switch (ret) {
    347	case -EINVAL:
    348		return set_validity_icpt(scb_s, 0x0022U);
    349	case -EFAULT:
    350		return set_validity_icpt(scb_s, 0x0035U);
    351	case -EACCES:
    352		return set_validity_icpt(scb_s, 0x003CU);
    353	}
    354	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
    355	return 0;
    356}
    357
    358/* shadow (round up/down) the ibc to avoid validity icpt */
    359static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    360{
    361	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    362	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    363	/* READ_ONCE does not work on bitfields - use a temporary variable */
    364	const uint32_t __new_ibc = scb_o->ibc;
    365	const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU;
    366	__u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
    367
    368	scb_s->ibc = 0;
    369	/* ibc installed in g2 and requested for g3 */
    370	if (vcpu->kvm->arch.model.ibc && new_ibc) {
    371		scb_s->ibc = new_ibc;
    372		/* takte care of the minimum ibc level of the machine */
    373		if (scb_s->ibc < min_ibc)
    374			scb_s->ibc = min_ibc;
    375		/* take care of the maximum ibc level set for the guest */
    376		if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
    377			scb_s->ibc = vcpu->kvm->arch.model.ibc;
    378	}
    379}
    380
    381/* unshadow the scb, copying parameters back to the real scb */
    382static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    383{
    384	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    385	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    386
    387	/* interception */
    388	scb_o->icptcode = scb_s->icptcode;
    389	scb_o->icptstatus = scb_s->icptstatus;
    390	scb_o->ipa = scb_s->ipa;
    391	scb_o->ipb = scb_s->ipb;
    392	scb_o->gbea = scb_s->gbea;
    393
    394	/* timer */
    395	scb_o->cputm = scb_s->cputm;
    396	scb_o->ckc = scb_s->ckc;
    397	scb_o->todpr = scb_s->todpr;
    398
    399	/* guest state */
    400	scb_o->gpsw = scb_s->gpsw;
    401	scb_o->gg14 = scb_s->gg14;
    402	scb_o->gg15 = scb_s->gg15;
    403	memcpy(scb_o->gcr, scb_s->gcr, 128);
    404	scb_o->pp = scb_s->pp;
    405
    406	/* branch prediction */
    407	if (test_kvm_facility(vcpu->kvm, 82)) {
    408		scb_o->fpf &= ~FPF_BPBC;
    409		scb_o->fpf |= scb_s->fpf & FPF_BPBC;
    410	}
    411
    412	/* interrupt intercept */
    413	switch (scb_s->icptcode) {
    414	case ICPT_PROGI:
    415	case ICPT_INSTPROGI:
    416	case ICPT_EXTINT:
    417		memcpy((void *)((u64)scb_o + 0xc0),
    418		       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
    419		break;
    420	}
    421
    422	if (scb_s->ihcpu != 0xffffU)
    423		scb_o->ihcpu = scb_s->ihcpu;
    424}
    425
    426/*
    427 * Setup the shadow scb by copying and checking the relevant parts of the g2
    428 * provided scb.
    429 *
    430 * Returns: - 0 if the scb has been shadowed
    431 *          - > 0 if control has to be given to guest 2
    432 */
    433static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    434{
    435	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    436	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    437	/* READ_ONCE does not work on bitfields - use a temporary variable */
    438	const uint32_t __new_prefix = scb_o->prefix;
    439	const uint32_t new_prefix = READ_ONCE(__new_prefix);
    440	const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE;
    441	bool had_tx = scb_s->ecb & ECB_TE;
    442	unsigned long new_mso = 0;
    443	int rc;
    444
    445	/* make sure we don't have any leftovers when reusing the scb */
    446	scb_s->icptcode = 0;
    447	scb_s->eca = 0;
    448	scb_s->ecb = 0;
    449	scb_s->ecb2 = 0;
    450	scb_s->ecb3 = 0;
    451	scb_s->ecd = 0;
    452	scb_s->fac = 0;
    453	scb_s->fpf = 0;
    454
    455	rc = prepare_cpuflags(vcpu, vsie_page);
    456	if (rc)
    457		goto out;
    458
    459	/* timer */
    460	scb_s->cputm = scb_o->cputm;
    461	scb_s->ckc = scb_o->ckc;
    462	scb_s->todpr = scb_o->todpr;
    463	scb_s->epoch = scb_o->epoch;
    464
    465	/* guest state */
    466	scb_s->gpsw = scb_o->gpsw;
    467	scb_s->gg14 = scb_o->gg14;
    468	scb_s->gg15 = scb_o->gg15;
    469	memcpy(scb_s->gcr, scb_o->gcr, 128);
    470	scb_s->pp = scb_o->pp;
    471
    472	/* interception / execution handling */
    473	scb_s->gbea = scb_o->gbea;
    474	scb_s->lctl = scb_o->lctl;
    475	scb_s->svcc = scb_o->svcc;
    476	scb_s->ictl = scb_o->ictl;
    477	/*
    478	 * SKEY handling functions can't deal with false setting of PTE invalid
    479	 * bits. Therefore we cannot provide interpretation and would later
    480	 * have to provide own emulation handlers.
    481	 */
    482	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS))
    483		scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
    484
    485	scb_s->icpua = scb_o->icpua;
    486
    487	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
    488		new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
    489	/* if the hva of the prefix changes, we have to remap the prefix */
    490	if (scb_s->mso != new_mso || scb_s->prefix != new_prefix)
    491		prefix_unmapped(vsie_page);
    492	 /* SIE will do mso/msl validity and exception checks for us */
    493	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
    494	scb_s->mso = new_mso;
    495	scb_s->prefix = new_prefix;
    496
    497	/* We have to definetly flush the tlb if this scb never ran */
    498	if (scb_s->ihcpu != 0xffffU)
    499		scb_s->ihcpu = scb_o->ihcpu;
    500
    501	/* MVPG and Protection Exception Interpretation are always available */
    502	scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI);
    503	/* Host-protection-interruption introduced with ESOP */
    504	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
    505		scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
    506	/* transactional execution */
    507	if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
    508		/* remap the prefix is tx is toggled on */
    509		if (!had_tx)
    510			prefix_unmapped(vsie_page);
    511		scb_s->ecb |= ECB_TE;
    512	}
    513	/* specification exception interpretation */
    514	scb_s->ecb |= scb_o->ecb & ECB_SPECI;
    515	/* branch prediction */
    516	if (test_kvm_facility(vcpu->kvm, 82))
    517		scb_s->fpf |= scb_o->fpf & FPF_BPBC;
    518	/* SIMD */
    519	if (test_kvm_facility(vcpu->kvm, 129)) {
    520		scb_s->eca |= scb_o->eca & ECA_VX;
    521		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
    522	}
    523	/* Run-time-Instrumentation */
    524	if (test_kvm_facility(vcpu->kvm, 64))
    525		scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI;
    526	/* Instruction Execution Prevention */
    527	if (test_kvm_facility(vcpu->kvm, 130))
    528		scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
    529	/* Guarded Storage */
    530	if (test_kvm_facility(vcpu->kvm, 133)) {
    531		scb_s->ecb |= scb_o->ecb & ECB_GS;
    532		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
    533	}
    534	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
    535		scb_s->eca |= scb_o->eca & ECA_SII;
    536	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
    537		scb_s->eca |= scb_o->eca & ECA_IB;
    538	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
    539		scb_s->eca |= scb_o->eca & ECA_CEI;
    540	/* Epoch Extension */
    541	if (test_kvm_facility(vcpu->kvm, 139))
    542		scb_s->ecd |= scb_o->ecd & ECD_MEF;
    543
    544	/* etoken */
    545	if (test_kvm_facility(vcpu->kvm, 156))
    546		scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
    547
    548	scb_s->hpid = HPID_VSIE;
    549	scb_s->cpnc = scb_o->cpnc;
    550
    551	prepare_ibc(vcpu, vsie_page);
    552	rc = shadow_crycb(vcpu, vsie_page);
    553out:
    554	if (rc)
    555		unshadow_scb(vcpu, vsie_page);
    556	return rc;
    557}
    558
    559void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
    560				 unsigned long end)
    561{
    562	struct kvm *kvm = gmap->private;
    563	struct vsie_page *cur;
    564	unsigned long prefix;
    565	struct page *page;
    566	int i;
    567
    568	if (!gmap_is_shadow(gmap))
    569		return;
    570	if (start >= 1UL << 31)
    571		/* We are only interested in prefix pages */
    572		return;
    573
    574	/*
    575	 * Only new shadow blocks are added to the list during runtime,
    576	 * therefore we can safely reference them all the time.
    577	 */
    578	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
    579		page = READ_ONCE(kvm->arch.vsie.pages[i]);
    580		if (!page)
    581			continue;
    582		cur = page_to_virt(page);
    583		if (READ_ONCE(cur->gmap) != gmap)
    584			continue;
    585		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
    586		/* with mso/msl, the prefix lies at an offset */
    587		prefix += cur->scb_s.mso;
    588		if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
    589			prefix_unmapped_sync(cur);
    590	}
    591}
    592
    593/*
    594 * Map the first prefix page and if tx is enabled also the second prefix page.
    595 *
    596 * The prefix will be protected, a gmap notifier will inform about unmaps.
    597 * The shadow scb must not be executed until the prefix is remapped, this is
    598 * guaranteed by properly handling PROG_REQUEST.
    599 *
    600 * Returns: - 0 on if successfully mapped or already mapped
    601 *          - > 0 if control has to be given to guest 2
    602 *          - -EAGAIN if the caller can retry immediately
    603 *          - -ENOMEM if out of memory
    604 */
    605static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    606{
    607	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    608	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
    609	int rc;
    610
    611	if (prefix_is_mapped(vsie_page))
    612		return 0;
    613
    614	/* mark it as mapped so we can catch any concurrent unmappers */
    615	prefix_mapped(vsie_page);
    616
    617	/* with mso/msl, the prefix lies at offset *mso* */
    618	prefix += scb_s->mso;
    619
    620	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
    621	if (!rc && (scb_s->ecb & ECB_TE))
    622		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
    623					   prefix + PAGE_SIZE, NULL);
    624	/*
    625	 * We don't have to mprotect, we will be called for all unshadows.
    626	 * SIE will detect if protection applies and trigger a validity.
    627	 */
    628	if (rc)
    629		prefix_unmapped(vsie_page);
    630	if (rc > 0 || rc == -EFAULT)
    631		rc = set_validity_icpt(scb_s, 0x0037U);
    632	return rc;
    633}
    634
    635/*
    636 * Pin the guest page given by gpa and set hpa to the pinned host address.
    637 * Will always be pinned writable.
    638 *
    639 * Returns: - 0 on success
    640 *          - -EINVAL if the gpa is not valid guest storage
    641 */
    642static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
    643{
    644	struct page *page;
    645
    646	page = gfn_to_page(kvm, gpa_to_gfn(gpa));
    647	if (is_error_page(page))
    648		return -EINVAL;
    649	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
    650	return 0;
    651}
    652
    653/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
    654static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
    655{
    656	kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
    657	/* mark the page always as dirty for migration */
    658	mark_page_dirty(kvm, gpa_to_gfn(gpa));
    659}
    660
    661/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
    662static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    663{
    664	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    665	hpa_t hpa;
    666
    667	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
    668	if (hpa) {
    669		unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa);
    670		vsie_page->sca_gpa = 0;
    671		scb_s->scaol = 0;
    672		scb_s->scaoh = 0;
    673	}
    674
    675	hpa = scb_s->itdba;
    676	if (hpa) {
    677		unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa);
    678		vsie_page->itdba_gpa = 0;
    679		scb_s->itdba = 0;
    680	}
    681
    682	hpa = scb_s->gvrd;
    683	if (hpa) {
    684		unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa);
    685		vsie_page->gvrd_gpa = 0;
    686		scb_s->gvrd = 0;
    687	}
    688
    689	hpa = scb_s->riccbd;
    690	if (hpa) {
    691		unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa);
    692		vsie_page->riccbd_gpa = 0;
    693		scb_s->riccbd = 0;
    694	}
    695
    696	hpa = scb_s->sdnxo;
    697	if (hpa) {
    698		unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa);
    699		vsie_page->sdnx_gpa = 0;
    700		scb_s->sdnxo = 0;
    701	}
    702}
    703
    704/*
    705 * Instead of shadowing some blocks, we can simply forward them because the
    706 * addresses in the scb are 64 bit long.
    707 *
    708 * This works as long as the data lies in one page. If blocks ever exceed one
    709 * page, we have to fall back to shadowing.
    710 *
    711 * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
    712 * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
    713 *
    714 * Returns: - 0 if all blocks were pinned.
    715 *          - > 0 if control has to be given to guest 2
    716 *          - -ENOMEM if out of memory
    717 */
    718static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    719{
    720	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
    721	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    722	hpa_t hpa;
    723	gpa_t gpa;
    724	int rc = 0;
    725
    726	gpa = READ_ONCE(scb_o->scaol) & ~0xfUL;
    727	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
    728		gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32;
    729	if (gpa) {
    730		if (gpa < 2 * PAGE_SIZE)
    731			rc = set_validity_icpt(scb_s, 0x0038U);
    732		else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
    733			rc = set_validity_icpt(scb_s, 0x0011U);
    734		else if ((gpa & PAGE_MASK) !=
    735			 ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
    736			rc = set_validity_icpt(scb_s, 0x003bU);
    737		if (!rc) {
    738			rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    739			if (rc)
    740				rc = set_validity_icpt(scb_s, 0x0034U);
    741		}
    742		if (rc)
    743			goto unpin;
    744		vsie_page->sca_gpa = gpa;
    745		scb_s->scaoh = (u32)((u64)hpa >> 32);
    746		scb_s->scaol = (u32)(u64)hpa;
    747	}
    748
    749	gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
    750	if (gpa && (scb_s->ecb & ECB_TE)) {
    751		if (gpa < 2 * PAGE_SIZE) {
    752			rc = set_validity_icpt(scb_s, 0x0080U);
    753			goto unpin;
    754		}
    755		/* 256 bytes cannot cross page boundaries */
    756		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    757		if (rc) {
    758			rc = set_validity_icpt(scb_s, 0x0080U);
    759			goto unpin;
    760		}
    761		vsie_page->itdba_gpa = gpa;
    762		scb_s->itdba = hpa;
    763	}
    764
    765	gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL;
    766	if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
    767		if (gpa < 2 * PAGE_SIZE) {
    768			rc = set_validity_icpt(scb_s, 0x1310U);
    769			goto unpin;
    770		}
    771		/*
    772		 * 512 bytes vector registers cannot cross page boundaries
    773		 * if this block gets bigger, we have to shadow it.
    774		 */
    775		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    776		if (rc) {
    777			rc = set_validity_icpt(scb_s, 0x1310U);
    778			goto unpin;
    779		}
    780		vsie_page->gvrd_gpa = gpa;
    781		scb_s->gvrd = hpa;
    782	}
    783
    784	gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL;
    785	if (gpa && (scb_s->ecb3 & ECB3_RI)) {
    786		if (gpa < 2 * PAGE_SIZE) {
    787			rc = set_validity_icpt(scb_s, 0x0043U);
    788			goto unpin;
    789		}
    790		/* 64 bytes cannot cross page boundaries */
    791		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    792		if (rc) {
    793			rc = set_validity_icpt(scb_s, 0x0043U);
    794			goto unpin;
    795		}
    796		/* Validity 0x0044 will be checked by SIE */
    797		vsie_page->riccbd_gpa = gpa;
    798		scb_s->riccbd = hpa;
    799	}
    800	if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) ||
    801	    (scb_s->ecd & ECD_ETOKENF)) {
    802		unsigned long sdnxc;
    803
    804		gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
    805		sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL;
    806		if (!gpa || gpa < 2 * PAGE_SIZE) {
    807			rc = set_validity_icpt(scb_s, 0x10b0U);
    808			goto unpin;
    809		}
    810		if (sdnxc < 6 || sdnxc > 12) {
    811			rc = set_validity_icpt(scb_s, 0x10b1U);
    812			goto unpin;
    813		}
    814		if (gpa & ((1 << sdnxc) - 1)) {
    815			rc = set_validity_icpt(scb_s, 0x10b2U);
    816			goto unpin;
    817		}
    818		/* Due to alignment rules (checked above) this cannot
    819		 * cross page boundaries
    820		 */
    821		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    822		if (rc) {
    823			rc = set_validity_icpt(scb_s, 0x10b0U);
    824			goto unpin;
    825		}
    826		vsie_page->sdnx_gpa = gpa;
    827		scb_s->sdnxo = hpa | sdnxc;
    828	}
    829	return 0;
    830unpin:
    831	unpin_blocks(vcpu, vsie_page);
    832	return rc;
    833}
    834
    835/* unpin the scb provided by guest 2, marking it as dirty */
    836static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
    837		      gpa_t gpa)
    838{
    839	hpa_t hpa = (hpa_t) vsie_page->scb_o;
    840
    841	if (hpa)
    842		unpin_guest_page(vcpu->kvm, gpa, hpa);
    843	vsie_page->scb_o = NULL;
    844}
    845
    846/*
    847 * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
    848 *
    849 * Returns: - 0 if the scb was pinned.
    850 *          - > 0 if control has to be given to guest 2
    851 */
    852static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
    853		   gpa_t gpa)
    854{
    855	hpa_t hpa;
    856	int rc;
    857
    858	rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
    859	if (rc) {
    860		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
    861		WARN_ON_ONCE(rc);
    862		return 1;
    863	}
    864	vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
    865	return 0;
    866}
    867
    868/*
    869 * Inject a fault into guest 2.
    870 *
    871 * Returns: - > 0 if control has to be given to guest 2
    872 *            < 0 if an error occurred during injection.
    873 */
    874static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
    875			bool write_flag)
    876{
    877	struct kvm_s390_pgm_info pgm = {
    878		.code = code,
    879		.trans_exc_code =
    880			/* 0-51: virtual address */
    881			(vaddr & 0xfffffffffffff000UL) |
    882			/* 52-53: store / fetch */
    883			(((unsigned int) !write_flag) + 1) << 10,
    884			/* 62-63: asce id (alway primary == 0) */
    885		.exc_access_id = 0, /* always primary */
    886		.op_access_id = 0, /* not MVPG */
    887	};
    888	int rc;
    889
    890	if (code == PGM_PROTECTION)
    891		pgm.trans_exc_code |= 0x4UL;
    892
    893	rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
    894	return rc ? rc : 1;
    895}
    896
    897/*
    898 * Handle a fault during vsie execution on a gmap shadow.
    899 *
    900 * Returns: - 0 if the fault was resolved
    901 *          - > 0 if control has to be given to guest 2
    902 *          - < 0 if an error occurred
    903 */
    904static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    905{
    906	int rc;
    907
    908	if (current->thread.gmap_int_code == PGM_PROTECTION)
    909		/* we can directly forward all protection exceptions */
    910		return inject_fault(vcpu, PGM_PROTECTION,
    911				    current->thread.gmap_addr, 1);
    912
    913	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
    914				   current->thread.gmap_addr, NULL);
    915	if (rc > 0) {
    916		rc = inject_fault(vcpu, rc,
    917				  current->thread.gmap_addr,
    918				  current->thread.gmap_write_flag);
    919		if (rc >= 0)
    920			vsie_page->fault_addr = current->thread.gmap_addr;
    921	}
    922	return rc;
    923}
    924
    925/*
    926 * Retry the previous fault that required guest 2 intervention. This avoids
    927 * one superfluous SIE re-entry and direct exit.
    928 *
    929 * Will ignore any errors. The next SIE fault will do proper fault handling.
    930 */
    931static void handle_last_fault(struct kvm_vcpu *vcpu,
    932			      struct vsie_page *vsie_page)
    933{
    934	if (vsie_page->fault_addr)
    935		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
    936				      vsie_page->fault_addr, NULL);
    937	vsie_page->fault_addr = 0;
    938}
    939
    940static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
    941{
    942	vsie_page->scb_s.icptcode = 0;
    943}
    944
    945/* rewind the psw and clear the vsie icpt, so we can retry execution */
    946static void retry_vsie_icpt(struct vsie_page *vsie_page)
    947{
    948	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    949	int ilen = insn_length(scb_s->ipa >> 8);
    950
    951	/* take care of EXECUTE instructions */
    952	if (scb_s->icptstatus & 1) {
    953		ilen = (scb_s->icptstatus >> 4) & 0x6;
    954		if (!ilen)
    955			ilen = 4;
    956	}
    957	scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
    958	clear_vsie_icpt(vsie_page);
    959}
    960
    961/*
    962 * Try to shadow + enable the guest 2 provided facility list.
    963 * Retry instruction execution if enabled for and provided by guest 2.
    964 *
    965 * Returns: - 0 if handled (retry or guest 2 icpt)
    966 *          - > 0 if control has to be given to guest 2
    967 */
    968static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
    969{
    970	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
    971	__u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
    972
    973	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
    974		retry_vsie_icpt(vsie_page);
    975		if (read_guest_real(vcpu, fac, &vsie_page->fac,
    976				    sizeof(vsie_page->fac)))
    977			return set_validity_icpt(scb_s, 0x1090U);
    978		scb_s->fac = (__u32)(__u64) &vsie_page->fac;
    979	}
    980	return 0;
    981}
    982
    983/*
    984 * Get a register for a nested guest.
    985 * @vcpu the vcpu of the guest
    986 * @vsie_page the vsie_page for the nested guest
    987 * @reg the register number, the upper 4 bits are ignored.
    988 * returns: the value of the register.
    989 */
    990static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
    991{
    992	/* no need to validate the parameter and/or perform error handling */
    993	reg &= 0xf;
    994	switch (reg) {
    995	case 15:
    996		return vsie_page->scb_s.gg15;
    997	case 14:
    998		return vsie_page->scb_s.gg14;
    999	default:
   1000		return vcpu->run->s.regs.gprs[reg];
   1001	}
   1002}
   1003
   1004static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
   1005{
   1006	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
   1007	unsigned long pei_dest, pei_src, src, dest, mask, prefix;
   1008	u64 *pei_block = &vsie_page->scb_o->mcic;
   1009	int edat, rc_dest, rc_src;
   1010	union ctlreg0 cr0;
   1011
   1012	cr0.val = vcpu->arch.sie_block->gcr[0];
   1013	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
   1014	mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
   1015	prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
   1016
   1017	dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
   1018	dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
   1019	src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
   1020	src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
   1021
   1022	rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
   1023	rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
   1024	/*
   1025	 * Either everything went well, or something non-critical went wrong
   1026	 * e.g. because of a race. In either case, simply retry.
   1027	 */
   1028	if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
   1029		retry_vsie_icpt(vsie_page);
   1030		return -EAGAIN;
   1031	}
   1032	/* Something more serious went wrong, propagate the error */
   1033	if (rc_dest < 0)
   1034		return rc_dest;
   1035	if (rc_src < 0)
   1036		return rc_src;
   1037
   1038	/* The only possible suppressing exception: just deliver it */
   1039	if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
   1040		clear_vsie_icpt(vsie_page);
   1041		rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
   1042		WARN_ON_ONCE(rc_dest);
   1043		return 1;
   1044	}
   1045
   1046	/*
   1047	 * Forward the PEI intercept to the guest if it was a page fault, or
   1048	 * also for segment and region table faults if EDAT applies.
   1049	 */
   1050	if (edat) {
   1051		rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
   1052		rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
   1053	} else {
   1054		rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
   1055		rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
   1056	}
   1057	if (!rc_dest && !rc_src) {
   1058		pei_block[0] = pei_dest;
   1059		pei_block[1] = pei_src;
   1060		return 1;
   1061	}
   1062
   1063	retry_vsie_icpt(vsie_page);
   1064
   1065	/*
   1066	 * The host has edat, and the guest does not, or it was an ASCE type
   1067	 * exception. The host needs to inject the appropriate DAT interrupts
   1068	 * into the guest.
   1069	 */
   1070	if (rc_dest)
   1071		return inject_fault(vcpu, rc_dest, dest, 1);
   1072	return inject_fault(vcpu, rc_src, src, 0);
   1073}
   1074
   1075/*
   1076 * Run the vsie on a shadow scb and a shadow gmap, without any further
   1077 * sanity checks, handling SIE faults.
   1078 *
   1079 * Returns: - 0 everything went fine
   1080 *          - > 0 if control has to be given to guest 2
   1081 *          - < 0 if an error occurred
   1082 */
   1083static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
   1084	__releases(vcpu->kvm->srcu)
   1085	__acquires(vcpu->kvm->srcu)
   1086{
   1087	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
   1088	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
   1089	int guest_bp_isolation;
   1090	int rc = 0;
   1091
   1092	handle_last_fault(vcpu, vsie_page);
   1093
   1094	kvm_vcpu_srcu_read_unlock(vcpu);
   1095
   1096	/* save current guest state of bp isolation override */
   1097	guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
   1098
   1099	/*
   1100	 * The guest is running with BPBC, so we have to force it on for our
   1101	 * nested guest. This is done by enabling BPBC globally, so the BPBC
   1102	 * control in the SCB (which the nested guest can modify) is simply
   1103	 * ignored.
   1104	 */
   1105	if (test_kvm_facility(vcpu->kvm, 82) &&
   1106	    vcpu->arch.sie_block->fpf & FPF_BPBC)
   1107		set_thread_flag(TIF_ISOLATE_BP_GUEST);
   1108
   1109	local_irq_disable();
   1110	guest_enter_irqoff();
   1111	local_irq_enable();
   1112
   1113	/*
   1114	 * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
   1115	 * and VCPU requests also hinder the vSIE from running and lead
   1116	 * to an immediate exit. kvm_s390_vsie_kick() has to be used to
   1117	 * also kick the vSIE.
   1118	 */
   1119	vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
   1120	barrier();
   1121	if (test_cpu_flag(CIF_FPU))
   1122		load_fpu_regs();
   1123	if (!kvm_s390_vcpu_sie_inhibited(vcpu))
   1124		rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
   1125	barrier();
   1126	vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
   1127
   1128	local_irq_disable();
   1129	guest_exit_irqoff();
   1130	local_irq_enable();
   1131
   1132	/* restore guest state for bp isolation override */
   1133	if (!guest_bp_isolation)
   1134		clear_thread_flag(TIF_ISOLATE_BP_GUEST);
   1135
   1136	kvm_vcpu_srcu_read_lock(vcpu);
   1137
   1138	if (rc == -EINTR) {
   1139		VCPU_EVENT(vcpu, 3, "%s", "machine check");
   1140		kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info);
   1141		return 0;
   1142	}
   1143
   1144	if (rc > 0)
   1145		rc = 0; /* we could still have an icpt */
   1146	else if (rc == -EFAULT)
   1147		return handle_fault(vcpu, vsie_page);
   1148
   1149	switch (scb_s->icptcode) {
   1150	case ICPT_INST:
   1151		if (scb_s->ipa == 0xb2b0)
   1152			rc = handle_stfle(vcpu, vsie_page);
   1153		break;
   1154	case ICPT_STOP:
   1155		/* stop not requested by g2 - must have been a kick */
   1156		if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
   1157			clear_vsie_icpt(vsie_page);
   1158		break;
   1159	case ICPT_VALIDITY:
   1160		if ((scb_s->ipa & 0xf000) != 0xf000)
   1161			scb_s->ipa += 0x1000;
   1162		break;
   1163	case ICPT_PARTEXEC:
   1164		if (scb_s->ipa == 0xb254)
   1165			rc = vsie_handle_mvpg(vcpu, vsie_page);
   1166		break;
   1167	}
   1168	return rc;
   1169}
   1170
   1171static void release_gmap_shadow(struct vsie_page *vsie_page)
   1172{
   1173	if (vsie_page->gmap)
   1174		gmap_put(vsie_page->gmap);
   1175	WRITE_ONCE(vsie_page->gmap, NULL);
   1176	prefix_unmapped(vsie_page);
   1177}
   1178
   1179static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
   1180			       struct vsie_page *vsie_page)
   1181{
   1182	unsigned long asce;
   1183	union ctlreg0 cr0;
   1184	struct gmap *gmap;
   1185	int edat;
   1186
   1187	asce = vcpu->arch.sie_block->gcr[1];
   1188	cr0.val = vcpu->arch.sie_block->gcr[0];
   1189	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
   1190	edat += edat && test_kvm_facility(vcpu->kvm, 78);
   1191
   1192	/*
   1193	 * ASCE or EDAT could have changed since last icpt, or the gmap
   1194	 * we're holding has been unshadowed. If the gmap is still valid,
   1195	 * we can safely reuse it.
   1196	 */
   1197	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
   1198		return 0;
   1199
   1200	/* release the old shadow - if any, and mark the prefix as unmapped */
   1201	release_gmap_shadow(vsie_page);
   1202	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
   1203	if (IS_ERR(gmap))
   1204		return PTR_ERR(gmap);
   1205	gmap->private = vcpu->kvm;
   1206	WRITE_ONCE(vsie_page->gmap, gmap);
   1207	return 0;
   1208}
   1209
   1210/*
   1211 * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
   1212 */
   1213static void register_shadow_scb(struct kvm_vcpu *vcpu,
   1214				struct vsie_page *vsie_page)
   1215{
   1216	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
   1217
   1218	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
   1219	/*
   1220	 * External calls have to lead to a kick of the vcpu and
   1221	 * therefore the vsie -> Simulate Wait state.
   1222	 */
   1223	kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
   1224	/*
   1225	 * We have to adjust the g3 epoch by the g2 epoch. The epoch will
   1226	 * automatically be adjusted on tod clock changes via kvm_sync_clock.
   1227	 */
   1228	preempt_disable();
   1229	scb_s->epoch += vcpu->kvm->arch.epoch;
   1230
   1231	if (scb_s->ecd & ECD_MEF) {
   1232		scb_s->epdx += vcpu->kvm->arch.epdx;
   1233		if (scb_s->epoch < vcpu->kvm->arch.epoch)
   1234			scb_s->epdx += 1;
   1235	}
   1236
   1237	preempt_enable();
   1238}
   1239
   1240/*
   1241 * Unregister a shadow scb from a VCPU.
   1242 */
   1243static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
   1244{
   1245	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
   1246	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
   1247}
   1248
   1249/*
   1250 * Run the vsie on a shadowed scb, managing the gmap shadow, handling
   1251 * prefix pages and faults.
   1252 *
   1253 * Returns: - 0 if no errors occurred
   1254 *          - > 0 if control has to be given to guest 2
   1255 *          - -ENOMEM if out of memory
   1256 */
   1257static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
   1258{
   1259	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
   1260	int rc = 0;
   1261
   1262	while (1) {
   1263		rc = acquire_gmap_shadow(vcpu, vsie_page);
   1264		if (!rc)
   1265			rc = map_prefix(vcpu, vsie_page);
   1266		if (!rc) {
   1267			gmap_enable(vsie_page->gmap);
   1268			update_intervention_requests(vsie_page);
   1269			rc = do_vsie_run(vcpu, vsie_page);
   1270			gmap_enable(vcpu->arch.gmap);
   1271		}
   1272		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
   1273
   1274		if (rc == -EAGAIN)
   1275			rc = 0;
   1276		if (rc || scb_s->icptcode || signal_pending(current) ||
   1277		    kvm_s390_vcpu_has_irq(vcpu, 0) ||
   1278		    kvm_s390_vcpu_sie_inhibited(vcpu))
   1279			break;
   1280		cond_resched();
   1281	}
   1282
   1283	if (rc == -EFAULT) {
   1284		/*
   1285		 * Addressing exceptions are always presentes as intercepts.
   1286		 * As addressing exceptions are suppressing and our guest 3 PSW
   1287		 * points at the responsible instruction, we have to
   1288		 * forward the PSW and set the ilc. If we can't read guest 3
   1289		 * instruction, we can use an arbitrary ilc. Let's always use
   1290		 * ilen = 4 for now, so we can avoid reading in guest 3 virtual
   1291		 * memory. (we could also fake the shadow so the hardware
   1292		 * handles it).
   1293		 */
   1294		scb_s->icptcode = ICPT_PROGI;
   1295		scb_s->iprcc = PGM_ADDRESSING;
   1296		scb_s->pgmilc = 4;
   1297		scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
   1298		rc = 1;
   1299	}
   1300	return rc;
   1301}
   1302
   1303/*
   1304 * Get or create a vsie page for a scb address.
   1305 *
   1306 * Returns: - address of a vsie page (cached or new one)
   1307 *          - NULL if the same scb address is already used by another VCPU
   1308 *          - ERR_PTR(-ENOMEM) if out of memory
   1309 */
   1310static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
   1311{
   1312	struct vsie_page *vsie_page;
   1313	struct page *page;
   1314	int nr_vcpus;
   1315
   1316	rcu_read_lock();
   1317	page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
   1318	rcu_read_unlock();
   1319	if (page) {
   1320		if (page_ref_inc_return(page) == 2)
   1321			return page_to_virt(page);
   1322		page_ref_dec(page);
   1323	}
   1324
   1325	/*
   1326	 * We want at least #online_vcpus shadows, so every VCPU can execute
   1327	 * the VSIE in parallel.
   1328	 */
   1329	nr_vcpus = atomic_read(&kvm->online_vcpus);
   1330
   1331	mutex_lock(&kvm->arch.vsie.mutex);
   1332	if (kvm->arch.vsie.page_count < nr_vcpus) {
   1333		page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
   1334		if (!page) {
   1335			mutex_unlock(&kvm->arch.vsie.mutex);
   1336			return ERR_PTR(-ENOMEM);
   1337		}
   1338		page_ref_inc(page);
   1339		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
   1340		kvm->arch.vsie.page_count++;
   1341	} else {
   1342		/* reuse an existing entry that belongs to nobody */
   1343		while (true) {
   1344			page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
   1345			if (page_ref_inc_return(page) == 2)
   1346				break;
   1347			page_ref_dec(page);
   1348			kvm->arch.vsie.next++;
   1349			kvm->arch.vsie.next %= nr_vcpus;
   1350		}
   1351		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
   1352	}
   1353	page->index = addr;
   1354	/* double use of the same address */
   1355	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
   1356		page_ref_dec(page);
   1357		mutex_unlock(&kvm->arch.vsie.mutex);
   1358		return NULL;
   1359	}
   1360	mutex_unlock(&kvm->arch.vsie.mutex);
   1361
   1362	vsie_page = page_to_virt(page);
   1363	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
   1364	release_gmap_shadow(vsie_page);
   1365	vsie_page->fault_addr = 0;
   1366	vsie_page->scb_s.ihcpu = 0xffffU;
   1367	return vsie_page;
   1368}
   1369
   1370/* put a vsie page acquired via get_vsie_page */
   1371static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
   1372{
   1373	struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
   1374
   1375	page_ref_dec(page);
   1376}
   1377
   1378int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
   1379{
   1380	struct vsie_page *vsie_page;
   1381	unsigned long scb_addr;
   1382	int rc;
   1383
   1384	vcpu->stat.instruction_sie++;
   1385	if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
   1386		return -EOPNOTSUPP;
   1387	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
   1388		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
   1389
   1390	BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE);
   1391	scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
   1392
   1393	/* 512 byte alignment */
   1394	if (unlikely(scb_addr & 0x1ffUL))
   1395		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
   1396
   1397	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
   1398	    kvm_s390_vcpu_sie_inhibited(vcpu))
   1399		return 0;
   1400
   1401	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
   1402	if (IS_ERR(vsie_page))
   1403		return PTR_ERR(vsie_page);
   1404	else if (!vsie_page)
   1405		/* double use of sie control block - simply do nothing */
   1406		return 0;
   1407
   1408	rc = pin_scb(vcpu, vsie_page, scb_addr);
   1409	if (rc)
   1410		goto out_put;
   1411	rc = shadow_scb(vcpu, vsie_page);
   1412	if (rc)
   1413		goto out_unpin_scb;
   1414	rc = pin_blocks(vcpu, vsie_page);
   1415	if (rc)
   1416		goto out_unshadow;
   1417	register_shadow_scb(vcpu, vsie_page);
   1418	rc = vsie_run(vcpu, vsie_page);
   1419	unregister_shadow_scb(vcpu);
   1420	unpin_blocks(vcpu, vsie_page);
   1421out_unshadow:
   1422	unshadow_scb(vcpu, vsie_page);
   1423out_unpin_scb:
   1424	unpin_scb(vcpu, vsie_page, scb_addr);
   1425out_put:
   1426	put_vsie_page(vcpu->kvm, vsie_page);
   1427
   1428	return rc < 0 ? rc : 0;
   1429}
   1430
   1431/* Init the vsie data structures. To be called when a vm is initialized. */
   1432void kvm_s390_vsie_init(struct kvm *kvm)
   1433{
   1434	mutex_init(&kvm->arch.vsie.mutex);
   1435	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
   1436}
   1437
   1438/* Destroy the vsie data structures. To be called when a vm is destroyed. */
   1439void kvm_s390_vsie_destroy(struct kvm *kvm)
   1440{
   1441	struct vsie_page *vsie_page;
   1442	struct page *page;
   1443	int i;
   1444
   1445	mutex_lock(&kvm->arch.vsie.mutex);
   1446	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
   1447		page = kvm->arch.vsie.pages[i];
   1448		kvm->arch.vsie.pages[i] = NULL;
   1449		vsie_page = page_to_virt(page);
   1450		release_gmap_shadow(vsie_page);
   1451		/* free the radix tree entry */
   1452		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
   1453		__free_page(page);
   1454	}
   1455	kvm->arch.vsie.page_count = 0;
   1456	mutex_unlock(&kvm->arch.vsie.mutex);
   1457}
   1458
   1459void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
   1460{
   1461	struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
   1462
   1463	/*
   1464	 * Even if the VCPU lets go of the shadow sie block reference, it is
   1465	 * still valid in the cache. So we can safely kick it.
   1466	 */
   1467	if (scb) {
   1468		atomic_or(PROG_BLOCK_SIE, &scb->prog20);
   1469		if (scb->prog0c & PROG_IN_SIE)
   1470			atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
   1471	}
   1472}