fault.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
fault.c (22212B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  S390 version
      4 *    Copyright IBM Corp. 1999
      5 *    Author(s): Hartmut Penner (hp@de.ibm.com)
      6 *               Ulrich Weigand (uweigand@de.ibm.com)
      7 *
      8 *  Derived from "arch/i386/mm/fault.c"
      9 *    Copyright (C) 1995  Linus Torvalds
     10 */
     11
     12#include <linux/kernel_stat.h>
     13#include <linux/perf_event.h>
     14#include <linux/signal.h>
     15#include <linux/sched.h>
     16#include <linux/sched/debug.h>
     17#include <linux/kernel.h>
     18#include <linux/errno.h>
     19#include <linux/string.h>
     20#include <linux/types.h>
     21#include <linux/ptrace.h>
     22#include <linux/mman.h>
     23#include <linux/mm.h>
     24#include <linux/compat.h>
     25#include <linux/smp.h>
     26#include <linux/kdebug.h>
     27#include <linux/init.h>
     28#include <linux/console.h>
     29#include <linux/extable.h>
     30#include <linux/hardirq.h>
     31#include <linux/kprobes.h>
     32#include <linux/uaccess.h>
     33#include <linux/hugetlb.h>
     34#include <linux/kfence.h>
     35#include <asm/asm-extable.h>
     36#include <asm/asm-offsets.h>
     37#include <asm/diag.h>
     38#include <asm/gmap.h>
     39#include <asm/irq.h>
     40#include <asm/mmu_context.h>
     41#include <asm/facility.h>
     42#include <asm/uv.h>
     43#include "../kernel/entry.h"
     44
     45#define __FAIL_ADDR_MASK -4096L
     46#define __SUBCODE_MASK 0x0600
     47#define __PF_RES_FIELD 0x8000000000000000ULL
     48
     49#define VM_FAULT_BADCONTEXT	((__force vm_fault_t) 0x010000)
     50#define VM_FAULT_BADMAP		((__force vm_fault_t) 0x020000)
     51#define VM_FAULT_BADACCESS	((__force vm_fault_t) 0x040000)
     52#define VM_FAULT_SIGNAL		((__force vm_fault_t) 0x080000)
     53#define VM_FAULT_PFAULT		((__force vm_fault_t) 0x100000)
     54
     55enum fault_type {
     56	KERNEL_FAULT,
     57	USER_FAULT,
     58	GMAP_FAULT,
     59};
     60
     61static unsigned long store_indication __read_mostly;
     62
     63static int __init fault_init(void)
     64{
     65	if (test_facility(75))
     66		store_indication = 0xc00;
     67	return 0;
     68}
     69early_initcall(fault_init);
     70
     71/*
     72 * Find out which address space caused the exception.
     73 */
     74static enum fault_type get_fault_type(struct pt_regs *regs)
     75{
     76	unsigned long trans_exc_code;
     77
     78	trans_exc_code = regs->int_parm_long & 3;
     79	if (likely(trans_exc_code == 0)) {
     80		/* primary space exception */
     81		if (user_mode(regs))
     82			return USER_FAULT;
     83		if (!IS_ENABLED(CONFIG_PGSTE))
     84			return KERNEL_FAULT;
     85		if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
     86			return GMAP_FAULT;
     87		return KERNEL_FAULT;
     88	}
     89	if (trans_exc_code == 2)
     90		return USER_FAULT;
     91	if (trans_exc_code == 1) {
     92		/* access register mode, not used in the kernel */
     93		return USER_FAULT;
     94	}
     95	/* home space exception -> access via kernel ASCE */
     96	return KERNEL_FAULT;
     97}
     98
     99static int bad_address(void *p)
    100{
    101	unsigned long dummy;
    102
    103	return get_kernel_nofault(dummy, (unsigned long *)p);
    104}
    105
    106static void dump_pagetable(unsigned long asce, unsigned long address)
    107{
    108	unsigned long *table = __va(asce & _ASCE_ORIGIN);
    109
    110	pr_alert("AS:%016lx ", asce);
    111	switch (asce & _ASCE_TYPE_MASK) {
    112	case _ASCE_TYPE_REGION1:
    113		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
    114		if (bad_address(table))
    115			goto bad;
    116		pr_cont("R1:%016lx ", *table);
    117		if (*table & _REGION_ENTRY_INVALID)
    118			goto out;
    119		table = __va(*table & _REGION_ENTRY_ORIGIN);
    120		fallthrough;
    121	case _ASCE_TYPE_REGION2:
    122		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
    123		if (bad_address(table))
    124			goto bad;
    125		pr_cont("R2:%016lx ", *table);
    126		if (*table & _REGION_ENTRY_INVALID)
    127			goto out;
    128		table = __va(*table & _REGION_ENTRY_ORIGIN);
    129		fallthrough;
    130	case _ASCE_TYPE_REGION3:
    131		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
    132		if (bad_address(table))
    133			goto bad;
    134		pr_cont("R3:%016lx ", *table);
    135		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
    136			goto out;
    137		table = __va(*table & _REGION_ENTRY_ORIGIN);
    138		fallthrough;
    139	case _ASCE_TYPE_SEGMENT:
    140		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
    141		if (bad_address(table))
    142			goto bad;
    143		pr_cont("S:%016lx ", *table);
    144		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
    145			goto out;
    146		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
    147	}
    148	table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
    149	if (bad_address(table))
    150		goto bad;
    151	pr_cont("P:%016lx ", *table);
    152out:
    153	pr_cont("\n");
    154	return;
    155bad:
    156	pr_cont("BAD\n");
    157}
    158
    159static void dump_fault_info(struct pt_regs *regs)
    160{
    161	unsigned long asce;
    162
    163	pr_alert("Failing address: %016lx TEID: %016lx\n",
    164		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
    165	pr_alert("Fault in ");
    166	switch (regs->int_parm_long & 3) {
    167	case 3:
    168		pr_cont("home space ");
    169		break;
    170	case 2:
    171		pr_cont("secondary space ");
    172		break;
    173	case 1:
    174		pr_cont("access register ");
    175		break;
    176	case 0:
    177		pr_cont("primary space ");
    178		break;
    179	}
    180	pr_cont("mode while using ");
    181	switch (get_fault_type(regs)) {
    182	case USER_FAULT:
    183		asce = S390_lowcore.user_asce;
    184		pr_cont("user ");
    185		break;
    186	case GMAP_FAULT:
    187		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
    188		pr_cont("gmap ");
    189		break;
    190	case KERNEL_FAULT:
    191		asce = S390_lowcore.kernel_asce;
    192		pr_cont("kernel ");
    193		break;
    194	default:
    195		unreachable();
    196	}
    197	pr_cont("ASCE.\n");
    198	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
    199}
    200
    201int show_unhandled_signals = 1;
    202
    203void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
    204{
    205	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
    206		return;
    207	if (!unhandled_signal(current, signr))
    208		return;
    209	if (!printk_ratelimit())
    210		return;
    211	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
    212	       regs->int_code & 0xffff, regs->int_code >> 17);
    213	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
    214	printk(KERN_CONT "\n");
    215	if (is_mm_fault)
    216		dump_fault_info(regs);
    217	show_regs(regs);
    218}
    219
    220/*
    221 * Send SIGSEGV to task.  This is an external routine
    222 * to keep the stack usage of do_page_fault small.
    223 */
    224static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
    225{
    226	report_user_fault(regs, SIGSEGV, 1);
    227	force_sig_fault(SIGSEGV, si_code,
    228			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
    229}
    230
    231static noinline void do_no_context(struct pt_regs *regs)
    232{
    233	if (fixup_exception(regs))
    234		return;
    235	/*
    236	 * Oops. The kernel tried to access some bad page. We'll have to
    237	 * terminate things with extreme prejudice.
    238	 */
    239	if (get_fault_type(regs) == KERNEL_FAULT)
    240		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
    241		       " in virtual kernel address space\n");
    242	else
    243		printk(KERN_ALERT "Unable to handle kernel paging request"
    244		       " in virtual user address space\n");
    245	dump_fault_info(regs);
    246	die(regs, "Oops");
    247}
    248
    249static noinline void do_low_address(struct pt_regs *regs)
    250{
    251	/* Low-address protection hit in kernel mode means
    252	   NULL pointer write access in kernel mode.  */
    253	if (regs->psw.mask & PSW_MASK_PSTATE) {
    254		/* Low-address protection hit in user mode 'cannot happen'. */
    255		die (regs, "Low-address protection");
    256	}
    257
    258	do_no_context(regs);
    259}
    260
    261static noinline void do_sigbus(struct pt_regs *regs)
    262{
    263	/*
    264	 * Send a sigbus, regardless of whether we were in kernel
    265	 * or user mode.
    266	 */
    267	force_sig_fault(SIGBUS, BUS_ADRERR,
    268			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
    269}
    270
    271static noinline void do_fault_error(struct pt_regs *regs, int access,
    272					vm_fault_t fault)
    273{
    274	int si_code;
    275
    276	switch (fault) {
    277	case VM_FAULT_BADACCESS:
    278	case VM_FAULT_BADMAP:
    279		/* Bad memory access. Check if it is kernel or user space. */
    280		if (user_mode(regs)) {
    281			/* User mode accesses just cause a SIGSEGV */
    282			si_code = (fault == VM_FAULT_BADMAP) ?
    283				SEGV_MAPERR : SEGV_ACCERR;
    284			do_sigsegv(regs, si_code);
    285			break;
    286		}
    287		fallthrough;
    288	case VM_FAULT_BADCONTEXT:
    289	case VM_FAULT_PFAULT:
    290		do_no_context(regs);
    291		break;
    292	case VM_FAULT_SIGNAL:
    293		if (!user_mode(regs))
    294			do_no_context(regs);
    295		break;
    296	default: /* fault & VM_FAULT_ERROR */
    297		if (fault & VM_FAULT_OOM) {
    298			if (!user_mode(regs))
    299				do_no_context(regs);
    300			else
    301				pagefault_out_of_memory();
    302		} else if (fault & VM_FAULT_SIGSEGV) {
    303			/* Kernel mode? Handle exceptions or die */
    304			if (!user_mode(regs))
    305				do_no_context(regs);
    306			else
    307				do_sigsegv(regs, SEGV_MAPERR);
    308		} else if (fault & VM_FAULT_SIGBUS) {
    309			/* Kernel mode? Handle exceptions or die */
    310			if (!user_mode(regs))
    311				do_no_context(regs);
    312			else
    313				do_sigbus(regs);
    314		} else
    315			BUG();
    316		break;
    317	}
    318}
    319
    320/*
    321 * This routine handles page faults.  It determines the address,
    322 * and the problem, and then passes it off to one of the appropriate
    323 * routines.
    324 *
    325 * interruption code (int_code):
    326 *   04       Protection           ->  Write-Protection  (suppression)
    327 *   10       Segment translation  ->  Not present       (nullification)
    328 *   11       Page translation     ->  Not present       (nullification)
    329 *   3b       Region third trans.  ->  Not present       (nullification)
    330 */
    331static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
    332{
    333	struct gmap *gmap;
    334	struct task_struct *tsk;
    335	struct mm_struct *mm;
    336	struct vm_area_struct *vma;
    337	enum fault_type type;
    338	unsigned long trans_exc_code;
    339	unsigned long address;
    340	unsigned int flags;
    341	vm_fault_t fault;
    342	bool is_write;
    343
    344	tsk = current;
    345	/*
    346	 * The instruction that caused the program check has
    347	 * been nullified. Don't signal single step via SIGTRAP.
    348	 */
    349	clear_thread_flag(TIF_PER_TRAP);
    350
    351	if (kprobe_page_fault(regs, 14))
    352		return 0;
    353
    354	mm = tsk->mm;
    355	trans_exc_code = regs->int_parm_long;
    356	address = trans_exc_code & __FAIL_ADDR_MASK;
    357	is_write = (trans_exc_code & store_indication) == 0x400;
    358
    359	/*
    360	 * Verify that the fault happened in user space, that
    361	 * we are not in an interrupt and that there is a 
    362	 * user context.
    363	 */
    364	fault = VM_FAULT_BADCONTEXT;
    365	type = get_fault_type(regs);
    366	switch (type) {
    367	case KERNEL_FAULT:
    368		if (kfence_handle_page_fault(address, is_write, regs))
    369			return 0;
    370		goto out;
    371	case USER_FAULT:
    372	case GMAP_FAULT:
    373		if (faulthandler_disabled() || !mm)
    374			goto out;
    375		break;
    376	}
    377
    378	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
    379	flags = FAULT_FLAG_DEFAULT;
    380	if (user_mode(regs))
    381		flags |= FAULT_FLAG_USER;
    382	if (access == VM_WRITE || is_write)
    383		flags |= FAULT_FLAG_WRITE;
    384	mmap_read_lock(mm);
    385
    386	gmap = NULL;
    387	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
    388		gmap = (struct gmap *) S390_lowcore.gmap;
    389		current->thread.gmap_addr = address;
    390		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
    391		current->thread.gmap_int_code = regs->int_code & 0xffff;
    392		address = __gmap_translate(gmap, address);
    393		if (address == -EFAULT) {
    394			fault = VM_FAULT_BADMAP;
    395			goto out_up;
    396		}
    397		if (gmap->pfault_enabled)
    398			flags |= FAULT_FLAG_RETRY_NOWAIT;
    399	}
    400
    401retry:
    402	fault = VM_FAULT_BADMAP;
    403	vma = find_vma(mm, address);
    404	if (!vma)
    405		goto out_up;
    406
    407	if (unlikely(vma->vm_start > address)) {
    408		if (!(vma->vm_flags & VM_GROWSDOWN))
    409			goto out_up;
    410		if (expand_stack(vma, address))
    411			goto out_up;
    412	}
    413
    414	/*
    415	 * Ok, we have a good vm_area for this memory access, so
    416	 * we can handle it..
    417	 */
    418	fault = VM_FAULT_BADACCESS;
    419	if (unlikely(!(vma->vm_flags & access)))
    420		goto out_up;
    421
    422	if (is_vm_hugetlb_page(vma))
    423		address &= HPAGE_MASK;
    424	/*
    425	 * If for any reason at all we couldn't handle the fault,
    426	 * make sure we exit gracefully rather than endlessly redo
    427	 * the fault.
    428	 */
    429	fault = handle_mm_fault(vma, address, flags, regs);
    430	if (fault_signal_pending(fault, regs)) {
    431		fault = VM_FAULT_SIGNAL;
    432		if (flags & FAULT_FLAG_RETRY_NOWAIT)
    433			goto out_up;
    434		goto out;
    435	}
    436	if (unlikely(fault & VM_FAULT_ERROR))
    437		goto out_up;
    438
    439	if (fault & VM_FAULT_RETRY) {
    440		if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
    441			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
    442			/*
    443			 * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
    444			 * not been released
    445			 */
    446			current->thread.gmap_pfault = 1;
    447			fault = VM_FAULT_PFAULT;
    448			goto out_up;
    449		}
    450		flags &= ~FAULT_FLAG_RETRY_NOWAIT;
    451		flags |= FAULT_FLAG_TRIED;
    452		mmap_read_lock(mm);
    453		goto retry;
    454	}
    455	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
    456		address =  __gmap_link(gmap, current->thread.gmap_addr,
    457				       address);
    458		if (address == -EFAULT) {
    459			fault = VM_FAULT_BADMAP;
    460			goto out_up;
    461		}
    462		if (address == -ENOMEM) {
    463			fault = VM_FAULT_OOM;
    464			goto out_up;
    465		}
    466	}
    467	fault = 0;
    468out_up:
    469	mmap_read_unlock(mm);
    470out:
    471	return fault;
    472}
    473
    474void do_protection_exception(struct pt_regs *regs)
    475{
    476	unsigned long trans_exc_code;
    477	int access;
    478	vm_fault_t fault;
    479
    480	trans_exc_code = regs->int_parm_long;
    481	/*
    482	 * Protection exceptions are suppressing, decrement psw address.
    483	 * The exception to this rule are aborted transactions, for these
    484	 * the PSW already points to the correct location.
    485	 */
    486	if (!(regs->int_code & 0x200))
    487		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
    488	/*
    489	 * Check for low-address protection.  This needs to be treated
    490	 * as a special case because the translation exception code
    491	 * field is not guaranteed to contain valid data in this case.
    492	 */
    493	if (unlikely(!(trans_exc_code & 4))) {
    494		do_low_address(regs);
    495		return;
    496	}
    497	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
    498		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
    499					(regs->psw.addr & PAGE_MASK);
    500		access = VM_EXEC;
    501		fault = VM_FAULT_BADACCESS;
    502	} else {
    503		access = VM_WRITE;
    504		fault = do_exception(regs, access);
    505	}
    506	if (unlikely(fault))
    507		do_fault_error(regs, access, fault);
    508}
    509NOKPROBE_SYMBOL(do_protection_exception);
    510
    511void do_dat_exception(struct pt_regs *regs)
    512{
    513	int access;
    514	vm_fault_t fault;
    515
    516	access = VM_ACCESS_FLAGS;
    517	fault = do_exception(regs, access);
    518	if (unlikely(fault))
    519		do_fault_error(regs, access, fault);
    520}
    521NOKPROBE_SYMBOL(do_dat_exception);
    522
    523#ifdef CONFIG_PFAULT 
    524/*
    525 * 'pfault' pseudo page faults routines.
    526 */
    527static int pfault_disable;
    528
    529static int __init nopfault(char *str)
    530{
    531	pfault_disable = 1;
    532	return 1;
    533}
    534
    535__setup("nopfault", nopfault);
    536
    537struct pfault_refbk {
    538	u16 refdiagc;
    539	u16 reffcode;
    540	u16 refdwlen;
    541	u16 refversn;
    542	u64 refgaddr;
    543	u64 refselmk;
    544	u64 refcmpmk;
    545	u64 reserved;
    546} __attribute__ ((packed, aligned(8)));
    547
    548static struct pfault_refbk pfault_init_refbk = {
    549	.refdiagc = 0x258,
    550	.reffcode = 0,
    551	.refdwlen = 5,
    552	.refversn = 2,
    553	.refgaddr = __LC_LPP,
    554	.refselmk = 1ULL << 48,
    555	.refcmpmk = 1ULL << 48,
    556	.reserved = __PF_RES_FIELD
    557};
    558
    559int pfault_init(void)
    560{
    561        int rc;
    562
    563	if (pfault_disable)
    564		return -1;
    565	diag_stat_inc(DIAG_STAT_X258);
    566	asm volatile(
    567		"	diag	%1,%0,0x258\n"
    568		"0:	j	2f\n"
    569		"1:	la	%0,8\n"
    570		"2:\n"
    571		EX_TABLE(0b,1b)
    572		: "=d" (rc)
    573		: "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
    574        return rc;
    575}
    576
    577static struct pfault_refbk pfault_fini_refbk = {
    578	.refdiagc = 0x258,
    579	.reffcode = 1,
    580	.refdwlen = 5,
    581	.refversn = 2,
    582};
    583
    584void pfault_fini(void)
    585{
    586
    587	if (pfault_disable)
    588		return;
    589	diag_stat_inc(DIAG_STAT_X258);
    590	asm volatile(
    591		"	diag	%0,0,0x258\n"
    592		"0:	nopr	%%r7\n"
    593		EX_TABLE(0b,0b)
    594		: : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
    595}
    596
    597static DEFINE_SPINLOCK(pfault_lock);
    598static LIST_HEAD(pfault_list);
    599
    600#define PF_COMPLETE	0x0080
    601
    602/*
    603 * The mechanism of our pfault code: if Linux is running as guest, runs a user
    604 * space process and the user space process accesses a page that the host has
    605 * paged out we get a pfault interrupt.
    606 *
    607 * This allows us, within the guest, to schedule a different process. Without
    608 * this mechanism the host would have to suspend the whole virtual cpu until
    609 * the page has been paged in.
    610 *
    611 * So when we get such an interrupt then we set the state of the current task
    612 * to uninterruptible and also set the need_resched flag. Both happens within
    613 * interrupt context(!). If we later on want to return to user space we
    614 * recognize the need_resched flag and then call schedule().  It's not very
    615 * obvious how this works...
    616 *
    617 * Of course we have a lot of additional fun with the completion interrupt (->
    618 * host signals that a page of a process has been paged in and the process can
    619 * continue to run). This interrupt can arrive on any cpu and, since we have
    620 * virtual cpus, actually appear before the interrupt that signals that a page
    621 * is missing.
    622 */
    623static void pfault_interrupt(struct ext_code ext_code,
    624			     unsigned int param32, unsigned long param64)
    625{
    626	struct task_struct *tsk;
    627	__u16 subcode;
    628	pid_t pid;
    629
    630	/*
    631	 * Get the external interruption subcode & pfault initial/completion
    632	 * signal bit. VM stores this in the 'cpu address' field associated
    633	 * with the external interrupt.
    634	 */
    635	subcode = ext_code.subcode;
    636	if ((subcode & 0xff00) != __SUBCODE_MASK)
    637		return;
    638	inc_irq_stat(IRQEXT_PFL);
    639	/* Get the token (= pid of the affected task). */
    640	pid = param64 & LPP_PID_MASK;
    641	rcu_read_lock();
    642	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
    643	if (tsk)
    644		get_task_struct(tsk);
    645	rcu_read_unlock();
    646	if (!tsk)
    647		return;
    648	spin_lock(&pfault_lock);
    649	if (subcode & PF_COMPLETE) {
    650		/* signal bit is set -> a page has been swapped in by VM */
    651		if (tsk->thread.pfault_wait == 1) {
    652			/* Initial interrupt was faster than the completion
    653			 * interrupt. pfault_wait is valid. Set pfault_wait
    654			 * back to zero and wake up the process. This can
    655			 * safely be done because the task is still sleeping
    656			 * and can't produce new pfaults. */
    657			tsk->thread.pfault_wait = 0;
    658			list_del(&tsk->thread.list);
    659			wake_up_process(tsk);
    660			put_task_struct(tsk);
    661		} else {
    662			/* Completion interrupt was faster than initial
    663			 * interrupt. Set pfault_wait to -1 so the initial
    664			 * interrupt doesn't put the task to sleep.
    665			 * If the task is not running, ignore the completion
    666			 * interrupt since it must be a leftover of a PFAULT
    667			 * CANCEL operation which didn't remove all pending
    668			 * completion interrupts. */
    669			if (task_is_running(tsk))
    670				tsk->thread.pfault_wait = -1;
    671		}
    672	} else {
    673		/* signal bit not set -> a real page is missing. */
    674		if (WARN_ON_ONCE(tsk != current))
    675			goto out;
    676		if (tsk->thread.pfault_wait == 1) {
    677			/* Already on the list with a reference: put to sleep */
    678			goto block;
    679		} else if (tsk->thread.pfault_wait == -1) {
    680			/* Completion interrupt was faster than the initial
    681			 * interrupt (pfault_wait == -1). Set pfault_wait
    682			 * back to zero and exit. */
    683			tsk->thread.pfault_wait = 0;
    684		} else {
    685			/* Initial interrupt arrived before completion
    686			 * interrupt. Let the task sleep.
    687			 * An extra task reference is needed since a different
    688			 * cpu may set the task state to TASK_RUNNING again
    689			 * before the scheduler is reached. */
    690			get_task_struct(tsk);
    691			tsk->thread.pfault_wait = 1;
    692			list_add(&tsk->thread.list, &pfault_list);
    693block:
    694			/* Since this must be a userspace fault, there
    695			 * is no kernel task state to trample. Rely on the
    696			 * return to userspace schedule() to block. */
    697			__set_current_state(TASK_UNINTERRUPTIBLE);
    698			set_tsk_need_resched(tsk);
    699			set_preempt_need_resched();
    700		}
    701	}
    702out:
    703	spin_unlock(&pfault_lock);
    704	put_task_struct(tsk);
    705}
    706
    707static int pfault_cpu_dead(unsigned int cpu)
    708{
    709	struct thread_struct *thread, *next;
    710	struct task_struct *tsk;
    711
    712	spin_lock_irq(&pfault_lock);
    713	list_for_each_entry_safe(thread, next, &pfault_list, list) {
    714		thread->pfault_wait = 0;
    715		list_del(&thread->list);
    716		tsk = container_of(thread, struct task_struct, thread);
    717		wake_up_process(tsk);
    718		put_task_struct(tsk);
    719	}
    720	spin_unlock_irq(&pfault_lock);
    721	return 0;
    722}
    723
    724static int __init pfault_irq_init(void)
    725{
    726	int rc;
    727
    728	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
    729	if (rc)
    730		goto out_extint;
    731	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
    732	if (rc)
    733		goto out_pfault;
    734	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
    735	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
    736				  NULL, pfault_cpu_dead);
    737	return 0;
    738
    739out_pfault:
    740	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
    741out_extint:
    742	pfault_disable = 1;
    743	return rc;
    744}
    745early_initcall(pfault_irq_init);
    746
    747#endif /* CONFIG_PFAULT */
    748
    749#if IS_ENABLED(CONFIG_PGSTE)
    750
    751void do_secure_storage_access(struct pt_regs *regs)
    752{
    753	unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
    754	struct vm_area_struct *vma;
    755	struct mm_struct *mm;
    756	struct page *page;
    757	int rc;
    758
    759	/*
    760	 * bit 61 tells us if the address is valid, if it's not we
    761	 * have a major problem and should stop the kernel or send a
    762	 * SIGSEGV to the process. Unfortunately bit 61 is not
    763	 * reliable without the misc UV feature so we need to check
    764	 * for that as well.
    765	 */
    766	if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
    767	    !test_bit_inv(61, &regs->int_parm_long)) {
    768		/*
    769		 * When this happens, userspace did something that it
    770		 * was not supposed to do, e.g. branching into secure
    771		 * memory. Trigger a segmentation fault.
    772		 */
    773		if (user_mode(regs)) {
    774			send_sig(SIGSEGV, current, 0);
    775			return;
    776		}
    777
    778		/*
    779		 * The kernel should never run into this case and we
    780		 * have no way out of this situation.
    781		 */
    782		panic("Unexpected PGM 0x3d with TEID bit 61=0");
    783	}
    784
    785	switch (get_fault_type(regs)) {
    786	case USER_FAULT:
    787		mm = current->mm;
    788		mmap_read_lock(mm);
    789		vma = find_vma(mm, addr);
    790		if (!vma) {
    791			mmap_read_unlock(mm);
    792			do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
    793			break;
    794		}
    795		page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
    796		if (IS_ERR_OR_NULL(page)) {
    797			mmap_read_unlock(mm);
    798			break;
    799		}
    800		if (arch_make_page_accessible(page))
    801			send_sig(SIGSEGV, current, 0);
    802		put_page(page);
    803		mmap_read_unlock(mm);
    804		break;
    805	case KERNEL_FAULT:
    806		page = phys_to_page(addr);
    807		if (unlikely(!try_get_page(page)))
    808			break;
    809		rc = arch_make_page_accessible(page);
    810		put_page(page);
    811		if (rc)
    812			BUG();
    813		break;
    814	case GMAP_FAULT:
    815	default:
    816		do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
    817		WARN_ON_ONCE(1);
    818	}
    819}
    820NOKPROBE_SYMBOL(do_secure_storage_access);
    821
    822void do_non_secure_storage_access(struct pt_regs *regs)
    823{
    824	unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
    825	struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
    826
    827	if (get_fault_type(regs) != GMAP_FAULT) {
    828		do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
    829		WARN_ON_ONCE(1);
    830		return;
    831	}
    832
    833	if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
    834		send_sig(SIGSEGV, current, 0);
    835}
    836NOKPROBE_SYMBOL(do_non_secure_storage_access);
    837
    838void do_secure_storage_violation(struct pt_regs *regs)
    839{
    840	/*
    841	 * Either KVM messed up the secure guest mapping or the same
    842	 * page is mapped into multiple secure guests.
    843	 *
    844	 * This exception is only triggered when a guest 2 is running
    845	 * and can therefore never occur in kernel context.
    846	 */
    847	printk_ratelimited(KERN_WARNING
    848			   "Secure storage violation in task: %s, pid %d\n",
    849			   current->comm, current->pid);
    850	send_sig(SIGSEGV, current, 0);
    851}
    852
    853#endif /* CONFIG_PGSTE */