fault.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
fault.c (27100B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Based on arch/arm/mm/fault.c
      4 *
      5 * Copyright (C) 1995  Linus Torvalds
      6 * Copyright (C) 1995-2004 Russell King
      7 * Copyright (C) 2012 ARM Ltd.
      8 */
      9
     10#include <linux/acpi.h>
     11#include <linux/bitfield.h>
     12#include <linux/extable.h>
     13#include <linux/kfence.h>
     14#include <linux/signal.h>
     15#include <linux/mm.h>
     16#include <linux/hardirq.h>
     17#include <linux/init.h>
     18#include <linux/kasan.h>
     19#include <linux/kprobes.h>
     20#include <linux/uaccess.h>
     21#include <linux/page-flags.h>
     22#include <linux/sched/signal.h>
     23#include <linux/sched/debug.h>
     24#include <linux/highmem.h>
     25#include <linux/perf_event.h>
     26#include <linux/preempt.h>
     27#include <linux/hugetlb.h>
     28
     29#include <asm/acpi.h>
     30#include <asm/bug.h>
     31#include <asm/cmpxchg.h>
     32#include <asm/cpufeature.h>
     33#include <asm/exception.h>
     34#include <asm/daifflags.h>
     35#include <asm/debug-monitors.h>
     36#include <asm/esr.h>
     37#include <asm/kprobes.h>
     38#include <asm/mte.h>
     39#include <asm/processor.h>
     40#include <asm/sysreg.h>
     41#include <asm/system_misc.h>
     42#include <asm/tlbflush.h>
     43#include <asm/traps.h>
     44
     45struct fault_info {
     46	int	(*fn)(unsigned long far, unsigned long esr,
     47		      struct pt_regs *regs);
     48	int	sig;
     49	int	code;
     50	const char *name;
     51};
     52
     53static const struct fault_info fault_info[];
     54static struct fault_info debug_fault_info[];
     55
     56static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
     57{
     58	return fault_info + (esr & ESR_ELx_FSC);
     59}
     60
     61static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
     62{
     63	return debug_fault_info + DBG_ESR_EVT(esr);
     64}
     65
     66static void data_abort_decode(unsigned long esr)
     67{
     68	pr_alert("Data abort info:\n");
     69
     70	if (esr & ESR_ELx_ISV) {
     71		pr_alert("  Access size = %u byte(s)\n",
     72			 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
     73		pr_alert("  SSE = %lu, SRT = %lu\n",
     74			 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
     75			 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
     76		pr_alert("  SF = %lu, AR = %lu\n",
     77			 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
     78			 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
     79	} else {
     80		pr_alert("  ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
     81	}
     82
     83	pr_alert("  CM = %lu, WnR = %lu\n",
     84		 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
     85		 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT);
     86}
     87
     88static void mem_abort_decode(unsigned long esr)
     89{
     90	pr_alert("Mem abort info:\n");
     91
     92	pr_alert("  ESR = 0x%016lx\n", esr);
     93	pr_alert("  EC = 0x%02lx: %s, IL = %u bits\n",
     94		 ESR_ELx_EC(esr), esr_get_class_string(esr),
     95		 (esr & ESR_ELx_IL) ? 32 : 16);
     96	pr_alert("  SET = %lu, FnV = %lu\n",
     97		 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
     98		 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
     99	pr_alert("  EA = %lu, S1PTW = %lu\n",
    100		 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
    101		 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
    102	pr_alert("  FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
    103		 esr_to_fault_info(esr)->name);
    104
    105	if (esr_is_data_abort(esr))
    106		data_abort_decode(esr);
    107}
    108
    109static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
    110{
    111	/* Either init_pg_dir or swapper_pg_dir */
    112	if (mm == &init_mm)
    113		return __pa_symbol(mm->pgd);
    114
    115	return (unsigned long)virt_to_phys(mm->pgd);
    116}
    117
    118/*
    119 * Dump out the page tables associated with 'addr' in the currently active mm.
    120 */
    121static void show_pte(unsigned long addr)
    122{
    123	struct mm_struct *mm;
    124	pgd_t *pgdp;
    125	pgd_t pgd;
    126
    127	if (is_ttbr0_addr(addr)) {
    128		/* TTBR0 */
    129		mm = current->active_mm;
    130		if (mm == &init_mm) {
    131			pr_alert("[%016lx] user address but active_mm is swapper\n",
    132				 addr);
    133			return;
    134		}
    135	} else if (is_ttbr1_addr(addr)) {
    136		/* TTBR1 */
    137		mm = &init_mm;
    138	} else {
    139		pr_alert("[%016lx] address between user and kernel address ranges\n",
    140			 addr);
    141		return;
    142	}
    143
    144	pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
    145		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
    146		 vabits_actual, mm_to_pgd_phys(mm));
    147	pgdp = pgd_offset(mm, addr);
    148	pgd = READ_ONCE(*pgdp);
    149	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
    150
    151	do {
    152		p4d_t *p4dp, p4d;
    153		pud_t *pudp, pud;
    154		pmd_t *pmdp, pmd;
    155		pte_t *ptep, pte;
    156
    157		if (pgd_none(pgd) || pgd_bad(pgd))
    158			break;
    159
    160		p4dp = p4d_offset(pgdp, addr);
    161		p4d = READ_ONCE(*p4dp);
    162		pr_cont(", p4d=%016llx", p4d_val(p4d));
    163		if (p4d_none(p4d) || p4d_bad(p4d))
    164			break;
    165
    166		pudp = pud_offset(p4dp, addr);
    167		pud = READ_ONCE(*pudp);
    168		pr_cont(", pud=%016llx", pud_val(pud));
    169		if (pud_none(pud) || pud_bad(pud))
    170			break;
    171
    172		pmdp = pmd_offset(pudp, addr);
    173		pmd = READ_ONCE(*pmdp);
    174		pr_cont(", pmd=%016llx", pmd_val(pmd));
    175		if (pmd_none(pmd) || pmd_bad(pmd))
    176			break;
    177
    178		ptep = pte_offset_map(pmdp, addr);
    179		pte = READ_ONCE(*ptep);
    180		pr_cont(", pte=%016llx", pte_val(pte));
    181		pte_unmap(ptep);
    182	} while(0);
    183
    184	pr_cont("\n");
    185}
    186
    187/*
    188 * This function sets the access flags (dirty, accessed), as well as write
    189 * permission, and only to a more permissive setting.
    190 *
    191 * It needs to cope with hardware update of the accessed/dirty state by other
    192 * agents in the system and can safely skip the __sync_icache_dcache() call as,
    193 * like set_pte_at(), the PTE is never changed from no-exec to exec here.
    194 *
    195 * Returns whether or not the PTE actually changed.
    196 */
    197int ptep_set_access_flags(struct vm_area_struct *vma,
    198			  unsigned long address, pte_t *ptep,
    199			  pte_t entry, int dirty)
    200{
    201	pteval_t old_pteval, pteval;
    202	pte_t pte = READ_ONCE(*ptep);
    203
    204	if (pte_same(pte, entry))
    205		return 0;
    206
    207	/* only preserve the access flags and write permission */
    208	pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
    209
    210	/*
    211	 * Setting the flags must be done atomically to avoid racing with the
    212	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
    213	 * be set to the most permissive (lowest value) of *ptep and entry
    214	 * (calculated as: a & b == ~(~a | ~b)).
    215	 */
    216	pte_val(entry) ^= PTE_RDONLY;
    217	pteval = pte_val(pte);
    218	do {
    219		old_pteval = pteval;
    220		pteval ^= PTE_RDONLY;
    221		pteval |= pte_val(entry);
    222		pteval ^= PTE_RDONLY;
    223		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
    224	} while (pteval != old_pteval);
    225
    226	/* Invalidate a stale read-only entry */
    227	if (dirty)
    228		flush_tlb_page(vma, address);
    229	return 1;
    230}
    231
    232static bool is_el1_instruction_abort(unsigned long esr)
    233{
    234	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
    235}
    236
    237static bool is_el1_data_abort(unsigned long esr)
    238{
    239	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
    240}
    241
    242static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
    243					   struct pt_regs *regs)
    244{
    245	unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE;
    246
    247	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
    248		return false;
    249
    250	if (fsc_type == ESR_ELx_FSC_PERM)
    251		return true;
    252
    253	if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
    254		return fsc_type == ESR_ELx_FSC_FAULT &&
    255			(regs->pstate & PSR_PAN_BIT);
    256
    257	return false;
    258}
    259
    260static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
    261							unsigned long esr,
    262							struct pt_regs *regs)
    263{
    264	unsigned long flags;
    265	u64 par, dfsc;
    266
    267	if (!is_el1_data_abort(esr) ||
    268	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
    269		return false;
    270
    271	local_irq_save(flags);
    272	asm volatile("at s1e1r, %0" :: "r" (addr));
    273	isb();
    274	par = read_sysreg_par();
    275	local_irq_restore(flags);
    276
    277	/*
    278	 * If we now have a valid translation, treat the translation fault as
    279	 * spurious.
    280	 */
    281	if (!(par & SYS_PAR_EL1_F))
    282		return true;
    283
    284	/*
    285	 * If we got a different type of fault from the AT instruction,
    286	 * treat the translation fault as spurious.
    287	 */
    288	dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
    289	return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
    290}
    291
    292static void die_kernel_fault(const char *msg, unsigned long addr,
    293			     unsigned long esr, struct pt_regs *regs)
    294{
    295	bust_spinlocks(1);
    296
    297	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
    298		 addr);
    299
    300	kasan_non_canonical_hook(addr);
    301
    302	mem_abort_decode(esr);
    303
    304	show_pte(addr);
    305	die("Oops", regs, esr);
    306	bust_spinlocks(0);
    307	make_task_dead(SIGKILL);
    308}
    309
    310#ifdef CONFIG_KASAN_HW_TAGS
    311static void report_tag_fault(unsigned long addr, unsigned long esr,
    312			     struct pt_regs *regs)
    313{
    314	/*
    315	 * SAS bits aren't set for all faults reported in EL1, so we can't
    316	 * find out access size.
    317	 */
    318	bool is_write = !!(esr & ESR_ELx_WNR);
    319	kasan_report(addr, 0, is_write, regs->pc);
    320}
    321#else
    322/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
    323static inline void report_tag_fault(unsigned long addr, unsigned long esr,
    324				    struct pt_regs *regs) { }
    325#endif
    326
    327static void do_tag_recovery(unsigned long addr, unsigned long esr,
    328			   struct pt_regs *regs)
    329{
    330
    331	report_tag_fault(addr, esr, regs);
    332
    333	/*
    334	 * Disable MTE Tag Checking on the local CPU for the current EL.
    335	 * It will be done lazily on the other CPUs when they will hit a
    336	 * tag fault.
    337	 */
    338	sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
    339			 SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
    340	isb();
    341}
    342
    343static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
    344{
    345	unsigned long fsc = esr & ESR_ELx_FSC;
    346
    347	if (!is_el1_data_abort(esr))
    348		return false;
    349
    350	if (fsc == ESR_ELx_FSC_MTE)
    351		return true;
    352
    353	return false;
    354}
    355
    356static void __do_kernel_fault(unsigned long addr, unsigned long esr,
    357			      struct pt_regs *regs)
    358{
    359	const char *msg;
    360
    361	/*
    362	 * Are we prepared to handle this kernel fault?
    363	 * We are almost certainly not prepared to handle instruction faults.
    364	 */
    365	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
    366		return;
    367
    368	if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
    369	    "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
    370		return;
    371
    372	if (is_el1_mte_sync_tag_check_fault(esr)) {
    373		do_tag_recovery(addr, esr, regs);
    374
    375		return;
    376	}
    377
    378	if (is_el1_permission_fault(addr, esr, regs)) {
    379		if (esr & ESR_ELx_WNR)
    380			msg = "write to read-only memory";
    381		else if (is_el1_instruction_abort(esr))
    382			msg = "execute from non-executable memory";
    383		else
    384			msg = "read from unreadable memory";
    385	} else if (addr < PAGE_SIZE) {
    386		msg = "NULL pointer dereference";
    387	} else {
    388		if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
    389			return;
    390
    391		msg = "paging request";
    392	}
    393
    394	die_kernel_fault(msg, addr, esr, regs);
    395}
    396
    397static void set_thread_esr(unsigned long address, unsigned long esr)
    398{
    399	current->thread.fault_address = address;
    400
    401	/*
    402	 * If the faulting address is in the kernel, we must sanitize the ESR.
    403	 * From userspace's point of view, kernel-only mappings don't exist
    404	 * at all, so we report them as level 0 translation faults.
    405	 * (This is not quite the way that "no mapping there at all" behaves:
    406	 * an alignment fault not caused by the memory type would take
    407	 * precedence over translation fault for a real access to empty
    408	 * space. Unfortunately we can't easily distinguish "alignment fault
    409	 * not caused by memory type" from "alignment fault caused by memory
    410	 * type", so we ignore this wrinkle and just return the translation
    411	 * fault.)
    412	 */
    413	if (!is_ttbr0_addr(current->thread.fault_address)) {
    414		switch (ESR_ELx_EC(esr)) {
    415		case ESR_ELx_EC_DABT_LOW:
    416			/*
    417			 * These bits provide only information about the
    418			 * faulting instruction, which userspace knows already.
    419			 * We explicitly clear bits which are architecturally
    420			 * RES0 in case they are given meanings in future.
    421			 * We always report the ESR as if the fault was taken
    422			 * to EL1 and so ISV and the bits in ISS[23:14] are
    423			 * clear. (In fact it always will be a fault to EL1.)
    424			 */
    425			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
    426				ESR_ELx_CM | ESR_ELx_WNR;
    427			esr |= ESR_ELx_FSC_FAULT;
    428			break;
    429		case ESR_ELx_EC_IABT_LOW:
    430			/*
    431			 * Claim a level 0 translation fault.
    432			 * All other bits are architecturally RES0 for faults
    433			 * reported with that DFSC value, so we clear them.
    434			 */
    435			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
    436			esr |= ESR_ELx_FSC_FAULT;
    437			break;
    438		default:
    439			/*
    440			 * This should never happen (entry.S only brings us
    441			 * into this code for insn and data aborts from a lower
    442			 * exception level). Fail safe by not providing an ESR
    443			 * context record at all.
    444			 */
    445			WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
    446			esr = 0;
    447			break;
    448		}
    449	}
    450
    451	current->thread.fault_code = esr;
    452}
    453
    454static void do_bad_area(unsigned long far, unsigned long esr,
    455			struct pt_regs *regs)
    456{
    457	unsigned long addr = untagged_addr(far);
    458
    459	/*
    460	 * If we are in kernel mode at this point, we have no context to
    461	 * handle this fault with.
    462	 */
    463	if (user_mode(regs)) {
    464		const struct fault_info *inf = esr_to_fault_info(esr);
    465
    466		set_thread_esr(addr, esr);
    467		arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
    468	} else {
    469		__do_kernel_fault(addr, esr, regs);
    470	}
    471}
    472
    473#define VM_FAULT_BADMAP		0x010000
    474#define VM_FAULT_BADACCESS	0x020000
    475
    476static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
    477				  unsigned int mm_flags, unsigned long vm_flags,
    478				  struct pt_regs *regs)
    479{
    480	struct vm_area_struct *vma = find_vma(mm, addr);
    481
    482	if (unlikely(!vma))
    483		return VM_FAULT_BADMAP;
    484
    485	/*
    486	 * Ok, we have a good vm_area for this memory access, so we can handle
    487	 * it.
    488	 */
    489	if (unlikely(vma->vm_start > addr)) {
    490		if (!(vma->vm_flags & VM_GROWSDOWN))
    491			return VM_FAULT_BADMAP;
    492		if (expand_stack(vma, addr))
    493			return VM_FAULT_BADMAP;
    494	}
    495
    496	/*
    497	 * Check that the permissions on the VMA allow for the fault which
    498	 * occurred.
    499	 */
    500	if (!(vma->vm_flags & vm_flags))
    501		return VM_FAULT_BADACCESS;
    502	return handle_mm_fault(vma, addr, mm_flags, regs);
    503}
    504
    505static bool is_el0_instruction_abort(unsigned long esr)
    506{
    507	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
    508}
    509
    510/*
    511 * Note: not valid for EL1 DC IVAC, but we never use that such that it
    512 * should fault. EL0 cannot issue DC IVAC (undef).
    513 */
    514static bool is_write_abort(unsigned long esr)
    515{
    516	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
    517}
    518
    519static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
    520				   struct pt_regs *regs)
    521{
    522	const struct fault_info *inf;
    523	struct mm_struct *mm = current->mm;
    524	vm_fault_t fault;
    525	unsigned long vm_flags;
    526	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
    527	unsigned long addr = untagged_addr(far);
    528
    529	if (kprobe_page_fault(regs, esr))
    530		return 0;
    531
    532	/*
    533	 * If we're in an interrupt or have no user context, we must not take
    534	 * the fault.
    535	 */
    536	if (faulthandler_disabled() || !mm)
    537		goto no_context;
    538
    539	if (user_mode(regs))
    540		mm_flags |= FAULT_FLAG_USER;
    541
    542	/*
    543	 * vm_flags tells us what bits we must have in vma->vm_flags
    544	 * for the fault to be benign, __do_page_fault() would check
    545	 * vma->vm_flags & vm_flags and returns an error if the
    546	 * intersection is empty
    547	 */
    548	if (is_el0_instruction_abort(esr)) {
    549		/* It was exec fault */
    550		vm_flags = VM_EXEC;
    551		mm_flags |= FAULT_FLAG_INSTRUCTION;
    552	} else if (is_write_abort(esr)) {
    553		/* It was write fault */
    554		vm_flags = VM_WRITE;
    555		mm_flags |= FAULT_FLAG_WRITE;
    556	} else {
    557		/* It was read fault */
    558		vm_flags = VM_READ;
    559		/* Write implies read */
    560		vm_flags |= VM_WRITE;
    561		/* If EPAN is absent then exec implies read */
    562		if (!cpus_have_const_cap(ARM64_HAS_EPAN))
    563			vm_flags |= VM_EXEC;
    564	}
    565
    566	if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
    567		if (is_el1_instruction_abort(esr))
    568			die_kernel_fault("execution of user memory",
    569					 addr, esr, regs);
    570
    571		if (!search_exception_tables(regs->pc))
    572			die_kernel_fault("access to user memory outside uaccess routines",
    573					 addr, esr, regs);
    574	}
    575
    576	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    577
    578	/*
    579	 * As per x86, we may deadlock here. However, since the kernel only
    580	 * validly references user space from well defined areas of the code,
    581	 * we can bug out early if this is from code which shouldn't.
    582	 */
    583	if (!mmap_read_trylock(mm)) {
    584		if (!user_mode(regs) && !search_exception_tables(regs->pc))
    585			goto no_context;
    586retry:
    587		mmap_read_lock(mm);
    588	} else {
    589		/*
    590		 * The above mmap_read_trylock() might have succeeded in which
    591		 * case, we'll have missed the might_sleep() from down_read().
    592		 */
    593		might_sleep();
    594#ifdef CONFIG_DEBUG_VM
    595		if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
    596			mmap_read_unlock(mm);
    597			goto no_context;
    598		}
    599#endif
    600	}
    601
    602	fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
    603
    604	/* Quick path to respond to signals */
    605	if (fault_signal_pending(fault, regs)) {
    606		if (!user_mode(regs))
    607			goto no_context;
    608		return 0;
    609	}
    610
    611	if (fault & VM_FAULT_RETRY) {
    612		mm_flags |= FAULT_FLAG_TRIED;
    613		goto retry;
    614	}
    615	mmap_read_unlock(mm);
    616
    617	/*
    618	 * Handle the "normal" (no error) case first.
    619	 */
    620	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
    621			      VM_FAULT_BADACCESS))))
    622		return 0;
    623
    624	/*
    625	 * If we are in kernel mode at this point, we have no context to
    626	 * handle this fault with.
    627	 */
    628	if (!user_mode(regs))
    629		goto no_context;
    630
    631	if (fault & VM_FAULT_OOM) {
    632		/*
    633		 * We ran out of memory, call the OOM killer, and return to
    634		 * userspace (which will retry the fault, or kill us if we got
    635		 * oom-killed).
    636		 */
    637		pagefault_out_of_memory();
    638		return 0;
    639	}
    640
    641	inf = esr_to_fault_info(esr);
    642	set_thread_esr(addr, esr);
    643	if (fault & VM_FAULT_SIGBUS) {
    644		/*
    645		 * We had some memory, but were unable to successfully fix up
    646		 * this page fault.
    647		 */
    648		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
    649	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
    650		unsigned int lsb;
    651
    652		lsb = PAGE_SHIFT;
    653		if (fault & VM_FAULT_HWPOISON_LARGE)
    654			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
    655
    656		arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
    657	} else {
    658		/*
    659		 * Something tried to access memory that isn't in our memory
    660		 * map.
    661		 */
    662		arm64_force_sig_fault(SIGSEGV,
    663				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
    664				      far, inf->name);
    665	}
    666
    667	return 0;
    668
    669no_context:
    670	__do_kernel_fault(addr, esr, regs);
    671	return 0;
    672}
    673
    674static int __kprobes do_translation_fault(unsigned long far,
    675					  unsigned long esr,
    676					  struct pt_regs *regs)
    677{
    678	unsigned long addr = untagged_addr(far);
    679
    680	if (is_ttbr0_addr(addr))
    681		return do_page_fault(far, esr, regs);
    682
    683	do_bad_area(far, esr, regs);
    684	return 0;
    685}
    686
    687static int do_alignment_fault(unsigned long far, unsigned long esr,
    688			      struct pt_regs *regs)
    689{
    690	do_bad_area(far, esr, regs);
    691	return 0;
    692}
    693
    694static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
    695{
    696	return 1; /* "fault" */
    697}
    698
    699static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
    700{
    701	const struct fault_info *inf;
    702	unsigned long siaddr;
    703
    704	inf = esr_to_fault_info(esr);
    705
    706	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
    707		/*
    708		 * APEI claimed this as a firmware-first notification.
    709		 * Some processing deferred to task_work before ret_to_user().
    710		 */
    711		return 0;
    712	}
    713
    714	if (esr & ESR_ELx_FnV) {
    715		siaddr = 0;
    716	} else {
    717		/*
    718		 * The architecture specifies that the tag bits of FAR_EL1 are
    719		 * UNKNOWN for synchronous external aborts. Mask them out now
    720		 * so that userspace doesn't see them.
    721		 */
    722		siaddr  = untagged_addr(far);
    723	}
    724	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
    725
    726	return 0;
    727}
    728
    729static int do_tag_check_fault(unsigned long far, unsigned long esr,
    730			      struct pt_regs *regs)
    731{
    732	/*
    733	 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
    734	 * for tag check faults. Set them to corresponding bits in the untagged
    735	 * address.
    736	 */
    737	far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
    738	do_bad_area(far, esr, regs);
    739	return 0;
    740}
    741
    742static const struct fault_info fault_info[] = {
    743	{ do_bad,		SIGKILL, SI_KERNEL,	"ttbr address size fault"	},
    744	{ do_bad,		SIGKILL, SI_KERNEL,	"level 1 address size fault"	},
    745	{ do_bad,		SIGKILL, SI_KERNEL,	"level 2 address size fault"	},
    746	{ do_bad,		SIGKILL, SI_KERNEL,	"level 3 address size fault"	},
    747	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 0 translation fault"	},
    748	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	},
    749	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	},
    750	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
    751	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 8"			},
    752	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
    753	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
    754	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 access flag fault"	},
    755	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 12"			},
    756	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 permission fault"	},
    757	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
    758	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 permission fault"	},
    759	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous external abort"	},
    760	{ do_tag_check_fault,	SIGSEGV, SEGV_MTESERR,	"synchronous tag check fault"	},
    761	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 18"			},
    762	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 19"			},
    763	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 (translation table walk)"	},
    764	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 (translation table walk)"	},
    765	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 (translation table walk)"	},
    766	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 (translation table walk)"	},
    767	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous parity or ECC error" },	// Reserved when RAS is implemented
    768	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 25"			},
    769	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 26"			},
    770	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 27"			},
    771	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
    772	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
    773	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
    774	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
    775	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 32"			},
    776	{ do_alignment_fault,	SIGBUS,  BUS_ADRALN,	"alignment fault"		},
    777	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 34"			},
    778	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 35"			},
    779	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 36"			},
    780	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 37"			},
    781	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 38"			},
    782	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 39"			},
    783	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 40"			},
    784	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 41"			},
    785	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 42"			},
    786	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 43"			},
    787	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 44"			},
    788	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 45"			},
    789	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 46"			},
    790	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 47"			},
    791	{ do_bad,		SIGKILL, SI_KERNEL,	"TLB conflict abort"		},
    792	{ do_bad,		SIGKILL, SI_KERNEL,	"Unsupported atomic hardware update fault"	},
    793	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 50"			},
    794	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 51"			},
    795	{ do_bad,		SIGKILL, SI_KERNEL,	"implementation fault (lockdown abort)" },
    796	{ do_bad,		SIGBUS,  BUS_OBJERR,	"implementation fault (unsupported exclusive)" },
    797	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 54"			},
    798	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 55"			},
    799	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 56"			},
    800	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 57"			},
    801	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 58" 			},
    802	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 59"			},
    803	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 60"			},
    804	{ do_bad,		SIGKILL, SI_KERNEL,	"section domain fault"		},
    805	{ do_bad,		SIGKILL, SI_KERNEL,	"page domain fault"		},
    806	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
    807};
    808
    809void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
    810{
    811	const struct fault_info *inf = esr_to_fault_info(esr);
    812	unsigned long addr = untagged_addr(far);
    813
    814	if (!inf->fn(far, esr, regs))
    815		return;
    816
    817	if (!user_mode(regs))
    818		die_kernel_fault(inf->name, addr, esr, regs);
    819
    820	/*
    821	 * At this point we have an unrecognized fault type whose tag bits may
    822	 * have been defined as UNKNOWN. Therefore we only expose the untagged
    823	 * address to the signal handler.
    824	 */
    825	arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
    826}
    827NOKPROBE_SYMBOL(do_mem_abort);
    828
    829void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
    830{
    831	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
    832			 addr, esr);
    833}
    834NOKPROBE_SYMBOL(do_sp_pc_abort);
    835
    836int __init early_brk64(unsigned long addr, unsigned long esr,
    837		       struct pt_regs *regs);
    838
    839/*
    840 * __refdata because early_brk64 is __init, but the reference to it is
    841 * clobbered at arch_initcall time.
    842 * See traps.c and debug-monitors.c:debug_traps_init().
    843 */
    844static struct fault_info __refdata debug_fault_info[] = {
    845	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware breakpoint"	},
    846	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware single-step"	},
    847	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware watchpoint"	},
    848	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 3"		},
    849	{ do_bad,	SIGTRAP,	TRAP_BRKPT,	"aarch32 BKPT"		},
    850	{ do_bad,	SIGKILL,	SI_KERNEL,	"aarch32 vector catch"	},
    851	{ early_brk64,	SIGTRAP,	TRAP_BRKPT,	"aarch64 BRK"		},
    852	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 7"		},
    853};
    854
    855void __init hook_debug_fault_code(int nr,
    856				  int (*fn)(unsigned long, unsigned long, struct pt_regs *),
    857				  int sig, int code, const char *name)
    858{
    859	BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
    860
    861	debug_fault_info[nr].fn		= fn;
    862	debug_fault_info[nr].sig	= sig;
    863	debug_fault_info[nr].code	= code;
    864	debug_fault_info[nr].name	= name;
    865}
    866
    867/*
    868 * In debug exception context, we explicitly disable preemption despite
    869 * having interrupts disabled.
    870 * This serves two purposes: it makes it much less likely that we would
    871 * accidentally schedule in exception context and it will force a warning
    872 * if we somehow manage to schedule by accident.
    873 */
    874static void debug_exception_enter(struct pt_regs *regs)
    875{
    876	preempt_disable();
    877
    878	/* This code is a bit fragile.  Test it. */
    879	RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
    880}
    881NOKPROBE_SYMBOL(debug_exception_enter);
    882
    883static void debug_exception_exit(struct pt_regs *regs)
    884{
    885	preempt_enable_no_resched();
    886}
    887NOKPROBE_SYMBOL(debug_exception_exit);
    888
    889void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
    890			struct pt_regs *regs)
    891{
    892	const struct fault_info *inf = esr_to_debug_fault_info(esr);
    893	unsigned long pc = instruction_pointer(regs);
    894
    895	debug_exception_enter(regs);
    896
    897	if (user_mode(regs) && !is_ttbr0_addr(pc))
    898		arm64_apply_bp_hardening();
    899
    900	if (inf->fn(addr_if_watchpoint, esr, regs)) {
    901		arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
    902	}
    903
    904	debug_exception_exit(regs);
    905}
    906NOKPROBE_SYMBOL(do_debug_exception);
    907
    908/*
    909 * Used during anonymous page fault handling.
    910 */
    911struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
    912						unsigned long vaddr)
    913{
    914	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
    915
    916	/*
    917	 * If the page is mapped with PROT_MTE, initialise the tags at the
    918	 * point of allocation and page zeroing as this is usually faster than
    919	 * separate DC ZVA and STGM.
    920	 */
    921	if (vma->vm_flags & VM_MTE)
    922		flags |= __GFP_ZEROTAGS;
    923
    924	return alloc_page_vma(flags, vma, vaddr);
    925}
    926
    927void tag_clear_highpage(struct page *page)
    928{
    929	mte_zero_clear_page_tags(page_address(page));
    930	page_kasan_tag_reset(page);
    931	set_bit(PG_mte_tagged, &page->flags);
    932}