cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dumpstack.c (13270B)


      1/*
      2 *  Copyright (C) 1991, 1992  Linus Torvalds
      3 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
      4 */
      5#include <linux/kallsyms.h>
      6#include <linux/kprobes.h>
      7#include <linux/uaccess.h>
      8#include <linux/utsname.h>
      9#include <linux/hardirq.h>
     10#include <linux/kdebug.h>
     11#include <linux/module.h>
     12#include <linux/ptrace.h>
     13#include <linux/sched/debug.h>
     14#include <linux/sched/task_stack.h>
     15#include <linux/ftrace.h>
     16#include <linux/kexec.h>
     17#include <linux/bug.h>
     18#include <linux/nmi.h>
     19#include <linux/sysfs.h>
     20#include <linux/kasan.h>
     21
     22#include <asm/cpu_entry_area.h>
     23#include <asm/stacktrace.h>
     24#include <asm/unwind.h>
     25
     26int panic_on_unrecovered_nmi;
     27int panic_on_io_nmi;
     28static int die_counter;
     29
     30static struct pt_regs exec_summary_regs;
     31
     32bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
     33			   struct stack_info *info)
     34{
     35	unsigned long *begin = task_stack_page(task);
     36	unsigned long *end   = task_stack_page(task) + THREAD_SIZE;
     37
     38	if (stack < begin || stack >= end)
     39		return false;
     40
     41	info->type	= STACK_TYPE_TASK;
     42	info->begin	= begin;
     43	info->end	= end;
     44	info->next_sp	= NULL;
     45
     46	return true;
     47}
     48
     49/* Called from get_stack_info_noinstr - so must be noinstr too */
     50bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
     51{
     52	struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
     53
     54	void *begin = ss;
     55	void *end = ss + 1;
     56
     57	if ((void *)stack < begin || (void *)stack >= end)
     58		return false;
     59
     60	info->type	= STACK_TYPE_ENTRY;
     61	info->begin	= begin;
     62	info->end	= end;
     63	info->next_sp	= NULL;
     64
     65	return true;
     66}
     67
     68static void printk_stack_address(unsigned long address, int reliable,
     69				 const char *log_lvl)
     70{
     71	touch_nmi_watchdog();
     72	printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address);
     73}
     74
     75static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
     76		     unsigned int nbytes)
     77{
     78	if (!user_mode(regs))
     79		return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
     80
     81	/* The user space code from other tasks cannot be accessed. */
     82	if (regs != task_pt_regs(current))
     83		return -EPERM;
     84
     85	/*
     86	 * Even if named copy_from_user_nmi() this can be invoked from
     87	 * other contexts and will not try to resolve a pagefault, which is
     88	 * the correct thing to do here as this code can be called from any
     89	 * context.
     90	 */
     91	return copy_from_user_nmi(buf, (void __user *)src, nbytes);
     92}
     93
     94/*
     95 * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
     96 *
     97 * In case where we don't have the exact kernel image (which, if we did, we can
     98 * simply disassemble and navigate to the RIP), the purpose of the bigger
     99 * prologue is to have more context and to be able to correlate the code from
    100 * the different toolchains better.
    101 *
    102 * In addition, it helps in recreating the register allocation of the failing
    103 * kernel and thus make sense of the register dump.
    104 *
    105 * What is more, the additional complication of a variable length insn arch like
    106 * x86 warrants having longer byte sequence before rIP so that the disassembler
    107 * can "sync" up properly and find instruction boundaries when decoding the
    108 * opcode bytes.
    109 *
    110 * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
    111 * guesstimate in attempt to achieve all of the above.
    112 */
    113void show_opcodes(struct pt_regs *regs, const char *loglvl)
    114{
    115#define PROLOGUE_SIZE 42
    116#define EPILOGUE_SIZE 21
    117#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
    118	u8 opcodes[OPCODE_BUFSIZE];
    119	unsigned long prologue = regs->ip - PROLOGUE_SIZE;
    120
    121	switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
    122	case 0:
    123		printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
    124		       __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
    125		       opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
    126		break;
    127	case -EPERM:
    128		/* No access to the user space stack of other tasks. Ignore. */
    129		break;
    130	default:
    131		printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
    132		       loglvl, prologue);
    133		break;
    134	}
    135}
    136
    137void show_ip(struct pt_regs *regs, const char *loglvl)
    138{
    139#ifdef CONFIG_X86_32
    140	printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
    141#else
    142	printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
    143#endif
    144	show_opcodes(regs, loglvl);
    145}
    146
    147void show_iret_regs(struct pt_regs *regs, const char *log_lvl)
    148{
    149	show_ip(regs, log_lvl);
    150	printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss,
    151		regs->sp, regs->flags);
    152}
    153
    154static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
    155				  bool partial, const char *log_lvl)
    156{
    157	/*
    158	 * These on_stack() checks aren't strictly necessary: the unwind code
    159	 * has already validated the 'regs' pointer.  The checks are done for
    160	 * ordering reasons: if the registers are on the next stack, we don't
    161	 * want to print them out yet.  Otherwise they'll be shown as part of
    162	 * the wrong stack.  Later, when show_trace_log_lvl() switches to the
    163	 * next stack, this function will be called again with the same regs so
    164	 * they can be printed in the right context.
    165	 */
    166	if (!partial && on_stack(info, regs, sizeof(*regs))) {
    167		__show_regs(regs, SHOW_REGS_SHORT, log_lvl);
    168
    169	} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
    170				       IRET_FRAME_SIZE)) {
    171		/*
    172		 * When an interrupt or exception occurs in entry code, the
    173		 * full pt_regs might not have been saved yet.  In that case
    174		 * just print the iret frame.
    175		 */
    176		show_iret_regs(regs, log_lvl);
    177	}
    178}
    179
    180static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
    181			unsigned long *stack, const char *log_lvl)
    182{
    183	struct unwind_state state;
    184	struct stack_info stack_info = {0};
    185	unsigned long visit_mask = 0;
    186	int graph_idx = 0;
    187	bool partial = false;
    188
    189	printk("%sCall Trace:\n", log_lvl);
    190
    191	unwind_start(&state, task, regs, stack);
    192	stack = stack ? : get_stack_pointer(task, regs);
    193	regs = unwind_get_entry_regs(&state, &partial);
    194
    195	/*
    196	 * Iterate through the stacks, starting with the current stack pointer.
    197	 * Each stack has a pointer to the next one.
    198	 *
    199	 * x86-64 can have several stacks:
    200	 * - task stack
    201	 * - interrupt stack
    202	 * - HW exception stacks (double fault, nmi, debug, mce)
    203	 * - entry stack
    204	 *
    205	 * x86-32 can have up to four stacks:
    206	 * - task stack
    207	 * - softirq stack
    208	 * - hardirq stack
    209	 * - entry stack
    210	 */
    211	for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
    212		const char *stack_name;
    213
    214		if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
    215			/*
    216			 * We weren't on a valid stack.  It's possible that
    217			 * we overflowed a valid stack into a guard page.
    218			 * See if the next page up is valid so that we can
    219			 * generate some kind of backtrace if this happens.
    220			 */
    221			stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
    222			if (get_stack_info(stack, task, &stack_info, &visit_mask))
    223				break;
    224		}
    225
    226		stack_name = stack_type_name(stack_info.type);
    227		if (stack_name)
    228			printk("%s <%s>\n", log_lvl, stack_name);
    229
    230		if (regs)
    231			show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
    232
    233		/*
    234		 * Scan the stack, printing any text addresses we find.  At the
    235		 * same time, follow proper stack frames with the unwinder.
    236		 *
    237		 * Addresses found during the scan which are not reported by
    238		 * the unwinder are considered to be additional clues which are
    239		 * sometimes useful for debugging and are prefixed with '?'.
    240		 * This also serves as a failsafe option in case the unwinder
    241		 * goes off in the weeds.
    242		 */
    243		for (; stack < stack_info.end; stack++) {
    244			unsigned long real_addr;
    245			int reliable = 0;
    246			unsigned long addr = READ_ONCE_NOCHECK(*stack);
    247			unsigned long *ret_addr_p =
    248				unwind_get_return_address_ptr(&state);
    249
    250			if (!__kernel_text_address(addr))
    251				continue;
    252
    253			/*
    254			 * Don't print regs->ip again if it was already printed
    255			 * by show_regs_if_on_stack().
    256			 */
    257			if (regs && stack == &regs->ip)
    258				goto next;
    259
    260			if (stack == ret_addr_p)
    261				reliable = 1;
    262
    263			/*
    264			 * When function graph tracing is enabled for a
    265			 * function, its return address on the stack is
    266			 * replaced with the address of an ftrace handler
    267			 * (return_to_handler).  In that case, before printing
    268			 * the "real" address, we want to print the handler
    269			 * address as an "unreliable" hint that function graph
    270			 * tracing was involved.
    271			 */
    272			real_addr = ftrace_graph_ret_addr(task, &graph_idx,
    273							  addr, stack);
    274			if (real_addr != addr)
    275				printk_stack_address(addr, 0, log_lvl);
    276			printk_stack_address(real_addr, reliable, log_lvl);
    277
    278			if (!reliable)
    279				continue;
    280
    281next:
    282			/*
    283			 * Get the next frame from the unwinder.  No need to
    284			 * check for an error: if anything goes wrong, the rest
    285			 * of the addresses will just be printed as unreliable.
    286			 */
    287			unwind_next_frame(&state);
    288
    289			/* if the frame has entry regs, print them */
    290			regs = unwind_get_entry_regs(&state, &partial);
    291			if (regs)
    292				show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
    293		}
    294
    295		if (stack_name)
    296			printk("%s </%s>\n", log_lvl, stack_name);
    297	}
    298}
    299
    300void show_stack(struct task_struct *task, unsigned long *sp,
    301		       const char *loglvl)
    302{
    303	task = task ? : current;
    304
    305	/*
    306	 * Stack frames below this one aren't interesting.  Don't show them
    307	 * if we're printing for %current.
    308	 */
    309	if (!sp && task == current)
    310		sp = get_stack_pointer(current, NULL);
    311
    312	show_trace_log_lvl(task, NULL, sp, loglvl);
    313}
    314
    315void show_stack_regs(struct pt_regs *regs)
    316{
    317	show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
    318}
    319
    320static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
    321static int die_owner = -1;
    322static unsigned int die_nest_count;
    323
    324unsigned long oops_begin(void)
    325{
    326	int cpu;
    327	unsigned long flags;
    328
    329	oops_enter();
    330
    331	/* racy, but better than risking deadlock. */
    332	raw_local_irq_save(flags);
    333	cpu = smp_processor_id();
    334	if (!arch_spin_trylock(&die_lock)) {
    335		if (cpu == die_owner)
    336			/* nested oops. should stop eventually */;
    337		else
    338			arch_spin_lock(&die_lock);
    339	}
    340	die_nest_count++;
    341	die_owner = cpu;
    342	console_verbose();
    343	bust_spinlocks(1);
    344	return flags;
    345}
    346NOKPROBE_SYMBOL(oops_begin);
    347
    348void __noreturn rewind_stack_and_make_dead(int signr);
    349
    350void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
    351{
    352	if (regs && kexec_should_crash(current))
    353		crash_kexec(regs);
    354
    355	bust_spinlocks(0);
    356	die_owner = -1;
    357	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
    358	die_nest_count--;
    359	if (!die_nest_count)
    360		/* Nest count reaches zero, release the lock. */
    361		arch_spin_unlock(&die_lock);
    362	raw_local_irq_restore(flags);
    363	oops_exit();
    364
    365	/* Executive summary in case the oops scrolled away */
    366	__show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT);
    367
    368	if (!signr)
    369		return;
    370	if (in_interrupt())
    371		panic("Fatal exception in interrupt");
    372	if (panic_on_oops)
    373		panic("Fatal exception");
    374
    375	/*
    376	 * We're not going to return, but we might be on an IST stack or
    377	 * have very little stack space left.  Rewind the stack and kill
    378	 * the task.
    379	 * Before we rewind the stack, we have to tell KASAN that we're going to
    380	 * reuse the task stack and that existing poisons are invalid.
    381	 */
    382	kasan_unpoison_task_stack(current);
    383	rewind_stack_and_make_dead(signr);
    384}
    385NOKPROBE_SYMBOL(oops_end);
    386
    387static void __die_header(const char *str, struct pt_regs *regs, long err)
    388{
    389	const char *pr = "";
    390
    391	/* Save the regs of the first oops for the executive summary later. */
    392	if (!die_counter)
    393		exec_summary_regs = *regs;
    394
    395	if (IS_ENABLED(CONFIG_PREEMPTION))
    396		pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
    397
    398	printk(KERN_DEFAULT
    399	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
    400	       pr,
    401	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
    402	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
    403	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
    404	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
    405	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
    406}
    407NOKPROBE_SYMBOL(__die_header);
    408
    409static int __die_body(const char *str, struct pt_regs *regs, long err)
    410{
    411	show_regs(regs);
    412	print_modules();
    413
    414	if (notify_die(DIE_OOPS, str, regs, err,
    415			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
    416		return 1;
    417
    418	return 0;
    419}
    420NOKPROBE_SYMBOL(__die_body);
    421
    422int __die(const char *str, struct pt_regs *regs, long err)
    423{
    424	__die_header(str, regs, err);
    425	return __die_body(str, regs, err);
    426}
    427NOKPROBE_SYMBOL(__die);
    428
    429/*
    430 * This is gone through when something in the kernel has done something bad
    431 * and is about to be terminated:
    432 */
    433void die(const char *str, struct pt_regs *regs, long err)
    434{
    435	unsigned long flags = oops_begin();
    436	int sig = SIGSEGV;
    437
    438	if (__die(str, regs, err))
    439		sig = 0;
    440	oops_end(flags, regs, sig);
    441}
    442
    443void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
    444{
    445	unsigned long flags = oops_begin();
    446	int sig = SIGSEGV;
    447
    448	__die_header(str, regs, err);
    449	if (gp_addr)
    450		kasan_non_canonical_hook(gp_addr);
    451	if (__die_body(str, regs, err))
    452		sig = 0;
    453	oops_end(flags, regs, sig);
    454}
    455
    456void show_regs(struct pt_regs *regs)
    457{
    458	enum show_regs_mode print_kernel_regs;
    459
    460	show_regs_print_info(KERN_DEFAULT);
    461
    462	print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL;
    463	__show_regs(regs, print_kernel_regs, KERN_DEFAULT);
    464
    465	/*
    466	 * When in-kernel, we also print out the stack at the time of the fault..
    467	 */
    468	if (!user_mode(regs))
    469		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
    470}