nmi.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
nmi.c (15360B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Copyright (C) 1991, 1992  Linus Torvalds
      4 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
      5 *  Copyright (C) 2011	Don Zickus Red Hat, Inc.
      6 *
      7 *  Pentium III FXSR, SSE support
      8 *	Gareth Hughes <gareth@valinux.com>, May 2000
      9 */
     10
     11/*
     12 * Handle hardware traps and faults.
     13 */
     14#include <linux/spinlock.h>
     15#include <linux/kprobes.h>
     16#include <linux/kdebug.h>
     17#include <linux/sched/debug.h>
     18#include <linux/nmi.h>
     19#include <linux/debugfs.h>
     20#include <linux/delay.h>
     21#include <linux/hardirq.h>
     22#include <linux/ratelimit.h>
     23#include <linux/slab.h>
     24#include <linux/export.h>
     25#include <linux/atomic.h>
     26#include <linux/sched/clock.h>
     27
     28#include <asm/cpu_entry_area.h>
     29#include <asm/traps.h>
     30#include <asm/mach_traps.h>
     31#include <asm/nmi.h>
     32#include <asm/x86_init.h>
     33#include <asm/reboot.h>
     34#include <asm/cache.h>
     35#include <asm/nospec-branch.h>
     36#include <asm/sev.h>
     37
     38#define CREATE_TRACE_POINTS
     39#include <trace/events/nmi.h>
     40
     41struct nmi_desc {
     42	raw_spinlock_t lock;
     43	struct list_head head;
     44};
     45
     46static struct nmi_desc nmi_desc[NMI_MAX] = 
     47{
     48	{
     49		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
     50		.head = LIST_HEAD_INIT(nmi_desc[0].head),
     51	},
     52	{
     53		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
     54		.head = LIST_HEAD_INIT(nmi_desc[1].head),
     55	},
     56	{
     57		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
     58		.head = LIST_HEAD_INIT(nmi_desc[2].head),
     59	},
     60	{
     61		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
     62		.head = LIST_HEAD_INIT(nmi_desc[3].head),
     63	},
     64
     65};
     66
     67struct nmi_stats {
     68	unsigned int normal;
     69	unsigned int unknown;
     70	unsigned int external;
     71	unsigned int swallow;
     72};
     73
     74static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
     75
     76static int ignore_nmis __read_mostly;
     77
     78int unknown_nmi_panic;
     79/*
     80 * Prevent NMI reason port (0x61) being accessed simultaneously, can
     81 * only be used in NMI handler.
     82 */
     83static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
     84
     85static int __init setup_unknown_nmi_panic(char *str)
     86{
     87	unknown_nmi_panic = 1;
     88	return 1;
     89}
     90__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
     91
     92#define nmi_to_desc(type) (&nmi_desc[type])
     93
     94static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
     95
     96static int __init nmi_warning_debugfs(void)
     97{
     98	debugfs_create_u64("nmi_longest_ns", 0644,
     99			arch_debugfs_dir, &nmi_longest_ns);
    100	return 0;
    101}
    102fs_initcall(nmi_warning_debugfs);
    103
    104static void nmi_check_duration(struct nmiaction *action, u64 duration)
    105{
    106	int remainder_ns, decimal_msecs;
    107
    108	if (duration < nmi_longest_ns || duration < action->max_duration)
    109		return;
    110
    111	action->max_duration = duration;
    112
    113	remainder_ns = do_div(duration, (1000 * 1000));
    114	decimal_msecs = remainder_ns / 1000;
    115
    116	printk_ratelimited(KERN_INFO
    117		"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
    118		action->handler, duration, decimal_msecs);
    119}
    120
    121static int nmi_handle(unsigned int type, struct pt_regs *regs)
    122{
    123	struct nmi_desc *desc = nmi_to_desc(type);
    124	struct nmiaction *a;
    125	int handled=0;
    126
    127	rcu_read_lock();
    128
    129	/*
    130	 * NMIs are edge-triggered, which means if you have enough
    131	 * of them concurrently, you can lose some because only one
    132	 * can be latched at any given time.  Walk the whole list
    133	 * to handle those situations.
    134	 */
    135	list_for_each_entry_rcu(a, &desc->head, list) {
    136		int thishandled;
    137		u64 delta;
    138
    139		delta = sched_clock();
    140		thishandled = a->handler(type, regs);
    141		handled += thishandled;
    142		delta = sched_clock() - delta;
    143		trace_nmi_handler(a->handler, (int)delta, thishandled);
    144
    145		nmi_check_duration(a, delta);
    146	}
    147
    148	rcu_read_unlock();
    149
    150	/* return total number of NMI events handled */
    151	return handled;
    152}
    153NOKPROBE_SYMBOL(nmi_handle);
    154
    155int __register_nmi_handler(unsigned int type, struct nmiaction *action)
    156{
    157	struct nmi_desc *desc = nmi_to_desc(type);
    158	unsigned long flags;
    159
    160	if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
    161		return -EINVAL;
    162
    163	raw_spin_lock_irqsave(&desc->lock, flags);
    164
    165	/*
    166	 * Indicate if there are multiple registrations on the
    167	 * internal NMI handler call chains (SERR and IO_CHECK).
    168	 */
    169	WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
    170	WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
    171
    172	/*
    173	 * some handlers need to be executed first otherwise a fake
    174	 * event confuses some handlers (kdump uses this flag)
    175	 */
    176	if (action->flags & NMI_FLAG_FIRST)
    177		list_add_rcu(&action->list, &desc->head);
    178	else
    179		list_add_tail_rcu(&action->list, &desc->head);
    180
    181	raw_spin_unlock_irqrestore(&desc->lock, flags);
    182	return 0;
    183}
    184EXPORT_SYMBOL(__register_nmi_handler);
    185
    186void unregister_nmi_handler(unsigned int type, const char *name)
    187{
    188	struct nmi_desc *desc = nmi_to_desc(type);
    189	struct nmiaction *n, *found = NULL;
    190	unsigned long flags;
    191
    192	raw_spin_lock_irqsave(&desc->lock, flags);
    193
    194	list_for_each_entry_rcu(n, &desc->head, list) {
    195		/*
    196		 * the name passed in to describe the nmi handler
    197		 * is used as the lookup key
    198		 */
    199		if (!strcmp(n->name, name)) {
    200			WARN(in_nmi(),
    201				"Trying to free NMI (%s) from NMI context!\n", n->name);
    202			list_del_rcu(&n->list);
    203			found = n;
    204			break;
    205		}
    206	}
    207
    208	raw_spin_unlock_irqrestore(&desc->lock, flags);
    209	if (found) {
    210		synchronize_rcu();
    211		INIT_LIST_HEAD(&found->list);
    212	}
    213}
    214EXPORT_SYMBOL_GPL(unregister_nmi_handler);
    215
    216static void
    217pci_serr_error(unsigned char reason, struct pt_regs *regs)
    218{
    219	/* check to see if anyone registered against these types of errors */
    220	if (nmi_handle(NMI_SERR, regs))
    221		return;
    222
    223	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
    224		 reason, smp_processor_id());
    225
    226	if (panic_on_unrecovered_nmi)
    227		nmi_panic(regs, "NMI: Not continuing");
    228
    229	pr_emerg("Dazed and confused, but trying to continue\n");
    230
    231	/* Clear and disable the PCI SERR error line. */
    232	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
    233	outb(reason, NMI_REASON_PORT);
    234}
    235NOKPROBE_SYMBOL(pci_serr_error);
    236
    237static void
    238io_check_error(unsigned char reason, struct pt_regs *regs)
    239{
    240	unsigned long i;
    241
    242	/* check to see if anyone registered against these types of errors */
    243	if (nmi_handle(NMI_IO_CHECK, regs))
    244		return;
    245
    246	pr_emerg(
    247	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
    248		 reason, smp_processor_id());
    249	show_regs(regs);
    250
    251	if (panic_on_io_nmi) {
    252		nmi_panic(regs, "NMI IOCK error: Not continuing");
    253
    254		/*
    255		 * If we end up here, it means we have received an NMI while
    256		 * processing panic(). Simply return without delaying and
    257		 * re-enabling NMIs.
    258		 */
    259		return;
    260	}
    261
    262	/* Re-enable the IOCK line, wait for a few seconds */
    263	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
    264	outb(reason, NMI_REASON_PORT);
    265
    266	i = 20000;
    267	while (--i) {
    268		touch_nmi_watchdog();
    269		udelay(100);
    270	}
    271
    272	reason &= ~NMI_REASON_CLEAR_IOCHK;
    273	outb(reason, NMI_REASON_PORT);
    274}
    275NOKPROBE_SYMBOL(io_check_error);
    276
    277static void
    278unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
    279{
    280	int handled;
    281
    282	/*
    283	 * Use 'false' as back-to-back NMIs are dealt with one level up.
    284	 * Of course this makes having multiple 'unknown' handlers useless
    285	 * as only the first one is ever run (unless it can actually determine
    286	 * if it caused the NMI)
    287	 */
    288	handled = nmi_handle(NMI_UNKNOWN, regs);
    289	if (handled) {
    290		__this_cpu_add(nmi_stats.unknown, handled);
    291		return;
    292	}
    293
    294	__this_cpu_add(nmi_stats.unknown, 1);
    295
    296	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
    297		 reason, smp_processor_id());
    298
    299	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
    300		nmi_panic(regs, "NMI: Not continuing");
    301
    302	pr_emerg("Dazed and confused, but trying to continue\n");
    303}
    304NOKPROBE_SYMBOL(unknown_nmi_error);
    305
    306static DEFINE_PER_CPU(bool, swallow_nmi);
    307static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
    308
    309static noinstr void default_do_nmi(struct pt_regs *regs)
    310{
    311	unsigned char reason = 0;
    312	int handled;
    313	bool b2b = false;
    314
    315	/*
    316	 * CPU-specific NMI must be processed before non-CPU-specific
    317	 * NMI, otherwise we may lose it, because the CPU-specific
    318	 * NMI can not be detected/processed on other CPUs.
    319	 */
    320
    321	/*
    322	 * Back-to-back NMIs are interesting because they can either
    323	 * be two NMI or more than two NMIs (any thing over two is dropped
    324	 * due to NMI being edge-triggered).  If this is the second half
    325	 * of the back-to-back NMI, assume we dropped things and process
    326	 * more handlers.  Otherwise reset the 'swallow' NMI behaviour
    327	 */
    328	if (regs->ip == __this_cpu_read(last_nmi_rip))
    329		b2b = true;
    330	else
    331		__this_cpu_write(swallow_nmi, false);
    332
    333	__this_cpu_write(last_nmi_rip, regs->ip);
    334
    335	instrumentation_begin();
    336
    337	handled = nmi_handle(NMI_LOCAL, regs);
    338	__this_cpu_add(nmi_stats.normal, handled);
    339	if (handled) {
    340		/*
    341		 * There are cases when a NMI handler handles multiple
    342		 * events in the current NMI.  One of these events may
    343		 * be queued for in the next NMI.  Because the event is
    344		 * already handled, the next NMI will result in an unknown
    345		 * NMI.  Instead lets flag this for a potential NMI to
    346		 * swallow.
    347		 */
    348		if (handled > 1)
    349			__this_cpu_write(swallow_nmi, true);
    350		goto out;
    351	}
    352
    353	/*
    354	 * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
    355	 *
    356	 * Another CPU may be processing panic routines while holding
    357	 * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
    358	 * and if so, call its callback directly.  If there is no CPU preparing
    359	 * crash dump, we simply loop here.
    360	 */
    361	while (!raw_spin_trylock(&nmi_reason_lock)) {
    362		run_crash_ipi_callback(regs);
    363		cpu_relax();
    364	}
    365
    366	reason = x86_platform.get_nmi_reason();
    367
    368	if (reason & NMI_REASON_MASK) {
    369		if (reason & NMI_REASON_SERR)
    370			pci_serr_error(reason, regs);
    371		else if (reason & NMI_REASON_IOCHK)
    372			io_check_error(reason, regs);
    373#ifdef CONFIG_X86_32
    374		/*
    375		 * Reassert NMI in case it became active
    376		 * meanwhile as it's edge-triggered:
    377		 */
    378		reassert_nmi();
    379#endif
    380		__this_cpu_add(nmi_stats.external, 1);
    381		raw_spin_unlock(&nmi_reason_lock);
    382		goto out;
    383	}
    384	raw_spin_unlock(&nmi_reason_lock);
    385
    386	/*
    387	 * Only one NMI can be latched at a time.  To handle
    388	 * this we may process multiple nmi handlers at once to
    389	 * cover the case where an NMI is dropped.  The downside
    390	 * to this approach is we may process an NMI prematurely,
    391	 * while its real NMI is sitting latched.  This will cause
    392	 * an unknown NMI on the next run of the NMI processing.
    393	 *
    394	 * We tried to flag that condition above, by setting the
    395	 * swallow_nmi flag when we process more than one event.
    396	 * This condition is also only present on the second half
    397	 * of a back-to-back NMI, so we flag that condition too.
    398	 *
    399	 * If both are true, we assume we already processed this
    400	 * NMI previously and we swallow it.  Otherwise we reset
    401	 * the logic.
    402	 *
    403	 * There are scenarios where we may accidentally swallow
    404	 * a 'real' unknown NMI.  For example, while processing
    405	 * a perf NMI another perf NMI comes in along with a
    406	 * 'real' unknown NMI.  These two NMIs get combined into
    407	 * one (as described above).  When the next NMI gets
    408	 * processed, it will be flagged by perf as handled, but
    409	 * no one will know that there was a 'real' unknown NMI sent
    410	 * also.  As a result it gets swallowed.  Or if the first
    411	 * perf NMI returns two events handled then the second
    412	 * NMI will get eaten by the logic below, again losing a
    413	 * 'real' unknown NMI.  But this is the best we can do
    414	 * for now.
    415	 */
    416	if (b2b && __this_cpu_read(swallow_nmi))
    417		__this_cpu_add(nmi_stats.swallow, 1);
    418	else
    419		unknown_nmi_error(reason, regs);
    420
    421out:
    422	instrumentation_end();
    423}
    424
    425/*
    426 * NMIs can page fault or hit breakpoints which will cause it to lose
    427 * its NMI context with the CPU when the breakpoint or page fault does an IRET.
    428 *
    429 * As a result, NMIs can nest if NMIs get unmasked due an IRET during
    430 * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
    431 * if the outer NMI came from kernel mode, but we can still nest if the
    432 * outer NMI came from user mode.
    433 *
    434 * To handle these nested NMIs, we have three states:
    435 *
    436 *  1) not running
    437 *  2) executing
    438 *  3) latched
    439 *
    440 * When no NMI is in progress, it is in the "not running" state.
    441 * When an NMI comes in, it goes into the "executing" state.
    442 * Normally, if another NMI is triggered, it does not interrupt
    443 * the running NMI and the HW will simply latch it so that when
    444 * the first NMI finishes, it will restart the second NMI.
    445 * (Note, the latch is binary, thus multiple NMIs triggering,
    446 *  when one is running, are ignored. Only one NMI is restarted.)
    447 *
    448 * If an NMI executes an iret, another NMI can preempt it. We do not
    449 * want to allow this new NMI to run, but we want to execute it when the
    450 * first one finishes.  We set the state to "latched", and the exit of
    451 * the first NMI will perform a dec_return, if the result is zero
    452 * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
    453 * dec_return would have set the state to NMI_EXECUTING (what we want it
    454 * to be when we are running). In this case, we simply jump back to
    455 * rerun the NMI handler again, and restart the 'latched' NMI.
    456 *
    457 * No trap (breakpoint or page fault) should be hit before nmi_restart,
    458 * thus there is no race between the first check of state for NOT_RUNNING
    459 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
    460 * at this point.
    461 *
    462 * In case the NMI takes a page fault, we need to save off the CR2
    463 * because the NMI could have preempted another page fault and corrupt
    464 * the CR2 that is about to be read. As nested NMIs must be restarted
    465 * and they can not take breakpoints or page faults, the update of the
    466 * CR2 must be done before converting the nmi state back to NOT_RUNNING.
    467 * Otherwise, there would be a race of another nested NMI coming in
    468 * after setting state to NOT_RUNNING but before updating the nmi_cr2.
    469 */
    470enum nmi_states {
    471	NMI_NOT_RUNNING = 0,
    472	NMI_EXECUTING,
    473	NMI_LATCHED,
    474};
    475static DEFINE_PER_CPU(enum nmi_states, nmi_state);
    476static DEFINE_PER_CPU(unsigned long, nmi_cr2);
    477static DEFINE_PER_CPU(unsigned long, nmi_dr7);
    478
    479DEFINE_IDTENTRY_RAW(exc_nmi)
    480{
    481	irqentry_state_t irq_state;
    482
    483	/*
    484	 * Re-enable NMIs right here when running as an SEV-ES guest. This might
    485	 * cause nested NMIs, but those can be handled safely.
    486	 */
    487	sev_es_nmi_complete();
    488
    489	if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
    490		return;
    491
    492	if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
    493		this_cpu_write(nmi_state, NMI_LATCHED);
    494		return;
    495	}
    496	this_cpu_write(nmi_state, NMI_EXECUTING);
    497	this_cpu_write(nmi_cr2, read_cr2());
    498nmi_restart:
    499
    500	/*
    501	 * Needs to happen before DR7 is accessed, because the hypervisor can
    502	 * intercept DR7 reads/writes, turning those into #VC exceptions.
    503	 */
    504	sev_es_ist_enter(regs);
    505
    506	this_cpu_write(nmi_dr7, local_db_save());
    507
    508	irq_state = irqentry_nmi_enter(regs);
    509
    510	inc_irq_stat(__nmi_count);
    511
    512	if (!ignore_nmis)
    513		default_do_nmi(regs);
    514
    515	irqentry_nmi_exit(regs, irq_state);
    516
    517	local_db_restore(this_cpu_read(nmi_dr7));
    518
    519	sev_es_ist_exit();
    520
    521	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
    522		write_cr2(this_cpu_read(nmi_cr2));
    523	if (this_cpu_dec_return(nmi_state))
    524		goto nmi_restart;
    525
    526	if (user_mode(regs))
    527		mds_user_clear_cpu_buffers();
    528}
    529
    530#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
    531DEFINE_IDTENTRY_RAW(exc_nmi_noist)
    532{
    533	exc_nmi(regs);
    534}
    535#endif
    536#if IS_MODULE(CONFIG_KVM_INTEL)
    537EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
    538#endif
    539
    540void stop_nmi(void)
    541{
    542	ignore_nmis++;
    543}
    544
    545void restart_nmi(void)
    546{
    547	ignore_nmis--;
    548}
    549
    550/* reset the back-to-back NMI logic */
    551void local_touch_nmi(void)
    552{
    553	__this_cpu_write(last_nmi_rip, 0);
    554}
    555EXPORT_SYMBOL_GPL(local_touch_nmi);