watchdog.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
watchdog.c (16115B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Watchdog support on powerpc systems.
      4 *
      5 * Copyright 2017, IBM Corporation.
      6 *
      7 * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
      8 */
      9
     10#define pr_fmt(fmt) "watchdog: " fmt
     11
     12#include <linux/kernel.h>
     13#include <linux/param.h>
     14#include <linux/init.h>
     15#include <linux/percpu.h>
     16#include <linux/cpu.h>
     17#include <linux/nmi.h>
     18#include <linux/module.h>
     19#include <linux/export.h>
     20#include <linux/kprobes.h>
     21#include <linux/hardirq.h>
     22#include <linux/reboot.h>
     23#include <linux/slab.h>
     24#include <linux/kdebug.h>
     25#include <linux/sched/debug.h>
     26#include <linux/delay.h>
     27#include <linux/processor.h>
     28#include <linux/smp.h>
     29
     30#include <asm/interrupt.h>
     31#include <asm/paca.h>
     32#include <asm/nmi.h>
     33
     34/*
     35 * The powerpc watchdog ensures that each CPU is able to service timers.
     36 * The watchdog sets up a simple timer on each CPU to run once per timer
     37 * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
     38 * the heartbeat.
     39 *
     40 * Then there are two systems to check that the heartbeat is still running.
     41 * The local soft-NMI, and the SMP checker.
     42 *
     43 * The soft-NMI checker can detect lockups on the local CPU. When interrupts
     44 * are disabled with local_irq_disable(), platforms that use soft-masking
     45 * can leave hardware interrupts enabled and handle them with a masked
     46 * interrupt handler. The masked handler can send the timer interrupt to the
     47 * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
     48 * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
     49 *
     50 * The soft-NMI checker will compare the heartbeat timestamp for this CPU
     51 * with the current time, and take action if the difference exceeds the
     52 * watchdog threshold.
     53 *
     54 * The limitation of the soft-NMI watchdog is that it does not work when
     55 * interrupts are hard disabled or otherwise not being serviced. This is
     56 * solved by also having a SMP watchdog where all CPUs check all other
     57 * CPUs heartbeat.
     58 *
     59 * The SMP checker can detect lockups on other CPUs. A global "pending"
     60 * cpumask is kept, containing all CPUs which enable the watchdog. Each
     61 * CPU clears their pending bit in their heartbeat timer. When the bitmask
     62 * becomes empty, the last CPU to clear its pending bit updates a global
     63 * timestamp and refills the pending bitmask.
     64 *
     65 * In the heartbeat timer, if any CPU notices that the global timestamp has
     66 * not been updated for a period exceeding the watchdog threshold, then it
     67 * means the CPU(s) with their bit still set in the pending mask have had
     68 * their heartbeat stop, and action is taken.
     69 *
     70 * Some platforms implement true NMI IPIs, which can be used by the SMP
     71 * watchdog to detect an unresponsive CPU and pull it out of its stuck
     72 * state with the NMI IPI, to get crash/debug data from it. This way the
     73 * SMP watchdog can detect hardware interrupts off lockups.
     74 */
     75
     76static cpumask_t wd_cpus_enabled __read_mostly;
     77
     78static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
     79static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
     80
     81static u64 wd_timer_period_ms __read_mostly;  /* interval between heartbeat */
     82
     83static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
     84static DEFINE_PER_CPU(u64, wd_timer_tb);
     85
     86/* SMP checker bits */
     87static unsigned long __wd_smp_lock;
     88static unsigned long __wd_reporting;
     89static unsigned long __wd_nmi_output;
     90static cpumask_t wd_smp_cpus_pending;
     91static cpumask_t wd_smp_cpus_stuck;
     92static u64 wd_smp_last_reset_tb;
     93
     94/*
     95 * Try to take the exclusive watchdog action / NMI IPI / printing lock.
     96 * wd_smp_lock must be held. If this fails, we should return and wait
     97 * for the watchdog to kick in again (or another CPU to trigger it).
     98 *
     99 * Importantly, if hardlockup_panic is set, wd_try_report failure should
    100 * not delay the panic, because whichever other CPU is reporting will
    101 * call panic.
    102 */
    103static bool wd_try_report(void)
    104{
    105	if (__wd_reporting)
    106		return false;
    107	__wd_reporting = 1;
    108	return true;
    109}
    110
    111/* End printing after successful wd_try_report. wd_smp_lock not required. */
    112static void wd_end_reporting(void)
    113{
    114	smp_mb(); /* End printing "critical section" */
    115	WARN_ON_ONCE(__wd_reporting == 0);
    116	WRITE_ONCE(__wd_reporting, 0);
    117}
    118
    119static inline void wd_smp_lock(unsigned long *flags)
    120{
    121	/*
    122	 * Avoid locking layers if possible.
    123	 * This may be called from low level interrupt handlers at some
    124	 * point in future.
    125	 */
    126	raw_local_irq_save(*flags);
    127	hard_irq_disable(); /* Make it soft-NMI safe */
    128	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
    129		raw_local_irq_restore(*flags);
    130		spin_until_cond(!test_bit(0, &__wd_smp_lock));
    131		raw_local_irq_save(*flags);
    132		hard_irq_disable();
    133	}
    134}
    135
    136static inline void wd_smp_unlock(unsigned long *flags)
    137{
    138	clear_bit_unlock(0, &__wd_smp_lock);
    139	raw_local_irq_restore(*flags);
    140}
    141
    142static void wd_lockup_ipi(struct pt_regs *regs)
    143{
    144	int cpu = raw_smp_processor_id();
    145	u64 tb = get_tb();
    146
    147	pr_emerg("CPU %d Hard LOCKUP\n", cpu);
    148	pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
    149		 cpu, tb, per_cpu(wd_timer_tb, cpu),
    150		 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
    151	print_modules();
    152	print_irqtrace_events(current);
    153	if (regs)
    154		show_regs(regs);
    155	else
    156		dump_stack();
    157
    158	/*
    159	 * __wd_nmi_output must be set after we printk from NMI context.
    160	 *
    161	 * printk from NMI context defers printing to the console to irq_work.
    162	 * If that NMI was taken in some code that is hard-locked, then irqs
    163	 * are disabled so irq_work will never fire. That can result in the
    164	 * hard lockup messages being delayed (indefinitely, until something
    165	 * else kicks the console drivers).
    166	 *
    167	 * Setting __wd_nmi_output will cause another CPU to notice and kick
    168	 * the console drivers for us.
    169	 *
    170	 * xchg is not needed here (it could be a smp_mb and store), but xchg
    171	 * gives the memory ordering and atomicity required.
    172	 */
    173	xchg(&__wd_nmi_output, 1);
    174
    175	/* Do not panic from here because that can recurse into NMI IPI layer */
    176}
    177
    178static bool set_cpu_stuck(int cpu)
    179{
    180	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
    181	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
    182	/*
    183	 * See wd_smp_clear_cpu_pending()
    184	 */
    185	smp_mb();
    186	if (cpumask_empty(&wd_smp_cpus_pending)) {
    187		wd_smp_last_reset_tb = get_tb();
    188		cpumask_andnot(&wd_smp_cpus_pending,
    189				&wd_cpus_enabled,
    190				&wd_smp_cpus_stuck);
    191		return true;
    192	}
    193	return false;
    194}
    195
    196static void watchdog_smp_panic(int cpu)
    197{
    198	static cpumask_t wd_smp_cpus_ipi; // protected by reporting
    199	unsigned long flags;
    200	u64 tb, last_reset;
    201	int c;
    202
    203	wd_smp_lock(&flags);
    204	/* Double check some things under lock */
    205	tb = get_tb();
    206	last_reset = wd_smp_last_reset_tb;
    207	if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb)
    208		goto out;
    209	if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
    210		goto out;
    211	if (!wd_try_report())
    212		goto out;
    213	for_each_online_cpu(c) {
    214		if (!cpumask_test_cpu(c, &wd_smp_cpus_pending))
    215			continue;
    216		if (c == cpu)
    217			continue; // should not happen
    218
    219		__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
    220		if (set_cpu_stuck(c))
    221			break;
    222	}
    223	if (cpumask_empty(&wd_smp_cpus_ipi)) {
    224		wd_end_reporting();
    225		goto out;
    226	}
    227	wd_smp_unlock(&flags);
    228
    229	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
    230		 cpu, cpumask_pr_args(&wd_smp_cpus_ipi));
    231	pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
    232		 cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000);
    233
    234	if (!sysctl_hardlockup_all_cpu_backtrace) {
    235		/*
    236		 * Try to trigger the stuck CPUs, unless we are going to
    237		 * get a backtrace on all of them anyway.
    238		 */
    239		for_each_cpu(c, &wd_smp_cpus_ipi) {
    240			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
    241			__cpumask_clear_cpu(c, &wd_smp_cpus_ipi);
    242		}
    243	} else {
    244		trigger_allbutself_cpu_backtrace();
    245		cpumask_clear(&wd_smp_cpus_ipi);
    246	}
    247
    248	if (hardlockup_panic)
    249		nmi_panic(NULL, "Hard LOCKUP");
    250
    251	wd_end_reporting();
    252
    253	return;
    254
    255out:
    256	wd_smp_unlock(&flags);
    257}
    258
    259static void wd_smp_clear_cpu_pending(int cpu)
    260{
    261	if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
    262		if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
    263			struct pt_regs *regs = get_irq_regs();
    264			unsigned long flags;
    265
    266			pr_emerg("CPU %d became unstuck TB:%lld\n",
    267				 cpu, get_tb());
    268			print_irqtrace_events(current);
    269			if (regs)
    270				show_regs(regs);
    271			else
    272				dump_stack();
    273
    274			wd_smp_lock(&flags);
    275			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
    276			wd_smp_unlock(&flags);
    277		} else {
    278			/*
    279			 * The last CPU to clear pending should have reset the
    280			 * watchdog so we generally should not find it empty
    281			 * here if our CPU was clear. However it could happen
    282			 * due to a rare race with another CPU taking the
    283			 * last CPU out of the mask concurrently.
    284			 *
    285			 * We can't add a warning for it. But just in case
    286			 * there is a problem with the watchdog that is causing
    287			 * the mask to not be reset, try to kick it along here.
    288			 */
    289			if (unlikely(cpumask_empty(&wd_smp_cpus_pending)))
    290				goto none_pending;
    291		}
    292		return;
    293	}
    294
    295	/*
    296	 * All other updates to wd_smp_cpus_pending are performed under
    297	 * wd_smp_lock. All of them are atomic except the case where the
    298	 * mask becomes empty and is reset. This will not happen here because
    299	 * cpu was tested to be in the bitmap (above), and a CPU only clears
    300	 * its own bit. _Except_ in the case where another CPU has detected a
    301	 * hard lockup on our CPU and takes us out of the pending mask. So in
    302	 * normal operation there will be no race here, no problem.
    303	 *
    304	 * In the lockup case, this atomic clear-bit vs a store that refills
    305	 * other bits in the accessed word wll not be a problem. The bit clear
    306	 * is atomic so it will not cause the store to get lost, and the store
    307	 * will never set this bit so it will not overwrite the bit clear. The
    308	 * only way for a stuck CPU to return to the pending bitmap is to
    309	 * become unstuck itself.
    310	 */
    311	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
    312
    313	/*
    314	 * Order the store to clear pending with the load(s) to check all
    315	 * words in the pending mask to check they are all empty. This orders
    316	 * with the same barrier on another CPU. This prevents two CPUs
    317	 * clearing the last 2 pending bits, but neither seeing the other's
    318	 * store when checking if the mask is empty, and missing an empty
    319	 * mask, which ends with a false positive.
    320	 */
    321	smp_mb();
    322	if (cpumask_empty(&wd_smp_cpus_pending)) {
    323		unsigned long flags;
    324
    325none_pending:
    326		/*
    327		 * Double check under lock because more than one CPU could see
    328		 * a clear mask with the lockless check after clearing their
    329		 * pending bits.
    330		 */
    331		wd_smp_lock(&flags);
    332		if (cpumask_empty(&wd_smp_cpus_pending)) {
    333			wd_smp_last_reset_tb = get_tb();
    334			cpumask_andnot(&wd_smp_cpus_pending,
    335					&wd_cpus_enabled,
    336					&wd_smp_cpus_stuck);
    337		}
    338		wd_smp_unlock(&flags);
    339	}
    340}
    341
    342static void watchdog_timer_interrupt(int cpu)
    343{
    344	u64 tb = get_tb();
    345
    346	per_cpu(wd_timer_tb, cpu) = tb;
    347
    348	wd_smp_clear_cpu_pending(cpu);
    349
    350	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
    351		watchdog_smp_panic(cpu);
    352
    353	if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
    354		/*
    355		 * Something has called printk from NMI context. It might be
    356		 * stuck, so this this triggers a flush that will get that
    357		 * printk output to the console.
    358		 *
    359		 * See wd_lockup_ipi.
    360		 */
    361		printk_trigger_flush();
    362	}
    363}
    364
    365DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
    366{
    367	unsigned long flags;
    368	int cpu = raw_smp_processor_id();
    369	u64 tb;
    370
    371	/* should only arrive from kernel, with irqs disabled */
    372	WARN_ON_ONCE(!arch_irq_disabled_regs(regs));
    373
    374	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
    375		return 0;
    376
    377	__this_cpu_inc(irq_stat.soft_nmi_irqs);
    378
    379	tb = get_tb();
    380	if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
    381		/*
    382		 * Taking wd_smp_lock here means it is a soft-NMI lock, which
    383		 * means we can't take any regular or irqsafe spin locks while
    384		 * holding this lock. This is why timers can't printk while
    385		 * holding the lock.
    386		 */
    387		wd_smp_lock(&flags);
    388		if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
    389			wd_smp_unlock(&flags);
    390			return 0;
    391		}
    392		if (!wd_try_report()) {
    393			wd_smp_unlock(&flags);
    394			/* Couldn't report, try again in 100ms */
    395			mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
    396			return 0;
    397		}
    398
    399		set_cpu_stuck(cpu);
    400
    401		wd_smp_unlock(&flags);
    402
    403		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
    404			 cpu, (void *)regs->nip);
    405		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
    406			 cpu, tb, per_cpu(wd_timer_tb, cpu),
    407			 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
    408		print_modules();
    409		print_irqtrace_events(current);
    410		show_regs(regs);
    411
    412		xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi
    413
    414		if (sysctl_hardlockup_all_cpu_backtrace)
    415			trigger_allbutself_cpu_backtrace();
    416
    417		if (hardlockup_panic)
    418			nmi_panic(regs, "Hard LOCKUP");
    419
    420		wd_end_reporting();
    421	}
    422	/*
    423	 * We are okay to change DEC in soft_nmi_interrupt because the masked
    424	 * handler has marked a DEC as pending, so the timer interrupt will be
    425	 * replayed as soon as local irqs are enabled again.
    426	 */
    427	if (wd_panic_timeout_tb < 0x7fffffff)
    428		mtspr(SPRN_DEC, wd_panic_timeout_tb);
    429
    430	return 0;
    431}
    432
    433static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
    434{
    435	int cpu = smp_processor_id();
    436
    437	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
    438		return HRTIMER_NORESTART;
    439
    440	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
    441		return HRTIMER_NORESTART;
    442
    443	watchdog_timer_interrupt(cpu);
    444
    445	hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
    446
    447	return HRTIMER_RESTART;
    448}
    449
    450void arch_touch_nmi_watchdog(void)
    451{
    452	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
    453	int cpu = smp_processor_id();
    454	u64 tb;
    455
    456	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
    457		return;
    458
    459	tb = get_tb();
    460	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
    461		per_cpu(wd_timer_tb, cpu) = tb;
    462		wd_smp_clear_cpu_pending(cpu);
    463	}
    464}
    465EXPORT_SYMBOL(arch_touch_nmi_watchdog);
    466
    467static void start_watchdog(void *arg)
    468{
    469	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
    470	int cpu = smp_processor_id();
    471	unsigned long flags;
    472
    473	if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
    474		WARN_ON(1);
    475		return;
    476	}
    477
    478	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
    479		return;
    480
    481	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
    482		return;
    483
    484	wd_smp_lock(&flags);
    485	cpumask_set_cpu(cpu, &wd_cpus_enabled);
    486	if (cpumask_weight(&wd_cpus_enabled) == 1) {
    487		cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
    488		wd_smp_last_reset_tb = get_tb();
    489	}
    490	wd_smp_unlock(&flags);
    491
    492	*this_cpu_ptr(&wd_timer_tb) = get_tb();
    493
    494	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    495	hrtimer->function = watchdog_timer_fn;
    496	hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
    497		      HRTIMER_MODE_REL_PINNED);
    498}
    499
    500static int start_watchdog_on_cpu(unsigned int cpu)
    501{
    502	return smp_call_function_single(cpu, start_watchdog, NULL, true);
    503}
    504
    505static void stop_watchdog(void *arg)
    506{
    507	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
    508	int cpu = smp_processor_id();
    509	unsigned long flags;
    510
    511	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
    512		return; /* Can happen in CPU unplug case */
    513
    514	hrtimer_cancel(hrtimer);
    515
    516	wd_smp_lock(&flags);
    517	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
    518	wd_smp_unlock(&flags);
    519
    520	wd_smp_clear_cpu_pending(cpu);
    521}
    522
    523static int stop_watchdog_on_cpu(unsigned int cpu)
    524{
    525	return smp_call_function_single(cpu, stop_watchdog, NULL, true);
    526}
    527
    528static void watchdog_calc_timeouts(void)
    529{
    530	wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
    531
    532	/* Have the SMP detector trigger a bit later */
    533	wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
    534
    535	/* 2/5 is the factor that the perf based detector uses */
    536	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
    537}
    538
    539void watchdog_nmi_stop(void)
    540{
    541	int cpu;
    542
    543	for_each_cpu(cpu, &wd_cpus_enabled)
    544		stop_watchdog_on_cpu(cpu);
    545}
    546
    547void watchdog_nmi_start(void)
    548{
    549	int cpu;
    550
    551	watchdog_calc_timeouts();
    552	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
    553		start_watchdog_on_cpu(cpu);
    554}
    555
    556/*
    557 * Invoked from core watchdog init.
    558 */
    559int __init watchdog_nmi_probe(void)
    560{
    561	int err;
    562
    563	err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
    564					"powerpc/watchdog:online",
    565					start_watchdog_on_cpu,
    566					stop_watchdog_on_cpu);
    567	if (err < 0) {
    568		pr_warn("could not be initialized");
    569		return err;
    570	}
    571	return 0;
    572}