tree_stall.h - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
tree_stall.h (32034B)
      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * RCU CPU stall warnings for normal RCU grace periods
      4 *
      5 * Copyright IBM Corporation, 2019
      6 *
      7 * Author: Paul E. McKenney <paulmck@linux.ibm.com>
      8 */
      9
     10#include <linux/kvm_para.h>
     11
     12//////////////////////////////////////////////////////////////////////////////
     13//
     14// Controlling CPU stall warnings, including delay calculation.
     15
     16/* panic() on RCU Stall sysctl. */
     17int sysctl_panic_on_rcu_stall __read_mostly;
     18int sysctl_max_rcu_stall_to_panic __read_mostly;
     19
     20#ifdef CONFIG_PROVE_RCU
     21#define RCU_STALL_DELAY_DELTA		(5 * HZ)
     22#else
     23#define RCU_STALL_DELAY_DELTA		0
     24#endif
     25#define RCU_STALL_MIGHT_DIV		8
     26#define RCU_STALL_MIGHT_MIN		(2 * HZ)
     27
     28int rcu_exp_jiffies_till_stall_check(void)
     29{
     30	int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
     31	int exp_stall_delay_delta = 0;
     32	int till_stall_check;
     33
     34	// Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
     35	if (!cpu_stall_timeout)
     36		cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
     37
     38	// Limit check must be consistent with the Kconfig limits for
     39	// CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
     40	// The minimum clamped value is "2UL", because at least one full
     41	// tick has to be guaranteed.
     42	till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
     43
     44	if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
     45		WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
     46
     47#ifdef CONFIG_PROVE_RCU
     48	/* Add extra ~25% out of till_stall_check. */
     49	exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
     50#endif
     51
     52	return till_stall_check + exp_stall_delay_delta;
     53}
     54EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
     55
     56/* Limit-check stall timeouts specified at boottime and runtime. */
     57int rcu_jiffies_till_stall_check(void)
     58{
     59	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
     60
     61	/*
     62	 * Limit check must be consistent with the Kconfig limits
     63	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
     64	 */
     65	if (till_stall_check < 3) {
     66		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
     67		till_stall_check = 3;
     68	} else if (till_stall_check > 300) {
     69		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
     70		till_stall_check = 300;
     71	}
     72	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
     73}
     74EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
     75
     76/**
     77 * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled?
     78 *
     79 * Returns @true if the current grace period is sufficiently old that
     80 * it is reasonable to assume that it might be stalled.  This can be
     81 * useful when deciding whether to allocate memory to enable RCU-mediated
     82 * freeing on the one hand or just invoking synchronize_rcu() on the other.
     83 * The latter is preferable when the grace period is stalled.
     84 *
     85 * Note that sampling of the .gp_start and .gp_seq fields must be done
     86 * carefully to avoid false positives at the beginnings and ends of
     87 * grace periods.
     88 */
     89bool rcu_gp_might_be_stalled(void)
     90{
     91	unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV;
     92	unsigned long j = jiffies;
     93
     94	if (d < RCU_STALL_MIGHT_MIN)
     95		d = RCU_STALL_MIGHT_MIN;
     96	smp_mb(); // jiffies before .gp_seq to avoid false positives.
     97	if (!rcu_gp_in_progress())
     98		return false;
     99	// Long delays at this point avoids false positive, but a delay
    100	// of ULONG_MAX/4 jiffies voids your no-false-positive warranty.
    101	smp_mb(); // .gp_seq before second .gp_start
    102	// And ditto here.
    103	return !time_before(j, READ_ONCE(rcu_state.gp_start) + d);
    104}
    105
    106/* Don't do RCU CPU stall warnings during long sysrq printouts. */
    107void rcu_sysrq_start(void)
    108{
    109	if (!rcu_cpu_stall_suppress)
    110		rcu_cpu_stall_suppress = 2;
    111}
    112
    113void rcu_sysrq_end(void)
    114{
    115	if (rcu_cpu_stall_suppress == 2)
    116		rcu_cpu_stall_suppress = 0;
    117}
    118
    119/* Don't print RCU CPU stall warnings during a kernel panic. */
    120static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
    121{
    122	rcu_cpu_stall_suppress = 1;
    123	return NOTIFY_DONE;
    124}
    125
    126static struct notifier_block rcu_panic_block = {
    127	.notifier_call = rcu_panic,
    128};
    129
    130static int __init check_cpu_stall_init(void)
    131{
    132	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
    133	return 0;
    134}
    135early_initcall(check_cpu_stall_init);
    136
    137/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
    138static void panic_on_rcu_stall(void)
    139{
    140	static int cpu_stall;
    141
    142	if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
    143		return;
    144
    145	if (sysctl_panic_on_rcu_stall)
    146		panic("RCU Stall\n");
    147}
    148
    149/**
    150 * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
    151 *
    152 * The caller must disable hard irqs.
    153 */
    154void rcu_cpu_stall_reset(void)
    155{
    156	WRITE_ONCE(rcu_state.jiffies_stall,
    157		   jiffies + rcu_jiffies_till_stall_check());
    158}
    159
    160//////////////////////////////////////////////////////////////////////////////
    161//
    162// Interaction with RCU grace periods
    163
    164/* Start of new grace period, so record stall time (and forcing times). */
    165static void record_gp_stall_check_time(void)
    166{
    167	unsigned long j = jiffies;
    168	unsigned long j1;
    169
    170	WRITE_ONCE(rcu_state.gp_start, j);
    171	j1 = rcu_jiffies_till_stall_check();
    172	smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
    173	WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
    174	rcu_state.jiffies_resched = j + j1 / 2;
    175	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
    176}
    177
    178/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
    179static void zero_cpu_stall_ticks(struct rcu_data *rdp)
    180{
    181	rdp->ticks_this_gp = 0;
    182	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
    183	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
    184}
    185
    186/*
    187 * If too much time has passed in the current grace period, and if
    188 * so configured, go kick the relevant kthreads.
    189 */
    190static void rcu_stall_kick_kthreads(void)
    191{
    192	unsigned long j;
    193
    194	if (!READ_ONCE(rcu_kick_kthreads))
    195		return;
    196	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
    197	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
    198	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
    199		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
    200			  rcu_state.name);
    201		rcu_ftrace_dump(DUMP_ALL);
    202		wake_up_process(rcu_state.gp_kthread);
    203		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
    204	}
    205}
    206
    207/*
    208 * Handler for the irq_work request posted about halfway into the RCU CPU
    209 * stall timeout, and used to detect excessive irq disabling.  Set state
    210 * appropriately, but just complain if there is unexpected state on entry.
    211 */
    212static void rcu_iw_handler(struct irq_work *iwp)
    213{
    214	struct rcu_data *rdp;
    215	struct rcu_node *rnp;
    216
    217	rdp = container_of(iwp, struct rcu_data, rcu_iw);
    218	rnp = rdp->mynode;
    219	raw_spin_lock_rcu_node(rnp);
    220	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
    221		rdp->rcu_iw_gp_seq = rnp->gp_seq;
    222		rdp->rcu_iw_pending = false;
    223	}
    224	raw_spin_unlock_rcu_node(rnp);
    225}
    226
    227//////////////////////////////////////////////////////////////////////////////
    228//
    229// Printing RCU CPU stall warnings
    230
    231#ifdef CONFIG_PREEMPT_RCU
    232
    233/*
    234 * Dump detailed information for all tasks blocking the current RCU
    235 * grace period on the specified rcu_node structure.
    236 */
    237static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
    238{
    239	unsigned long flags;
    240	struct task_struct *t;
    241
    242	raw_spin_lock_irqsave_rcu_node(rnp, flags);
    243	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
    244		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    245		return;
    246	}
    247	t = list_entry(rnp->gp_tasks->prev,
    248		       struct task_struct, rcu_node_entry);
    249	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
    250		/*
    251		 * We could be printing a lot while holding a spinlock.
    252		 * Avoid triggering hard lockup.
    253		 */
    254		touch_nmi_watchdog();
    255		sched_show_task(t);
    256	}
    257	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    258}
    259
    260// Communicate task state back to the RCU CPU stall warning request.
    261struct rcu_stall_chk_rdr {
    262	int nesting;
    263	union rcu_special rs;
    264	bool on_blkd_list;
    265};
    266
    267/*
    268 * Report out the state of a not-running task that is stalling the
    269 * current RCU grace period.
    270 */
    271static int check_slow_task(struct task_struct *t, void *arg)
    272{
    273	struct rcu_stall_chk_rdr *rscrp = arg;
    274
    275	if (task_curr(t))
    276		return -EBUSY; // It is running, so decline to inspect it.
    277	rscrp->nesting = t->rcu_read_lock_nesting;
    278	rscrp->rs = t->rcu_read_unlock_special;
    279	rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
    280	return 0;
    281}
    282
    283/*
    284 * Scan the current list of tasks blocked within RCU read-side critical
    285 * sections, printing out the tid of each of the first few of them.
    286 */
    287static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
    288	__releases(rnp->lock)
    289{
    290	int i = 0;
    291	int ndetected = 0;
    292	struct rcu_stall_chk_rdr rscr;
    293	struct task_struct *t;
    294	struct task_struct *ts[8];
    295
    296	lockdep_assert_irqs_disabled();
    297	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
    298		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    299		return 0;
    300	}
    301	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
    302	       rnp->level, rnp->grplo, rnp->grphi);
    303	t = list_entry(rnp->gp_tasks->prev,
    304		       struct task_struct, rcu_node_entry);
    305	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
    306		get_task_struct(t);
    307		ts[i++] = t;
    308		if (i >= ARRAY_SIZE(ts))
    309			break;
    310	}
    311	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    312	while (i) {
    313		t = ts[--i];
    314		if (task_call_func(t, check_slow_task, &rscr))
    315			pr_cont(" P%d", t->pid);
    316		else
    317			pr_cont(" P%d/%d:%c%c%c%c",
    318				t->pid, rscr.nesting,
    319				".b"[rscr.rs.b.blocked],
    320				".q"[rscr.rs.b.need_qs],
    321				".e"[rscr.rs.b.exp_hint],
    322				".l"[rscr.on_blkd_list]);
    323		lockdep_assert_irqs_disabled();
    324		put_task_struct(t);
    325		ndetected++;
    326	}
    327	pr_cont("\n");
    328	return ndetected;
    329}
    330
    331#else /* #ifdef CONFIG_PREEMPT_RCU */
    332
    333/*
    334 * Because preemptible RCU does not exist, we never have to check for
    335 * tasks blocked within RCU read-side critical sections.
    336 */
    337static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
    338{
    339}
    340
    341/*
    342 * Because preemptible RCU does not exist, we never have to check for
    343 * tasks blocked within RCU read-side critical sections.
    344 */
    345static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
    346	__releases(rnp->lock)
    347{
    348	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    349	return 0;
    350}
    351#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
    352
    353/*
    354 * Dump stacks of all tasks running on stalled CPUs.  First try using
    355 * NMIs, but fall back to manual remote stack tracing on architectures
    356 * that don't support NMI-based stack dumps.  The NMI-triggered stack
    357 * traces are more accurate because they are printed by the target CPU.
    358 */
    359static void rcu_dump_cpu_stacks(void)
    360{
    361	int cpu;
    362	unsigned long flags;
    363	struct rcu_node *rnp;
    364
    365	rcu_for_each_leaf_node(rnp) {
    366		raw_spin_lock_irqsave_rcu_node(rnp, flags);
    367		for_each_leaf_node_possible_cpu(rnp, cpu)
    368			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
    369				if (cpu_is_offline(cpu))
    370					pr_err("Offline CPU %d blocking current GP.\n", cpu);
    371				else if (!trigger_single_cpu_backtrace(cpu))
    372					dump_cpu_task(cpu);
    373			}
    374		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    375	}
    376}
    377
    378static const char * const gp_state_names[] = {
    379	[RCU_GP_IDLE] = "RCU_GP_IDLE",
    380	[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
    381	[RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
    382	[RCU_GP_ONOFF] = "RCU_GP_ONOFF",
    383	[RCU_GP_INIT] = "RCU_GP_INIT",
    384	[RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
    385	[RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
    386	[RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
    387	[RCU_GP_CLEANED] = "RCU_GP_CLEANED",
    388};
    389
    390/*
    391 * Convert a ->gp_state value to a character string.
    392 */
    393static const char *gp_state_getname(short gs)
    394{
    395	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
    396		return "???";
    397	return gp_state_names[gs];
    398}
    399
    400/* Is the RCU grace-period kthread being starved of CPU time? */
    401static bool rcu_is_gp_kthread_starving(unsigned long *jp)
    402{
    403	unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity);
    404
    405	if (jp)
    406		*jp = j;
    407	return j > 2 * HZ;
    408}
    409
    410static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
    411{
    412	unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity);
    413
    414	if (jp)
    415		*jp = j;
    416	return j > 2 * HZ;
    417}
    418
    419/*
    420 * Print out diagnostic information for the specified stalled CPU.
    421 *
    422 * If the specified CPU is aware of the current RCU grace period, then
    423 * print the number of scheduling clock interrupts the CPU has taken
    424 * during the time that it has been aware.  Otherwise, print the number
    425 * of RCU grace periods that this CPU is ignorant of, for example, "1"
    426 * if the CPU was aware of the previous grace period.
    427 *
    428 * Also print out idle info.
    429 */
    430static void print_cpu_stall_info(int cpu)
    431{
    432	unsigned long delta;
    433	bool falsepositive;
    434	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    435	char *ticks_title;
    436	unsigned long ticks_value;
    437
    438	/*
    439	 * We could be printing a lot while holding a spinlock.  Avoid
    440	 * triggering hard lockup.
    441	 */
    442	touch_nmi_watchdog();
    443
    444	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
    445	if (ticks_value) {
    446		ticks_title = "GPs behind";
    447	} else {
    448		ticks_title = "ticks this GP";
    449		ticks_value = rdp->ticks_this_gp;
    450	}
    451	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
    452	falsepositive = rcu_is_gp_kthread_starving(NULL) &&
    453			rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
    454	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
    455	       cpu,
    456	       "O."[!!cpu_online(cpu)],
    457	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
    458	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
    459	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
    460			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
    461				"!."[!delta],
    462	       ticks_value, ticks_title,
    463	       rcu_dynticks_snap(rdp) & 0xfff,
    464	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
    465	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
    466	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
    467	       falsepositive ? " (false positive?)" : "");
    468}
    469
    470static void rcuc_kthread_dump(struct rcu_data *rdp)
    471{
    472	int cpu;
    473	unsigned long j;
    474	struct task_struct *rcuc;
    475
    476	rcuc = rdp->rcu_cpu_kthread_task;
    477	if (!rcuc)
    478		return;
    479
    480	cpu = task_cpu(rcuc);
    481	if (cpu_is_offline(cpu) || idle_cpu(cpu))
    482		return;
    483
    484	if (!rcu_is_rcuc_kthread_starving(rdp, &j))
    485		return;
    486
    487	pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
    488	sched_show_task(rcuc);
    489	if (!trigger_single_cpu_backtrace(cpu))
    490		dump_cpu_task(cpu);
    491}
    492
    493/* Complain about starvation of grace-period kthread.  */
    494static void rcu_check_gp_kthread_starvation(void)
    495{
    496	int cpu;
    497	struct task_struct *gpk = rcu_state.gp_kthread;
    498	unsigned long j;
    499
    500	if (rcu_is_gp_kthread_starving(&j)) {
    501		cpu = gpk ? task_cpu(gpk) : -1;
    502		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
    503		       rcu_state.name, j,
    504		       (long)rcu_seq_current(&rcu_state.gp_seq),
    505		       data_race(READ_ONCE(rcu_state.gp_flags)),
    506		       gp_state_getname(rcu_state.gp_state),
    507		       data_race(READ_ONCE(rcu_state.gp_state)),
    508		       gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
    509		if (gpk) {
    510			pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
    511			pr_err("RCU grace-period kthread stack dump:\n");
    512			sched_show_task(gpk);
    513			if (cpu >= 0) {
    514				if (cpu_is_offline(cpu)) {
    515					pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
    516				} else  {
    517					pr_err("Stack dump where RCU GP kthread last ran:\n");
    518					if (!trigger_single_cpu_backtrace(cpu))
    519						dump_cpu_task(cpu);
    520				}
    521			}
    522			wake_up_process(gpk);
    523		}
    524	}
    525}
    526
    527/* Complain about missing wakeups from expired fqs wait timer */
    528static void rcu_check_gp_kthread_expired_fqs_timer(void)
    529{
    530	struct task_struct *gpk = rcu_state.gp_kthread;
    531	short gp_state;
    532	unsigned long jiffies_fqs;
    533	int cpu;
    534
    535	/*
    536	 * Order reads of .gp_state and .jiffies_force_qs.
    537	 * Matching smp_wmb() is present in rcu_gp_fqs_loop().
    538	 */
    539	gp_state = smp_load_acquire(&rcu_state.gp_state);
    540	jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
    541
    542	if (gp_state == RCU_GP_WAIT_FQS &&
    543	    time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
    544	    gpk && !READ_ONCE(gpk->on_rq)) {
    545		cpu = task_cpu(gpk);
    546		pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
    547		       rcu_state.name, (jiffies - jiffies_fqs),
    548		       (long)rcu_seq_current(&rcu_state.gp_seq),
    549		       data_race(rcu_state.gp_flags),
    550		       gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
    551		       data_race(READ_ONCE(gpk->__state)));
    552		pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
    553		       cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
    554	}
    555}
    556
    557static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
    558{
    559	int cpu;
    560	unsigned long flags;
    561	unsigned long gpa;
    562	unsigned long j;
    563	int ndetected = 0;
    564	struct rcu_node *rnp;
    565	long totqlen = 0;
    566
    567	lockdep_assert_irqs_disabled();
    568
    569	/* Kick and suppress, if so configured. */
    570	rcu_stall_kick_kthreads();
    571	if (rcu_stall_is_suppressed())
    572		return;
    573
    574	/*
    575	 * OK, time to rat on our buddy...
    576	 * See Documentation/RCU/stallwarn.rst for info on how to debug
    577	 * RCU CPU stall warnings.
    578	 */
    579	trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected"));
    580	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
    581	rcu_for_each_leaf_node(rnp) {
    582		raw_spin_lock_irqsave_rcu_node(rnp, flags);
    583		if (rnp->qsmask != 0) {
    584			for_each_leaf_node_possible_cpu(rnp, cpu)
    585				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
    586					print_cpu_stall_info(cpu);
    587					ndetected++;
    588				}
    589		}
    590		ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
    591		lockdep_assert_irqs_disabled();
    592	}
    593
    594	for_each_possible_cpu(cpu)
    595		totqlen += rcu_get_n_cbs_cpu(cpu);
    596	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
    597	       smp_processor_id(), (long)(jiffies - gps),
    598	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
    599	if (ndetected) {
    600		rcu_dump_cpu_stacks();
    601
    602		/* Complain about tasks blocking the grace period. */
    603		rcu_for_each_leaf_node(rnp)
    604			rcu_print_detail_task_stall_rnp(rnp);
    605	} else {
    606		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
    607			pr_err("INFO: Stall ended before state dump start\n");
    608		} else {
    609			j = jiffies;
    610			gpa = data_race(READ_ONCE(rcu_state.gp_activity));
    611			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
    612			       rcu_state.name, j - gpa, j, gpa,
    613			       data_race(READ_ONCE(jiffies_till_next_fqs)),
    614			       data_race(READ_ONCE(rcu_get_root()->qsmask)));
    615		}
    616	}
    617	/* Rewrite if needed in case of slow consoles. */
    618	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
    619		WRITE_ONCE(rcu_state.jiffies_stall,
    620			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
    621
    622	rcu_check_gp_kthread_expired_fqs_timer();
    623	rcu_check_gp_kthread_starvation();
    624
    625	panic_on_rcu_stall();
    626
    627	rcu_force_quiescent_state();  /* Kick them all. */
    628}
    629
    630static void print_cpu_stall(unsigned long gps)
    631{
    632	int cpu;
    633	unsigned long flags;
    634	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    635	struct rcu_node *rnp = rcu_get_root();
    636	long totqlen = 0;
    637
    638	lockdep_assert_irqs_disabled();
    639
    640	/* Kick and suppress, if so configured. */
    641	rcu_stall_kick_kthreads();
    642	if (rcu_stall_is_suppressed())
    643		return;
    644
    645	/*
    646	 * OK, time to rat on ourselves...
    647	 * See Documentation/RCU/stallwarn.rst for info on how to debug
    648	 * RCU CPU stall warnings.
    649	 */
    650	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
    651	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
    652	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
    653	print_cpu_stall_info(smp_processor_id());
    654	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
    655	for_each_possible_cpu(cpu)
    656		totqlen += rcu_get_n_cbs_cpu(cpu);
    657	pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
    658		jiffies - gps,
    659		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
    660
    661	rcu_check_gp_kthread_expired_fqs_timer();
    662	rcu_check_gp_kthread_starvation();
    663
    664	if (!use_softirq)
    665		rcuc_kthread_dump(rdp);
    666
    667	rcu_dump_cpu_stacks();
    668
    669	raw_spin_lock_irqsave_rcu_node(rnp, flags);
    670	/* Rewrite if needed in case of slow consoles. */
    671	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
    672		WRITE_ONCE(rcu_state.jiffies_stall,
    673			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
    674	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    675
    676	panic_on_rcu_stall();
    677
    678	/*
    679	 * Attempt to revive the RCU machinery by forcing a context switch.
    680	 *
    681	 * A context switch would normally allow the RCU state machine to make
    682	 * progress and it could be we're stuck in kernel space without context
    683	 * switches for an entirely unreasonable amount of time.
    684	 */
    685	set_tsk_need_resched(current);
    686	set_preempt_need_resched();
    687}
    688
    689static void check_cpu_stall(struct rcu_data *rdp)
    690{
    691	bool didstall = false;
    692	unsigned long gs1;
    693	unsigned long gs2;
    694	unsigned long gps;
    695	unsigned long j;
    696	unsigned long jn;
    697	unsigned long js;
    698	struct rcu_node *rnp;
    699
    700	lockdep_assert_irqs_disabled();
    701	if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
    702	    !rcu_gp_in_progress())
    703		return;
    704	rcu_stall_kick_kthreads();
    705	j = jiffies;
    706
    707	/*
    708	 * Lots of memory barriers to reject false positives.
    709	 *
    710	 * The idea is to pick up rcu_state.gp_seq, then
    711	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
    712	 * another copy of rcu_state.gp_seq.  These values are updated in
    713	 * the opposite order with memory barriers (or equivalent) during
    714	 * grace-period initialization and cleanup.  Now, a false positive
    715	 * can occur if we get an new value of rcu_state.gp_start and a old
    716	 * value of rcu_state.jiffies_stall.  But given the memory barriers,
    717	 * the only way that this can happen is if one grace period ends
    718	 * and another starts between these two fetches.  This is detected
    719	 * by comparing the second fetch of rcu_state.gp_seq with the
    720	 * previous fetch from rcu_state.gp_seq.
    721	 *
    722	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
    723	 * and rcu_state.gp_start suffice to forestall false positives.
    724	 */
    725	gs1 = READ_ONCE(rcu_state.gp_seq);
    726	smp_rmb(); /* Pick up ->gp_seq first... */
    727	js = READ_ONCE(rcu_state.jiffies_stall);
    728	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
    729	gps = READ_ONCE(rcu_state.gp_start);
    730	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
    731	gs2 = READ_ONCE(rcu_state.gp_seq);
    732	if (gs1 != gs2 ||
    733	    ULONG_CMP_LT(j, js) ||
    734	    ULONG_CMP_GE(gps, js))
    735		return; /* No stall or GP completed since entering function. */
    736	rnp = rdp->mynode;
    737	jn = jiffies + ULONG_MAX / 2;
    738	if (rcu_gp_in_progress() &&
    739	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
    740	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
    741
    742		/*
    743		 * If a virtual machine is stopped by the host it can look to
    744		 * the watchdog like an RCU stall. Check to see if the host
    745		 * stopped the vm.
    746		 */
    747		if (kvm_check_and_clear_guest_paused())
    748			return;
    749
    750		/* We haven't checked in, so go dump stack. */
    751		print_cpu_stall(gps);
    752		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
    753			rcu_ftrace_dump(DUMP_ALL);
    754		didstall = true;
    755
    756	} else if (rcu_gp_in_progress() &&
    757		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
    758		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
    759
    760		/*
    761		 * If a virtual machine is stopped by the host it can look to
    762		 * the watchdog like an RCU stall. Check to see if the host
    763		 * stopped the vm.
    764		 */
    765		if (kvm_check_and_clear_guest_paused())
    766			return;
    767
    768		/* They had a few time units to dump stack, so complain. */
    769		print_other_cpu_stall(gs2, gps);
    770		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
    771			rcu_ftrace_dump(DUMP_ALL);
    772		didstall = true;
    773	}
    774	if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
    775		jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
    776		WRITE_ONCE(rcu_state.jiffies_stall, jn);
    777	}
    778}
    779
    780//////////////////////////////////////////////////////////////////////////////
    781//
    782// RCU forward-progress mechanisms, including of callback invocation.
    783
    784
    785/*
    786 * Check to see if a failure to end RCU priority inversion was due to
    787 * a CPU not passing through a quiescent state.  When this happens, there
    788 * is nothing that RCU priority boosting can do to help, so we shouldn't
    789 * count this as an RCU priority boosting failure.  A return of true says
    790 * RCU priority boosting is to blame, and false says otherwise.  If false
    791 * is returned, the first of the CPUs to blame is stored through cpup.
    792 * If there was no CPU blocking the current grace period, but also nothing
    793 * in need of being boosted, *cpup is set to -1.  This can happen in case
    794 * of vCPU preemption while the last CPU is reporting its quiscent state,
    795 * for example.
    796 *
    797 * If cpup is NULL, then a lockless quick check is carried out, suitable
    798 * for high-rate usage.  On the other hand, if cpup is non-NULL, each
    799 * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
    800 */
    801bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
    802{
    803	bool atb = false;
    804	int cpu;
    805	unsigned long flags;
    806	struct rcu_node *rnp;
    807
    808	rcu_for_each_leaf_node(rnp) {
    809		if (!cpup) {
    810			if (data_race(READ_ONCE(rnp->qsmask))) {
    811				return false;
    812			} else {
    813				if (READ_ONCE(rnp->gp_tasks))
    814					atb = true;
    815				continue;
    816			}
    817		}
    818		*cpup = -1;
    819		raw_spin_lock_irqsave_rcu_node(rnp, flags);
    820		if (rnp->gp_tasks)
    821			atb = true;
    822		if (!rnp->qsmask) {
    823			// No CPUs without quiescent states for this rnp.
    824			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    825			continue;
    826		}
    827		// Find the first holdout CPU.
    828		for_each_leaf_node_possible_cpu(rnp, cpu) {
    829			if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
    830				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    831				*cpup = cpu;
    832				return false;
    833			}
    834		}
    835		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    836	}
    837	// Can't blame CPUs, so must blame RCU priority boosting.
    838	return atb;
    839}
    840EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
    841
    842/*
    843 * Show the state of the grace-period kthreads.
    844 */
    845void show_rcu_gp_kthreads(void)
    846{
    847	unsigned long cbs = 0;
    848	int cpu;
    849	unsigned long j;
    850	unsigned long ja;
    851	unsigned long jr;
    852	unsigned long js;
    853	unsigned long jw;
    854	struct rcu_data *rdp;
    855	struct rcu_node *rnp;
    856	struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
    857
    858	j = jiffies;
    859	ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
    860	jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
    861	js = j - data_race(READ_ONCE(rcu_state.gp_start));
    862	jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
    863	pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
    864		rcu_state.name, gp_state_getname(rcu_state.gp_state),
    865		data_race(READ_ONCE(rcu_state.gp_state)),
    866		t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
    867		js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
    868		(long)data_race(READ_ONCE(rcu_state.gp_seq)),
    869		(long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
    870		data_race(READ_ONCE(rcu_state.gp_max)),
    871		data_race(READ_ONCE(rcu_state.gp_flags)));
    872	rcu_for_each_node_breadth_first(rnp) {
    873		if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
    874		    !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
    875		    !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
    876			continue;
    877		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
    878			rnp->grplo, rnp->grphi,
    879			(long)data_race(READ_ONCE(rnp->gp_seq)),
    880			(long)data_race(READ_ONCE(rnp->gp_seq_needed)),
    881			data_race(READ_ONCE(rnp->qsmask)),
    882			".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
    883			".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
    884			".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
    885			".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
    886			data_race(READ_ONCE(rnp->n_boosts)));
    887		if (!rcu_is_leaf_node(rnp))
    888			continue;
    889		for_each_leaf_node_possible_cpu(rnp, cpu) {
    890			rdp = per_cpu_ptr(&rcu_data, cpu);
    891			if (READ_ONCE(rdp->gpwrap) ||
    892			    ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
    893					 READ_ONCE(rdp->gp_seq_needed)))
    894				continue;
    895			pr_info("\tcpu %d ->gp_seq_needed %ld\n",
    896				cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
    897		}
    898	}
    899	for_each_possible_cpu(cpu) {
    900		rdp = per_cpu_ptr(&rcu_data, cpu);
    901		cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
    902		if (rcu_segcblist_is_offloaded(&rdp->cblist))
    903			show_rcu_nocb_state(rdp);
    904	}
    905	pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
    906	show_rcu_tasks_gp_kthreads();
    907}
    908EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
    909
    910/*
    911 * This function checks for grace-period requests that fail to motivate
    912 * RCU to come out of its idle mode.
    913 */
    914static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
    915				     const unsigned long gpssdelay)
    916{
    917	unsigned long flags;
    918	unsigned long j;
    919	struct rcu_node *rnp_root = rcu_get_root();
    920	static atomic_t warned = ATOMIC_INIT(0);
    921
    922	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
    923	    ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
    924			 READ_ONCE(rnp_root->gp_seq_needed)) ||
    925	    !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread.
    926		return;
    927	j = jiffies; /* Expensive access, and in common case don't get here. */
    928	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
    929	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
    930	    atomic_read(&warned))
    931		return;
    932
    933	raw_spin_lock_irqsave_rcu_node(rnp, flags);
    934	j = jiffies;
    935	if (rcu_gp_in_progress() ||
    936	    ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
    937			 READ_ONCE(rnp_root->gp_seq_needed)) ||
    938	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
    939	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
    940	    atomic_read(&warned)) {
    941		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    942		return;
    943	}
    944	/* Hold onto the leaf lock to make others see warned==1. */
    945
    946	if (rnp_root != rnp)
    947		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
    948	j = jiffies;
    949	if (rcu_gp_in_progress() ||
    950	    ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
    951			 READ_ONCE(rnp_root->gp_seq_needed)) ||
    952	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
    953	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
    954	    atomic_xchg(&warned, 1)) {
    955		if (rnp_root != rnp)
    956			/* irqs remain disabled. */
    957			raw_spin_unlock_rcu_node(rnp_root);
    958		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    959		return;
    960	}
    961	WARN_ON(1);
    962	if (rnp_root != rnp)
    963		raw_spin_unlock_rcu_node(rnp_root);
    964	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
    965	show_rcu_gp_kthreads();
    966}
    967
    968/*
    969 * Do a forward-progress check for rcutorture.  This is normally invoked
    970 * due to an OOM event.  The argument "j" gives the time period during
    971 * which rcutorture would like progress to have been made.
    972 */
    973void rcu_fwd_progress_check(unsigned long j)
    974{
    975	unsigned long cbs;
    976	int cpu;
    977	unsigned long max_cbs = 0;
    978	int max_cpu = -1;
    979	struct rcu_data *rdp;
    980
    981	if (rcu_gp_in_progress()) {
    982		pr_info("%s: GP age %lu jiffies\n",
    983			__func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
    984		show_rcu_gp_kthreads();
    985	} else {
    986		pr_info("%s: Last GP end %lu jiffies ago\n",
    987			__func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
    988		preempt_disable();
    989		rdp = this_cpu_ptr(&rcu_data);
    990		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
    991		preempt_enable();
    992	}
    993	for_each_possible_cpu(cpu) {
    994		cbs = rcu_get_n_cbs_cpu(cpu);
    995		if (!cbs)
    996			continue;
    997		if (max_cpu < 0)
    998			pr_info("%s: callbacks", __func__);
    999		pr_cont(" %d: %lu", cpu, cbs);
   1000		if (cbs <= max_cbs)
   1001			continue;
   1002		max_cbs = cbs;
   1003		max_cpu = cpu;
   1004	}
   1005	if (max_cpu >= 0)
   1006		pr_cont("\n");
   1007}
   1008EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
   1009
   1010/* Commandeer a sysrq key to dump RCU's tree. */
   1011static bool sysrq_rcu;
   1012module_param(sysrq_rcu, bool, 0444);
   1013
   1014/* Dump grace-period-request information due to commandeered sysrq. */
   1015static void sysrq_show_rcu(int key)
   1016{
   1017	show_rcu_gp_kthreads();
   1018}
   1019
   1020static const struct sysrq_key_op sysrq_rcudump_op = {
   1021	.handler = sysrq_show_rcu,
   1022	.help_msg = "show-rcu(y)",
   1023	.action_msg = "Show RCU tree",
   1024	.enable_mask = SYSRQ_ENABLE_DUMP,
   1025};
   1026
   1027static int __init rcu_sysrq_init(void)
   1028{
   1029	if (sysrq_rcu)
   1030		return register_sysrq_key('y', &sysrq_rcudump_op);
   1031	return 0;
   1032}
   1033early_initcall(rcu_sysrq_init);