tree.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
tree.c (162263B)
      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
      4 *
      5 * Copyright IBM Corporation, 2008
      6 *
      7 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
      8 *	    Manfred Spraul <manfred@colorfullife.com>
      9 *	    Paul E. McKenney <paulmck@linux.ibm.com>
     10 *
     11 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
     12 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
     13 *
     14 * For detailed explanation of Read-Copy Update mechanism see -
     15 *	Documentation/RCU
     16 */
     17
     18#define pr_fmt(fmt) "rcu: " fmt
     19
     20#include <linux/types.h>
     21#include <linux/kernel.h>
     22#include <linux/init.h>
     23#include <linux/spinlock.h>
     24#include <linux/smp.h>
     25#include <linux/rcupdate_wait.h>
     26#include <linux/interrupt.h>
     27#include <linux/sched.h>
     28#include <linux/sched/debug.h>
     29#include <linux/nmi.h>
     30#include <linux/atomic.h>
     31#include <linux/bitops.h>
     32#include <linux/export.h>
     33#include <linux/completion.h>
     34#include <linux/moduleparam.h>
     35#include <linux/panic.h>
     36#include <linux/panic_notifier.h>
     37#include <linux/percpu.h>
     38#include <linux/notifier.h>
     39#include <linux/cpu.h>
     40#include <linux/mutex.h>
     41#include <linux/time.h>
     42#include <linux/kernel_stat.h>
     43#include <linux/wait.h>
     44#include <linux/kthread.h>
     45#include <uapi/linux/sched/types.h>
     46#include <linux/prefetch.h>
     47#include <linux/delay.h>
     48#include <linux/random.h>
     49#include <linux/trace_events.h>
     50#include <linux/suspend.h>
     51#include <linux/ftrace.h>
     52#include <linux/tick.h>
     53#include <linux/sysrq.h>
     54#include <linux/kprobes.h>
     55#include <linux/gfp.h>
     56#include <linux/oom.h>
     57#include <linux/smpboot.h>
     58#include <linux/jiffies.h>
     59#include <linux/slab.h>
     60#include <linux/sched/isolation.h>
     61#include <linux/sched/clock.h>
     62#include <linux/vmalloc.h>
     63#include <linux/mm.h>
     64#include <linux/kasan.h>
     65#include "../time/tick-internal.h"
     66
     67#include "tree.h"
     68#include "rcu.h"
     69
     70#ifdef MODULE_PARAM_PREFIX
     71#undef MODULE_PARAM_PREFIX
     72#endif
     73#define MODULE_PARAM_PREFIX "rcutree."
     74
     75/* Data structures. */
     76
     77static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
     78	.dynticks_nesting = 1,
     79	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
     80	.dynticks = ATOMIC_INIT(1),
     81#ifdef CONFIG_RCU_NOCB_CPU
     82	.cblist.flags = SEGCBLIST_RCU_CORE,
     83#endif
     84};
     85static struct rcu_state rcu_state = {
     86	.level = { &rcu_state.node[0] },
     87	.gp_state = RCU_GP_IDLE,
     88	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
     89	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
     90	.barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
     91	.name = RCU_NAME,
     92	.abbr = RCU_ABBR,
     93	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
     94	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
     95	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
     96};
     97
     98/* Dump rcu_node combining tree at boot to verify correct setup. */
     99static bool dump_tree;
    100module_param(dump_tree, bool, 0444);
    101/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
    102static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
    103#ifndef CONFIG_PREEMPT_RT
    104module_param(use_softirq, bool, 0444);
    105#endif
    106/* Control rcu_node-tree auto-balancing at boot time. */
    107static bool rcu_fanout_exact;
    108module_param(rcu_fanout_exact, bool, 0444);
    109/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
    110static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
    111module_param(rcu_fanout_leaf, int, 0444);
    112int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
    113/* Number of rcu_nodes at specified level. */
    114int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
    115int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
    116
    117/*
    118 * The rcu_scheduler_active variable is initialized to the value
    119 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
    120 * first task is spawned.  So when this variable is RCU_SCHEDULER_INACTIVE,
    121 * RCU can assume that there is but one task, allowing RCU to (for example)
    122 * optimize synchronize_rcu() to a simple barrier().  When this variable
    123 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
    124 * to detect real grace periods.  This variable is also used to suppress
    125 * boot-time false positives from lockdep-RCU error checking.  Finally, it
    126 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
    127 * is fully initialized, including all of its kthreads having been spawned.
    128 */
    129int rcu_scheduler_active __read_mostly;
    130EXPORT_SYMBOL_GPL(rcu_scheduler_active);
    131
    132/*
    133 * The rcu_scheduler_fully_active variable transitions from zero to one
    134 * during the early_initcall() processing, which is after the scheduler
    135 * is capable of creating new tasks.  So RCU processing (for example,
    136 * creating tasks for RCU priority boosting) must be delayed until after
    137 * rcu_scheduler_fully_active transitions from zero to one.  We also
    138 * currently delay invocation of any RCU callbacks until after this point.
    139 *
    140 * It might later prove better for people registering RCU callbacks during
    141 * early boot to take responsibility for these callbacks, but one step at
    142 * a time.
    143 */
    144static int rcu_scheduler_fully_active __read_mostly;
    145
    146static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
    147			      unsigned long gps, unsigned long flags);
    148static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
    149static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
    150static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
    151static void invoke_rcu_core(void);
    152static void rcu_report_exp_rdp(struct rcu_data *rdp);
    153static void sync_sched_exp_online_cleanup(int cpu);
    154static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
    155static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
    156
    157/* rcuc/rcub/rcuop kthread realtime priority */
    158static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
    159module_param(kthread_prio, int, 0444);
    160
    161/* Delay in jiffies for grace-period initialization delays, debug only. */
    162
    163static int gp_preinit_delay;
    164module_param(gp_preinit_delay, int, 0444);
    165static int gp_init_delay;
    166module_param(gp_init_delay, int, 0444);
    167static int gp_cleanup_delay;
    168module_param(gp_cleanup_delay, int, 0444);
    169
    170// Add delay to rcu_read_unlock() for strict grace periods.
    171static int rcu_unlock_delay;
    172#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
    173module_param(rcu_unlock_delay, int, 0444);
    174#endif
    175
    176/*
    177 * This rcu parameter is runtime-read-only. It reflects
    178 * a minimum allowed number of objects which can be cached
    179 * per-CPU. Object size is equal to one page. This value
    180 * can be changed at boot time.
    181 */
    182static int rcu_min_cached_objs = 5;
    183module_param(rcu_min_cached_objs, int, 0444);
    184
    185// A page shrinker can ask for pages to be freed to make them
    186// available for other parts of the system. This usually happens
    187// under low memory conditions, and in that case we should also
    188// defer page-cache filling for a short time period.
    189//
    190// The default value is 5 seconds, which is long enough to reduce
    191// interference with the shrinker while it asks other systems to
    192// drain their caches.
    193static int rcu_delay_page_cache_fill_msec = 5000;
    194module_param(rcu_delay_page_cache_fill_msec, int, 0444);
    195
    196/* Retrieve RCU kthreads priority for rcutorture */
    197int rcu_get_gp_kthreads_prio(void)
    198{
    199	return kthread_prio;
    200}
    201EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
    202
    203/*
    204 * Number of grace periods between delays, normalized by the duration of
    205 * the delay.  The longer the delay, the more the grace periods between
    206 * each delay.  The reason for this normalization is that it means that,
    207 * for non-zero delays, the overall slowdown of grace periods is constant
    208 * regardless of the duration of the delay.  This arrangement balances
    209 * the need for long delays to increase some race probabilities with the
    210 * need for fast grace periods to increase other race probabilities.
    211 */
    212#define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays for debugging. */
    213
    214/*
    215 * Compute the mask of online CPUs for the specified rcu_node structure.
    216 * This will not be stable unless the rcu_node structure's ->lock is
    217 * held, but the bit corresponding to the current CPU will be stable
    218 * in most contexts.
    219 */
    220static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
    221{
    222	return READ_ONCE(rnp->qsmaskinitnext);
    223}
    224
    225/*
    226 * Is the CPU corresponding to the specified rcu_data structure online
    227 * from RCU's perspective?  This perspective is given by that structure's
    228 * ->qsmaskinitnext field rather than by the global cpu_online_mask.
    229 */
    230static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
    231{
    232	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
    233}
    234
    235/*
    236 * Return true if an RCU grace period is in progress.  The READ_ONCE()s
    237 * permit this function to be invoked without holding the root rcu_node
    238 * structure's ->lock, but of course results can be subject to change.
    239 */
    240static int rcu_gp_in_progress(void)
    241{
    242	return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
    243}
    244
    245/*
    246 * Return the number of callbacks queued on the specified CPU.
    247 * Handles both the nocbs and normal cases.
    248 */
    249static long rcu_get_n_cbs_cpu(int cpu)
    250{
    251	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    252
    253	if (rcu_segcblist_is_enabled(&rdp->cblist))
    254		return rcu_segcblist_n_cbs(&rdp->cblist);
    255	return 0;
    256}
    257
    258void rcu_softirq_qs(void)
    259{
    260	rcu_qs();
    261	rcu_preempt_deferred_qs(current);
    262	rcu_tasks_qs(current, false);
    263}
    264
    265/*
    266 * Increment the current CPU's rcu_data structure's ->dynticks field
    267 * with ordering.  Return the new value.
    268 */
    269static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
    270{
    271	return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks));
    272}
    273
    274/*
    275 * Record entry into an extended quiescent state.  This is only to be
    276 * called when not already in an extended quiescent state, that is,
    277 * RCU is watching prior to the call to this function and is no longer
    278 * watching upon return.
    279 */
    280static noinstr void rcu_dynticks_eqs_enter(void)
    281{
    282	int seq;
    283
    284	/*
    285	 * CPUs seeing atomic_add_return() must see prior RCU read-side
    286	 * critical sections, and we also must force ordering with the
    287	 * next idle sojourn.
    288	 */
    289	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
    290	seq = rcu_dynticks_inc(1);
    291	// RCU is no longer watching.  Better be in extended quiescent state!
    292	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
    293}
    294
    295/*
    296 * Record exit from an extended quiescent state.  This is only to be
    297 * called from an extended quiescent state, that is, RCU is not watching
    298 * prior to the call to this function and is watching upon return.
    299 */
    300static noinstr void rcu_dynticks_eqs_exit(void)
    301{
    302	int seq;
    303
    304	/*
    305	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
    306	 * and we also must force ordering with the next RCU read-side
    307	 * critical section.
    308	 */
    309	seq = rcu_dynticks_inc(1);
    310	// RCU is now watching.  Better not be in an extended quiescent state!
    311	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
    312	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
    313}
    314
    315/*
    316 * Reset the current CPU's ->dynticks counter to indicate that the
    317 * newly onlined CPU is no longer in an extended quiescent state.
    318 * This will either leave the counter unchanged, or increment it
    319 * to the next non-quiescent value.
    320 *
    321 * The non-atomic test/increment sequence works because the upper bits
    322 * of the ->dynticks counter are manipulated only by the corresponding CPU,
    323 * or when the corresponding CPU is offline.
    324 */
    325static void rcu_dynticks_eqs_online(void)
    326{
    327	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    328
    329	if (atomic_read(&rdp->dynticks) & 0x1)
    330		return;
    331	rcu_dynticks_inc(1);
    332}
    333
    334/*
    335 * Is the current CPU in an extended quiescent state?
    336 *
    337 * No ordering, as we are sampling CPU-local information.
    338 */
    339static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
    340{
    341	return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
    342}
    343
    344/*
    345 * Snapshot the ->dynticks counter with full ordering so as to allow
    346 * stable comparison of this counter with past and future snapshots.
    347 */
    348static int rcu_dynticks_snap(struct rcu_data *rdp)
    349{
    350	smp_mb();  // Fundamental RCU ordering guarantee.
    351	return atomic_read_acquire(&rdp->dynticks);
    352}
    353
    354/*
    355 * Return true if the snapshot returned from rcu_dynticks_snap()
    356 * indicates that RCU is in an extended quiescent state.
    357 */
    358static bool rcu_dynticks_in_eqs(int snap)
    359{
    360	return !(snap & 0x1);
    361}
    362
    363/* Return true if the specified CPU is currently idle from an RCU viewpoint.  */
    364bool rcu_is_idle_cpu(int cpu)
    365{
    366	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    367
    368	return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
    369}
    370
    371/*
    372 * Return true if the CPU corresponding to the specified rcu_data
    373 * structure has spent some time in an extended quiescent state since
    374 * rcu_dynticks_snap() returned the specified snapshot.
    375 */
    376static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
    377{
    378	return snap != rcu_dynticks_snap(rdp);
    379}
    380
    381/*
    382 * Return true if the referenced integer is zero while the specified
    383 * CPU remains within a single extended quiescent state.
    384 */
    385bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
    386{
    387	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
    388	int snap;
    389
    390	// If not quiescent, force back to earlier extended quiescent state.
    391	snap = atomic_read(&rdp->dynticks) & ~0x1;
    392
    393	smp_rmb(); // Order ->dynticks and *vp reads.
    394	if (READ_ONCE(*vp))
    395		return false;  // Non-zero, so report failure;
    396	smp_rmb(); // Order *vp read and ->dynticks re-read.
    397
    398	// If still in the same extended quiescent state, we are good!
    399	return snap == atomic_read(&rdp->dynticks);
    400}
    401
    402/*
    403 * Let the RCU core know that this CPU has gone through the scheduler,
    404 * which is a quiescent state.  This is called when the need for a
    405 * quiescent state is urgent, so we burn an atomic operation and full
    406 * memory barriers to let the RCU core know about it, regardless of what
    407 * this CPU might (or might not) do in the near future.
    408 *
    409 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
    410 *
    411 * The caller must have disabled interrupts and must not be idle.
    412 */
    413notrace void rcu_momentary_dyntick_idle(void)
    414{
    415	int seq;
    416
    417	raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
    418	seq = rcu_dynticks_inc(2);
    419	/* It is illegal to call this from idle state. */
    420	WARN_ON_ONCE(!(seq & 0x1));
    421	rcu_preempt_deferred_qs(current);
    422}
    423EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
    424
    425/**
    426 * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
    427 *
    428 * If the current CPU is idle and running at a first-level (not nested)
    429 * interrupt, or directly, from idle, return true.
    430 *
    431 * The caller must have at least disabled IRQs.
    432 */
    433static int rcu_is_cpu_rrupt_from_idle(void)
    434{
    435	long nesting;
    436
    437	/*
    438	 * Usually called from the tick; but also used from smp_function_call()
    439	 * for expedited grace periods. This latter can result in running from
    440	 * the idle task, instead of an actual IPI.
    441	 */
    442	lockdep_assert_irqs_disabled();
    443
    444	/* Check for counter underflows */
    445	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
    446			 "RCU dynticks_nesting counter underflow!");
    447	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
    448			 "RCU dynticks_nmi_nesting counter underflow/zero!");
    449
    450	/* Are we at first interrupt nesting level? */
    451	nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
    452	if (nesting > 1)
    453		return false;
    454
    455	/*
    456	 * If we're not in an interrupt, we must be in the idle task!
    457	 */
    458	WARN_ON_ONCE(!nesting && !is_idle_task(current));
    459
    460	/* Does CPU appear to be idle from an RCU standpoint? */
    461	return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
    462}
    463
    464#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
    465				// Maximum callbacks per rcu_do_batch ...
    466#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
    467static long blimit = DEFAULT_RCU_BLIMIT;
    468#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
    469static long qhimark = DEFAULT_RCU_QHIMARK;
    470#define DEFAULT_RCU_QLOMARK 100   // Once only this many pending, use blimit.
    471static long qlowmark = DEFAULT_RCU_QLOMARK;
    472#define DEFAULT_RCU_QOVLD_MULT 2
    473#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
    474static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
    475static long qovld_calc = -1;	  // No pre-initialization lock acquisitions!
    476
    477module_param(blimit, long, 0444);
    478module_param(qhimark, long, 0444);
    479module_param(qlowmark, long, 0444);
    480module_param(qovld, long, 0444);
    481
    482static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
    483static ulong jiffies_till_next_fqs = ULONG_MAX;
    484static bool rcu_kick_kthreads;
    485static int rcu_divisor = 7;
    486module_param(rcu_divisor, int, 0644);
    487
    488/* Force an exit from rcu_do_batch() after 3 milliseconds. */
    489static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
    490module_param(rcu_resched_ns, long, 0644);
    491
    492/*
    493 * How long the grace period must be before we start recruiting
    494 * quiescent-state help from rcu_note_context_switch().
    495 */
    496static ulong jiffies_till_sched_qs = ULONG_MAX;
    497module_param(jiffies_till_sched_qs, ulong, 0444);
    498static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
    499module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
    500
    501/*
    502 * Make sure that we give the grace-period kthread time to detect any
    503 * idle CPUs before taking active measures to force quiescent states.
    504 * However, don't go below 100 milliseconds, adjusted upwards for really
    505 * large systems.
    506 */
    507static void adjust_jiffies_till_sched_qs(void)
    508{
    509	unsigned long j;
    510
    511	/* If jiffies_till_sched_qs was specified, respect the request. */
    512	if (jiffies_till_sched_qs != ULONG_MAX) {
    513		WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
    514		return;
    515	}
    516	/* Otherwise, set to third fqs scan, but bound below on large system. */
    517	j = READ_ONCE(jiffies_till_first_fqs) +
    518		      2 * READ_ONCE(jiffies_till_next_fqs);
    519	if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
    520		j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
    521	pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
    522	WRITE_ONCE(jiffies_to_sched_qs, j);
    523}
    524
    525static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
    526{
    527	ulong j;
    528	int ret = kstrtoul(val, 0, &j);
    529
    530	if (!ret) {
    531		WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
    532		adjust_jiffies_till_sched_qs();
    533	}
    534	return ret;
    535}
    536
    537static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
    538{
    539	ulong j;
    540	int ret = kstrtoul(val, 0, &j);
    541
    542	if (!ret) {
    543		WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
    544		adjust_jiffies_till_sched_qs();
    545	}
    546	return ret;
    547}
    548
    549static const struct kernel_param_ops first_fqs_jiffies_ops = {
    550	.set = param_set_first_fqs_jiffies,
    551	.get = param_get_ulong,
    552};
    553
    554static const struct kernel_param_ops next_fqs_jiffies_ops = {
    555	.set = param_set_next_fqs_jiffies,
    556	.get = param_get_ulong,
    557};
    558
    559module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
    560module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
    561module_param(rcu_kick_kthreads, bool, 0644);
    562
    563static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
    564static int rcu_pending(int user);
    565
    566/*
    567 * Return the number of RCU GPs completed thus far for debug & stats.
    568 */
    569unsigned long rcu_get_gp_seq(void)
    570{
    571	return READ_ONCE(rcu_state.gp_seq);
    572}
    573EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
    574
    575/*
    576 * Return the number of RCU expedited batches completed thus far for
    577 * debug & stats.  Odd numbers mean that a batch is in progress, even
    578 * numbers mean idle.  The value returned will thus be roughly double
    579 * the cumulative batches since boot.
    580 */
    581unsigned long rcu_exp_batches_completed(void)
    582{
    583	return rcu_state.expedited_sequence;
    584}
    585EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
    586
    587/*
    588 * Return the root node of the rcu_state structure.
    589 */
    590static struct rcu_node *rcu_get_root(void)
    591{
    592	return &rcu_state.node[0];
    593}
    594
    595/*
    596 * Send along grace-period-related data for rcutorture diagnostics.
    597 */
    598void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
    599			    unsigned long *gp_seq)
    600{
    601	switch (test_type) {
    602	case RCU_FLAVOR:
    603		*flags = READ_ONCE(rcu_state.gp_flags);
    604		*gp_seq = rcu_seq_current(&rcu_state.gp_seq);
    605		break;
    606	default:
    607		break;
    608	}
    609}
    610EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
    611
    612/*
    613 * Enter an RCU extended quiescent state, which can be either the
    614 * idle loop or adaptive-tickless usermode execution.
    615 *
    616 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
    617 * the possibility of usermode upcalls having messed up our count
    618 * of interrupt nesting level during the prior busy period.
    619 */
    620static noinstr void rcu_eqs_enter(bool user)
    621{
    622	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    623
    624	WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
    625	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
    626	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
    627		     rdp->dynticks_nesting == 0);
    628	if (rdp->dynticks_nesting != 1) {
    629		// RCU will still be watching, so just do accounting and leave.
    630		rdp->dynticks_nesting--;
    631		return;
    632	}
    633
    634	lockdep_assert_irqs_disabled();
    635	instrumentation_begin();
    636	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
    637	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
    638	rcu_preempt_deferred_qs(current);
    639
    640	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
    641	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
    642
    643	instrumentation_end();
    644	WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
    645	// RCU is watching here ...
    646	rcu_dynticks_eqs_enter();
    647	// ... but is no longer watching here.
    648	rcu_dynticks_task_enter();
    649}
    650
    651/**
    652 * rcu_idle_enter - inform RCU that current CPU is entering idle
    653 *
    654 * Enter idle mode, in other words, -leave- the mode in which RCU
    655 * read-side critical sections can occur.  (Though RCU read-side
    656 * critical sections can occur in irq handlers in idle, a possibility
    657 * handled by irq_enter() and irq_exit().)
    658 *
    659 * If you add or remove a call to rcu_idle_enter(), be sure to test with
    660 * CONFIG_RCU_EQS_DEBUG=y.
    661 */
    662void rcu_idle_enter(void)
    663{
    664	lockdep_assert_irqs_disabled();
    665	rcu_eqs_enter(false);
    666}
    667EXPORT_SYMBOL_GPL(rcu_idle_enter);
    668
    669#ifdef CONFIG_NO_HZ_FULL
    670
    671#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
    672/*
    673 * An empty function that will trigger a reschedule on
    674 * IRQ tail once IRQs get re-enabled on userspace/guest resume.
    675 */
    676static void late_wakeup_func(struct irq_work *work)
    677{
    678}
    679
    680static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
    681	IRQ_WORK_INIT(late_wakeup_func);
    682
    683/*
    684 * If either:
    685 *
    686 * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
    687 * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
    688 *
    689 * In these cases the late RCU wake ups aren't supported in the resched loops and our
    690 * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
    691 * get re-enabled again.
    692 */
    693noinstr static void rcu_irq_work_resched(void)
    694{
    695	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    696
    697	if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
    698		return;
    699
    700	if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
    701		return;
    702
    703	instrumentation_begin();
    704	if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
    705		irq_work_queue(this_cpu_ptr(&late_wakeup_work));
    706	}
    707	instrumentation_end();
    708}
    709
    710#else
    711static inline void rcu_irq_work_resched(void) { }
    712#endif
    713
    714/**
    715 * rcu_user_enter - inform RCU that we are resuming userspace.
    716 *
    717 * Enter RCU idle mode right before resuming userspace.  No use of RCU
    718 * is permitted between this call and rcu_user_exit(). This way the
    719 * CPU doesn't need to maintain the tick for RCU maintenance purposes
    720 * when the CPU runs in userspace.
    721 *
    722 * If you add or remove a call to rcu_user_enter(), be sure to test with
    723 * CONFIG_RCU_EQS_DEBUG=y.
    724 */
    725noinstr void rcu_user_enter(void)
    726{
    727	lockdep_assert_irqs_disabled();
    728
    729	/*
    730	 * Other than generic entry implementation, we may be past the last
    731	 * rescheduling opportunity in the entry code. Trigger a self IPI
    732	 * that will fire and reschedule once we resume in user/guest mode.
    733	 */
    734	rcu_irq_work_resched();
    735	rcu_eqs_enter(true);
    736}
    737
    738#endif /* CONFIG_NO_HZ_FULL */
    739
    740/**
    741 * rcu_nmi_exit - inform RCU of exit from NMI context
    742 *
    743 * If we are returning from the outermost NMI handler that interrupted an
    744 * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
    745 * to let the RCU grace-period handling know that the CPU is back to
    746 * being RCU-idle.
    747 *
    748 * If you add or remove a call to rcu_nmi_exit(), be sure to test
    749 * with CONFIG_RCU_EQS_DEBUG=y.
    750 */
    751noinstr void rcu_nmi_exit(void)
    752{
    753	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    754
    755	instrumentation_begin();
    756	/*
    757	 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
    758	 * (We are exiting an NMI handler, so RCU better be paying attention
    759	 * to us!)
    760	 */
    761	WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
    762	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
    763
    764	/*
    765	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
    766	 * leave it in non-RCU-idle state.
    767	 */
    768	if (rdp->dynticks_nmi_nesting != 1) {
    769		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
    770				  atomic_read(&rdp->dynticks));
    771		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
    772			   rdp->dynticks_nmi_nesting - 2);
    773		instrumentation_end();
    774		return;
    775	}
    776
    777	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
    778	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
    779	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
    780
    781	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
    782	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
    783	instrumentation_end();
    784
    785	// RCU is watching here ...
    786	rcu_dynticks_eqs_enter();
    787	// ... but is no longer watching here.
    788
    789	if (!in_nmi())
    790		rcu_dynticks_task_enter();
    791}
    792
    793/**
    794 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
    795 *
    796 * Exit from an interrupt handler, which might possibly result in entering
    797 * idle mode, in other words, leaving the mode in which read-side critical
    798 * sections can occur.  The caller must have disabled interrupts.
    799 *
    800 * This code assumes that the idle loop never does anything that might
    801 * result in unbalanced calls to irq_enter() and irq_exit().  If your
    802 * architecture's idle loop violates this assumption, RCU will give you what
    803 * you deserve, good and hard.  But very infrequently and irreproducibly.
    804 *
    805 * Use things like work queues to work around this limitation.
    806 *
    807 * You have been warned.
    808 *
    809 * If you add or remove a call to rcu_irq_exit(), be sure to test with
    810 * CONFIG_RCU_EQS_DEBUG=y.
    811 */
    812void noinstr rcu_irq_exit(void)
    813{
    814	lockdep_assert_irqs_disabled();
    815	rcu_nmi_exit();
    816}
    817
    818#ifdef CONFIG_PROVE_RCU
    819/**
    820 * rcu_irq_exit_check_preempt - Validate that scheduling is possible
    821 */
    822void rcu_irq_exit_check_preempt(void)
    823{
    824	lockdep_assert_irqs_disabled();
    825
    826	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
    827			 "RCU dynticks_nesting counter underflow/zero!");
    828	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
    829			 DYNTICK_IRQ_NONIDLE,
    830			 "Bad RCU  dynticks_nmi_nesting counter\n");
    831	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
    832			 "RCU in extended quiescent state!");
    833}
    834#endif /* #ifdef CONFIG_PROVE_RCU */
    835
    836/*
    837 * Wrapper for rcu_irq_exit() where interrupts are enabled.
    838 *
    839 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
    840 * with CONFIG_RCU_EQS_DEBUG=y.
    841 */
    842void rcu_irq_exit_irqson(void)
    843{
    844	unsigned long flags;
    845
    846	local_irq_save(flags);
    847	rcu_irq_exit();
    848	local_irq_restore(flags);
    849}
    850
    851/*
    852 * Exit an RCU extended quiescent state, which can be either the
    853 * idle loop or adaptive-tickless usermode execution.
    854 *
    855 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
    856 * allow for the possibility of usermode upcalls messing up our count of
    857 * interrupt nesting level during the busy period that is just now starting.
    858 */
    859static void noinstr rcu_eqs_exit(bool user)
    860{
    861	struct rcu_data *rdp;
    862	long oldval;
    863
    864	lockdep_assert_irqs_disabled();
    865	rdp = this_cpu_ptr(&rcu_data);
    866	oldval = rdp->dynticks_nesting;
    867	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
    868	if (oldval) {
    869		// RCU was already watching, so just do accounting and leave.
    870		rdp->dynticks_nesting++;
    871		return;
    872	}
    873	rcu_dynticks_task_exit();
    874	// RCU is not watching here ...
    875	rcu_dynticks_eqs_exit();
    876	// ... but is watching here.
    877	instrumentation_begin();
    878
    879	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
    880	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
    881
    882	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
    883	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
    884	WRITE_ONCE(rdp->dynticks_nesting, 1);
    885	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
    886	WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
    887	instrumentation_end();
    888}
    889
    890/**
    891 * rcu_idle_exit - inform RCU that current CPU is leaving idle
    892 *
    893 * Exit idle mode, in other words, -enter- the mode in which RCU
    894 * read-side critical sections can occur.
    895 *
    896 * If you add or remove a call to rcu_idle_exit(), be sure to test with
    897 * CONFIG_RCU_EQS_DEBUG=y.
    898 */
    899void rcu_idle_exit(void)
    900{
    901	unsigned long flags;
    902
    903	local_irq_save(flags);
    904	rcu_eqs_exit(false);
    905	local_irq_restore(flags);
    906}
    907EXPORT_SYMBOL_GPL(rcu_idle_exit);
    908
    909#ifdef CONFIG_NO_HZ_FULL
    910/**
    911 * rcu_user_exit - inform RCU that we are exiting userspace.
    912 *
    913 * Exit RCU idle mode while entering the kernel because it can
    914 * run a RCU read side critical section anytime.
    915 *
    916 * If you add or remove a call to rcu_user_exit(), be sure to test with
    917 * CONFIG_RCU_EQS_DEBUG=y.
    918 */
    919void noinstr rcu_user_exit(void)
    920{
    921	rcu_eqs_exit(true);
    922}
    923
    924/**
    925 * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
    926 *
    927 * The scheduler tick is not normally enabled when CPUs enter the kernel
    928 * from nohz_full userspace execution.  After all, nohz_full userspace
    929 * execution is an RCU quiescent state and the time executing in the kernel
    930 * is quite short.  Except of course when it isn't.  And it is not hard to
    931 * cause a large system to spend tens of seconds or even minutes looping
    932 * in the kernel, which can cause a number of problems, include RCU CPU
    933 * stall warnings.
    934 *
    935 * Therefore, if a nohz_full CPU fails to report a quiescent state
    936 * in a timely manner, the RCU grace-period kthread sets that CPU's
    937 * ->rcu_urgent_qs flag with the expectation that the next interrupt or
    938 * exception will invoke this function, which will turn on the scheduler
    939 * tick, which will enable RCU to detect that CPU's quiescent states,
    940 * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
    941 * The tick will be disabled once a quiescent state is reported for
    942 * this CPU.
    943 *
    944 * Of course, in carefully tuned systems, there might never be an
    945 * interrupt or exception.  In that case, the RCU grace-period kthread
    946 * will eventually cause one to happen.  However, in less carefully
    947 * controlled environments, this function allows RCU to get what it
    948 * needs without creating otherwise useless interruptions.
    949 */
    950void __rcu_irq_enter_check_tick(void)
    951{
    952	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
    953
    954	// If we're here from NMI there's nothing to do.
    955	if (in_nmi())
    956		return;
    957
    958	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
    959			 "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
    960
    961	if (!tick_nohz_full_cpu(rdp->cpu) ||
    962	    !READ_ONCE(rdp->rcu_urgent_qs) ||
    963	    READ_ONCE(rdp->rcu_forced_tick)) {
    964		// RCU doesn't need nohz_full help from this CPU, or it is
    965		// already getting that help.
    966		return;
    967	}
    968
    969	// We get here only when not in an extended quiescent state and
    970	// from interrupts (as opposed to NMIs).  Therefore, (1) RCU is
    971	// already watching and (2) The fact that we are in an interrupt
    972	// handler and that the rcu_node lock is an irq-disabled lock
    973	// prevents self-deadlock.  So we can safely recheck under the lock.
    974	// Note that the nohz_full state currently cannot change.
    975	raw_spin_lock_rcu_node(rdp->mynode);
    976	if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
    977		// A nohz_full CPU is in the kernel and RCU needs a
    978		// quiescent state.  Turn on the tick!
    979		WRITE_ONCE(rdp->rcu_forced_tick, true);
    980		tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
    981	}
    982	raw_spin_unlock_rcu_node(rdp->mynode);
    983}
    984#endif /* CONFIG_NO_HZ_FULL */
    985
    986/**
    987 * rcu_nmi_enter - inform RCU of entry to NMI context
    988 *
    989 * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
    990 * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
    991 * that the CPU is active.  This implementation permits nested NMIs, as
    992 * long as the nesting level does not overflow an int.  (You will probably
    993 * run out of stack space first.)
    994 *
    995 * If you add or remove a call to rcu_nmi_enter(), be sure to test
    996 * with CONFIG_RCU_EQS_DEBUG=y.
    997 */
    998noinstr void rcu_nmi_enter(void)
    999{
   1000	long incby = 2;
   1001	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
   1002
   1003	/* Complain about underflow. */
   1004	WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
   1005
   1006	/*
   1007	 * If idle from RCU viewpoint, atomically increment ->dynticks
   1008	 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
   1009	 * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
   1010	 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
   1011	 * to be in the outermost NMI handler that interrupted an RCU-idle
   1012	 * period (observation due to Andy Lutomirski).
   1013	 */
   1014	if (rcu_dynticks_curr_cpu_in_eqs()) {
   1015
   1016		if (!in_nmi())
   1017			rcu_dynticks_task_exit();
   1018
   1019		// RCU is not watching here ...
   1020		rcu_dynticks_eqs_exit();
   1021		// ... but is watching here.
   1022
   1023		instrumentation_begin();
   1024		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
   1025		instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
   1026		// instrumentation for the noinstr rcu_dynticks_eqs_exit()
   1027		instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
   1028
   1029		incby = 1;
   1030	} else if (!in_nmi()) {
   1031		instrumentation_begin();
   1032		rcu_irq_enter_check_tick();
   1033	} else  {
   1034		instrumentation_begin();
   1035	}
   1036
   1037	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
   1038			  rdp->dynticks_nmi_nesting,
   1039			  rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
   1040	instrumentation_end();
   1041	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
   1042		   rdp->dynticks_nmi_nesting + incby);
   1043	barrier();
   1044}
   1045
   1046/**
   1047 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
   1048 *
   1049 * Enter an interrupt handler, which might possibly result in exiting
   1050 * idle mode, in other words, entering the mode in which read-side critical
   1051 * sections can occur.  The caller must have disabled interrupts.
   1052 *
   1053 * Note that the Linux kernel is fully capable of entering an interrupt
   1054 * handler that it never exits, for example when doing upcalls to user mode!
   1055 * This code assumes that the idle loop never does upcalls to user mode.
   1056 * If your architecture's idle loop does do upcalls to user mode (or does
   1057 * anything else that results in unbalanced calls to the irq_enter() and
   1058 * irq_exit() functions), RCU will give you what you deserve, good and hard.
   1059 * But very infrequently and irreproducibly.
   1060 *
   1061 * Use things like work queues to work around this limitation.
   1062 *
   1063 * You have been warned.
   1064 *
   1065 * If you add or remove a call to rcu_irq_enter(), be sure to test with
   1066 * CONFIG_RCU_EQS_DEBUG=y.
   1067 */
   1068noinstr void rcu_irq_enter(void)
   1069{
   1070	lockdep_assert_irqs_disabled();
   1071	rcu_nmi_enter();
   1072}
   1073
   1074/*
   1075 * Wrapper for rcu_irq_enter() where interrupts are enabled.
   1076 *
   1077 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
   1078 * with CONFIG_RCU_EQS_DEBUG=y.
   1079 */
   1080void rcu_irq_enter_irqson(void)
   1081{
   1082	unsigned long flags;
   1083
   1084	local_irq_save(flags);
   1085	rcu_irq_enter();
   1086	local_irq_restore(flags);
   1087}
   1088
   1089/*
   1090 * Check to see if any future non-offloaded RCU-related work will need
   1091 * to be done by the current CPU, even if none need be done immediately,
   1092 * returning 1 if so.  This function is part of the RCU implementation;
   1093 * it is -not- an exported member of the RCU API.  This is used by
   1094 * the idle-entry code to figure out whether it is safe to disable the
   1095 * scheduler-clock interrupt.
   1096 *
   1097 * Just check whether or not this CPU has non-offloaded RCU callbacks
   1098 * queued.
   1099 */
   1100int rcu_needs_cpu(void)
   1101{
   1102	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
   1103		!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
   1104}
   1105
   1106/*
   1107 * If any sort of urgency was applied to the current CPU (for example,
   1108 * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
   1109 * to get to a quiescent state, disable it.
   1110 */
   1111static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
   1112{
   1113	raw_lockdep_assert_held_rcu_node(rdp->mynode);
   1114	WRITE_ONCE(rdp->rcu_urgent_qs, false);
   1115	WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
   1116	if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
   1117		tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
   1118		WRITE_ONCE(rdp->rcu_forced_tick, false);
   1119	}
   1120}
   1121
   1122/**
   1123 * rcu_is_watching - see if RCU thinks that the current CPU is not idle
   1124 *
   1125 * Return true if RCU is watching the running CPU, which means that this
   1126 * CPU can safely enter RCU read-side critical sections.  In other words,
   1127 * if the current CPU is not in its idle loop or is in an interrupt or
   1128 * NMI handler, return true.
   1129 *
   1130 * Make notrace because it can be called by the internal functions of
   1131 * ftrace, and making this notrace removes unnecessary recursion calls.
   1132 */
   1133notrace bool rcu_is_watching(void)
   1134{
   1135	bool ret;
   1136
   1137	preempt_disable_notrace();
   1138	ret = !rcu_dynticks_curr_cpu_in_eqs();
   1139	preempt_enable_notrace();
   1140	return ret;
   1141}
   1142EXPORT_SYMBOL_GPL(rcu_is_watching);
   1143
   1144/*
   1145 * If a holdout task is actually running, request an urgent quiescent
   1146 * state from its CPU.  This is unsynchronized, so migrations can cause
   1147 * the request to go to the wrong CPU.  Which is OK, all that will happen
   1148 * is that the CPU's next context switch will be a bit slower and next
   1149 * time around this task will generate another request.
   1150 */
   1151void rcu_request_urgent_qs_task(struct task_struct *t)
   1152{
   1153	int cpu;
   1154
   1155	barrier();
   1156	cpu = task_cpu(t);
   1157	if (!task_curr(t))
   1158		return; /* This task is not running on that CPU. */
   1159	smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
   1160}
   1161
   1162#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
   1163
   1164/*
   1165 * Is the current CPU online as far as RCU is concerned?
   1166 *
   1167 * Disable preemption to avoid false positives that could otherwise
   1168 * happen due to the current CPU number being sampled, this task being
   1169 * preempted, its old CPU being taken offline, resuming on some other CPU,
   1170 * then determining that its old CPU is now offline.
   1171 *
   1172 * Disable checking if in an NMI handler because we cannot safely
   1173 * report errors from NMI handlers anyway.  In addition, it is OK to use
   1174 * RCU on an offline processor during initial boot, hence the check for
   1175 * rcu_scheduler_fully_active.
   1176 */
   1177bool rcu_lockdep_current_cpu_online(void)
   1178{
   1179	struct rcu_data *rdp;
   1180	bool ret = false;
   1181
   1182	if (in_nmi() || !rcu_scheduler_fully_active)
   1183		return true;
   1184	preempt_disable_notrace();
   1185	rdp = this_cpu_ptr(&rcu_data);
   1186	/*
   1187	 * Strictly, we care here about the case where the current CPU is
   1188	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
   1189	 * not being up to date. So arch_spin_is_locked() might have a
   1190	 * false positive if it's held by some *other* CPU, but that's
   1191	 * OK because that just means a false *negative* on the warning.
   1192	 */
   1193	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
   1194		ret = true;
   1195	preempt_enable_notrace();
   1196	return ret;
   1197}
   1198EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
   1199
   1200#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
   1201
   1202/*
   1203 * When trying to report a quiescent state on behalf of some other CPU,
   1204 * it is our responsibility to check for and handle potential overflow
   1205 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
   1206 * After all, the CPU might be in deep idle state, and thus executing no
   1207 * code whatsoever.
   1208 */
   1209static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
   1210{
   1211	raw_lockdep_assert_held_rcu_node(rnp);
   1212	if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
   1213			 rnp->gp_seq))
   1214		WRITE_ONCE(rdp->gpwrap, true);
   1215	if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
   1216		rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
   1217}
   1218
   1219/*
   1220 * Snapshot the specified CPU's dynticks counter so that we can later
   1221 * credit them with an implicit quiescent state.  Return 1 if this CPU
   1222 * is in dynticks idle mode, which is an extended quiescent state.
   1223 */
   1224static int dyntick_save_progress_counter(struct rcu_data *rdp)
   1225{
   1226	rdp->dynticks_snap = rcu_dynticks_snap(rdp);
   1227	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
   1228		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
   1229		rcu_gpnum_ovf(rdp->mynode, rdp);
   1230		return 1;
   1231	}
   1232	return 0;
   1233}
   1234
   1235/*
   1236 * Return true if the specified CPU has passed through a quiescent
   1237 * state by virtue of being in or having passed through an dynticks
   1238 * idle state since the last call to dyntick_save_progress_counter()
   1239 * for this same CPU, or by virtue of having been offline.
   1240 */
   1241static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
   1242{
   1243	unsigned long jtsq;
   1244	struct rcu_node *rnp = rdp->mynode;
   1245
   1246	/*
   1247	 * If the CPU passed through or entered a dynticks idle phase with
   1248	 * no active irq/NMI handlers, then we can safely pretend that the CPU
   1249	 * already acknowledged the request to pass through a quiescent
   1250	 * state.  Either way, that CPU cannot possibly be in an RCU
   1251	 * read-side critical section that started before the beginning
   1252	 * of the current RCU grace period.
   1253	 */
   1254	if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
   1255		trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
   1256		rcu_gpnum_ovf(rnp, rdp);
   1257		return 1;
   1258	}
   1259
   1260	/*
   1261	 * Complain if a CPU that is considered to be offline from RCU's
   1262	 * perspective has not yet reported a quiescent state.  After all,
   1263	 * the offline CPU should have reported a quiescent state during
   1264	 * the CPU-offline process, or, failing that, by rcu_gp_init()
   1265	 * if it ran concurrently with either the CPU going offline or the
   1266	 * last task on a leaf rcu_node structure exiting its RCU read-side
   1267	 * critical section while all CPUs corresponding to that structure
   1268	 * are offline.  This added warning detects bugs in any of these
   1269	 * code paths.
   1270	 *
   1271	 * The rcu_node structure's ->lock is held here, which excludes
   1272	 * the relevant portions the CPU-hotplug code, the grace-period
   1273	 * initialization code, and the rcu_read_unlock() code paths.
   1274	 *
   1275	 * For more detail, please refer to the "Hotplug CPU" section
   1276	 * of RCU's Requirements documentation.
   1277	 */
   1278	if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
   1279		struct rcu_node *rnp1;
   1280
   1281		pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
   1282			__func__, rnp->grplo, rnp->grphi, rnp->level,
   1283			(long)rnp->gp_seq, (long)rnp->completedqs);
   1284		for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
   1285			pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
   1286				__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
   1287		pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
   1288			__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
   1289			(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
   1290			(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
   1291		return 1; /* Break things loose after complaining. */
   1292	}
   1293
   1294	/*
   1295	 * A CPU running for an extended time within the kernel can
   1296	 * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
   1297	 * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
   1298	 * both .rcu_need_heavy_qs and .rcu_urgent_qs.  Note that the
   1299	 * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
   1300	 * variable are safe because the assignments are repeated if this
   1301	 * CPU failed to pass through a quiescent state.  This code
   1302	 * also checks .jiffies_resched in case jiffies_to_sched_qs
   1303	 * is set way high.
   1304	 */
   1305	jtsq = READ_ONCE(jiffies_to_sched_qs);
   1306	if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
   1307	    (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
   1308	     time_after(jiffies, rcu_state.jiffies_resched) ||
   1309	     rcu_state.cbovld)) {
   1310		WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
   1311		/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
   1312		smp_store_release(&rdp->rcu_urgent_qs, true);
   1313	} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
   1314		WRITE_ONCE(rdp->rcu_urgent_qs, true);
   1315	}
   1316
   1317	/*
   1318	 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
   1319	 * The above code handles this, but only for straight cond_resched().
   1320	 * And some in-kernel loops check need_resched() before calling
   1321	 * cond_resched(), which defeats the above code for CPUs that are
   1322	 * running in-kernel with scheduling-clock interrupts disabled.
   1323	 * So hit them over the head with the resched_cpu() hammer!
   1324	 */
   1325	if (tick_nohz_full_cpu(rdp->cpu) &&
   1326	    (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
   1327	     rcu_state.cbovld)) {
   1328		WRITE_ONCE(rdp->rcu_urgent_qs, true);
   1329		resched_cpu(rdp->cpu);
   1330		WRITE_ONCE(rdp->last_fqs_resched, jiffies);
   1331	}
   1332
   1333	/*
   1334	 * If more than halfway to RCU CPU stall-warning time, invoke
   1335	 * resched_cpu() more frequently to try to loosen things up a bit.
   1336	 * Also check to see if the CPU is getting hammered with interrupts,
   1337	 * but only once per grace period, just to keep the IPIs down to
   1338	 * a dull roar.
   1339	 */
   1340	if (time_after(jiffies, rcu_state.jiffies_resched)) {
   1341		if (time_after(jiffies,
   1342			       READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
   1343			resched_cpu(rdp->cpu);
   1344			WRITE_ONCE(rdp->last_fqs_resched, jiffies);
   1345		}
   1346		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
   1347		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
   1348		    (rnp->ffmask & rdp->grpmask)) {
   1349			rdp->rcu_iw_pending = true;
   1350			rdp->rcu_iw_gp_seq = rnp->gp_seq;
   1351			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
   1352		}
   1353	}
   1354
   1355	return 0;
   1356}
   1357
   1358/* Trace-event wrapper function for trace_rcu_future_grace_period.  */
   1359static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
   1360			      unsigned long gp_seq_req, const char *s)
   1361{
   1362	trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
   1363				      gp_seq_req, rnp->level,
   1364				      rnp->grplo, rnp->grphi, s);
   1365}
   1366
   1367/*
   1368 * rcu_start_this_gp - Request the start of a particular grace period
   1369 * @rnp_start: The leaf node of the CPU from which to start.
   1370 * @rdp: The rcu_data corresponding to the CPU from which to start.
   1371 * @gp_seq_req: The gp_seq of the grace period to start.
   1372 *
   1373 * Start the specified grace period, as needed to handle newly arrived
   1374 * callbacks.  The required future grace periods are recorded in each
   1375 * rcu_node structure's ->gp_seq_needed field.  Returns true if there
   1376 * is reason to awaken the grace-period kthread.
   1377 *
   1378 * The caller must hold the specified rcu_node structure's ->lock, which
   1379 * is why the caller is responsible for waking the grace-period kthread.
   1380 *
   1381 * Returns true if the GP thread needs to be awakened else false.
   1382 */
   1383static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
   1384			      unsigned long gp_seq_req)
   1385{
   1386	bool ret = false;
   1387	struct rcu_node *rnp;
   1388
   1389	/*
   1390	 * Use funnel locking to either acquire the root rcu_node
   1391	 * structure's lock or bail out if the need for this grace period
   1392	 * has already been recorded -- or if that grace period has in
   1393	 * fact already started.  If there is already a grace period in
   1394	 * progress in a non-leaf node, no recording is needed because the
   1395	 * end of the grace period will scan the leaf rcu_node structures.
   1396	 * Note that rnp_start->lock must not be released.
   1397	 */
   1398	raw_lockdep_assert_held_rcu_node(rnp_start);
   1399	trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
   1400	for (rnp = rnp_start; 1; rnp = rnp->parent) {
   1401		if (rnp != rnp_start)
   1402			raw_spin_lock_rcu_node(rnp);
   1403		if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
   1404		    rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
   1405		    (rnp != rnp_start &&
   1406		     rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
   1407			trace_rcu_this_gp(rnp, rdp, gp_seq_req,
   1408					  TPS("Prestarted"));
   1409			goto unlock_out;
   1410		}
   1411		WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
   1412		if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
   1413			/*
   1414			 * We just marked the leaf or internal node, and a
   1415			 * grace period is in progress, which means that
   1416			 * rcu_gp_cleanup() will see the marking.  Bail to
   1417			 * reduce contention.
   1418			 */
   1419			trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
   1420					  TPS("Startedleaf"));
   1421			goto unlock_out;
   1422		}
   1423		if (rnp != rnp_start && rnp->parent != NULL)
   1424			raw_spin_unlock_rcu_node(rnp);
   1425		if (!rnp->parent)
   1426			break;  /* At root, and perhaps also leaf. */
   1427	}
   1428
   1429	/* If GP already in progress, just leave, otherwise start one. */
   1430	if (rcu_gp_in_progress()) {
   1431		trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
   1432		goto unlock_out;
   1433	}
   1434	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
   1435	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
   1436	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
   1437	if (!READ_ONCE(rcu_state.gp_kthread)) {
   1438		trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
   1439		goto unlock_out;
   1440	}
   1441	trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
   1442	ret = true;  /* Caller must wake GP kthread. */
   1443unlock_out:
   1444	/* Push furthest requested GP to leaf node and rcu_data structure. */
   1445	if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
   1446		WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
   1447		WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
   1448	}
   1449	if (rnp != rnp_start)
   1450		raw_spin_unlock_rcu_node(rnp);
   1451	return ret;
   1452}
   1453
   1454/*
   1455 * Clean up any old requests for the just-ended grace period.  Also return
   1456 * whether any additional grace periods have been requested.
   1457 */
   1458static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
   1459{
   1460	bool needmore;
   1461	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
   1462
   1463	needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
   1464	if (!needmore)
   1465		rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
   1466	trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
   1467			  needmore ? TPS("CleanupMore") : TPS("Cleanup"));
   1468	return needmore;
   1469}
   1470
   1471/*
   1472 * Awaken the grace-period kthread.  Don't do a self-awaken (unless in an
   1473 * interrupt or softirq handler, in which case we just might immediately
   1474 * sleep upon return, resulting in a grace-period hang), and don't bother
   1475 * awakening when there is nothing for the grace-period kthread to do
   1476 * (as in several CPUs raced to awaken, we lost), and finally don't try
   1477 * to awaken a kthread that has not yet been created.  If all those checks
   1478 * are passed, track some debug information and awaken.
   1479 *
   1480 * So why do the self-wakeup when in an interrupt or softirq handler
   1481 * in the grace-period kthread's context?  Because the kthread might have
   1482 * been interrupted just as it was going to sleep, and just after the final
   1483 * pre-sleep check of the awaken condition.  In this case, a wakeup really
   1484 * is required, and is therefore supplied.
   1485 */
   1486static void rcu_gp_kthread_wake(void)
   1487{
   1488	struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
   1489
   1490	if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
   1491	    !READ_ONCE(rcu_state.gp_flags) || !t)
   1492		return;
   1493	WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
   1494	WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
   1495	swake_up_one(&rcu_state.gp_wq);
   1496}
   1497
   1498/*
   1499 * If there is room, assign a ->gp_seq number to any callbacks on this
   1500 * CPU that have not already been assigned.  Also accelerate any callbacks
   1501 * that were previously assigned a ->gp_seq number that has since proven
   1502 * to be too conservative, which can happen if callbacks get assigned a
   1503 * ->gp_seq number while RCU is idle, but with reference to a non-root
   1504 * rcu_node structure.  This function is idempotent, so it does not hurt
   1505 * to call it repeatedly.  Returns an flag saying that we should awaken
   1506 * the RCU grace-period kthread.
   1507 *
   1508 * The caller must hold rnp->lock with interrupts disabled.
   1509 */
   1510static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
   1511{
   1512	unsigned long gp_seq_req;
   1513	bool ret = false;
   1514
   1515	rcu_lockdep_assert_cblist_protected(rdp);
   1516	raw_lockdep_assert_held_rcu_node(rnp);
   1517
   1518	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
   1519	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
   1520		return false;
   1521
   1522	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
   1523
   1524	/*
   1525	 * Callbacks are often registered with incomplete grace-period
   1526	 * information.  Something about the fact that getting exact
   1527	 * information requires acquiring a global lock...  RCU therefore
   1528	 * makes a conservative estimate of the grace period number at which
   1529	 * a given callback will become ready to invoke.	The following
   1530	 * code checks this estimate and improves it when possible, thus
   1531	 * accelerating callback invocation to an earlier grace-period
   1532	 * number.
   1533	 */
   1534	gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
   1535	if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
   1536		ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
   1537
   1538	/* Trace depending on how much we were able to accelerate. */
   1539	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
   1540		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
   1541	else
   1542		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
   1543
   1544	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
   1545
   1546	return ret;
   1547}
   1548
   1549/*
   1550 * Similar to rcu_accelerate_cbs(), but does not require that the leaf
   1551 * rcu_node structure's ->lock be held.  It consults the cached value
   1552 * of ->gp_seq_needed in the rcu_data structure, and if that indicates
   1553 * that a new grace-period request be made, invokes rcu_accelerate_cbs()
   1554 * while holding the leaf rcu_node structure's ->lock.
   1555 */
   1556static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
   1557					struct rcu_data *rdp)
   1558{
   1559	unsigned long c;
   1560	bool needwake;
   1561
   1562	rcu_lockdep_assert_cblist_protected(rdp);
   1563	c = rcu_seq_snap(&rcu_state.gp_seq);
   1564	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
   1565		/* Old request still live, so mark recent callbacks. */
   1566		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
   1567		return;
   1568	}
   1569	raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
   1570	needwake = rcu_accelerate_cbs(rnp, rdp);
   1571	raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
   1572	if (needwake)
   1573		rcu_gp_kthread_wake();
   1574}
   1575
   1576/*
   1577 * Move any callbacks whose grace period has completed to the
   1578 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
   1579 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
   1580 * sublist.  This function is idempotent, so it does not hurt to
   1581 * invoke it repeatedly.  As long as it is not invoked -too- often...
   1582 * Returns true if the RCU grace-period kthread needs to be awakened.
   1583 *
   1584 * The caller must hold rnp->lock with interrupts disabled.
   1585 */
   1586static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
   1587{
   1588	rcu_lockdep_assert_cblist_protected(rdp);
   1589	raw_lockdep_assert_held_rcu_node(rnp);
   1590
   1591	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
   1592	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
   1593		return false;
   1594
   1595	/*
   1596	 * Find all callbacks whose ->gp_seq numbers indicate that they
   1597	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
   1598	 */
   1599	rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
   1600
   1601	/* Classify any remaining callbacks. */
   1602	return rcu_accelerate_cbs(rnp, rdp);
   1603}
   1604
   1605/*
   1606 * Move and classify callbacks, but only if doing so won't require
   1607 * that the RCU grace-period kthread be awakened.
   1608 */
   1609static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
   1610						  struct rcu_data *rdp)
   1611{
   1612	rcu_lockdep_assert_cblist_protected(rdp);
   1613	if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
   1614		return;
   1615	// The grace period cannot end while we hold the rcu_node lock.
   1616	if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
   1617		WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
   1618	raw_spin_unlock_rcu_node(rnp);
   1619}
   1620
   1621/*
   1622 * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
   1623 * quiescent state.  This is intended to be invoked when the CPU notices
   1624 * a new grace period.
   1625 */
   1626static void rcu_strict_gp_check_qs(void)
   1627{
   1628	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
   1629		rcu_read_lock();
   1630		rcu_read_unlock();
   1631	}
   1632}
   1633
   1634/*
   1635 * Update CPU-local rcu_data state to record the beginnings and ends of
   1636 * grace periods.  The caller must hold the ->lock of the leaf rcu_node
   1637 * structure corresponding to the current CPU, and must have irqs disabled.
   1638 * Returns true if the grace-period kthread needs to be awakened.
   1639 */
   1640static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
   1641{
   1642	bool ret = false;
   1643	bool need_qs;
   1644	const bool offloaded = rcu_rdp_is_offloaded(rdp);
   1645
   1646	raw_lockdep_assert_held_rcu_node(rnp);
   1647
   1648	if (rdp->gp_seq == rnp->gp_seq)
   1649		return false; /* Nothing to do. */
   1650
   1651	/* Handle the ends of any preceding grace periods first. */
   1652	if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
   1653	    unlikely(READ_ONCE(rdp->gpwrap))) {
   1654		if (!offloaded)
   1655			ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
   1656		rdp->core_needs_qs = false;
   1657		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
   1658	} else {
   1659		if (!offloaded)
   1660			ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
   1661		if (rdp->core_needs_qs)
   1662			rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
   1663	}
   1664
   1665	/* Now handle the beginnings of any new-to-this-CPU grace periods. */
   1666	if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
   1667	    unlikely(READ_ONCE(rdp->gpwrap))) {
   1668		/*
   1669		 * If the current grace period is waiting for this CPU,
   1670		 * set up to detect a quiescent state, otherwise don't
   1671		 * go looking for one.
   1672		 */
   1673		trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
   1674		need_qs = !!(rnp->qsmask & rdp->grpmask);
   1675		rdp->cpu_no_qs.b.norm = need_qs;
   1676		rdp->core_needs_qs = need_qs;
   1677		zero_cpu_stall_ticks(rdp);
   1678	}
   1679	rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */
   1680	if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
   1681		WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
   1682	if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
   1683		WRITE_ONCE(rdp->last_sched_clock, jiffies);
   1684	WRITE_ONCE(rdp->gpwrap, false);
   1685	rcu_gpnum_ovf(rnp, rdp);
   1686	return ret;
   1687}
   1688
   1689static void note_gp_changes(struct rcu_data *rdp)
   1690{
   1691	unsigned long flags;
   1692	bool needwake;
   1693	struct rcu_node *rnp;
   1694
   1695	local_irq_save(flags);
   1696	rnp = rdp->mynode;
   1697	if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
   1698	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
   1699	    !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
   1700		local_irq_restore(flags);
   1701		return;
   1702	}
   1703	needwake = __note_gp_changes(rnp, rdp);
   1704	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   1705	rcu_strict_gp_check_qs();
   1706	if (needwake)
   1707		rcu_gp_kthread_wake();
   1708}
   1709
   1710static atomic_t *rcu_gp_slow_suppress;
   1711
   1712/* Register a counter to suppress debugging grace-period delays. */
   1713void rcu_gp_slow_register(atomic_t *rgssp)
   1714{
   1715	WARN_ON_ONCE(rcu_gp_slow_suppress);
   1716
   1717	WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
   1718}
   1719EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
   1720
   1721/* Unregister a counter, with NULL for not caring which. */
   1722void rcu_gp_slow_unregister(atomic_t *rgssp)
   1723{
   1724	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
   1725
   1726	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
   1727}
   1728EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
   1729
   1730static bool rcu_gp_slow_is_suppressed(void)
   1731{
   1732	atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
   1733
   1734	return rgssp && atomic_read(rgssp);
   1735}
   1736
   1737static void rcu_gp_slow(int delay)
   1738{
   1739	if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
   1740	    !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
   1741		schedule_timeout_idle(delay);
   1742}
   1743
   1744static unsigned long sleep_duration;
   1745
   1746/* Allow rcutorture to stall the grace-period kthread. */
   1747void rcu_gp_set_torture_wait(int duration)
   1748{
   1749	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0)
   1750		WRITE_ONCE(sleep_duration, duration);
   1751}
   1752EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
   1753
   1754/* Actually implement the aforementioned wait. */
   1755static void rcu_gp_torture_wait(void)
   1756{
   1757	unsigned long duration;
   1758
   1759	if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
   1760		return;
   1761	duration = xchg(&sleep_duration, 0UL);
   1762	if (duration > 0) {
   1763		pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
   1764		schedule_timeout_idle(duration);
   1765		pr_alert("%s: Wait complete\n", __func__);
   1766	}
   1767}
   1768
   1769/*
   1770 * Handler for on_each_cpu() to invoke the target CPU's RCU core
   1771 * processing.
   1772 */
   1773static void rcu_strict_gp_boundary(void *unused)
   1774{
   1775	invoke_rcu_core();
   1776}
   1777
   1778/*
   1779 * Initialize a new grace period.  Return false if no grace period required.
   1780 */
   1781static noinline_for_stack bool rcu_gp_init(void)
   1782{
   1783	unsigned long flags;
   1784	unsigned long oldmask;
   1785	unsigned long mask;
   1786	struct rcu_data *rdp;
   1787	struct rcu_node *rnp = rcu_get_root();
   1788
   1789	WRITE_ONCE(rcu_state.gp_activity, jiffies);
   1790	raw_spin_lock_irq_rcu_node(rnp);
   1791	if (!READ_ONCE(rcu_state.gp_flags)) {
   1792		/* Spurious wakeup, tell caller to go back to sleep.  */
   1793		raw_spin_unlock_irq_rcu_node(rnp);
   1794		return false;
   1795	}
   1796	WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
   1797
   1798	if (WARN_ON_ONCE(rcu_gp_in_progress())) {
   1799		/*
   1800		 * Grace period already in progress, don't start another.
   1801		 * Not supposed to be able to happen.
   1802		 */
   1803		raw_spin_unlock_irq_rcu_node(rnp);
   1804		return false;
   1805	}
   1806
   1807	/* Advance to a new grace period and initialize state. */
   1808	record_gp_stall_check_time();
   1809	/* Record GP times before starting GP, hence rcu_seq_start(). */
   1810	rcu_seq_start(&rcu_state.gp_seq);
   1811	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
   1812	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
   1813	raw_spin_unlock_irq_rcu_node(rnp);
   1814
   1815	/*
   1816	 * Apply per-leaf buffered online and offline operations to
   1817	 * the rcu_node tree. Note that this new grace period need not
   1818	 * wait for subsequent online CPUs, and that RCU hooks in the CPU
   1819	 * offlining path, when combined with checks in this function,
   1820	 * will handle CPUs that are currently going offline or that will
   1821	 * go offline later.  Please also refer to "Hotplug CPU" section
   1822	 * of RCU's Requirements documentation.
   1823	 */
   1824	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
   1825	/* Exclude CPU hotplug operations. */
   1826	rcu_for_each_leaf_node(rnp) {
   1827		local_irq_save(flags);
   1828		arch_spin_lock(&rcu_state.ofl_lock);
   1829		raw_spin_lock_rcu_node(rnp);
   1830		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
   1831		    !rnp->wait_blkd_tasks) {
   1832			/* Nothing to do on this leaf rcu_node structure. */
   1833			raw_spin_unlock_rcu_node(rnp);
   1834			arch_spin_unlock(&rcu_state.ofl_lock);
   1835			local_irq_restore(flags);
   1836			continue;
   1837		}
   1838
   1839		/* Record old state, apply changes to ->qsmaskinit field. */
   1840		oldmask = rnp->qsmaskinit;
   1841		rnp->qsmaskinit = rnp->qsmaskinitnext;
   1842
   1843		/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
   1844		if (!oldmask != !rnp->qsmaskinit) {
   1845			if (!oldmask) { /* First online CPU for rcu_node. */
   1846				if (!rnp->wait_blkd_tasks) /* Ever offline? */
   1847					rcu_init_new_rnp(rnp);
   1848			} else if (rcu_preempt_has_tasks(rnp)) {
   1849				rnp->wait_blkd_tasks = true; /* blocked tasks */
   1850			} else { /* Last offline CPU and can propagate. */
   1851				rcu_cleanup_dead_rnp(rnp);
   1852			}
   1853		}
   1854
   1855		/*
   1856		 * If all waited-on tasks from prior grace period are
   1857		 * done, and if all this rcu_node structure's CPUs are
   1858		 * still offline, propagate up the rcu_node tree and
   1859		 * clear ->wait_blkd_tasks.  Otherwise, if one of this
   1860		 * rcu_node structure's CPUs has since come back online,
   1861		 * simply clear ->wait_blkd_tasks.
   1862		 */
   1863		if (rnp->wait_blkd_tasks &&
   1864		    (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
   1865			rnp->wait_blkd_tasks = false;
   1866			if (!rnp->qsmaskinit)
   1867				rcu_cleanup_dead_rnp(rnp);
   1868		}
   1869
   1870		raw_spin_unlock_rcu_node(rnp);
   1871		arch_spin_unlock(&rcu_state.ofl_lock);
   1872		local_irq_restore(flags);
   1873	}
   1874	rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
   1875
   1876	/*
   1877	 * Set the quiescent-state-needed bits in all the rcu_node
   1878	 * structures for all currently online CPUs in breadth-first
   1879	 * order, starting from the root rcu_node structure, relying on the
   1880	 * layout of the tree within the rcu_state.node[] array.  Note that
   1881	 * other CPUs will access only the leaves of the hierarchy, thus
   1882	 * seeing that no grace period is in progress, at least until the
   1883	 * corresponding leaf node has been initialized.
   1884	 *
   1885	 * The grace period cannot complete until the initialization
   1886	 * process finishes, because this kthread handles both.
   1887	 */
   1888	WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
   1889	rcu_for_each_node_breadth_first(rnp) {
   1890		rcu_gp_slow(gp_init_delay);
   1891		raw_spin_lock_irqsave_rcu_node(rnp, flags);
   1892		rdp = this_cpu_ptr(&rcu_data);
   1893		rcu_preempt_check_blocked_tasks(rnp);
   1894		rnp->qsmask = rnp->qsmaskinit;
   1895		WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
   1896		if (rnp == rdp->mynode)
   1897			(void)__note_gp_changes(rnp, rdp);
   1898		rcu_preempt_boost_start_gp(rnp);
   1899		trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
   1900					    rnp->level, rnp->grplo,
   1901					    rnp->grphi, rnp->qsmask);
   1902		/* Quiescent states for tasks on any now-offline CPUs. */
   1903		mask = rnp->qsmask & ~rnp->qsmaskinitnext;
   1904		rnp->rcu_gp_init_mask = mask;
   1905		if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
   1906			rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
   1907		else
   1908			raw_spin_unlock_irq_rcu_node(rnp);
   1909		cond_resched_tasks_rcu_qs();
   1910		WRITE_ONCE(rcu_state.gp_activity, jiffies);
   1911	}
   1912
   1913	// If strict, make all CPUs aware of new grace period.
   1914	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
   1915		on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
   1916
   1917	return true;
   1918}
   1919
   1920/*
   1921 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
   1922 * time.
   1923 */
   1924static bool rcu_gp_fqs_check_wake(int *gfp)
   1925{
   1926	struct rcu_node *rnp = rcu_get_root();
   1927
   1928	// If under overload conditions, force an immediate FQS scan.
   1929	if (*gfp & RCU_GP_FLAG_OVLD)
   1930		return true;
   1931
   1932	// Someone like call_rcu() requested a force-quiescent-state scan.
   1933	*gfp = READ_ONCE(rcu_state.gp_flags);
   1934	if (*gfp & RCU_GP_FLAG_FQS)
   1935		return true;
   1936
   1937	// The current grace period has completed.
   1938	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
   1939		return true;
   1940
   1941	return false;
   1942}
   1943
   1944/*
   1945 * Do one round of quiescent-state forcing.
   1946 */
   1947static void rcu_gp_fqs(bool first_time)
   1948{
   1949	struct rcu_node *rnp = rcu_get_root();
   1950
   1951	WRITE_ONCE(rcu_state.gp_activity, jiffies);
   1952	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
   1953	if (first_time) {
   1954		/* Collect dyntick-idle snapshots. */
   1955		force_qs_rnp(dyntick_save_progress_counter);
   1956	} else {
   1957		/* Handle dyntick-idle and offline CPUs. */
   1958		force_qs_rnp(rcu_implicit_dynticks_qs);
   1959	}
   1960	/* Clear flag to prevent immediate re-entry. */
   1961	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
   1962		raw_spin_lock_irq_rcu_node(rnp);
   1963		WRITE_ONCE(rcu_state.gp_flags,
   1964			   READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
   1965		raw_spin_unlock_irq_rcu_node(rnp);
   1966	}
   1967}
   1968
   1969/*
   1970 * Loop doing repeated quiescent-state forcing until the grace period ends.
   1971 */
   1972static noinline_for_stack void rcu_gp_fqs_loop(void)
   1973{
   1974	bool first_gp_fqs;
   1975	int gf = 0;
   1976	unsigned long j;
   1977	int ret;
   1978	struct rcu_node *rnp = rcu_get_root();
   1979
   1980	first_gp_fqs = true;
   1981	j = READ_ONCE(jiffies_till_first_fqs);
   1982	if (rcu_state.cbovld)
   1983		gf = RCU_GP_FLAG_OVLD;
   1984	ret = 0;
   1985	for (;;) {
   1986		if (!ret) {
   1987			WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
   1988			/*
   1989			 * jiffies_force_qs before RCU_GP_WAIT_FQS state
   1990			 * update; required for stall checks.
   1991			 */
   1992			smp_wmb();
   1993			WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
   1994				   jiffies + (j ? 3 * j : 2));
   1995		}
   1996		trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   1997				       TPS("fqswait"));
   1998		WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
   1999		(void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
   2000				 rcu_gp_fqs_check_wake(&gf), j);
   2001		rcu_gp_torture_wait();
   2002		WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
   2003		/* Locking provides needed memory barriers. */
   2004		/* If grace period done, leave loop. */
   2005		if (!READ_ONCE(rnp->qsmask) &&
   2006		    !rcu_preempt_blocked_readers_cgp(rnp))
   2007			break;
   2008		/* If time for quiescent-state forcing, do it. */
   2009		if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
   2010		    (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
   2011			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   2012					       TPS("fqsstart"));
   2013			rcu_gp_fqs(first_gp_fqs);
   2014			gf = 0;
   2015			if (first_gp_fqs) {
   2016				first_gp_fqs = false;
   2017				gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0;
   2018			}
   2019			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   2020					       TPS("fqsend"));
   2021			cond_resched_tasks_rcu_qs();
   2022			WRITE_ONCE(rcu_state.gp_activity, jiffies);
   2023			ret = 0; /* Force full wait till next FQS. */
   2024			j = READ_ONCE(jiffies_till_next_fqs);
   2025		} else {
   2026			/* Deal with stray signal. */
   2027			cond_resched_tasks_rcu_qs();
   2028			WRITE_ONCE(rcu_state.gp_activity, jiffies);
   2029			WARN_ON(signal_pending(current));
   2030			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   2031					       TPS("fqswaitsig"));
   2032			ret = 1; /* Keep old FQS timing. */
   2033			j = jiffies;
   2034			if (time_after(jiffies, rcu_state.jiffies_force_qs))
   2035				j = 1;
   2036			else
   2037				j = rcu_state.jiffies_force_qs - j;
   2038			gf = 0;
   2039		}
   2040	}
   2041}
   2042
   2043/*
   2044 * Clean up after the old grace period.
   2045 */
   2046static noinline void rcu_gp_cleanup(void)
   2047{
   2048	int cpu;
   2049	bool needgp = false;
   2050	unsigned long gp_duration;
   2051	unsigned long new_gp_seq;
   2052	bool offloaded;
   2053	struct rcu_data *rdp;
   2054	struct rcu_node *rnp = rcu_get_root();
   2055	struct swait_queue_head *sq;
   2056
   2057	WRITE_ONCE(rcu_state.gp_activity, jiffies);
   2058	raw_spin_lock_irq_rcu_node(rnp);
   2059	rcu_state.gp_end = jiffies;
   2060	gp_duration = rcu_state.gp_end - rcu_state.gp_start;
   2061	if (gp_duration > rcu_state.gp_max)
   2062		rcu_state.gp_max = gp_duration;
   2063
   2064	/*
   2065	 * We know the grace period is complete, but to everyone else
   2066	 * it appears to still be ongoing.  But it is also the case
   2067	 * that to everyone else it looks like there is nothing that
   2068	 * they can do to advance the grace period.  It is therefore
   2069	 * safe for us to drop the lock in order to mark the grace
   2070	 * period as completed in all of the rcu_node structures.
   2071	 */
   2072	raw_spin_unlock_irq_rcu_node(rnp);
   2073
   2074	/*
   2075	 * Propagate new ->gp_seq value to rcu_node structures so that
   2076	 * other CPUs don't have to wait until the start of the next grace
   2077	 * period to process their callbacks.  This also avoids some nasty
   2078	 * RCU grace-period initialization races by forcing the end of
   2079	 * the current grace period to be completely recorded in all of
   2080	 * the rcu_node structures before the beginning of the next grace
   2081	 * period is recorded in any of the rcu_node structures.
   2082	 */
   2083	new_gp_seq = rcu_state.gp_seq;
   2084	rcu_seq_end(&new_gp_seq);
   2085	rcu_for_each_node_breadth_first(rnp) {
   2086		raw_spin_lock_irq_rcu_node(rnp);
   2087		if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
   2088			dump_blkd_tasks(rnp, 10);
   2089		WARN_ON_ONCE(rnp->qsmask);
   2090		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
   2091		rdp = this_cpu_ptr(&rcu_data);
   2092		if (rnp == rdp->mynode)
   2093			needgp = __note_gp_changes(rnp, rdp) || needgp;
   2094		/* smp_mb() provided by prior unlock-lock pair. */
   2095		needgp = rcu_future_gp_cleanup(rnp) || needgp;
   2096		// Reset overload indication for CPUs no longer overloaded
   2097		if (rcu_is_leaf_node(rnp))
   2098			for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
   2099				rdp = per_cpu_ptr(&rcu_data, cpu);
   2100				check_cb_ovld_locked(rdp, rnp);
   2101			}
   2102		sq = rcu_nocb_gp_get(rnp);
   2103		raw_spin_unlock_irq_rcu_node(rnp);
   2104		rcu_nocb_gp_cleanup(sq);
   2105		cond_resched_tasks_rcu_qs();
   2106		WRITE_ONCE(rcu_state.gp_activity, jiffies);
   2107		rcu_gp_slow(gp_cleanup_delay);
   2108	}
   2109	rnp = rcu_get_root();
   2110	raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
   2111
   2112	/* Declare grace period done, trace first to use old GP number. */
   2113	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
   2114	rcu_seq_end(&rcu_state.gp_seq);
   2115	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
   2116	WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
   2117	/* Check for GP requests since above loop. */
   2118	rdp = this_cpu_ptr(&rcu_data);
   2119	if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
   2120		trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
   2121				  TPS("CleanupMore"));
   2122		needgp = true;
   2123	}
   2124	/* Advance CBs to reduce false positives below. */
   2125	offloaded = rcu_rdp_is_offloaded(rdp);
   2126	if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
   2127
   2128		// We get here if a grace period was needed (“needgp”)
   2129		// and the above call to rcu_accelerate_cbs() did not set
   2130		// the RCU_GP_FLAG_INIT bit in ->gp_state (which records
   2131		// the need for another grace period).  The purpose
   2132		// of the “offloaded” check is to avoid invoking
   2133		// rcu_accelerate_cbs() on an offloaded CPU because we do not
   2134		// hold the ->nocb_lock needed to safely access an offloaded
   2135		// ->cblist.  We do not want to acquire that lock because
   2136		// it can be heavily contended during callback floods.
   2137
   2138		WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
   2139		WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
   2140		trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
   2141	} else {
   2142
   2143		// We get here either if there is no need for an
   2144		// additional grace period or if rcu_accelerate_cbs() has
   2145		// already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
   2146		// So all we need to do is to clear all of the other
   2147		// ->gp_flags bits.
   2148
   2149		WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
   2150	}
   2151	raw_spin_unlock_irq_rcu_node(rnp);
   2152
   2153	// If strict, make all CPUs aware of the end of the old grace period.
   2154	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
   2155		on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
   2156}
   2157
   2158/*
   2159 * Body of kthread that handles grace periods.
   2160 */
   2161static int __noreturn rcu_gp_kthread(void *unused)
   2162{
   2163	rcu_bind_gp_kthread();
   2164	for (;;) {
   2165
   2166		/* Handle grace-period start. */
   2167		for (;;) {
   2168			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   2169					       TPS("reqwait"));
   2170			WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
   2171			swait_event_idle_exclusive(rcu_state.gp_wq,
   2172					 READ_ONCE(rcu_state.gp_flags) &
   2173					 RCU_GP_FLAG_INIT);
   2174			rcu_gp_torture_wait();
   2175			WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
   2176			/* Locking provides needed memory barrier. */
   2177			if (rcu_gp_init())
   2178				break;
   2179			cond_resched_tasks_rcu_qs();
   2180			WRITE_ONCE(rcu_state.gp_activity, jiffies);
   2181			WARN_ON(signal_pending(current));
   2182			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
   2183					       TPS("reqwaitsig"));
   2184		}
   2185
   2186		/* Handle quiescent-state forcing. */
   2187		rcu_gp_fqs_loop();
   2188
   2189		/* Handle grace-period end. */
   2190		WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
   2191		rcu_gp_cleanup();
   2192		WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
   2193	}
   2194}
   2195
   2196/*
   2197 * Report a full set of quiescent states to the rcu_state data structure.
   2198 * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
   2199 * another grace period is required.  Whether we wake the grace-period
   2200 * kthread or it awakens itself for the next round of quiescent-state
   2201 * forcing, that kthread will clean up after the just-completed grace
   2202 * period.  Note that the caller must hold rnp->lock, which is released
   2203 * before return.
   2204 */
   2205static void rcu_report_qs_rsp(unsigned long flags)
   2206	__releases(rcu_get_root()->lock)
   2207{
   2208	raw_lockdep_assert_held_rcu_node(rcu_get_root());
   2209	WARN_ON_ONCE(!rcu_gp_in_progress());
   2210	WRITE_ONCE(rcu_state.gp_flags,
   2211		   READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
   2212	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
   2213	rcu_gp_kthread_wake();
   2214}
   2215
   2216/*
   2217 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
   2218 * Allows quiescent states for a group of CPUs to be reported at one go
   2219 * to the specified rcu_node structure, though all the CPUs in the group
   2220 * must be represented by the same rcu_node structure (which need not be a
   2221 * leaf rcu_node structure, though it often will be).  The gps parameter
   2222 * is the grace-period snapshot, which means that the quiescent states
   2223 * are valid only if rnp->gp_seq is equal to gps.  That structure's lock
   2224 * must be held upon entry, and it is released before return.
   2225 *
   2226 * As a special case, if mask is zero, the bit-already-cleared check is
   2227 * disabled.  This allows propagating quiescent state due to resumed tasks
   2228 * during grace-period initialization.
   2229 */
   2230static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
   2231			      unsigned long gps, unsigned long flags)
   2232	__releases(rnp->lock)
   2233{
   2234	unsigned long oldmask = 0;
   2235	struct rcu_node *rnp_c;
   2236
   2237	raw_lockdep_assert_held_rcu_node(rnp);
   2238
   2239	/* Walk up the rcu_node hierarchy. */
   2240	for (;;) {
   2241		if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
   2242
   2243			/*
   2244			 * Our bit has already been cleared, or the
   2245			 * relevant grace period is already over, so done.
   2246			 */
   2247			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2248			return;
   2249		}
   2250		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
   2251		WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
   2252			     rcu_preempt_blocked_readers_cgp(rnp));
   2253		WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
   2254		trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
   2255						 mask, rnp->qsmask, rnp->level,
   2256						 rnp->grplo, rnp->grphi,
   2257						 !!rnp->gp_tasks);
   2258		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
   2259
   2260			/* Other bits still set at this level, so done. */
   2261			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2262			return;
   2263		}
   2264		rnp->completedqs = rnp->gp_seq;
   2265		mask = rnp->grpmask;
   2266		if (rnp->parent == NULL) {
   2267
   2268			/* No more levels.  Exit loop holding root lock. */
   2269
   2270			break;
   2271		}
   2272		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2273		rnp_c = rnp;
   2274		rnp = rnp->parent;
   2275		raw_spin_lock_irqsave_rcu_node(rnp, flags);
   2276		oldmask = READ_ONCE(rnp_c->qsmask);
   2277	}
   2278
   2279	/*
   2280	 * Get here if we are the last CPU to pass through a quiescent
   2281	 * state for this grace period.  Invoke rcu_report_qs_rsp()
   2282	 * to clean up and start the next grace period if one is needed.
   2283	 */
   2284	rcu_report_qs_rsp(flags); /* releases rnp->lock. */
   2285}
   2286
   2287/*
   2288 * Record a quiescent state for all tasks that were previously queued
   2289 * on the specified rcu_node structure and that were blocking the current
   2290 * RCU grace period.  The caller must hold the corresponding rnp->lock with
   2291 * irqs disabled, and this lock is released upon return, but irqs remain
   2292 * disabled.
   2293 */
   2294static void __maybe_unused
   2295rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
   2296	__releases(rnp->lock)
   2297{
   2298	unsigned long gps;
   2299	unsigned long mask;
   2300	struct rcu_node *rnp_p;
   2301
   2302	raw_lockdep_assert_held_rcu_node(rnp);
   2303	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
   2304	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
   2305	    rnp->qsmask != 0) {
   2306		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2307		return;  /* Still need more quiescent states! */
   2308	}
   2309
   2310	rnp->completedqs = rnp->gp_seq;
   2311	rnp_p = rnp->parent;
   2312	if (rnp_p == NULL) {
   2313		/*
   2314		 * Only one rcu_node structure in the tree, so don't
   2315		 * try to report up to its nonexistent parent!
   2316		 */
   2317		rcu_report_qs_rsp(flags);
   2318		return;
   2319	}
   2320
   2321	/* Report up the rest of the hierarchy, tracking current ->gp_seq. */
   2322	gps = rnp->gp_seq;
   2323	mask = rnp->grpmask;
   2324	raw_spin_unlock_rcu_node(rnp);	/* irqs remain disabled. */
   2325	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
   2326	rcu_report_qs_rnp(mask, rnp_p, gps, flags);
   2327}
   2328
   2329/*
   2330 * Record a quiescent state for the specified CPU to that CPU's rcu_data
   2331 * structure.  This must be called from the specified CPU.
   2332 */
   2333static void
   2334rcu_report_qs_rdp(struct rcu_data *rdp)
   2335{
   2336	unsigned long flags;
   2337	unsigned long mask;
   2338	bool needwake = false;
   2339	bool needacc = false;
   2340	struct rcu_node *rnp;
   2341
   2342	WARN_ON_ONCE(rdp->cpu != smp_processor_id());
   2343	rnp = rdp->mynode;
   2344	raw_spin_lock_irqsave_rcu_node(rnp, flags);
   2345	if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
   2346	    rdp->gpwrap) {
   2347
   2348		/*
   2349		 * The grace period in which this quiescent state was
   2350		 * recorded has ended, so don't report it upwards.
   2351		 * We will instead need a new quiescent state that lies
   2352		 * within the current grace period.
   2353		 */
   2354		rdp->cpu_no_qs.b.norm = true;	/* need qs for new gp. */
   2355		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2356		return;
   2357	}
   2358	mask = rdp->grpmask;
   2359	rdp->core_needs_qs = false;
   2360	if ((rnp->qsmask & mask) == 0) {
   2361		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2362	} else {
   2363		/*
   2364		 * This GP can't end until cpu checks in, so all of our
   2365		 * callbacks can be processed during the next GP.
   2366		 *
   2367		 * NOCB kthreads have their own way to deal with that...
   2368		 */
   2369		if (!rcu_rdp_is_offloaded(rdp)) {
   2370			needwake = rcu_accelerate_cbs(rnp, rdp);
   2371		} else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
   2372			/*
   2373			 * ...but NOCB kthreads may miss or delay callbacks acceleration
   2374			 * if in the middle of a (de-)offloading process.
   2375			 */
   2376			needacc = true;
   2377		}
   2378
   2379		rcu_disable_urgency_upon_qs(rdp);
   2380		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
   2381		/* ^^^ Released rnp->lock */
   2382		if (needwake)
   2383			rcu_gp_kthread_wake();
   2384
   2385		if (needacc) {
   2386			rcu_nocb_lock_irqsave(rdp, flags);
   2387			rcu_accelerate_cbs_unlocked(rnp, rdp);
   2388			rcu_nocb_unlock_irqrestore(rdp, flags);
   2389		}
   2390	}
   2391}
   2392
   2393/*
   2394 * Check to see if there is a new grace period of which this CPU
   2395 * is not yet aware, and if so, set up local rcu_data state for it.
   2396 * Otherwise, see if this CPU has just passed through its first
   2397 * quiescent state for this grace period, and record that fact if so.
   2398 */
   2399static void
   2400rcu_check_quiescent_state(struct rcu_data *rdp)
   2401{
   2402	/* Check for grace-period ends and beginnings. */
   2403	note_gp_changes(rdp);
   2404
   2405	/*
   2406	 * Does this CPU still need to do its part for current grace period?
   2407	 * If no, return and let the other CPUs do their part as well.
   2408	 */
   2409	if (!rdp->core_needs_qs)
   2410		return;
   2411
   2412	/*
   2413	 * Was there a quiescent state since the beginning of the grace
   2414	 * period? If no, then exit and wait for the next call.
   2415	 */
   2416	if (rdp->cpu_no_qs.b.norm)
   2417		return;
   2418
   2419	/*
   2420	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
   2421	 * judge of that).
   2422	 */
   2423	rcu_report_qs_rdp(rdp);
   2424}
   2425
   2426/*
   2427 * Near the end of the offline process.  Trace the fact that this CPU
   2428 * is going offline.
   2429 */
   2430int rcutree_dying_cpu(unsigned int cpu)
   2431{
   2432	bool blkd;
   2433	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   2434	struct rcu_node *rnp = rdp->mynode;
   2435
   2436	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
   2437		return 0;
   2438
   2439	blkd = !!(rnp->qsmask & rdp->grpmask);
   2440	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
   2441			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
   2442	return 0;
   2443}
   2444
   2445/*
   2446 * All CPUs for the specified rcu_node structure have gone offline,
   2447 * and all tasks that were preempted within an RCU read-side critical
   2448 * section while running on one of those CPUs have since exited their RCU
   2449 * read-side critical section.  Some other CPU is reporting this fact with
   2450 * the specified rcu_node structure's ->lock held and interrupts disabled.
   2451 * This function therefore goes up the tree of rcu_node structures,
   2452 * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
   2453 * the leaf rcu_node structure's ->qsmaskinit field has already been
   2454 * updated.
   2455 *
   2456 * This function does check that the specified rcu_node structure has
   2457 * all CPUs offline and no blocked tasks, so it is OK to invoke it
   2458 * prematurely.  That said, invoking it after the fact will cost you
   2459 * a needless lock acquisition.  So once it has done its work, don't
   2460 * invoke it again.
   2461 */
   2462static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
   2463{
   2464	long mask;
   2465	struct rcu_node *rnp = rnp_leaf;
   2466
   2467	raw_lockdep_assert_held_rcu_node(rnp_leaf);
   2468	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
   2469	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
   2470	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
   2471		return;
   2472	for (;;) {
   2473		mask = rnp->grpmask;
   2474		rnp = rnp->parent;
   2475		if (!rnp)
   2476			break;
   2477		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
   2478		rnp->qsmaskinit &= ~mask;
   2479		/* Between grace periods, so better already be zero! */
   2480		WARN_ON_ONCE(rnp->qsmask);
   2481		if (rnp->qsmaskinit) {
   2482			raw_spin_unlock_rcu_node(rnp);
   2483			/* irqs remain disabled. */
   2484			return;
   2485		}
   2486		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
   2487	}
   2488}
   2489
   2490/*
   2491 * The CPU has been completely removed, and some other CPU is reporting
   2492 * this fact from process context.  Do the remainder of the cleanup.
   2493 * There can only be one CPU hotplug operation at a time, so no need for
   2494 * explicit locking.
   2495 */
   2496int rcutree_dead_cpu(unsigned int cpu)
   2497{
   2498	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   2499	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
   2500
   2501	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
   2502		return 0;
   2503
   2504	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
   2505	/* Adjust any no-longer-needed kthreads. */
   2506	rcu_boost_kthread_setaffinity(rnp, -1);
   2507	// Stop-machine done, so allow nohz_full to disable tick.
   2508	tick_dep_clear(TICK_DEP_BIT_RCU);
   2509	return 0;
   2510}
   2511
   2512/*
   2513 * Invoke any RCU callbacks that have made it to the end of their grace
   2514 * period.  Throttle as specified by rdp->blimit.
   2515 */
   2516static void rcu_do_batch(struct rcu_data *rdp)
   2517{
   2518	int div;
   2519	bool __maybe_unused empty;
   2520	unsigned long flags;
   2521	struct rcu_head *rhp;
   2522	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
   2523	long bl, count = 0;
   2524	long pending, tlimit = 0;
   2525
   2526	/* If no callbacks are ready, just return. */
   2527	if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
   2528		trace_rcu_batch_start(rcu_state.name,
   2529				      rcu_segcblist_n_cbs(&rdp->cblist), 0);
   2530		trace_rcu_batch_end(rcu_state.name, 0,
   2531				    !rcu_segcblist_empty(&rdp->cblist),
   2532				    need_resched(), is_idle_task(current),
   2533				    rcu_is_callbacks_kthread());
   2534		return;
   2535	}
   2536
   2537	/*
   2538	 * Extract the list of ready callbacks, disabling IRQs to prevent
   2539	 * races with call_rcu() from interrupt handlers.  Leave the
   2540	 * callback counts, as rcu_barrier() needs to be conservative.
   2541	 */
   2542	rcu_nocb_lock_irqsave(rdp, flags);
   2543	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
   2544	pending = rcu_segcblist_n_cbs(&rdp->cblist);
   2545	div = READ_ONCE(rcu_divisor);
   2546	div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
   2547	bl = max(rdp->blimit, pending >> div);
   2548	if (in_serving_softirq() && unlikely(bl > 100)) {
   2549		long rrn = READ_ONCE(rcu_resched_ns);
   2550
   2551		rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
   2552		tlimit = local_clock() + rrn;
   2553	}
   2554	trace_rcu_batch_start(rcu_state.name,
   2555			      rcu_segcblist_n_cbs(&rdp->cblist), bl);
   2556	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
   2557	if (rcu_rdp_is_offloaded(rdp))
   2558		rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
   2559
   2560	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
   2561	rcu_nocb_unlock_irqrestore(rdp, flags);
   2562
   2563	/* Invoke callbacks. */
   2564	tick_dep_set_task(current, TICK_DEP_BIT_RCU);
   2565	rhp = rcu_cblist_dequeue(&rcl);
   2566
   2567	for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
   2568		rcu_callback_t f;
   2569
   2570		count++;
   2571		debug_rcu_head_unqueue(rhp);
   2572
   2573		rcu_lock_acquire(&rcu_callback_map);
   2574		trace_rcu_invoke_callback(rcu_state.name, rhp);
   2575
   2576		f = rhp->func;
   2577		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
   2578		f(rhp);
   2579
   2580		rcu_lock_release(&rcu_callback_map);
   2581
   2582		/*
   2583		 * Stop only if limit reached and CPU has something to do.
   2584		 */
   2585		if (in_serving_softirq()) {
   2586			if (count >= bl && (need_resched() || !is_idle_task(current)))
   2587				break;
   2588			/*
   2589			 * Make sure we don't spend too much time here and deprive other
   2590			 * softirq vectors of CPU cycles.
   2591			 */
   2592			if (unlikely(tlimit)) {
   2593				/* only call local_clock() every 32 callbacks */
   2594				if (likely((count & 31) || local_clock() < tlimit))
   2595					continue;
   2596				/* Exceeded the time limit, so leave. */
   2597				break;
   2598			}
   2599		} else {
   2600			local_bh_enable();
   2601			lockdep_assert_irqs_enabled();
   2602			cond_resched_tasks_rcu_qs();
   2603			lockdep_assert_irqs_enabled();
   2604			local_bh_disable();
   2605		}
   2606	}
   2607
   2608	rcu_nocb_lock_irqsave(rdp, flags);
   2609	rdp->n_cbs_invoked += count;
   2610	trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
   2611			    is_idle_task(current), rcu_is_callbacks_kthread());
   2612
   2613	/* Update counts and requeue any remaining callbacks. */
   2614	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
   2615	rcu_segcblist_add_len(&rdp->cblist, -count);
   2616
   2617	/* Reinstate batch limit if we have worked down the excess. */
   2618	count = rcu_segcblist_n_cbs(&rdp->cblist);
   2619	if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
   2620		rdp->blimit = blimit;
   2621
   2622	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
   2623	if (count == 0 && rdp->qlen_last_fqs_check != 0) {
   2624		rdp->qlen_last_fqs_check = 0;
   2625		rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
   2626	} else if (count < rdp->qlen_last_fqs_check - qhimark)
   2627		rdp->qlen_last_fqs_check = count;
   2628
   2629	/*
   2630	 * The following usually indicates a double call_rcu().  To track
   2631	 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
   2632	 */
   2633	empty = rcu_segcblist_empty(&rdp->cblist);
   2634	WARN_ON_ONCE(count == 0 && !empty);
   2635	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
   2636		     count != 0 && empty);
   2637	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
   2638	WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
   2639
   2640	rcu_nocb_unlock_irqrestore(rdp, flags);
   2641
   2642	tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
   2643}
   2644
   2645/*
   2646 * This function is invoked from each scheduling-clock interrupt,
   2647 * and checks to see if this CPU is in a non-context-switch quiescent
   2648 * state, for example, user mode or idle loop.  It also schedules RCU
   2649 * core processing.  If the current grace period has gone on too long,
   2650 * it will ask the scheduler to manufacture a context switch for the sole
   2651 * purpose of providing the needed quiescent state.
   2652 */
   2653void rcu_sched_clock_irq(int user)
   2654{
   2655	unsigned long j;
   2656
   2657	if (IS_ENABLED(CONFIG_PROVE_RCU)) {
   2658		j = jiffies;
   2659		WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
   2660		__this_cpu_write(rcu_data.last_sched_clock, j);
   2661	}
   2662	trace_rcu_utilization(TPS("Start scheduler-tick"));
   2663	lockdep_assert_irqs_disabled();
   2664	raw_cpu_inc(rcu_data.ticks_this_gp);
   2665	/* The load-acquire pairs with the store-release setting to true. */
   2666	if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
   2667		/* Idle and userspace execution already are quiescent states. */
   2668		if (!rcu_is_cpu_rrupt_from_idle() && !user) {
   2669			set_tsk_need_resched(current);
   2670			set_preempt_need_resched();
   2671		}
   2672		__this_cpu_write(rcu_data.rcu_urgent_qs, false);
   2673	}
   2674	rcu_flavor_sched_clock_irq(user);
   2675	if (rcu_pending(user))
   2676		invoke_rcu_core();
   2677	if (user)
   2678		rcu_tasks_classic_qs(current, false);
   2679	lockdep_assert_irqs_disabled();
   2680
   2681	trace_rcu_utilization(TPS("End scheduler-tick"));
   2682}
   2683
   2684/*
   2685 * Scan the leaf rcu_node structures.  For each structure on which all
   2686 * CPUs have reported a quiescent state and on which there are tasks
   2687 * blocking the current grace period, initiate RCU priority boosting.
   2688 * Otherwise, invoke the specified function to check dyntick state for
   2689 * each CPU that has not yet reported a quiescent state.
   2690 */
   2691static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
   2692{
   2693	int cpu;
   2694	unsigned long flags;
   2695	unsigned long mask;
   2696	struct rcu_data *rdp;
   2697	struct rcu_node *rnp;
   2698
   2699	rcu_state.cbovld = rcu_state.cbovldnext;
   2700	rcu_state.cbovldnext = false;
   2701	rcu_for_each_leaf_node(rnp) {
   2702		cond_resched_tasks_rcu_qs();
   2703		mask = 0;
   2704		raw_spin_lock_irqsave_rcu_node(rnp, flags);
   2705		rcu_state.cbovldnext |= !!rnp->cbovldmask;
   2706		if (rnp->qsmask == 0) {
   2707			if (rcu_preempt_blocked_readers_cgp(rnp)) {
   2708				/*
   2709				 * No point in scanning bits because they
   2710				 * are all zero.  But we might need to
   2711				 * priority-boost blocked readers.
   2712				 */
   2713				rcu_initiate_boost(rnp, flags);
   2714				/* rcu_initiate_boost() releases rnp->lock */
   2715				continue;
   2716			}
   2717			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2718			continue;
   2719		}
   2720		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
   2721			rdp = per_cpu_ptr(&rcu_data, cpu);
   2722			if (f(rdp)) {
   2723				mask |= rdp->grpmask;
   2724				rcu_disable_urgency_upon_qs(rdp);
   2725			}
   2726		}
   2727		if (mask != 0) {
   2728			/* Idle/offline CPUs, report (releases rnp->lock). */
   2729			rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
   2730		} else {
   2731			/* Nothing to do here, so just drop the lock. */
   2732			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   2733		}
   2734	}
   2735}
   2736
   2737/*
   2738 * Force quiescent states on reluctant CPUs, and also detect which
   2739 * CPUs are in dyntick-idle mode.
   2740 */
   2741void rcu_force_quiescent_state(void)
   2742{
   2743	unsigned long flags;
   2744	bool ret;
   2745	struct rcu_node *rnp;
   2746	struct rcu_node *rnp_old = NULL;
   2747
   2748	/* Funnel through hierarchy to reduce memory contention. */
   2749	rnp = __this_cpu_read(rcu_data.mynode);
   2750	for (; rnp != NULL; rnp = rnp->parent) {
   2751		ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
   2752		       !raw_spin_trylock(&rnp->fqslock);
   2753		if (rnp_old != NULL)
   2754			raw_spin_unlock(&rnp_old->fqslock);
   2755		if (ret)
   2756			return;
   2757		rnp_old = rnp;
   2758	}
   2759	/* rnp_old == rcu_get_root(), rnp == NULL. */
   2760
   2761	/* Reached the root of the rcu_node tree, acquire lock. */
   2762	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
   2763	raw_spin_unlock(&rnp_old->fqslock);
   2764	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
   2765		raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
   2766		return;  /* Someone beat us to it. */
   2767	}
   2768	WRITE_ONCE(rcu_state.gp_flags,
   2769		   READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
   2770	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
   2771	rcu_gp_kthread_wake();
   2772}
   2773EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
   2774
   2775// Workqueue handler for an RCU reader for kernels enforcing struct RCU
   2776// grace periods.
   2777static void strict_work_handler(struct work_struct *work)
   2778{
   2779	rcu_read_lock();
   2780	rcu_read_unlock();
   2781}
   2782
   2783/* Perform RCU core processing work for the current CPU.  */
   2784static __latent_entropy void rcu_core(void)
   2785{
   2786	unsigned long flags;
   2787	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
   2788	struct rcu_node *rnp = rdp->mynode;
   2789	/*
   2790	 * On RT rcu_core() can be preempted when IRQs aren't disabled.
   2791	 * Therefore this function can race with concurrent NOCB (de-)offloading
   2792	 * on this CPU and the below condition must be considered volatile.
   2793	 * However if we race with:
   2794	 *
   2795	 * _ Offloading:   In the worst case we accelerate or process callbacks
   2796	 *                 concurrently with NOCB kthreads. We are guaranteed to
   2797	 *                 call rcu_nocb_lock() if that happens.
   2798	 *
   2799	 * _ Deoffloading: In the worst case we miss callbacks acceleration or
   2800	 *                 processing. This is fine because the early stage
   2801	 *                 of deoffloading invokes rcu_core() after setting
   2802	 *                 SEGCBLIST_RCU_CORE. So we guarantee that we'll process
   2803	 *                 what could have been dismissed without the need to wait
   2804	 *                 for the next rcu_pending() check in the next jiffy.
   2805	 */
   2806	const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
   2807
   2808	if (cpu_is_offline(smp_processor_id()))
   2809		return;
   2810	trace_rcu_utilization(TPS("Start RCU core"));
   2811	WARN_ON_ONCE(!rdp->beenonline);
   2812
   2813	/* Report any deferred quiescent states if preemption enabled. */
   2814	if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
   2815		rcu_preempt_deferred_qs(current);
   2816	} else if (rcu_preempt_need_deferred_qs(current)) {
   2817		set_tsk_need_resched(current);
   2818		set_preempt_need_resched();
   2819	}
   2820
   2821	/* Update RCU state based on any recent quiescent states. */
   2822	rcu_check_quiescent_state(rdp);
   2823
   2824	/* No grace period and unregistered callbacks? */
   2825	if (!rcu_gp_in_progress() &&
   2826	    rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
   2827		rcu_nocb_lock_irqsave(rdp, flags);
   2828		if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
   2829			rcu_accelerate_cbs_unlocked(rnp, rdp);
   2830		rcu_nocb_unlock_irqrestore(rdp, flags);
   2831	}
   2832
   2833	rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
   2834
   2835	/* If there are callbacks ready, invoke them. */
   2836	if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
   2837	    likely(READ_ONCE(rcu_scheduler_fully_active))) {
   2838		rcu_do_batch(rdp);
   2839		/* Re-invoke RCU core processing if there are callbacks remaining. */
   2840		if (rcu_segcblist_ready_cbs(&rdp->cblist))
   2841			invoke_rcu_core();
   2842	}
   2843
   2844	/* Do any needed deferred wakeups of rcuo kthreads. */
   2845	do_nocb_deferred_wakeup(rdp);
   2846	trace_rcu_utilization(TPS("End RCU core"));
   2847
   2848	// If strict GPs, schedule an RCU reader in a clean environment.
   2849	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
   2850		queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
   2851}
   2852
   2853static void rcu_core_si(struct softirq_action *h)
   2854{
   2855	rcu_core();
   2856}
   2857
   2858static void rcu_wake_cond(struct task_struct *t, int status)
   2859{
   2860	/*
   2861	 * If the thread is yielding, only wake it when this
   2862	 * is invoked from idle
   2863	 */
   2864	if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
   2865		wake_up_process(t);
   2866}
   2867
   2868static void invoke_rcu_core_kthread(void)
   2869{
   2870	struct task_struct *t;
   2871	unsigned long flags;
   2872
   2873	local_irq_save(flags);
   2874	__this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
   2875	t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
   2876	if (t != NULL && t != current)
   2877		rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
   2878	local_irq_restore(flags);
   2879}
   2880
   2881/*
   2882 * Wake up this CPU's rcuc kthread to do RCU core processing.
   2883 */
   2884static void invoke_rcu_core(void)
   2885{
   2886	if (!cpu_online(smp_processor_id()))
   2887		return;
   2888	if (use_softirq)
   2889		raise_softirq(RCU_SOFTIRQ);
   2890	else
   2891		invoke_rcu_core_kthread();
   2892}
   2893
   2894static void rcu_cpu_kthread_park(unsigned int cpu)
   2895{
   2896	per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
   2897}
   2898
   2899static int rcu_cpu_kthread_should_run(unsigned int cpu)
   2900{
   2901	return __this_cpu_read(rcu_data.rcu_cpu_has_work);
   2902}
   2903
   2904/*
   2905 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
   2906 * the RCU softirq used in configurations of RCU that do not support RCU
   2907 * priority boosting.
   2908 */
   2909static void rcu_cpu_kthread(unsigned int cpu)
   2910{
   2911	unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
   2912	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
   2913	unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
   2914	int spincnt;
   2915
   2916	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
   2917	for (spincnt = 0; spincnt < 10; spincnt++) {
   2918		WRITE_ONCE(*j, jiffies);
   2919		local_bh_disable();
   2920		*statusp = RCU_KTHREAD_RUNNING;
   2921		local_irq_disable();
   2922		work = *workp;
   2923		*workp = 0;
   2924		local_irq_enable();
   2925		if (work)
   2926			rcu_core();
   2927		local_bh_enable();
   2928		if (*workp == 0) {
   2929			trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
   2930			*statusp = RCU_KTHREAD_WAITING;
   2931			return;
   2932		}
   2933	}
   2934	*statusp = RCU_KTHREAD_YIELDING;
   2935	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
   2936	schedule_timeout_idle(2);
   2937	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
   2938	*statusp = RCU_KTHREAD_WAITING;
   2939	WRITE_ONCE(*j, jiffies);
   2940}
   2941
   2942static struct smp_hotplug_thread rcu_cpu_thread_spec = {
   2943	.store			= &rcu_data.rcu_cpu_kthread_task,
   2944	.thread_should_run	= rcu_cpu_kthread_should_run,
   2945	.thread_fn		= rcu_cpu_kthread,
   2946	.thread_comm		= "rcuc/%u",
   2947	.setup			= rcu_cpu_kthread_setup,
   2948	.park			= rcu_cpu_kthread_park,
   2949};
   2950
   2951/*
   2952 * Spawn per-CPU RCU core processing kthreads.
   2953 */
   2954static int __init rcu_spawn_core_kthreads(void)
   2955{
   2956	int cpu;
   2957
   2958	for_each_possible_cpu(cpu)
   2959		per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
   2960	if (use_softirq)
   2961		return 0;
   2962	WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
   2963		  "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
   2964	return 0;
   2965}
   2966
   2967/*
   2968 * Handle any core-RCU processing required by a call_rcu() invocation.
   2969 */
   2970static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
   2971			    unsigned long flags)
   2972{
   2973	/*
   2974	 * If called from an extended quiescent state, invoke the RCU
   2975	 * core in order to force a re-evaluation of RCU's idleness.
   2976	 */
   2977	if (!rcu_is_watching())
   2978		invoke_rcu_core();
   2979
   2980	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
   2981	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
   2982		return;
   2983
   2984	/*
   2985	 * Force the grace period if too many callbacks or too long waiting.
   2986	 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
   2987	 * if some other CPU has recently done so.  Also, don't bother
   2988	 * invoking rcu_force_quiescent_state() if the newly enqueued callback
   2989	 * is the only one waiting for a grace period to complete.
   2990	 */
   2991	if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
   2992		     rdp->qlen_last_fqs_check + qhimark)) {
   2993
   2994		/* Are we ignoring a completed grace period? */
   2995		note_gp_changes(rdp);
   2996
   2997		/* Start a new grace period if one not already started. */
   2998		if (!rcu_gp_in_progress()) {
   2999			rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
   3000		} else {
   3001			/* Give the grace period a kick. */
   3002			rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
   3003			if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
   3004			    rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
   3005				rcu_force_quiescent_state();
   3006			rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
   3007			rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
   3008		}
   3009	}
   3010}
   3011
   3012/*
   3013 * RCU callback function to leak a callback.
   3014 */
   3015static void rcu_leak_callback(struct rcu_head *rhp)
   3016{
   3017}
   3018
   3019/*
   3020 * Check and if necessary update the leaf rcu_node structure's
   3021 * ->cbovldmask bit corresponding to the current CPU based on that CPU's
   3022 * number of queued RCU callbacks.  The caller must hold the leaf rcu_node
   3023 * structure's ->lock.
   3024 */
   3025static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
   3026{
   3027	raw_lockdep_assert_held_rcu_node(rnp);
   3028	if (qovld_calc <= 0)
   3029		return; // Early boot and wildcard value set.
   3030	if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
   3031		WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
   3032	else
   3033		WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
   3034}
   3035
   3036/*
   3037 * Check and if necessary update the leaf rcu_node structure's
   3038 * ->cbovldmask bit corresponding to the current CPU based on that CPU's
   3039 * number of queued RCU callbacks.  No locks need be held, but the
   3040 * caller must have disabled interrupts.
   3041 *
   3042 * Note that this function ignores the possibility that there are a lot
   3043 * of callbacks all of which have already seen the end of their respective
   3044 * grace periods.  This omission is due to the need for no-CBs CPUs to
   3045 * be holding ->nocb_lock to do this check, which is too heavy for a
   3046 * common-case operation.
   3047 */
   3048static void check_cb_ovld(struct rcu_data *rdp)
   3049{
   3050	struct rcu_node *const rnp = rdp->mynode;
   3051
   3052	if (qovld_calc <= 0 ||
   3053	    ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
   3054	     !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
   3055		return; // Early boot wildcard value or already set correctly.
   3056	raw_spin_lock_rcu_node(rnp);
   3057	check_cb_ovld_locked(rdp, rnp);
   3058	raw_spin_unlock_rcu_node(rnp);
   3059}
   3060
   3061/**
   3062 * call_rcu() - Queue an RCU callback for invocation after a grace period.
   3063 * @head: structure to be used for queueing the RCU updates.
   3064 * @func: actual callback function to be invoked after the grace period
   3065 *
   3066 * The callback function will be invoked some time after a full grace
   3067 * period elapses, in other words after all pre-existing RCU read-side
   3068 * critical sections have completed.  However, the callback function
   3069 * might well execute concurrently with RCU read-side critical sections
   3070 * that started after call_rcu() was invoked.
   3071 *
   3072 * RCU read-side critical sections are delimited by rcu_read_lock()
   3073 * and rcu_read_unlock(), and may be nested.  In addition, but only in
   3074 * v5.0 and later, regions of code across which interrupts, preemption,
   3075 * or softirqs have been disabled also serve as RCU read-side critical
   3076 * sections.  This includes hardware interrupt handlers, softirq handlers,
   3077 * and NMI handlers.
   3078 *
   3079 * Note that all CPUs must agree that the grace period extended beyond
   3080 * all pre-existing RCU read-side critical section.  On systems with more
   3081 * than one CPU, this means that when "func()" is invoked, each CPU is
   3082 * guaranteed to have executed a full memory barrier since the end of its
   3083 * last RCU read-side critical section whose beginning preceded the call
   3084 * to call_rcu().  It also means that each CPU executing an RCU read-side
   3085 * critical section that continues beyond the start of "func()" must have
   3086 * executed a memory barrier after the call_rcu() but before the beginning
   3087 * of that RCU read-side critical section.  Note that these guarantees
   3088 * include CPUs that are offline, idle, or executing in user mode, as
   3089 * well as CPUs that are executing in the kernel.
   3090 *
   3091 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
   3092 * resulting RCU callback function "func()", then both CPU A and CPU B are
   3093 * guaranteed to execute a full memory barrier during the time interval
   3094 * between the call to call_rcu() and the invocation of "func()" -- even
   3095 * if CPU A and CPU B are the same CPU (but again only if the system has
   3096 * more than one CPU).
   3097 *
   3098 * Implementation of these memory-ordering guarantees is described here:
   3099 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   3100 */
   3101void call_rcu(struct rcu_head *head, rcu_callback_t func)
   3102{
   3103	static atomic_t doublefrees;
   3104	unsigned long flags;
   3105	struct rcu_data *rdp;
   3106	bool was_alldone;
   3107
   3108	/* Misaligned rcu_head! */
   3109	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
   3110
   3111	if (debug_rcu_head_queue(head)) {
   3112		/*
   3113		 * Probable double call_rcu(), so leak the callback.
   3114		 * Use rcu:rcu_callback trace event to find the previous
   3115		 * time callback was passed to call_rcu().
   3116		 */
   3117		if (atomic_inc_return(&doublefrees) < 4) {
   3118			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
   3119			mem_dump_obj(head);
   3120		}
   3121		WRITE_ONCE(head->func, rcu_leak_callback);
   3122		return;
   3123	}
   3124	head->func = func;
   3125	head->next = NULL;
   3126	kasan_record_aux_stack_noalloc(head);
   3127	local_irq_save(flags);
   3128	rdp = this_cpu_ptr(&rcu_data);
   3129
   3130	/* Add the callback to our list. */
   3131	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
   3132		// This can trigger due to call_rcu() from offline CPU:
   3133		WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
   3134		WARN_ON_ONCE(!rcu_is_watching());
   3135		// Very early boot, before rcu_init().  Initialize if needed
   3136		// and then drop through to queue the callback.
   3137		if (rcu_segcblist_empty(&rdp->cblist))
   3138			rcu_segcblist_init(&rdp->cblist);
   3139	}
   3140
   3141	check_cb_ovld(rdp);
   3142	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
   3143		return; // Enqueued onto ->nocb_bypass, so just leave.
   3144	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
   3145	rcu_segcblist_enqueue(&rdp->cblist, head);
   3146	if (__is_kvfree_rcu_offset((unsigned long)func))
   3147		trace_rcu_kvfree_callback(rcu_state.name, head,
   3148					 (unsigned long)func,
   3149					 rcu_segcblist_n_cbs(&rdp->cblist));
   3150	else
   3151		trace_rcu_callback(rcu_state.name, head,
   3152				   rcu_segcblist_n_cbs(&rdp->cblist));
   3153
   3154	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
   3155
   3156	/* Go handle any RCU core processing required. */
   3157	if (unlikely(rcu_rdp_is_offloaded(rdp))) {
   3158		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
   3159	} else {
   3160		__call_rcu_core(rdp, head, flags);
   3161		local_irq_restore(flags);
   3162	}
   3163}
   3164EXPORT_SYMBOL_GPL(call_rcu);
   3165
   3166
   3167/* Maximum number of jiffies to wait before draining a batch. */
   3168#define KFREE_DRAIN_JIFFIES (HZ / 50)
   3169#define KFREE_N_BATCHES 2
   3170#define FREE_N_CHANNELS 2
   3171
   3172/**
   3173 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
   3174 * @nr_records: Number of active pointers in the array
   3175 * @next: Next bulk object in the block chain
   3176 * @records: Array of the kvfree_rcu() pointers
   3177 */
   3178struct kvfree_rcu_bulk_data {
   3179	unsigned long nr_records;
   3180	struct kvfree_rcu_bulk_data *next;
   3181	void *records[];
   3182};
   3183
   3184/*
   3185 * This macro defines how many entries the "records" array
   3186 * will contain. It is based on the fact that the size of
   3187 * kvfree_rcu_bulk_data structure becomes exactly one page.
   3188 */
   3189#define KVFREE_BULK_MAX_ENTR \
   3190	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
   3191
   3192/**
   3193 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
   3194 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
   3195 * @head_free: List of kfree_rcu() objects waiting for a grace period
   3196 * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
   3197 * @krcp: Pointer to @kfree_rcu_cpu structure
   3198 */
   3199
   3200struct kfree_rcu_cpu_work {
   3201	struct rcu_work rcu_work;
   3202	struct rcu_head *head_free;
   3203	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
   3204	struct kfree_rcu_cpu *krcp;
   3205};
   3206
   3207/**
   3208 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
   3209 * @head: List of kfree_rcu() objects not yet waiting for a grace period
   3210 * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
   3211 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
   3212 * @lock: Synchronize access to this structure
   3213 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
   3214 * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
   3215 * @initialized: The @rcu_work fields have been initialized
   3216 * @count: Number of objects for which GP not started
   3217 * @bkvcache:
   3218 *	A simple cache list that contains objects for reuse purpose.
   3219 *	In order to save some per-cpu space the list is singular.
   3220 *	Even though it is lockless an access has to be protected by the
   3221 *	per-cpu lock.
   3222 * @page_cache_work: A work to refill the cache when it is empty
   3223 * @backoff_page_cache_fill: Delay cache refills
   3224 * @work_in_progress: Indicates that page_cache_work is running
   3225 * @hrtimer: A hrtimer for scheduling a page_cache_work
   3226 * @nr_bkv_objs: number of allocated objects at @bkvcache.
   3227 *
   3228 * This is a per-CPU structure.  The reason that it is not included in
   3229 * the rcu_data structure is to permit this code to be extracted from
   3230 * the RCU files.  Such extraction could allow further optimization of
   3231 * the interactions with the slab allocators.
   3232 */
   3233struct kfree_rcu_cpu {
   3234	struct rcu_head *head;
   3235	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
   3236	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
   3237	raw_spinlock_t lock;
   3238	struct delayed_work monitor_work;
   3239	bool monitor_todo;
   3240	bool initialized;
   3241	int count;
   3242
   3243	struct delayed_work page_cache_work;
   3244	atomic_t backoff_page_cache_fill;
   3245	atomic_t work_in_progress;
   3246	struct hrtimer hrtimer;
   3247
   3248	struct llist_head bkvcache;
   3249	int nr_bkv_objs;
   3250};
   3251
   3252static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
   3253	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
   3254};
   3255
   3256static __always_inline void
   3257debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
   3258{
   3259#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
   3260	int i;
   3261
   3262	for (i = 0; i < bhead->nr_records; i++)
   3263		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
   3264#endif
   3265}
   3266
   3267static inline struct kfree_rcu_cpu *
   3268krc_this_cpu_lock(unsigned long *flags)
   3269{
   3270	struct kfree_rcu_cpu *krcp;
   3271
   3272	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
   3273	krcp = this_cpu_ptr(&krc);
   3274	raw_spin_lock(&krcp->lock);
   3275
   3276	return krcp;
   3277}
   3278
   3279static inline void
   3280krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
   3281{
   3282	raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3283}
   3284
   3285static inline struct kvfree_rcu_bulk_data *
   3286get_cached_bnode(struct kfree_rcu_cpu *krcp)
   3287{
   3288	if (!krcp->nr_bkv_objs)
   3289		return NULL;
   3290
   3291	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
   3292	return (struct kvfree_rcu_bulk_data *)
   3293		llist_del_first(&krcp->bkvcache);
   3294}
   3295
   3296static inline bool
   3297put_cached_bnode(struct kfree_rcu_cpu *krcp,
   3298	struct kvfree_rcu_bulk_data *bnode)
   3299{
   3300	// Check the limit.
   3301	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
   3302		return false;
   3303
   3304	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
   3305	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
   3306	return true;
   3307}
   3308
   3309static int
   3310drain_page_cache(struct kfree_rcu_cpu *krcp)
   3311{
   3312	unsigned long flags;
   3313	struct llist_node *page_list, *pos, *n;
   3314	int freed = 0;
   3315
   3316	raw_spin_lock_irqsave(&krcp->lock, flags);
   3317	page_list = llist_del_all(&krcp->bkvcache);
   3318	WRITE_ONCE(krcp->nr_bkv_objs, 0);
   3319	raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3320
   3321	llist_for_each_safe(pos, n, page_list) {
   3322		free_page((unsigned long)pos);
   3323		freed++;
   3324	}
   3325
   3326	return freed;
   3327}
   3328
   3329/*
   3330 * This function is invoked in workqueue context after a grace period.
   3331 * It frees all the objects queued on ->bkvhead_free or ->head_free.
   3332 */
   3333static void kfree_rcu_work(struct work_struct *work)
   3334{
   3335	unsigned long flags;
   3336	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
   3337	struct rcu_head *head, *next;
   3338	struct kfree_rcu_cpu *krcp;
   3339	struct kfree_rcu_cpu_work *krwp;
   3340	int i, j;
   3341
   3342	krwp = container_of(to_rcu_work(work),
   3343			    struct kfree_rcu_cpu_work, rcu_work);
   3344	krcp = krwp->krcp;
   3345
   3346	raw_spin_lock_irqsave(&krcp->lock, flags);
   3347	// Channels 1 and 2.
   3348	for (i = 0; i < FREE_N_CHANNELS; i++) {
   3349		bkvhead[i] = krwp->bkvhead_free[i];
   3350		krwp->bkvhead_free[i] = NULL;
   3351	}
   3352
   3353	// Channel 3.
   3354	head = krwp->head_free;
   3355	krwp->head_free = NULL;
   3356	raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3357
   3358	// Handle the first two channels.
   3359	for (i = 0; i < FREE_N_CHANNELS; i++) {
   3360		for (; bkvhead[i]; bkvhead[i] = bnext) {
   3361			bnext = bkvhead[i]->next;
   3362			debug_rcu_bhead_unqueue(bkvhead[i]);
   3363
   3364			rcu_lock_acquire(&rcu_callback_map);
   3365			if (i == 0) { // kmalloc() / kfree().
   3366				trace_rcu_invoke_kfree_bulk_callback(
   3367					rcu_state.name, bkvhead[i]->nr_records,
   3368					bkvhead[i]->records);
   3369
   3370				kfree_bulk(bkvhead[i]->nr_records,
   3371					bkvhead[i]->records);
   3372			} else { // vmalloc() / vfree().
   3373				for (j = 0; j < bkvhead[i]->nr_records; j++) {
   3374					trace_rcu_invoke_kvfree_callback(
   3375						rcu_state.name,
   3376						bkvhead[i]->records[j], 0);
   3377
   3378					vfree(bkvhead[i]->records[j]);
   3379				}
   3380			}
   3381			rcu_lock_release(&rcu_callback_map);
   3382
   3383			raw_spin_lock_irqsave(&krcp->lock, flags);
   3384			if (put_cached_bnode(krcp, bkvhead[i]))
   3385				bkvhead[i] = NULL;
   3386			raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3387
   3388			if (bkvhead[i])
   3389				free_page((unsigned long) bkvhead[i]);
   3390
   3391			cond_resched_tasks_rcu_qs();
   3392		}
   3393	}
   3394
   3395	/*
   3396	 * This is used when the "bulk" path can not be used for the
   3397	 * double-argument of kvfree_rcu().  This happens when the
   3398	 * page-cache is empty, which means that objects are instead
   3399	 * queued on a linked list through their rcu_head structures.
   3400	 * This list is named "Channel 3".
   3401	 */
   3402	for (; head; head = next) {
   3403		unsigned long offset = (unsigned long)head->func;
   3404		void *ptr = (void *)head - offset;
   3405
   3406		next = head->next;
   3407		debug_rcu_head_unqueue((struct rcu_head *)ptr);
   3408		rcu_lock_acquire(&rcu_callback_map);
   3409		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
   3410
   3411		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
   3412			kvfree(ptr);
   3413
   3414		rcu_lock_release(&rcu_callback_map);
   3415		cond_resched_tasks_rcu_qs();
   3416	}
   3417}
   3418
   3419/*
   3420 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
   3421 */
   3422static void kfree_rcu_monitor(struct work_struct *work)
   3423{
   3424	struct kfree_rcu_cpu *krcp = container_of(work,
   3425		struct kfree_rcu_cpu, monitor_work.work);
   3426	unsigned long flags;
   3427	int i, j;
   3428
   3429	raw_spin_lock_irqsave(&krcp->lock, flags);
   3430
   3431	// Attempt to start a new batch.
   3432	for (i = 0; i < KFREE_N_BATCHES; i++) {
   3433		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
   3434
   3435		// Try to detach bkvhead or head and attach it over any
   3436		// available corresponding free channel. It can be that
   3437		// a previous RCU batch is in progress, it means that
   3438		// immediately to queue another one is not possible so
   3439		// in that case the monitor work is rearmed.
   3440		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
   3441			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
   3442				(krcp->head && !krwp->head_free)) {
   3443			// Channel 1 corresponds to the SLAB-pointer bulk path.
   3444			// Channel 2 corresponds to vmalloc-pointer bulk path.
   3445			for (j = 0; j < FREE_N_CHANNELS; j++) {
   3446				if (!krwp->bkvhead_free[j]) {
   3447					krwp->bkvhead_free[j] = krcp->bkvhead[j];
   3448					krcp->bkvhead[j] = NULL;
   3449				}
   3450			}
   3451
   3452			// Channel 3 corresponds to both SLAB and vmalloc
   3453			// objects queued on the linked list.
   3454			if (!krwp->head_free) {
   3455				krwp->head_free = krcp->head;
   3456				krcp->head = NULL;
   3457			}
   3458
   3459			WRITE_ONCE(krcp->count, 0);
   3460
   3461			// One work is per one batch, so there are three
   3462			// "free channels", the batch can handle. It can
   3463			// be that the work is in the pending state when
   3464			// channels have been detached following by each
   3465			// other.
   3466			queue_rcu_work(system_wq, &krwp->rcu_work);
   3467		}
   3468	}
   3469
   3470	// If there is nothing to detach, it means that our job is
   3471	// successfully done here. In case of having at least one
   3472	// of the channels that is still busy we should rearm the
   3473	// work to repeat an attempt. Because previous batches are
   3474	// still in progress.
   3475	if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
   3476		krcp->monitor_todo = false;
   3477	else
   3478		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
   3479
   3480	raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3481}
   3482
   3483static enum hrtimer_restart
   3484schedule_page_work_fn(struct hrtimer *t)
   3485{
   3486	struct kfree_rcu_cpu *krcp =
   3487		container_of(t, struct kfree_rcu_cpu, hrtimer);
   3488
   3489	queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
   3490	return HRTIMER_NORESTART;
   3491}
   3492
   3493static void fill_page_cache_func(struct work_struct *work)
   3494{
   3495	struct kvfree_rcu_bulk_data *bnode;
   3496	struct kfree_rcu_cpu *krcp =
   3497		container_of(work, struct kfree_rcu_cpu,
   3498			page_cache_work.work);
   3499	unsigned long flags;
   3500	int nr_pages;
   3501	bool pushed;
   3502	int i;
   3503
   3504	nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
   3505		1 : rcu_min_cached_objs;
   3506
   3507	for (i = 0; i < nr_pages; i++) {
   3508		bnode = (struct kvfree_rcu_bulk_data *)
   3509			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
   3510
   3511		if (bnode) {
   3512			raw_spin_lock_irqsave(&krcp->lock, flags);
   3513			pushed = put_cached_bnode(krcp, bnode);
   3514			raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3515
   3516			if (!pushed) {
   3517				free_page((unsigned long) bnode);
   3518				break;
   3519			}
   3520		}
   3521	}
   3522
   3523	atomic_set(&krcp->work_in_progress, 0);
   3524	atomic_set(&krcp->backoff_page_cache_fill, 0);
   3525}
   3526
   3527static void
   3528run_page_cache_worker(struct kfree_rcu_cpu *krcp)
   3529{
   3530	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
   3531			!atomic_xchg(&krcp->work_in_progress, 1)) {
   3532		if (atomic_read(&krcp->backoff_page_cache_fill)) {
   3533			queue_delayed_work(system_wq,
   3534				&krcp->page_cache_work,
   3535					msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
   3536		} else {
   3537			hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   3538			krcp->hrtimer.function = schedule_page_work_fn;
   3539			hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
   3540		}
   3541	}
   3542}
   3543
   3544// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
   3545// state specified by flags.  If can_alloc is true, the caller must
   3546// be schedulable and not be holding any locks or mutexes that might be
   3547// acquired by the memory allocator or anything that it might invoke.
   3548// Returns true if ptr was successfully recorded, else the caller must
   3549// use a fallback.
   3550static inline bool
   3551add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
   3552	unsigned long *flags, void *ptr, bool can_alloc)
   3553{
   3554	struct kvfree_rcu_bulk_data *bnode;
   3555	int idx;
   3556
   3557	*krcp = krc_this_cpu_lock(flags);
   3558	if (unlikely(!(*krcp)->initialized))
   3559		return false;
   3560
   3561	idx = !!is_vmalloc_addr(ptr);
   3562
   3563	/* Check if a new block is required. */
   3564	if (!(*krcp)->bkvhead[idx] ||
   3565			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
   3566		bnode = get_cached_bnode(*krcp);
   3567		if (!bnode && can_alloc) {
   3568			krc_this_cpu_unlock(*krcp, *flags);
   3569
   3570			// __GFP_NORETRY - allows a light-weight direct reclaim
   3571			// what is OK from minimizing of fallback hitting point of
   3572			// view. Apart of that it forbids any OOM invoking what is
   3573			// also beneficial since we are about to release memory soon.
   3574			//
   3575			// __GFP_NOMEMALLOC - prevents from consuming of all the
   3576			// memory reserves. Please note we have a fallback path.
   3577			//
   3578			// __GFP_NOWARN - it is supposed that an allocation can
   3579			// be failed under low memory or high memory pressure
   3580			// scenarios.
   3581			bnode = (struct kvfree_rcu_bulk_data *)
   3582				__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
   3583			*krcp = krc_this_cpu_lock(flags);
   3584		}
   3585
   3586		if (!bnode)
   3587			return false;
   3588
   3589		/* Initialize the new block. */
   3590		bnode->nr_records = 0;
   3591		bnode->next = (*krcp)->bkvhead[idx];
   3592
   3593		/* Attach it to the head. */
   3594		(*krcp)->bkvhead[idx] = bnode;
   3595	}
   3596
   3597	/* Finally insert. */
   3598	(*krcp)->bkvhead[idx]->records
   3599		[(*krcp)->bkvhead[idx]->nr_records++] = ptr;
   3600
   3601	return true;
   3602}
   3603
   3604/*
   3605 * Queue a request for lazy invocation of the appropriate free routine
   3606 * after a grace period.  Please note that three paths are maintained,
   3607 * two for the common case using arrays of pointers and a third one that
   3608 * is used only when the main paths cannot be used, for example, due to
   3609 * memory pressure.
   3610 *
   3611 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
   3612 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
   3613 * be free'd in workqueue context. This allows us to: batch requests together to
   3614 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
   3615 */
   3616void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
   3617{
   3618	unsigned long flags;
   3619	struct kfree_rcu_cpu *krcp;
   3620	bool success;
   3621	void *ptr;
   3622
   3623	if (head) {
   3624		ptr = (void *) head - (unsigned long) func;
   3625	} else {
   3626		/*
   3627		 * Please note there is a limitation for the head-less
   3628		 * variant, that is why there is a clear rule for such
   3629		 * objects: it can be used from might_sleep() context
   3630		 * only. For other places please embed an rcu_head to
   3631		 * your data.
   3632		 */
   3633		might_sleep();
   3634		ptr = (unsigned long *) func;
   3635	}
   3636
   3637	// Queue the object but don't yet schedule the batch.
   3638	if (debug_rcu_head_queue(ptr)) {
   3639		// Probable double kfree_rcu(), just leak.
   3640		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
   3641			  __func__, head);
   3642
   3643		// Mark as success and leave.
   3644		return;
   3645	}
   3646
   3647	kasan_record_aux_stack_noalloc(ptr);
   3648	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
   3649	if (!success) {
   3650		run_page_cache_worker(krcp);
   3651
   3652		if (head == NULL)
   3653			// Inline if kvfree_rcu(one_arg) call.
   3654			goto unlock_return;
   3655
   3656		head->func = func;
   3657		head->next = krcp->head;
   3658		krcp->head = head;
   3659		success = true;
   3660	}
   3661
   3662	WRITE_ONCE(krcp->count, krcp->count + 1);
   3663
   3664	// Set timer to drain after KFREE_DRAIN_JIFFIES.
   3665	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
   3666	    !krcp->monitor_todo) {
   3667		krcp->monitor_todo = true;
   3668		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
   3669	}
   3670
   3671unlock_return:
   3672	krc_this_cpu_unlock(krcp, flags);
   3673
   3674	/*
   3675	 * Inline kvfree() after synchronize_rcu(). We can do
   3676	 * it from might_sleep() context only, so the current
   3677	 * CPU can pass the QS state.
   3678	 */
   3679	if (!success) {
   3680		debug_rcu_head_unqueue((struct rcu_head *) ptr);
   3681		synchronize_rcu();
   3682		kvfree(ptr);
   3683	}
   3684}
   3685EXPORT_SYMBOL_GPL(kvfree_call_rcu);
   3686
   3687static unsigned long
   3688kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
   3689{
   3690	int cpu;
   3691	unsigned long count = 0;
   3692
   3693	/* Snapshot count of all CPUs */
   3694	for_each_possible_cpu(cpu) {
   3695		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   3696
   3697		count += READ_ONCE(krcp->count);
   3698		count += READ_ONCE(krcp->nr_bkv_objs);
   3699		atomic_set(&krcp->backoff_page_cache_fill, 1);
   3700	}
   3701
   3702	return count;
   3703}
   3704
   3705static unsigned long
   3706kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
   3707{
   3708	int cpu, freed = 0;
   3709
   3710	for_each_possible_cpu(cpu) {
   3711		int count;
   3712		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   3713
   3714		count = krcp->count;
   3715		count += drain_page_cache(krcp);
   3716		kfree_rcu_monitor(&krcp->monitor_work.work);
   3717
   3718		sc->nr_to_scan -= count;
   3719		freed += count;
   3720
   3721		if (sc->nr_to_scan <= 0)
   3722			break;
   3723	}
   3724
   3725	return freed == 0 ? SHRINK_STOP : freed;
   3726}
   3727
   3728static struct shrinker kfree_rcu_shrinker = {
   3729	.count_objects = kfree_rcu_shrink_count,
   3730	.scan_objects = kfree_rcu_shrink_scan,
   3731	.batch = 0,
   3732	.seeks = DEFAULT_SEEKS,
   3733};
   3734
   3735void __init kfree_rcu_scheduler_running(void)
   3736{
   3737	int cpu;
   3738	unsigned long flags;
   3739
   3740	for_each_possible_cpu(cpu) {
   3741		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   3742
   3743		raw_spin_lock_irqsave(&krcp->lock, flags);
   3744		if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
   3745				krcp->monitor_todo) {
   3746			raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3747			continue;
   3748		}
   3749		krcp->monitor_todo = true;
   3750		schedule_delayed_work_on(cpu, &krcp->monitor_work,
   3751					 KFREE_DRAIN_JIFFIES);
   3752		raw_spin_unlock_irqrestore(&krcp->lock, flags);
   3753	}
   3754}
   3755
   3756/*
   3757 * During early boot, any blocking grace-period wait automatically
   3758 * implies a grace period.  Later on, this is never the case for PREEMPTION.
   3759 *
   3760 * However, because a context switch is a grace period for !PREEMPTION, any
   3761 * blocking grace-period wait automatically implies a grace period if
   3762 * there is only one CPU online at any point time during execution of
   3763 * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
   3764 * occasionally incorrectly indicate that there are multiple CPUs online
   3765 * when there was in fact only one the whole time, as this just adds some
   3766 * overhead: RCU still operates correctly.
   3767 */
   3768static int rcu_blocking_is_gp(void)
   3769{
   3770	int ret;
   3771
   3772	// Invoking preempt_model_*() too early gets a splat.
   3773	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
   3774	    preempt_model_full() || preempt_model_rt())
   3775		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
   3776	might_sleep();  /* Check for RCU read-side critical section. */
   3777	preempt_disable();
   3778	/*
   3779	 * If the rcu_state.n_online_cpus counter is equal to one,
   3780	 * there is only one CPU, and that CPU sees all prior accesses
   3781	 * made by any CPU that was online at the time of its access.
   3782	 * Furthermore, if this counter is equal to one, its value cannot
   3783	 * change until after the preempt_enable() below.
   3784	 *
   3785	 * Furthermore, if rcu_state.n_online_cpus is equal to one here,
   3786	 * all later CPUs (both this one and any that come online later
   3787	 * on) are guaranteed to see all accesses prior to this point
   3788	 * in the code, without the need for additional memory barriers.
   3789	 * Those memory barriers are provided by CPU-hotplug code.
   3790	 */
   3791	ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
   3792	preempt_enable();
   3793	return ret;
   3794}
   3795
   3796/**
   3797 * synchronize_rcu - wait until a grace period has elapsed.
   3798 *
   3799 * Control will return to the caller some time after a full grace
   3800 * period has elapsed, in other words after all currently executing RCU
   3801 * read-side critical sections have completed.  Note, however, that
   3802 * upon return from synchronize_rcu(), the caller might well be executing
   3803 * concurrently with new RCU read-side critical sections that began while
   3804 * synchronize_rcu() was waiting.
   3805 *
   3806 * RCU read-side critical sections are delimited by rcu_read_lock()
   3807 * and rcu_read_unlock(), and may be nested.  In addition, but only in
   3808 * v5.0 and later, regions of code across which interrupts, preemption,
   3809 * or softirqs have been disabled also serve as RCU read-side critical
   3810 * sections.  This includes hardware interrupt handlers, softirq handlers,
   3811 * and NMI handlers.
   3812 *
   3813 * Note that this guarantee implies further memory-ordering guarantees.
   3814 * On systems with more than one CPU, when synchronize_rcu() returns,
   3815 * each CPU is guaranteed to have executed a full memory barrier since
   3816 * the end of its last RCU read-side critical section whose beginning
   3817 * preceded the call to synchronize_rcu().  In addition, each CPU having
   3818 * an RCU read-side critical section that extends beyond the return from
   3819 * synchronize_rcu() is guaranteed to have executed a full memory barrier
   3820 * after the beginning of synchronize_rcu() and before the beginning of
   3821 * that RCU read-side critical section.  Note that these guarantees include
   3822 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
   3823 * that are executing in the kernel.
   3824 *
   3825 * Furthermore, if CPU A invoked synchronize_rcu(), which returned
   3826 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
   3827 * to have executed a full memory barrier during the execution of
   3828 * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
   3829 * again only if the system has more than one CPU).
   3830 *
   3831 * Implementation of these memory-ordering guarantees is described here:
   3832 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   3833 */
   3834void synchronize_rcu(void)
   3835{
   3836	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
   3837			 lock_is_held(&rcu_lock_map) ||
   3838			 lock_is_held(&rcu_sched_lock_map),
   3839			 "Illegal synchronize_rcu() in RCU read-side critical section");
   3840	if (rcu_blocking_is_gp())
   3841		return;  // Context allows vacuous grace periods.
   3842	if (rcu_gp_is_expedited())
   3843		synchronize_rcu_expedited();
   3844	else
   3845		wait_rcu_gp(call_rcu);
   3846}
   3847EXPORT_SYMBOL_GPL(synchronize_rcu);
   3848
   3849/**
   3850 * get_state_synchronize_rcu - Snapshot current RCU state
   3851 *
   3852 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
   3853 * or poll_state_synchronize_rcu() to determine whether or not a full
   3854 * grace period has elapsed in the meantime.
   3855 */
   3856unsigned long get_state_synchronize_rcu(void)
   3857{
   3858	/*
   3859	 * Any prior manipulation of RCU-protected data must happen
   3860	 * before the load from ->gp_seq.
   3861	 */
   3862	smp_mb();  /* ^^^ */
   3863	return rcu_seq_snap(&rcu_state.gp_seq);
   3864}
   3865EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
   3866
   3867/**
   3868 * start_poll_synchronize_rcu - Snapshot and start RCU grace period
   3869 *
   3870 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
   3871 * or poll_state_synchronize_rcu() to determine whether or not a full
   3872 * grace period has elapsed in the meantime.  If the needed grace period
   3873 * is not already slated to start, notifies RCU core of the need for that
   3874 * grace period.
   3875 *
   3876 * Interrupts must be enabled for the case where it is necessary to awaken
   3877 * the grace-period kthread.
   3878 */
   3879unsigned long start_poll_synchronize_rcu(void)
   3880{
   3881	unsigned long flags;
   3882	unsigned long gp_seq = get_state_synchronize_rcu();
   3883	bool needwake;
   3884	struct rcu_data *rdp;
   3885	struct rcu_node *rnp;
   3886
   3887	lockdep_assert_irqs_enabled();
   3888	local_irq_save(flags);
   3889	rdp = this_cpu_ptr(&rcu_data);
   3890	rnp = rdp->mynode;
   3891	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
   3892	needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
   3893	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   3894	if (needwake)
   3895		rcu_gp_kthread_wake();
   3896	return gp_seq;
   3897}
   3898EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
   3899
   3900/**
   3901 * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
   3902 *
   3903 * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   3904 *
   3905 * If a full RCU grace period has elapsed since the earlier call from
   3906 * which oldstate was obtained, return @true, otherwise return @false.
   3907 * If @false is returned, it is the caller's responsibility to invoke this
   3908 * function later on until it does return @true.  Alternatively, the caller
   3909 * can explicitly wait for a grace period, for example, by passing @oldstate
   3910 * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
   3911 *
   3912 * Yes, this function does not take counter wrap into account.
   3913 * But counter wrap is harmless.  If the counter wraps, we have waited for
   3914 * more than 2 billion grace periods (and way more on a 64-bit system!).
   3915 * Those needing to keep oldstate values for very long time periods
   3916 * (many hours even on 32-bit systems) should check them occasionally
   3917 * and either refresh them or set a flag indicating that the grace period
   3918 * has completed.
   3919 *
   3920 * This function provides the same memory-ordering guarantees that
   3921 * would be provided by a synchronize_rcu() that was invoked at the call
   3922 * to the function that provided @oldstate, and that returned at the end
   3923 * of this function.
   3924 */
   3925bool poll_state_synchronize_rcu(unsigned long oldstate)
   3926{
   3927	if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
   3928		smp_mb(); /* Ensure GP ends before subsequent accesses. */
   3929		return true;
   3930	}
   3931	return false;
   3932}
   3933EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
   3934
   3935/**
   3936 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
   3937 *
   3938 * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   3939 *
   3940 * If a full RCU grace period has elapsed since the earlier call to
   3941 * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
   3942 * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
   3943 *
   3944 * Yes, this function does not take counter wrap into account.  But
   3945 * counter wrap is harmless.  If the counter wraps, we have waited for
   3946 * more than 2 billion grace periods (and way more on a 64-bit system!),
   3947 * so waiting for one additional grace period should be just fine.
   3948 *
   3949 * This function provides the same memory-ordering guarantees that
   3950 * would be provided by a synchronize_rcu() that was invoked at the call
   3951 * to the function that provided @oldstate, and that returned at the end
   3952 * of this function.
   3953 */
   3954void cond_synchronize_rcu(unsigned long oldstate)
   3955{
   3956	if (!poll_state_synchronize_rcu(oldstate))
   3957		synchronize_rcu();
   3958}
   3959EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
   3960
   3961/*
   3962 * Check to see if there is any immediate RCU-related work to be done by
   3963 * the current CPU, returning 1 if so and zero otherwise.  The checks are
   3964 * in order of increasing expense: checks that can be carried out against
   3965 * CPU-local state are performed first.  However, we must check for CPU
   3966 * stalls first, else we might not get a chance.
   3967 */
   3968static int rcu_pending(int user)
   3969{
   3970	bool gp_in_progress;
   3971	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
   3972	struct rcu_node *rnp = rdp->mynode;
   3973
   3974	lockdep_assert_irqs_disabled();
   3975
   3976	/* Check for CPU stalls, if enabled. */
   3977	check_cpu_stall(rdp);
   3978
   3979	/* Does this CPU need a deferred NOCB wakeup? */
   3980	if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
   3981		return 1;
   3982
   3983	/* Is this a nohz_full CPU in userspace or idle?  (Ignore RCU if so.) */
   3984	if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
   3985		return 0;
   3986
   3987	/* Is the RCU core waiting for a quiescent state from this CPU? */
   3988	gp_in_progress = rcu_gp_in_progress();
   3989	if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
   3990		return 1;
   3991
   3992	/* Does this CPU have callbacks ready to invoke? */
   3993	if (!rcu_rdp_is_offloaded(rdp) &&
   3994	    rcu_segcblist_ready_cbs(&rdp->cblist))
   3995		return 1;
   3996
   3997	/* Has RCU gone idle with this CPU needing another grace period? */
   3998	if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
   3999	    !rcu_rdp_is_offloaded(rdp) &&
   4000	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
   4001		return 1;
   4002
   4003	/* Have RCU grace period completed or started?  */
   4004	if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
   4005	    unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
   4006		return 1;
   4007
   4008	/* nothing to do */
   4009	return 0;
   4010}
   4011
   4012/*
   4013 * Helper function for rcu_barrier() tracing.  If tracing is disabled,
   4014 * the compiler is expected to optimize this away.
   4015 */
   4016static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
   4017{
   4018	trace_rcu_barrier(rcu_state.name, s, cpu,
   4019			  atomic_read(&rcu_state.barrier_cpu_count), done);
   4020}
   4021
   4022/*
   4023 * RCU callback function for rcu_barrier().  If we are last, wake
   4024 * up the task executing rcu_barrier().
   4025 *
   4026 * Note that the value of rcu_state.barrier_sequence must be captured
   4027 * before the atomic_dec_and_test().  Otherwise, if this CPU is not last,
   4028 * other CPUs might count the value down to zero before this CPU gets
   4029 * around to invoking rcu_barrier_trace(), which might result in bogus
   4030 * data from the next instance of rcu_barrier().
   4031 */
   4032static void rcu_barrier_callback(struct rcu_head *rhp)
   4033{
   4034	unsigned long __maybe_unused s = rcu_state.barrier_sequence;
   4035
   4036	if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
   4037		rcu_barrier_trace(TPS("LastCB"), -1, s);
   4038		complete(&rcu_state.barrier_completion);
   4039	} else {
   4040		rcu_barrier_trace(TPS("CB"), -1, s);
   4041	}
   4042}
   4043
   4044/*
   4045 * If needed, entrain an rcu_barrier() callback on rdp->cblist.
   4046 */
   4047static void rcu_barrier_entrain(struct rcu_data *rdp)
   4048{
   4049	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
   4050	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
   4051
   4052	lockdep_assert_held(&rcu_state.barrier_lock);
   4053	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
   4054		return;
   4055	rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
   4056	rdp->barrier_head.func = rcu_barrier_callback;
   4057	debug_rcu_head_queue(&rdp->barrier_head);
   4058	rcu_nocb_lock(rdp);
   4059	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
   4060	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
   4061		atomic_inc(&rcu_state.barrier_cpu_count);
   4062	} else {
   4063		debug_rcu_head_unqueue(&rdp->barrier_head);
   4064		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
   4065	}
   4066	rcu_nocb_unlock(rdp);
   4067	smp_store_release(&rdp->barrier_seq_snap, gseq);
   4068}
   4069
   4070/*
   4071 * Called with preemption disabled, and from cross-cpu IRQ context.
   4072 */
   4073static void rcu_barrier_handler(void *cpu_in)
   4074{
   4075	uintptr_t cpu = (uintptr_t)cpu_in;
   4076	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4077
   4078	lockdep_assert_irqs_disabled();
   4079	WARN_ON_ONCE(cpu != rdp->cpu);
   4080	WARN_ON_ONCE(cpu != smp_processor_id());
   4081	raw_spin_lock(&rcu_state.barrier_lock);
   4082	rcu_barrier_entrain(rdp);
   4083	raw_spin_unlock(&rcu_state.barrier_lock);
   4084}
   4085
   4086/**
   4087 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
   4088 *
   4089 * Note that this primitive does not necessarily wait for an RCU grace period
   4090 * to complete.  For example, if there are no RCU callbacks queued anywhere
   4091 * in the system, then rcu_barrier() is within its rights to return
   4092 * immediately, without waiting for anything, much less an RCU grace period.
   4093 */
   4094void rcu_barrier(void)
   4095{
   4096	uintptr_t cpu;
   4097	unsigned long flags;
   4098	unsigned long gseq;
   4099	struct rcu_data *rdp;
   4100	unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
   4101
   4102	rcu_barrier_trace(TPS("Begin"), -1, s);
   4103
   4104	/* Take mutex to serialize concurrent rcu_barrier() requests. */
   4105	mutex_lock(&rcu_state.barrier_mutex);
   4106
   4107	/* Did someone else do our work for us? */
   4108	if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
   4109		rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
   4110		smp_mb(); /* caller's subsequent code after above check. */
   4111		mutex_unlock(&rcu_state.barrier_mutex);
   4112		return;
   4113	}
   4114
   4115	/* Mark the start of the barrier operation. */
   4116	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
   4117	rcu_seq_start(&rcu_state.barrier_sequence);
   4118	gseq = rcu_state.barrier_sequence;
   4119	rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
   4120
   4121	/*
   4122	 * Initialize the count to two rather than to zero in order
   4123	 * to avoid a too-soon return to zero in case of an immediate
   4124	 * invocation of the just-enqueued callback (or preemption of
   4125	 * this task).  Exclude CPU-hotplug operations to ensure that no
   4126	 * offline non-offloaded CPU has callbacks queued.
   4127	 */
   4128	init_completion(&rcu_state.barrier_completion);
   4129	atomic_set(&rcu_state.barrier_cpu_count, 2);
   4130	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
   4131
   4132	/*
   4133	 * Force each CPU with callbacks to register a new callback.
   4134	 * When that callback is invoked, we will know that all of the
   4135	 * corresponding CPU's preceding callbacks have been invoked.
   4136	 */
   4137	for_each_possible_cpu(cpu) {
   4138		rdp = per_cpu_ptr(&rcu_data, cpu);
   4139retry:
   4140		if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
   4141			continue;
   4142		raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
   4143		if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
   4144			WRITE_ONCE(rdp->barrier_seq_snap, gseq);
   4145			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
   4146			rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
   4147			continue;
   4148		}
   4149		if (!rcu_rdp_cpu_online(rdp)) {
   4150			rcu_barrier_entrain(rdp);
   4151			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
   4152			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
   4153			rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
   4154			continue;
   4155		}
   4156		raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
   4157		if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
   4158			schedule_timeout_uninterruptible(1);
   4159			goto retry;
   4160		}
   4161		WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
   4162		rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
   4163	}
   4164
   4165	/*
   4166	 * Now that we have an rcu_barrier_callback() callback on each
   4167	 * CPU, and thus each counted, remove the initial count.
   4168	 */
   4169	if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
   4170		complete(&rcu_state.barrier_completion);
   4171
   4172	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
   4173	wait_for_completion(&rcu_state.barrier_completion);
   4174
   4175	/* Mark the end of the barrier operation. */
   4176	rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
   4177	rcu_seq_end(&rcu_state.barrier_sequence);
   4178	gseq = rcu_state.barrier_sequence;
   4179	for_each_possible_cpu(cpu) {
   4180		rdp = per_cpu_ptr(&rcu_data, cpu);
   4181
   4182		WRITE_ONCE(rdp->barrier_seq_snap, gseq);
   4183	}
   4184
   4185	/* Other rcu_barrier() invocations can now safely proceed. */
   4186	mutex_unlock(&rcu_state.barrier_mutex);
   4187}
   4188EXPORT_SYMBOL_GPL(rcu_barrier);
   4189
   4190/*
   4191 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
   4192 * first CPU in a given leaf rcu_node structure coming online.  The caller
   4193 * must hold the corresponding leaf rcu_node ->lock with interrupts
   4194 * disabled.
   4195 */
   4196static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
   4197{
   4198	long mask;
   4199	long oldmask;
   4200	struct rcu_node *rnp = rnp_leaf;
   4201
   4202	raw_lockdep_assert_held_rcu_node(rnp_leaf);
   4203	WARN_ON_ONCE(rnp->wait_blkd_tasks);
   4204	for (;;) {
   4205		mask = rnp->grpmask;
   4206		rnp = rnp->parent;
   4207		if (rnp == NULL)
   4208			return;
   4209		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
   4210		oldmask = rnp->qsmaskinit;
   4211		rnp->qsmaskinit |= mask;
   4212		raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
   4213		if (oldmask)
   4214			return;
   4215	}
   4216}
   4217
   4218/*
   4219 * Do boot-time initialization of a CPU's per-CPU RCU data.
   4220 */
   4221static void __init
   4222rcu_boot_init_percpu_data(int cpu)
   4223{
   4224	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4225
   4226	/* Set up local state, ensuring consistent view of global state. */
   4227	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
   4228	INIT_WORK(&rdp->strict_work, strict_work_handler);
   4229	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
   4230	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
   4231	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
   4232	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
   4233	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
   4234	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
   4235	rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
   4236	rdp->last_sched_clock = jiffies;
   4237	rdp->cpu = cpu;
   4238	rcu_boot_init_nocb_percpu_data(rdp);
   4239}
   4240
   4241/*
   4242 * Invoked early in the CPU-online process, when pretty much all services
   4243 * are available.  The incoming CPU is not present.
   4244 *
   4245 * Initializes a CPU's per-CPU RCU data.  Note that only one online or
   4246 * offline event can be happening at a given time.  Note also that we can
   4247 * accept some slop in the rsp->gp_seq access due to the fact that this
   4248 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
   4249 * And any offloaded callbacks are being numbered elsewhere.
   4250 */
   4251int rcutree_prepare_cpu(unsigned int cpu)
   4252{
   4253	unsigned long flags;
   4254	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4255	struct rcu_node *rnp = rcu_get_root();
   4256
   4257	/* Set up local state, ensuring consistent view of global state. */
   4258	raw_spin_lock_irqsave_rcu_node(rnp, flags);
   4259	rdp->qlen_last_fqs_check = 0;
   4260	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
   4261	rdp->blimit = blimit;
   4262	rdp->dynticks_nesting = 1;	/* CPU not up, no tearing. */
   4263	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
   4264
   4265	/*
   4266	 * Only non-NOCB CPUs that didn't have early-boot callbacks need to be
   4267	 * (re-)initialized.
   4268	 */
   4269	if (!rcu_segcblist_is_enabled(&rdp->cblist))
   4270		rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
   4271
   4272	/*
   4273	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
   4274	 * propagation up the rcu_node tree will happen at the beginning
   4275	 * of the next grace period.
   4276	 */
   4277	rnp = rdp->mynode;
   4278	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
   4279	rdp->beenonline = true;	 /* We have now been online. */
   4280	rdp->gp_seq = READ_ONCE(rnp->gp_seq);
   4281	rdp->gp_seq_needed = rdp->gp_seq;
   4282	rdp->cpu_no_qs.b.norm = true;
   4283	rdp->core_needs_qs = false;
   4284	rdp->rcu_iw_pending = false;
   4285	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
   4286	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
   4287	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
   4288	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   4289	rcu_spawn_one_boost_kthread(rnp);
   4290	rcu_spawn_cpu_nocb_kthread(cpu);
   4291	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
   4292
   4293	return 0;
   4294}
   4295
   4296/*
   4297 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
   4298 */
   4299static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
   4300{
   4301	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4302
   4303	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
   4304}
   4305
   4306/*
   4307 * Near the end of the CPU-online process.  Pretty much all services
   4308 * enabled, and the CPU is now very much alive.
   4309 */
   4310int rcutree_online_cpu(unsigned int cpu)
   4311{
   4312	unsigned long flags;
   4313	struct rcu_data *rdp;
   4314	struct rcu_node *rnp;
   4315
   4316	rdp = per_cpu_ptr(&rcu_data, cpu);
   4317	rnp = rdp->mynode;
   4318	raw_spin_lock_irqsave_rcu_node(rnp, flags);
   4319	rnp->ffmask |= rdp->grpmask;
   4320	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   4321	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
   4322		return 0; /* Too early in boot for scheduler work. */
   4323	sync_sched_exp_online_cleanup(cpu);
   4324	rcutree_affinity_setting(cpu, -1);
   4325
   4326	// Stop-machine done, so allow nohz_full to disable tick.
   4327	tick_dep_clear(TICK_DEP_BIT_RCU);
   4328	return 0;
   4329}
   4330
   4331/*
   4332 * Near the beginning of the process.  The CPU is still very much alive
   4333 * with pretty much all services enabled.
   4334 */
   4335int rcutree_offline_cpu(unsigned int cpu)
   4336{
   4337	unsigned long flags;
   4338	struct rcu_data *rdp;
   4339	struct rcu_node *rnp;
   4340
   4341	rdp = per_cpu_ptr(&rcu_data, cpu);
   4342	rnp = rdp->mynode;
   4343	raw_spin_lock_irqsave_rcu_node(rnp, flags);
   4344	rnp->ffmask &= ~rdp->grpmask;
   4345	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   4346
   4347	rcutree_affinity_setting(cpu, cpu);
   4348
   4349	// nohz_full CPUs need the tick for stop-machine to work quickly
   4350	tick_dep_set(TICK_DEP_BIT_RCU);
   4351	return 0;
   4352}
   4353
   4354/*
   4355 * Mark the specified CPU as being online so that subsequent grace periods
   4356 * (both expedited and normal) will wait on it.  Note that this means that
   4357 * incoming CPUs are not allowed to use RCU read-side critical sections
   4358 * until this function is called.  Failing to observe this restriction
   4359 * will result in lockdep splats.
   4360 *
   4361 * Note that this function is special in that it is invoked directly
   4362 * from the incoming CPU rather than from the cpuhp_step mechanism.
   4363 * This is because this function must be invoked at a precise location.
   4364 */
   4365void rcu_cpu_starting(unsigned int cpu)
   4366{
   4367	unsigned long flags;
   4368	unsigned long mask;
   4369	struct rcu_data *rdp;
   4370	struct rcu_node *rnp;
   4371	bool newcpu;
   4372
   4373	rdp = per_cpu_ptr(&rcu_data, cpu);
   4374	if (rdp->cpu_started)
   4375		return;
   4376	rdp->cpu_started = true;
   4377
   4378	rnp = rdp->mynode;
   4379	mask = rdp->grpmask;
   4380	local_irq_save(flags);
   4381	arch_spin_lock(&rcu_state.ofl_lock);
   4382	rcu_dynticks_eqs_online();
   4383	raw_spin_lock(&rcu_state.barrier_lock);
   4384	raw_spin_lock_rcu_node(rnp);
   4385	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
   4386	raw_spin_unlock(&rcu_state.barrier_lock);
   4387	newcpu = !(rnp->expmaskinitnext & mask);
   4388	rnp->expmaskinitnext |= mask;
   4389	/* Allow lockless access for expedited grace periods. */
   4390	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
   4391	ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
   4392	rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
   4393	rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
   4394	rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
   4395
   4396	/* An incoming CPU should never be blocking a grace period. */
   4397	if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
   4398		/* rcu_report_qs_rnp() *really* wants some flags to restore */
   4399		unsigned long flags2;
   4400
   4401		local_irq_save(flags2);
   4402		rcu_disable_urgency_upon_qs(rdp);
   4403		/* Report QS -after- changing ->qsmaskinitnext! */
   4404		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
   4405	} else {
   4406		raw_spin_unlock_rcu_node(rnp);
   4407	}
   4408	arch_spin_unlock(&rcu_state.ofl_lock);
   4409	local_irq_restore(flags);
   4410	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
   4411}
   4412
   4413/*
   4414 * The outgoing function has no further need of RCU, so remove it from
   4415 * the rcu_node tree's ->qsmaskinitnext bit masks.
   4416 *
   4417 * Note that this function is special in that it is invoked directly
   4418 * from the outgoing CPU rather than from the cpuhp_step mechanism.
   4419 * This is because this function must be invoked at a precise location.
   4420 */
   4421void rcu_report_dead(unsigned int cpu)
   4422{
   4423	unsigned long flags, seq_flags;
   4424	unsigned long mask;
   4425	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4426	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
   4427
   4428	// Do any dangling deferred wakeups.
   4429	do_nocb_deferred_wakeup(rdp);
   4430
   4431	/* QS for any half-done expedited grace period. */
   4432	rcu_report_exp_rdp(rdp);
   4433	rcu_preempt_deferred_qs(current);
   4434
   4435	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
   4436	mask = rdp->grpmask;
   4437	local_irq_save(seq_flags);
   4438	arch_spin_lock(&rcu_state.ofl_lock);
   4439	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
   4440	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
   4441	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
   4442	if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
   4443		/* Report quiescent state -before- changing ->qsmaskinitnext! */
   4444		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
   4445		raw_spin_lock_irqsave_rcu_node(rnp, flags);
   4446	}
   4447	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
   4448	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   4449	arch_spin_unlock(&rcu_state.ofl_lock);
   4450	local_irq_restore(seq_flags);
   4451
   4452	rdp->cpu_started = false;
   4453}
   4454
   4455#ifdef CONFIG_HOTPLUG_CPU
   4456/*
   4457 * The outgoing CPU has just passed through the dying-idle state, and we
   4458 * are being invoked from the CPU that was IPIed to continue the offline
   4459 * operation.  Migrate the outgoing CPU's callbacks to the current CPU.
   4460 */
   4461void rcutree_migrate_callbacks(int cpu)
   4462{
   4463	unsigned long flags;
   4464	struct rcu_data *my_rdp;
   4465	struct rcu_node *my_rnp;
   4466	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
   4467	bool needwake;
   4468
   4469	if (rcu_rdp_is_offloaded(rdp) ||
   4470	    rcu_segcblist_empty(&rdp->cblist))
   4471		return;  /* No callbacks to migrate. */
   4472
   4473	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
   4474	WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
   4475	rcu_barrier_entrain(rdp);
   4476	my_rdp = this_cpu_ptr(&rcu_data);
   4477	my_rnp = my_rdp->mynode;
   4478	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
   4479	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
   4480	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
   4481	/* Leverage recent GPs and set GP for new callbacks. */
   4482	needwake = rcu_advance_cbs(my_rnp, rdp) ||
   4483		   rcu_advance_cbs(my_rnp, my_rdp);
   4484	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
   4485	raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
   4486	needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
   4487	rcu_segcblist_disable(&rdp->cblist);
   4488	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
   4489	if (rcu_rdp_is_offloaded(my_rdp)) {
   4490		raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
   4491		__call_rcu_nocb_wake(my_rdp, true, flags);
   4492	} else {
   4493		rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
   4494		raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
   4495	}
   4496	if (needwake)
   4497		rcu_gp_kthread_wake();
   4498	lockdep_assert_irqs_enabled();
   4499	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
   4500		  !rcu_segcblist_empty(&rdp->cblist),
   4501		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
   4502		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
   4503		  rcu_segcblist_first_cb(&rdp->cblist));
   4504}
   4505#endif
   4506
   4507/*
   4508 * On non-huge systems, use expedited RCU grace periods to make suspend
   4509 * and hibernation run faster.
   4510 */
   4511static int rcu_pm_notify(struct notifier_block *self,
   4512			 unsigned long action, void *hcpu)
   4513{
   4514	switch (action) {
   4515	case PM_HIBERNATION_PREPARE:
   4516	case PM_SUSPEND_PREPARE:
   4517		rcu_expedite_gp();
   4518		break;
   4519	case PM_POST_HIBERNATION:
   4520	case PM_POST_SUSPEND:
   4521		rcu_unexpedite_gp();
   4522		break;
   4523	default:
   4524		break;
   4525	}
   4526	return NOTIFY_OK;
   4527}
   4528
   4529#ifdef CONFIG_RCU_EXP_KTHREAD
   4530struct kthread_worker *rcu_exp_gp_kworker;
   4531struct kthread_worker *rcu_exp_par_gp_kworker;
   4532
   4533static void __init rcu_start_exp_gp_kworkers(void)
   4534{
   4535	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
   4536	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
   4537	struct sched_param param = { .sched_priority = kthread_prio };
   4538
   4539	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
   4540	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
   4541		pr_err("Failed to create %s!\n", gp_kworker_name);
   4542		return;
   4543	}
   4544
   4545	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
   4546	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
   4547		pr_err("Failed to create %s!\n", par_gp_kworker_name);
   4548		kthread_destroy_worker(rcu_exp_gp_kworker);
   4549		return;
   4550	}
   4551
   4552	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
   4553	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
   4554				   &param);
   4555}
   4556
   4557static inline void rcu_alloc_par_gp_wq(void)
   4558{
   4559}
   4560#else /* !CONFIG_RCU_EXP_KTHREAD */
   4561struct workqueue_struct *rcu_par_gp_wq;
   4562
   4563static void __init rcu_start_exp_gp_kworkers(void)
   4564{
   4565}
   4566
   4567static inline void rcu_alloc_par_gp_wq(void)
   4568{
   4569	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
   4570	WARN_ON(!rcu_par_gp_wq);
   4571}
   4572#endif /* CONFIG_RCU_EXP_KTHREAD */
   4573
   4574/*
   4575 * Spawn the kthreads that handle RCU's grace periods.
   4576 */
   4577static int __init rcu_spawn_gp_kthread(void)
   4578{
   4579	unsigned long flags;
   4580	struct rcu_node *rnp;
   4581	struct sched_param sp;
   4582	struct task_struct *t;
   4583	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
   4584
   4585	rcu_scheduler_fully_active = 1;
   4586	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
   4587	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
   4588		return 0;
   4589	if (kthread_prio) {
   4590		sp.sched_priority = kthread_prio;
   4591		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
   4592	}
   4593	rnp = rcu_get_root();
   4594	raw_spin_lock_irqsave_rcu_node(rnp, flags);
   4595	WRITE_ONCE(rcu_state.gp_activity, jiffies);
   4596	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
   4597	// Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
   4598	smp_store_release(&rcu_state.gp_kthread, t);  /* ^^^ */
   4599	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
   4600	wake_up_process(t);
   4601	/* This is a pre-SMP initcall, we expect a single CPU */
   4602	WARN_ON(num_online_cpus() > 1);
   4603	/*
   4604	 * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
   4605	 * due to rcu_scheduler_fully_active.
   4606	 */
   4607	rcu_spawn_cpu_nocb_kthread(smp_processor_id());
   4608	rcu_spawn_one_boost_kthread(rdp->mynode);
   4609	rcu_spawn_core_kthreads();
   4610	/* Create kthread worker for expedited GPs */
   4611	rcu_start_exp_gp_kworkers();
   4612	return 0;
   4613}
   4614early_initcall(rcu_spawn_gp_kthread);
   4615
   4616/*
   4617 * This function is invoked towards the end of the scheduler's
   4618 * initialization process.  Before this is called, the idle task might
   4619 * contain synchronous grace-period primitives (during which time, this idle
   4620 * task is booting the system, and such primitives are no-ops).  After this
   4621 * function is called, any synchronous grace-period primitives are run as
   4622 * expedited, with the requesting task driving the grace period forward.
   4623 * A later core_initcall() rcu_set_runtime_mode() will switch to full
   4624 * runtime RCU functionality.
   4625 */
   4626void rcu_scheduler_starting(void)
   4627{
   4628	WARN_ON(num_online_cpus() != 1);
   4629	WARN_ON(nr_context_switches() > 0);
   4630	rcu_test_sync_prims();
   4631	rcu_scheduler_active = RCU_SCHEDULER_INIT;
   4632	rcu_test_sync_prims();
   4633}
   4634
   4635/*
   4636 * Helper function for rcu_init() that initializes the rcu_state structure.
   4637 */
   4638static void __init rcu_init_one(void)
   4639{
   4640	static const char * const buf[] = RCU_NODE_NAME_INIT;
   4641	static const char * const fqs[] = RCU_FQS_NAME_INIT;
   4642	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
   4643	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
   4644
   4645	int levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */
   4646	int cpustride = 1;
   4647	int i;
   4648	int j;
   4649	struct rcu_node *rnp;
   4650
   4651	BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
   4652
   4653	/* Silence gcc 4.8 false positive about array index out of range. */
   4654	if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
   4655		panic("rcu_init_one: rcu_num_lvls out of range");
   4656
   4657	/* Initialize the level-tracking arrays. */
   4658
   4659	for (i = 1; i < rcu_num_lvls; i++)
   4660		rcu_state.level[i] =
   4661			rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
   4662	rcu_init_levelspread(levelspread, num_rcu_lvl);
   4663
   4664	/* Initialize the elements themselves, starting from the leaves. */
   4665
   4666	for (i = rcu_num_lvls - 1; i >= 0; i--) {
   4667		cpustride *= levelspread[i];
   4668		rnp = rcu_state.level[i];
   4669		for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
   4670			raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
   4671			lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
   4672						   &rcu_node_class[i], buf[i]);
   4673			raw_spin_lock_init(&rnp->fqslock);
   4674			lockdep_set_class_and_name(&rnp->fqslock,
   4675						   &rcu_fqs_class[i], fqs[i]);
   4676			rnp->gp_seq = rcu_state.gp_seq;
   4677			rnp->gp_seq_needed = rcu_state.gp_seq;
   4678			rnp->completedqs = rcu_state.gp_seq;
   4679			rnp->qsmask = 0;
   4680			rnp->qsmaskinit = 0;
   4681			rnp->grplo = j * cpustride;
   4682			rnp->grphi = (j + 1) * cpustride - 1;
   4683			if (rnp->grphi >= nr_cpu_ids)
   4684				rnp->grphi = nr_cpu_ids - 1;
   4685			if (i == 0) {
   4686				rnp->grpnum = 0;
   4687				rnp->grpmask = 0;
   4688				rnp->parent = NULL;
   4689			} else {
   4690				rnp->grpnum = j % levelspread[i - 1];
   4691				rnp->grpmask = BIT(rnp->grpnum);
   4692				rnp->parent = rcu_state.level[i - 1] +
   4693					      j / levelspread[i - 1];
   4694			}
   4695			rnp->level = i;
   4696			INIT_LIST_HEAD(&rnp->blkd_tasks);
   4697			rcu_init_one_nocb(rnp);
   4698			init_waitqueue_head(&rnp->exp_wq[0]);
   4699			init_waitqueue_head(&rnp->exp_wq[1]);
   4700			init_waitqueue_head(&rnp->exp_wq[2]);
   4701			init_waitqueue_head(&rnp->exp_wq[3]);
   4702			spin_lock_init(&rnp->exp_lock);
   4703			mutex_init(&rnp->boost_kthread_mutex);
   4704		}
   4705	}
   4706
   4707	init_swait_queue_head(&rcu_state.gp_wq);
   4708	init_swait_queue_head(&rcu_state.expedited_wq);
   4709	rnp = rcu_first_leaf_node();
   4710	for_each_possible_cpu(i) {
   4711		while (i > rnp->grphi)
   4712			rnp++;
   4713		per_cpu_ptr(&rcu_data, i)->mynode = rnp;
   4714		rcu_boot_init_percpu_data(i);
   4715	}
   4716}
   4717
   4718/*
   4719 * Force priority from the kernel command-line into range.
   4720 */
   4721static void __init sanitize_kthread_prio(void)
   4722{
   4723	int kthread_prio_in = kthread_prio;
   4724
   4725	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
   4726	    && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
   4727		kthread_prio = 2;
   4728	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
   4729		kthread_prio = 1;
   4730	else if (kthread_prio < 0)
   4731		kthread_prio = 0;
   4732	else if (kthread_prio > 99)
   4733		kthread_prio = 99;
   4734
   4735	if (kthread_prio != kthread_prio_in)
   4736		pr_alert("%s: Limited prio to %d from %d\n",
   4737			 __func__, kthread_prio, kthread_prio_in);
   4738}
   4739
   4740/*
   4741 * Compute the rcu_node tree geometry from kernel parameters.  This cannot
   4742 * replace the definitions in tree.h because those are needed to size
   4743 * the ->node array in the rcu_state structure.
   4744 */
   4745void rcu_init_geometry(void)
   4746{
   4747	ulong d;
   4748	int i;
   4749	static unsigned long old_nr_cpu_ids;
   4750	int rcu_capacity[RCU_NUM_LVLS];
   4751	static bool initialized;
   4752
   4753	if (initialized) {
   4754		/*
   4755		 * Warn if setup_nr_cpu_ids() had not yet been invoked,
   4756		 * unless nr_cpus_ids == NR_CPUS, in which case who cares?
   4757		 */
   4758		WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
   4759		return;
   4760	}
   4761
   4762	old_nr_cpu_ids = nr_cpu_ids;
   4763	initialized = true;
   4764
   4765	/*
   4766	 * Initialize any unspecified boot parameters.
   4767	 * The default values of jiffies_till_first_fqs and
   4768	 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
   4769	 * value, which is a function of HZ, then adding one for each
   4770	 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
   4771	 */
   4772	d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
   4773	if (jiffies_till_first_fqs == ULONG_MAX)
   4774		jiffies_till_first_fqs = d;
   4775	if (jiffies_till_next_fqs == ULONG_MAX)
   4776		jiffies_till_next_fqs = d;
   4777	adjust_jiffies_till_sched_qs();
   4778
   4779	/* If the compile-time values are accurate, just leave. */
   4780	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
   4781	    nr_cpu_ids == NR_CPUS)
   4782		return;
   4783	pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
   4784		rcu_fanout_leaf, nr_cpu_ids);
   4785
   4786	/*
   4787	 * The boot-time rcu_fanout_leaf parameter must be at least two
   4788	 * and cannot exceed the number of bits in the rcu_node masks.
   4789	 * Complain and fall back to the compile-time values if this
   4790	 * limit is exceeded.
   4791	 */
   4792	if (rcu_fanout_leaf < 2 ||
   4793	    rcu_fanout_leaf > sizeof(unsigned long) * 8) {
   4794		rcu_fanout_leaf = RCU_FANOUT_LEAF;
   4795		WARN_ON(1);
   4796		return;
   4797	}
   4798
   4799	/*
   4800	 * Compute number of nodes that can be handled an rcu_node tree
   4801	 * with the given number of levels.
   4802	 */
   4803	rcu_capacity[0] = rcu_fanout_leaf;
   4804	for (i = 1; i < RCU_NUM_LVLS; i++)
   4805		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
   4806
   4807	/*
   4808	 * The tree must be able to accommodate the configured number of CPUs.
   4809	 * If this limit is exceeded, fall back to the compile-time values.
   4810	 */
   4811	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
   4812		rcu_fanout_leaf = RCU_FANOUT_LEAF;
   4813		WARN_ON(1);
   4814		return;
   4815	}
   4816
   4817	/* Calculate the number of levels in the tree. */
   4818	for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
   4819	}
   4820	rcu_num_lvls = i + 1;
   4821
   4822	/* Calculate the number of rcu_nodes at each level of the tree. */
   4823	for (i = 0; i < rcu_num_lvls; i++) {
   4824		int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
   4825		num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
   4826	}
   4827
   4828	/* Calculate the total number of rcu_node structures. */
   4829	rcu_num_nodes = 0;
   4830	for (i = 0; i < rcu_num_lvls; i++)
   4831		rcu_num_nodes += num_rcu_lvl[i];
   4832}
   4833
   4834/*
   4835 * Dump out the structure of the rcu_node combining tree associated
   4836 * with the rcu_state structure.
   4837 */
   4838static void __init rcu_dump_rcu_node_tree(void)
   4839{
   4840	int level = 0;
   4841	struct rcu_node *rnp;
   4842
   4843	pr_info("rcu_node tree layout dump\n");
   4844	pr_info(" ");
   4845	rcu_for_each_node_breadth_first(rnp) {
   4846		if (rnp->level != level) {
   4847			pr_cont("\n");
   4848			pr_info(" ");
   4849			level = rnp->level;
   4850		}
   4851		pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
   4852	}
   4853	pr_cont("\n");
   4854}
   4855
   4856struct workqueue_struct *rcu_gp_wq;
   4857
   4858static void __init kfree_rcu_batch_init(void)
   4859{
   4860	int cpu;
   4861	int i;
   4862
   4863	/* Clamp it to [0:100] seconds interval. */
   4864	if (rcu_delay_page_cache_fill_msec < 0 ||
   4865		rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
   4866
   4867		rcu_delay_page_cache_fill_msec =
   4868			clamp(rcu_delay_page_cache_fill_msec, 0,
   4869				(int) (100 * MSEC_PER_SEC));
   4870
   4871		pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
   4872			rcu_delay_page_cache_fill_msec);
   4873	}
   4874
   4875	for_each_possible_cpu(cpu) {
   4876		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   4877
   4878		for (i = 0; i < KFREE_N_BATCHES; i++) {
   4879			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
   4880			krcp->krw_arr[i].krcp = krcp;
   4881		}
   4882
   4883		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
   4884		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
   4885		krcp->initialized = true;
   4886	}
   4887	if (register_shrinker(&kfree_rcu_shrinker))
   4888		pr_err("Failed to register kfree_rcu() shrinker!\n");
   4889}
   4890
   4891void __init rcu_init(void)
   4892{
   4893	int cpu = smp_processor_id();
   4894
   4895	rcu_early_boot_tests();
   4896
   4897	kfree_rcu_batch_init();
   4898	rcu_bootup_announce();
   4899	sanitize_kthread_prio();
   4900	rcu_init_geometry();
   4901	rcu_init_one();
   4902	if (dump_tree)
   4903		rcu_dump_rcu_node_tree();
   4904	if (use_softirq)
   4905		open_softirq(RCU_SOFTIRQ, rcu_core_si);
   4906
   4907	/*
   4908	 * We don't need protection against CPU-hotplug here because
   4909	 * this is called early in boot, before either interrupts
   4910	 * or the scheduler are operational.
   4911	 */
   4912	pm_notifier(rcu_pm_notify, 0);
   4913	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
   4914	rcutree_prepare_cpu(cpu);
   4915	rcu_cpu_starting(cpu);
   4916	rcutree_online_cpu(cpu);
   4917
   4918	/* Create workqueue for Tree SRCU and for expedited GPs. */
   4919	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
   4920	WARN_ON(!rcu_gp_wq);
   4921	rcu_alloc_par_gp_wq();
   4922
   4923	/* Fill in default value for rcutree.qovld boot parameter. */
   4924	/* -After- the rcu_node ->lock fields are initialized! */
   4925	if (qovld < 0)
   4926		qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
   4927	else
   4928		qovld_calc = qovld;
   4929}
   4930
   4931#include "tree_stall.h"
   4932#include "tree_exp.h"
   4933#include "tree_nocb.h"
   4934#include "tree_plugin.h"