From ddc465936643108d5ba61f88594a2868d6a156ab Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Thu, 5 Mar 2020 23:22:55 +0100 Subject: Revert "rculist: Describe variadic macro argument in a Sphinx-compatible way" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f452ee096d95482892b101bde4fd037fa025d3cc. The workaround became unnecessary with commit 43756e347f21 ("scripts/kernel-doc: Add support for named variable macro arguments"). Signed-off-by: Jonathan Neuschäfer Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8214cdc715f2..7375bb3da140 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -371,7 +371,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -646,7 +646,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() -- cgit v1.2.3-71-gd317 From 6be7436d2245d3dd8b9a8f949367c13841c23308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Apr 2020 13:47:41 -0700 Subject: rcu: Add rcu_gp_might_be_stalled() This commit adds rcu_gp_might_be_stalled(), which returns true if there is some reason to believe that the RCU grace period is stalled. The use case is where an RCU free-memory path needs to allocate memory in order to free it, a situation that should be avoided where possible. But where it is necessary, there is always the alternative of using synchronize_rcu() to wait for a grace period in order to avoid the allocation. And if the grace period is stalled, allocating memory to asynchronously wait for it is a bad idea of epic proportions: Far better to let others use the memory, because these others might actually be able to free that memory before the grace period ends. Thus, rcu_gp_might_be_stalled() can be used to help decide whether allocating memory on an RCU free path is a semi-reasonable course of action. Cc: Joel Fernandes Cc: Uladzislau Rezki Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree_stall.h | 40 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 045c28b71f4f..dbf5ac439594 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,7 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } +static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 45f3f66bb04d..fbc26274af4d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -39,6 +39,7 @@ void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); +bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..4dede00e2936 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -- cgit v1.2.3-71-gd317 From f0bdf6d473cf12a488a78422e15aafdfe77cf853 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2020 14:52:32 -0800 Subject: rcu: Remove unused ->rcu_read_unlock_special.b.deferred_qs field The ->rcu_read_unlock_special.b.deferred_qs field is set to true in rcu_read_unlock_special() but never set to false. This is not particularly useful, so this commit removes this field. The only possible justification for this field is to ease debugging of RCU deferred quiscent states, but the combination of the other ->rcu_read_unlock_special fields plus ->rcu_blocked_node and of course ->rcu_read_lock_nesting should cover debugging needs. And if this last proves incorrect, this patch can always be reverted, along with the required setting of ->rcu_read_unlock_special.b.deferred_qs to false in rcu_preempt_deferred_qs_irqrestore(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 2 +- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..a4b727f57095 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 deferred_qs; + u8 pad; /* No garbage from compiler! */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 263c766b9dc1..f31c5992f842 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -634,7 +634,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3-71-gd317 From 2beaf3280e57bb891f8012dca49c87ed0f01e2f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Mar 2020 14:23:21 -0700 Subject: sched/core: Add function to sample state of locked-down task A running task's state can be sampled in a consistent manner (for example, for diagnostic purposes) simply by invoking smp_call_function_single() on its CPU, which may be obtained using task_cpu(), then having the IPI handler verify that the desired task is in fact still running. However, if the task is not running, this sampling can in theory be done immediately and directly. In practice, the task might start running at any time, including during the sampling period. Gaining a consistent sample of a not-running task therefore requires that something be done to lock down the target task's state. This commit therefore adds a try_invoke_on_locked_down_task() function that invokes a specified function if the specified task can be locked down, returning true if successful and if the specified function returns true. Otherwise this function simply returns false. Given that the function passed to try_invoke_on_nonrunning_task() might be invoked with a runqueue lock held, that function had better be quite lightweight. The function is passed the target task's task_struct pointer and the argument passed to try_invoke_on_locked_down_task(), allowing easy access to task state and to a location for further variables to be passed in and out. Note that the specified function will be called even if the specified task is currently running. The function can use ->on_rq and task_curr() to quickly and easily determine the task's state, and can return false if this state is not to the function's liking. The caller of the try_invoke_on_locked_down_task() would then see the false return value, and could take appropriate action, for example, trying again later or sending an IPI if matters are more urgent. It is expected that use cases such as the RCU CPU stall warning code will simply return false if the task is currently running. However, there are use cases involving nohz_full CPUs where the specified function might instead fall back to an alternative sampling scheme that relies on heavier synchronization (such as memory barriers) in the target task. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Ben Segall Cc: Mel Gorman [ paulmck: Apply feedback from Peter Zijlstra and Steven Rostedt. ] [ paulmck: Invoke if running to handle feedback from Mathieu Desnoyers. ] Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/wait.h | 2 ++ kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index feeb6be5cad6..898c890fc153 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1149,4 +1149,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); + #endif /* _LINUX_WAIT_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5ca567adfcb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2566,6 +2566,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2639,6 +2641,52 @@ out: return success; } +/** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. -- cgit v1.2.3-71-gd317 From b3d73156b075014ce5b2609f4f47723d6c0c23d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Mar 2020 13:58:27 -0800 Subject: rcu: Reinstate synchronize_rcu_mult() With the advent and likely usage of synchronize_rcu_rude(), there is again a need to wait on multiple types of RCU grace periods, for example, call_rcu_tasks() and call_rcu_tasks_rude(). This commit therefore reinstates synchronize_rcu_mult() in order to allow these grace periods to be straightforwardly waited on concurrently. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_wait.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index c0578ba23c1a..699b938358bf 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -31,4 +31,23 @@ do { \ #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +/** + * synchronize_rcu_mult - Wait concurrently for multiple grace periods + * @...: List of call_rcu() functions for different grace periods to wait on + * + * This macro waits concurrently for multiple types of RCU grace periods. + * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait + * on concurrent RCU and RCU-tasks grace periods. Waiting on a given SRCU + * domain requires you to write a wrapper function for that SRCU domain's + * call_srcu() function, with this wrapper supplying the pointer to the + * corresponding srcu_struct. + * + * The first argument tells Tiny RCU's _wait_rcu_gp() not to + * bother waiting for RCU. The reason for this is because anywhere + * synchronize_rcu_mult() can be called is automatically already a full + * grace period. + */ +#define synchronize_rcu_mult(...) \ + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ -- cgit v1.2.3-71-gd317 From 5873b8a94e5dae04b8e11fc798df512614e6d1e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Mar 2020 11:49:21 -0800 Subject: rcu-tasks: Refactor RCU-tasks to allow variants to be added This commit splits out generic processing from RCU-tasks-specific processing in order to allow additional flavors to be added. It also adds a def_bool TASKS_RCU_GENERIC to enable the common RCU-tasks infrastructure code. This is primarily, but not entirely, a code-movement commit. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 +- kernel/rcu/Kconfig | 10 +- kernel/rcu/tasks.h | 491 +++++++++++++++++++++++++---------------------- kernel/rcu/update.c | 4 + 4 files changed, 272 insertions(+), 239 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2678a37c3169..5523145e0a78 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -129,7 +129,7 @@ static inline void rcu_init_nohz(void) { } * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ -#ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RCU_GENERIC #define rcu_tasks_qs(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ @@ -140,14 +140,14 @@ void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); -#else /* #ifdef CONFIG_TASKS_RCU */ +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_qs(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } -#endif /* #else #ifdef CONFIG_TASKS_RCU */ +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..38475d0bc634 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. config RCU_STALL_COMMON def_bool TREE_RCU diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5ccfe0d64e6a..d77921ee5a6e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -5,7 +5,13 @@ * Copyright (C) 2020 Paul E. McKenney */ -#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); /** * Definition for a Tasks-RCU-like mechanism. @@ -14,6 +20,8 @@ * @cbs_wq: Wait queue allowning new callback to get kthread's attention. * @cbs_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @call_func: This flavor's call_rcu()-equivalent function. */ struct rcu_tasks { struct rcu_head *cbs_head; @@ -21,29 +29,20 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + call_rcu_func_t call_func; }; -#define DEFINE_RCU_TASKS(name) \ +#define DEFINE_RCU_TASKS(name, gp, call) \ static struct rcu_tasks name = \ { \ .cbs_tail = &name.cbs_head, \ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(name.cbs_wq), \ .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ } -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ -DEFINE_RCU_TASKS(rcu_tasks); - /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); @@ -52,29 +51,16 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) { unsigned long flags; bool needwake; - struct rcu_tasks *rtp = &rcu_tasks; rhp->next = NULL; rhp->func = func; @@ -87,108 +73,25 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); } -EXPORT_SYMBOL_GPL(call_rcu_tasks); -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); + wait_rcu_gp(rtp->call_func); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; struct rcu_head *list; struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); struct rcu_tasks *rtp = arg; - int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -220,111 +123,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) continue; } - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * t->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); + // Wait for one grace period. + rtp->gp_func(rtp); /* Invoke the callbacks. */ while (list) { @@ -340,18 +140,16 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; - t = kthread_run(rcu_tasks_kthread, &rcu_tasks, "rcu_tasks_kthread"); + t = kthread_run(rcu_tasks_kthread, rtp, "rcu_tasks_kthread"); if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; + return; smp_mb(); /* Ensure others see full kthread. */ - return 0; } -core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -369,8 +167,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) preempt_enable(); } -#endif /* #ifdef CONFIG_TASKS_RCU */ - #ifndef CONFIG_TINY_RCU /* @@ -387,3 +183,230 @@ static void __init rcu_tasks_bootup_oddness(void) } #endif /* #ifndef CONFIG_TINY_RCU */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(rcu_tasks_holdouts); + int fract; + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c5799349ff31..30dce20e1644 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -584,7 +584,11 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#ifdef CONFIG_TASKS_RCU_GENERIC #include "tasks.h" +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3-71-gd317 From c84aad765406c4c7573ce449e8a9977ebb8f4cb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Mar 2020 21:06:43 -0800 Subject: rcu-tasks: Add an RCU-tasks rude variant This commit adds a "rude" variant of RCU-tasks that has as quiescent states schedule(), cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, anyway) cond_resched(). In other words, RCU-tasks rude readers are regions of code with preemption disabled, but excluding code early in the CPU-online sequence and late in the CPU-offline sequence. Updates make use of IPIs and force an IPI and a context switch on each online CPU. This variant is useful in some situations in tracing. Suggested-by: Steven Rostedt [ paulmck: Apply EXPORT_SYMBOL_GPL() feedback from Qiujun Huang. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply review feedback from Steve Rostedt. ] --- include/linux/rcupdate.h | 3 ++ kernel/rcu/Kconfig | 11 +++++- kernel/rcu/tasks.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5523145e0a78..2be97a83f266 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,6 +37,7 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); +void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU @@ -138,6 +139,8 @@ static inline void rcu_init_nohz(void) { } #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); +void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 38475d0bc634..6ee6372a4459 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -84,6 +84,15 @@ config TASKS_RCU only voluntary context switch (not preemption!), idle, and user-mode execution as quiescent states. Not for manual selection. +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d77921ee5a6e..7f9ed20c26c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,6 +180,9 @@ static void __init rcu_tasks_bootup_oddness(void) else pr_info("\tTasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -410,3 +413,98 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -- cgit v1.2.3-71-gd317 From d5f177d35c24429c87db2567d20563fc16f7e8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Mar 2020 19:56:53 -0700 Subject: rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks Because RCU does not watch exception early-entry/late-exit, idle-loop, or CPU-hotplug execution, protection of tracing and BPF operations is needlessly complicated. This commit therefore adds a variant of Tasks RCU that: o Has explicit read-side markers to allow finite grace periods in the face of in-kernel loops for PREEMPT=n builds. These markers are rcu_read_lock_trace() and rcu_read_unlock_trace(). o Protects code in the idle loop, exception entry/exit, and CPU-hotplug code paths. In this respect, RCU-tasks trace is similar to SRCU, but with lighter-weight readers. o Avoids expensive read-side instruction, having overhead similar to that of Preemptible RCU. There are of course downsides: o The grace-period code can send IPIs to CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. This is mitigated by later commits. o It is necessary to scan the full tasklist, much as for Tasks RCU. o There is a single callback queue guarded by a single lock, again, much as for Tasks RCU. However, those early use cases that request multiple grace periods in quick succession are expected to do so from a single task, which makes the single lock almost irrelevant. If needed, multiple callback queues can be provided using any number of schemes. Perhaps most important, this variant of RCU does not affect the vanilla flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace readers can operate from idle, offline, and exception entry/exit in no way enables rcu_preempt and rcu_sched readers to do so. The memory ordering was outlined here: https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/ This effort benefited greatly from off-list discussions of BPF requirements with Alexei Starovoitov and Andrii Nakryiko. At least some of the on-list discussions are captured in the Link: tags below. In addition, KCSAN was quite helpful in finding some early bugs. Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/ Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/ Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/ Cc: Alexei Starovoitov Cc: Andrii Nakryiko [ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ] [ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ] [ paulmck: Fix locking issue reported by rcutorture. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 84 ++++++++++ include/linux/sched.h | 8 + init/init_task.c | 4 + kernel/fork.c | 4 + kernel/rcu/Kconfig | 11 +- kernel/rcu/tasks.h | 361 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 467 insertions(+), 5 deletions(-) create mode 100644 include/linux/rcupdate_trace.h (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h new file mode 100644 index 000000000000..ed97e10817bd --- /dev/null +++ b/include/linux/rcupdate_trace.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion, adapted for tracing. + * + * Copyright (C) 2020 Paul E. McKenney. + */ + +#ifndef __LINUX_RCUPDATE_TRACE_H +#define __LINUX_RCUPDATE_TRACE_H + +#include +#include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +extern struct lockdep_map rcu_trace_lock_map; + +static inline int rcu_read_lock_trace_held(void) +{ + return lock_is_held(&rcu_trace_lock_map); +} + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +static inline int rcu_read_lock_trace_held(void) +{ + return 1; +} + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +void rcu_read_unlock_trace_special(struct task_struct *t); + +/** + * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section + * + * When synchronize_rcu_trace() is invoked by one task, then that task + * is guaranteed to block until all other tasks exit their read-side + * critical sections. Similarly, if call_rcu_trace() is invoked on one + * task while other tasks are within RCU read-side critical sections, + * invocation of the corresponding RCU callback is deferred until after + * the all the other tasks exit their critical sections. + * + * For more details, please see the documentation for rcu_read_lock(). + */ +static inline void rcu_read_lock_trace(void) +{ + struct task_struct *t = current; + + WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + rcu_lock_acquire(&rcu_trace_lock_map); +} + +/** + * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section + * + * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is + * allowed. Invoking a rcu_read_unlock_trace() when there is no matching + * rcu_read_lock_trace() is verboten, and will result in lockdep complaints. + * + * For more details, please see the documentation for rcu_read_unlock(). + */ +static inline void rcu_read_unlock_trace(void) +{ + int nesting; + struct task_struct *t = current; + + rcu_lock_release(&rcu_trace_lock_map); + nesting = READ_ONCE(t->trc_reader_nesting) - 1; + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + return; // We assume shallow reader nesting. + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +void synchronize_rcu_tasks_trace(void); +void rcu_barrier_tasks_trace(void); + +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + +#endif /* __LINUX_RCUPDATE_TRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a4b727f57095..864f60e51c41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -724,6 +724,14 @@ struct task_struct { struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + bool trc_reader_need_end; + bool trc_reader_checked; + struct list_head trc_holdout_list; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + struct sched_info sched_info; struct list_head tasks; diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..e8b3740ee598 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -141,6 +141,10 @@ struct task_struct init_task .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, #endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), +#endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), #endif diff --git a/kernel/fork.c b/kernel/fork.c index 8c700f881d92..72e9396235b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 6ee6372a4459..cb1d18ef343c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -71,7 +71,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU select SRCU help This option enables generic infrastructure code supporting @@ -93,6 +93,15 @@ config TASKS_RUDE_RCU switches on all online CPUs, including idle ones, so use with caution. +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. + config RCU_STALL_COMMON def_bool TREE_RCU help diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d8b09d5d1db1..fd34fd673a8c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -181,12 +181,17 @@ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) preempt_enable(); } +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { + struct task_struct *t = current; + preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); preempt_enable(); + exit_tasks_rcu_finish_trace(t); } #ifndef CONFIG_TINY_RCU @@ -196,15 +201,19 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) */ static void __init rcu_tasks_bootup_oddness(void) { -#ifdef CONFIG_TASKS_RCU +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU pr_info("\tRude variant of Tasks RCU enabled.\n"); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -569,3 +578,347 @@ static int __init rcu_spawn_tasks_rude_kthread(void) core_initcall(rcu_spawn_tasks_rude_kthread); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + if (atomic_dec_and_test(&trc_n_readers_need_end)) + wake_up(&trc_wait); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + // If the task is in a read-side critical section, set up its + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + if (unlikely(t->trc_reader_nesting)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_need_end); + WRITE_ONCE(t->trc_reader_need_end, true); + } + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Wait for CPU-hotplug paths to complete. + cpus_read_lock(); + cpus_read_unlock(); + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_need_end, false); + t->trc_reader_checked = false; + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* Do intermediate processing between task and holdout scans. */ +static void rcu_tasks_trace_postscan(void) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool ndrpt, bool *frptp) +{ + struct task_struct *g, *t; + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(void) +{ + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + wait_event_idle_exclusive(trc_wait, + atomic_read(&trc_n_readers_need_end) == 0); + + smp_mb(); // Caller's code must be ordered after wakeup. +} + +/* Report any needed quiescent state for this exiting task. */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) + rcu_read_unlock_trace_special(t); +} + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ -- cgit v1.2.3-71-gd317 From 43766c3eadcf6033c92eb953f88801aebac0f785 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Mar 2020 20:38:29 -0700 Subject: rcu-tasks: Make RCU Tasks Trace make use of RCU scheduler hooks This commit makes the calls to rcu_tasks_qs() detect and report quiescent states for RCU tasks trace. If the task is in a quiescent state and if ->trc_reader_checked is not yet set, the task sets its own ->trc_reader_checked. This will cause the grace-period kthread to remove it from the holdout list if it still remains there. [ paulmck: Fix conditional compilation per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 44 +++++++++++++++++++++++++++++++++++++------- include/linux/rcutiny.h | 2 +- kernel/rcu/tasks.h | 5 +++-- kernel/rcu/tree_plugin.h | 6 ++---- 4 files changed, 43 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2be97a83f266..659cbfa7581a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -131,20 +131,50 @@ static inline void rcu_init_nohz(void) { } * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC -#define rcu_tasks_qs(t) \ - do { \ - if (READ_ONCE((t)->rcu_tasks_holdout)) \ - WRITE_ONCE((t)->rcu_tasks_holdout, false); \ + +# ifdef CONFIG_TASKS_RCU +# define rcu_tasks_classic_qs(t, preempt) \ + do { \ + if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ + WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +# else +# define rcu_tasks_classic_qs(t, preempt) do { } while (0) +# define call_rcu_tasks call_rcu +# define synchronize_rcu_tasks synchronize_rcu +# endif + +# ifdef CONFIG_TASKS_RCU_TRACE +# define rcu_tasks_trace_qs(t) \ + do { \ + if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ + !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ + smp_store_release(&(t)->trc_reader_checked, true); \ + smp_mb(); /* Readers partitioned by store. */ \ + } \ + } while (0) +# else +# define rcu_tasks_trace_qs(t) do { } while (0) +# endif + +#define rcu_tasks_qs(t, preempt) \ +do { \ + rcu_tasks_classic_qs((t), (preempt)); \ + rcu_tasks_trace_qs((t)); \ +} while (0) + +# ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); +# endif + +#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -#define rcu_tasks_qs(t) do { } while (0) +#define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu @@ -161,7 +191,7 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 045c28b71f4f..d77e11186afd 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -49,7 +49,7 @@ static inline void rcu_softirq_qs(void) #define rcu_note_context_switch(preempt) \ do { \ rcu_qs(); \ - rcu_tasks_qs(current); \ + rcu_tasks_qs(current, (preempt)); \ } while (0) static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c93fb29b460c..6f8a4040fbdd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -180,7 +180,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_unlock_lock(); // Order updates vs. GP. + smp_mb__after_spinlock(); // Order updates vs. GP. list = rtp->cbs_head; rtp->cbs_head = NULL; rtp->cbs_tail = &rtp->cbs_head; @@ -874,7 +874,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { WRITE_ONCE(t->trc_reader_need_end, false); - t->trc_reader_checked = false; + WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -983,6 +983,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); } smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. } /* Report any needed quiescent state for this exiting task. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4f34c325dd90..37e02812d18f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -331,8 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -841,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } -- cgit v1.2.3-71-gd317 From 276c410448dbca357a2bc3539acfe04862e5f172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Mar 2020 16:02:06 -0700 Subject: rcu-tasks: Split ->trc_reader_need_end This commit splits ->trc_reader_need_end by using the rcu_special union. This change permits readers to check to see if a memory barrier is required without any added overhead in the common case where no such barrier is required. This commit also adds the read-side checking. Later commits will add the machinery to properly set the new ->trc_reader_special.b.need_mb field. This commit also makes rcu_read_unlock_trace_special() tolerate nested read-side critical sections within interrupt and NMI handlers. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 11 +++++++---- include/linux/sched.h | 4 ++-- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 33 ++++++++++++++++++++------------- 5 files changed, 31 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index ed97e10817bd..c42b365ca176 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void) #ifdef CONFIG_TASKS_TRACE_RCU -void rcu_read_unlock_trace_special(struct task_struct *t); +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section @@ -50,6 +50,8 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } @@ -69,10 +71,11 @@ static inline void rcu_read_unlock_trace(void) rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; - WRITE_ONCE(t->trc_reader_nesting, nesting); - if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting) + if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { + WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. - rcu_read_unlock_trace_special(t); + } + rcu_read_unlock_trace_special(t, nesting); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); diff --git a/include/linux/sched.h b/include/linux/sched.h index 864f60e51c41..9437b53cc603 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -613,7 +613,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; @@ -727,7 +727,7 @@ struct task_struct { #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; - bool trc_reader_need_end; + union rcu_special trc_reader_special; bool trc_reader_checked; struct list_head trc_holdout_list; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/init/init_task.c b/init/init_task.c index e8b3740ee598..825972daec32 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -143,6 +143,7 @@ struct task_struct init_task #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, + .trc_reader_special.s = 0, .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), #endif #ifdef CONFIG_CPUSETS diff --git a/kernel/fork.c b/kernel/fork.c index 72e9396235b4..96eb4b535ced 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1685,6 +1685,7 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eeac4a122234..17b1b9a31071 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -723,10 +723,17 @@ DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t) +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - WRITE_ONCE(t->trc_reader_need_end, false); - if (atomic_dec_and_test(&trc_n_readers_need_end)) + int nq = t->trc_reader_special.b.need_qs; + + if (t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) wake_up(&trc_wait); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -777,8 +784,8 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -804,8 +811,8 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // exit from that critical section. if (unlikely(t->trc_reader_nesting)) { atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_need_end); - WRITE_ONCE(t->trc_reader_need_end, true); + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); } return true; } @@ -884,7 +891,7 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { - WRITE_ONCE(t->trc_reader_need_end, false); + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); @@ -916,7 +923,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], t->trc_reader_nesting, - " N"[!!t->trc_reader_need_end], + " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); } @@ -980,11 +987,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) break; // Count reached zero. // Stall warning time, so make a list of the offenders. for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_need_end)) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_need_end)) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); } @@ -1003,8 +1010,8 @@ void exit_tasks_rcu_finish_trace(struct task_struct *t) WRITE_ONCE(t->trc_reader_checked, true); WARN_ON_ONCE(t->trc_reader_nesting); WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_need_end))) - rcu_read_unlock_trace_special(t); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); } /** -- cgit v1.2.3-71-gd317 From 9ae58d7bd11f1fc4c96389df11751f8593d8bd33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Mar 2020 17:16:37 -0700 Subject: rcu-tasks: Add Kconfig option to mediate smp_mb() vs. IPI This commit provides a new TASKS_TRACE_RCU_READ_MB Kconfig option that enables use of read-side memory barriers by both rcu_read_lock_trace() and rcu_read_unlock_trace() when the are executed with the current->trc_reader_special.b.need_mb flag set. This flag is currently never set. Doing that is the subject of a later commit. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 3 ++- kernel/rcu/Kconfig | 18 ++++++++++++++++++ kernel/rcu/tasks.h | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index c42b365ca176..4c25a41f8b27 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -50,7 +50,8 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cb1d18ef343c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -234,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4857450c0430..4147857007d7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -734,7 +734,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { int nq = t->trc_reader_special.b.need_qs; - if (t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. if (nq) -- cgit v1.2.3-71-gd317 From be44ae62431196ac2a55198c0855028fff3ccfb4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 25 Feb 2020 21:09:19 -0800 Subject: locktorture.c: Fix if-statement empty body warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using -Wextra, gcc complains about torture_preempt_schedule() when its definition is empty (i.e., when CONFIG_PREEMPTION is not set/enabled). Fix these warnings by adding an empty do-while block for that macro when CONFIG_PREEMPTION is not set. Fixes these build warnings: ../kernel/locking/locktorture.c:119:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:166:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:337:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:490:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:528:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] ../kernel/locking/locktorture.c:553:29: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] I have verified that there is no object code change (with gcc 7.5.0). Signed-off-by: Randy Dunlap Cc: Davidlohr Bueso Cc: Josh Triplett Signed-off-by: "Paul E. McKenney" --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 6241f59e2d6f..629b66e6c161 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -89,7 +89,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else -#define torture_preempt_schedule() +#define torture_preempt_schedule() do { } while (0) #endif #endif /* __LINUX_TORTURE_H */ -- cgit v1.2.3-71-gd317 From 6553896666433e7efec589838b400a2a652b3ffa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Mar 2020 22:47:17 +0100 Subject: vmlinux.lds.h: Create section for protection against instrumentation Some code pathes, especially the low level entry code, must be protected against instrumentation for various reasons: - Low level entry code can be a fragile beast, especially on x86. - With NO_HZ_FULL RCU state needs to be established before using it. Having a dedicated section for such code allows to validate with tooling that no unsafe functions are invoked. Add the .noinstr.text section and the noinstr attribute to mark functions. noinstr implies notrace. Kprobes will gain a section check later. Provide also a set of markers: instrumentation_begin()/end() These are used to mark code inside a noinstr function which calls into regular instrumentable text section as safe. The instrumentation markers are only active when CONFIG_DEBUG_ENTRY is enabled as the end marker emits a NOP to prevent the compiler from merging the annotation points. This means the objtool verification requires a kernel compiled with this option. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.075416272@linutronix.de --- arch/powerpc/kernel/vmlinux.lds.S | 1 + include/asm-generic/sections.h | 3 +++ include/asm-generic/vmlinux.lds.h | 10 ++++++++ include/linux/compiler.h | 53 +++++++++++++++++++++++++++++++++++++++ include/linux/compiler_types.h | 4 +++ scripts/mod/modpost.c | 2 +- 6 files changed, 72 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 31a0f201fb6f..a1706b63b82d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -90,6 +90,7 @@ SECTIONS #ifdef CONFIG_PPC64 *(.tramp.ftrace.text); #endif + NOINSTR_TEXT SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d1779d442aa5..66397ed10acb 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -53,6 +53,9 @@ extern char __ctors_start[], __ctors_end[]; /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; +/* Start and end of instrumentation protected text section */ +extern char __noinstr_text_start[], __noinstr_text_end[]; + extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 71e387a5fe90..db600ef218d7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -540,6 +540,15 @@ . = ALIGN((align)); \ __end_rodata = .; +/* + * Non-instrumentable text section + */ +#define NOINSTR_TEXT \ + ALIGN_FUNCTION(); \ + __noinstr_text_start = .; \ + *(.noinstr.text) \ + __noinstr_text_end = .; + /* * .text section. Map to function alignment to avoid address changes * during second ld run in second ld pass when generating System.map @@ -551,6 +560,7 @@ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ + NOINSTR_TEXT \ *(.text..refcount) \ *(.ref.text) \ MEM_KEEP(init.text*) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 034b0a644efc..e9ead0505671 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -120,12 +120,65 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(.rodata..c_jump_table) +#ifdef CONFIG_DEBUG_ENTRY +/* Begin/end of an instrumentation safe region */ +#define instrumentation_begin() ({ \ + asm volatile("%c0:\n\t" \ + ".pushsection .discard.instr_begin\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) + +/* + * Because instrumentation_{begin,end}() can nest, objtool validation considers + * _begin() a +1 and _end() a -1 and computes a sum over the instructions. + * When the value is greater than 0, we consider instrumentation allowed. + * + * There is a problem with code like: + * + * noinstr void foo() + * { + * instrumentation_begin(); + * ... + * if (cond) { + * instrumentation_begin(); + * ... + * instrumentation_end(); + * } + * bar(); + * instrumentation_end(); + * } + * + * If instrumentation_end() would be an empty label, like all the other + * annotations, the inner _end(), which is at the end of a conditional block, + * would land on the instruction after the block. + * + * If we then consider the sum of the !cond path, we'll see that the call to + * bar() is with a 0-value, even though, we meant it to happen with a positive + * value. + * + * To avoid this, have _end() be a NOP instruction, this ensures it will be + * part of the condition block and does not escape. + */ +#define instrumentation_end() ({ \ + asm volatile("%c0: nop\n\t" \ + ".pushsection .discard.instr_end\n\t" \ + ".long %c0b - .\n\t" \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ +}) +#endif /* CONFIG_DEBUG_ENTRY */ + #else #define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif +#ifndef instrumentation_begin +#define instrumentation_begin() do { } while(0) +#define instrumentation_end() do { } while(0) +#endif + #ifndef ASM_UNREACHABLE # define ASM_UNREACHABLE #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e970f97a7fcb..5da257cbebf1 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -118,6 +118,10 @@ struct ftrace_likely_data { #define notrace __attribute__((__no_instrument_function__)) #endif +/* Section for code which can't be instrumented at all */ +#define noinstr \ + noinline notrace __attribute((__section__(".noinstr.text"))) + /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5c3c50c5ec52..0053d4fea847 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -948,7 +948,7 @@ static void check_section(const char *modname, struct elf_info *elf, #define DATA_SECTIONS ".data", ".data.rel" #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ - ".kprobes.text", ".cpuidle.text" + ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ ".coldtext" -- cgit v1.2.3-71-gd317 From 69ea03b56ed2c7189ccd0b5910ad39f3cad1df21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 09:46:47 +0100 Subject: hardirq/nmi: Allow nested nmi_enter() Since there are already a number of sites (ARM64, PowerPC) that effectively nest nmi_enter(), make the primitive support this before adding even more. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Acked-by: Will Deacon Cc: Michael Ellerman Link: https://lkml.kernel.org/r/20200505134100.864179229@linutronix.de --- arch/arm64/kernel/sdei.c | 14 ++------------ arch/arm64/kernel/traps.c | 8 ++------ arch/powerpc/kernel/traps.c | 22 ++++++---------------- include/linux/hardirq.h | 5 ++++- include/linux/preempt.h | 4 ++-- 5 files changed, 16 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..e396e69e33a1 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -251,22 +251,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - bool do_nmi_exit = false; - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } + nmi_enter(); ret = _sdei_handler(regs, arg); - if (do_nmi_exit) - nmi_exit(); + nmi_exit(); return ret; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..c728f163f329 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -906,17 +906,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); + nmi_enter(); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - if (!was_in_nmi) - nmi_exit(); + nmi_exit(); } asmlinkage void enter_from_user_mode(void) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 3fca22276bb1..b44dd75de517 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -441,15 +441,9 @@ nonrecoverable: void system_reset_exception(struct pt_regs *regs) { unsigned long hsrr0, hsrr1; - bool nested = in_nmi(); bool saved_hsrrs = false; - /* - * Avoid crashes in case of nested NMI exceptions. Recoverability - * is determined by RI and in_nmi - */ - if (!nested) - nmi_enter(); + nmi_enter(); /* * System reset can interrupt code where HSRRs are live and MSR[RI]=1. @@ -521,8 +515,7 @@ out: mtspr(SPRN_HSRR1, hsrr1); } - if (!nested) - nmi_exit(); + nmi_exit(); /* What should we do here? We could issue a shutdown or hard reset. */ } @@ -823,9 +816,8 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { int recover = 0; - bool nested = in_nmi(); - if (!nested) - nmi_enter(); + + nmi_enter(); __this_cpu_inc(irq_stat.mce_exceptions); @@ -851,8 +843,7 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - if (!nested) - nmi_exit(); + nmi_exit(); die("Machine check", regs, SIGBUS); @@ -863,8 +854,7 @@ void machine_check_exception(struct pt_regs *regs) return; bail: - if (!nested) - nmi_exit(); + nmi_exit(); } void SMIException(struct pt_regs *regs) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 7c8b82f69288..a043ad826c67 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -65,13 +65,16 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +/* + * nmi_enter() can nest up to 15 times; see NMI_BITS. + */ #define nmi_enter() \ do { \ arch_nmi_enter(); \ printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ - BUG_ON(in_nmi()); \ + BUG_ON(in_nmi() == NMI_MASK); \ preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bc3f1aecaa19..7d9c1c0e149c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,13 +26,13 @@ * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 + * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 -#define NMI_BITS 1 +#define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -- cgit v1.2.3-71-gd317 From e616cb8daadf637175af4fe53138a94c190c4816 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:14:51 +0100 Subject: lockdep: Always inline lockdep_{off,on}() These functions are called {early,late} in nmi_{enter,exit} and should not be traced or probed. They are also puny, so 'inline' them. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.048523500@linutronix.de --- include/linux/lockdep.h | 23 +++++++++++++++++++++-- kernel/locking/lockdep.c | 19 ------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 206774ac6946..8fce5c98a4b0 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -308,8 +308,27 @@ extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_off(void); -extern void lockdep_on(void); +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + +/* + * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due + * to header dependencies. + */ + +#define lockdep_off() \ +do { \ + current->lockdep_recursion += LOCKDEP_OFF; \ +} while (0) + +#define lockdep_on() \ +do { \ + current->lockdep_recursion -= LOCKDEP_OFF; \ +} while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..6f1c8cba09c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) -- cgit v1.2.3-71-gd317 From 178ba00c354eb15cec6806a812771e60a5ae3ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2020 22:26:21 +0100 Subject: sh/ftrace: Move arch_ftrace_nmi_{enter,exit} into nmi exception SuperH is the last remaining user of arch_ftrace_nmi_{enter,exit}(), remove it from the generic code and into the SuperH code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Steven Rostedt (VMware) Cc: Rich Felker Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200505134101.248881738@linutronix.de --- Documentation/trace/ftrace-design.rst | 8 -------- arch/sh/Kconfig | 1 - arch/sh/kernel/traps.c | 12 ++++++++++++ include/linux/ftrace_irq.h | 11 ----------- kernel/trace/Kconfig | 10 ---------- 5 files changed, 12 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/Documentation/trace/ftrace-design.rst b/Documentation/trace/ftrace-design.rst index a8e22e0db63c..6893399157f0 100644 --- a/Documentation/trace/ftrace-design.rst +++ b/Documentation/trace/ftrace-design.rst @@ -229,14 +229,6 @@ Adding support for it is easy: just define the macro in asm/ftrace.h and pass the return address pointer as the 'retp' argument to ftrace_push_return_trace(). -HAVE_FTRACE_NMI_ENTER ---------------------- - -If you can't trace NMI functions, then skip this option. - -
- - HAVE_SYSCALL_TRACEPOINTS ------------------------ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4f0e37b83eb..97656d20b9ea 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,7 +71,6 @@ config SUPERH32 select HAVE_FUNCTION_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_ARCH_KGDB diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 63cf17bc760d..2130381c9d57 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -170,11 +170,21 @@ BUILD_TRAP_HANDLER(bug) force_sig(SIGTRAP); } +#ifdef CONFIG_DYNAMIC_FTRACE +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + BUILD_TRAP_HANDLER(nmi) { unsigned int cpu = smp_processor_id(); TRAP_HANDLER_DECL; + arch_ftrace_nmi_enter(); + nmi_enter(); nmi_count(cpu)++; @@ -190,4 +200,6 @@ BUILD_TRAP_HANDLER(nmi) } nmi_exit(); + + arch_ftrace_nmi_exit(); } diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index ccda97dc7f8b..0abd9a1d2852 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,15 +2,6 @@ #ifndef _LINUX_FTRACE_IRQ_H #define _LINUX_FTRACE_IRQ_H - -#ifdef CONFIG_FTRACE_NMI_ENTER -extern void arch_ftrace_nmi_enter(void); -extern void arch_ftrace_nmi_exit(void); -#else -static inline void arch_ftrace_nmi_enter(void) { } -static inline void arch_ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_HWLAT_TRACER extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); @@ -22,12 +13,10 @@ static inline void ftrace_nmi_enter(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif - arch_ftrace_nmi_enter(); } static inline void ftrace_nmi_exit(void) { - arch_ftrace_nmi_exit(); #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1e9a8f9a3459..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB -- cgit v1.2.3-71-gd317 From f93524eb9c54f49be150167918f6546b0a2e09b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2020 21:01:16 +0100 Subject: sched,rcu,tracing: Avoid tracing before in_nmi() is correct If a tracer is invoked before in_nmi() becomes true, the tracer can no longer detect it is called from NMI context and behave correctly. Therefore change nmi_{enter,exit}() to use __preempt_count_{add,sub}() as the normal preempt_count_{add,sub}() have a (desired) function trace entry. This fixes a potential issue with the current code; when the function-tracer has stack-tracing enabled __trace_stack() will malfunction when it hits the preempt_count_add() function entry from NMI context. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.434193525@linutronix.de --- include/linux/hardirq.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index a043ad826c67..621556efe45f 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -65,6 +65,15 @@ extern void irq_exit(void); #define arch_nmi_exit() do { } while (0) #endif +/* + * NMI vs Tracing + * -------------- + * + * We must not land in a tracer until (or after) we've changed preempt_count + * such that in_nmi() becomes true. To that effect all NMI C entry points must + * be marked 'notrace' and call nmi_enter() as soon as possible. + */ + /* * nmi_enter() can nest up to 15 times; see NMI_BITS. */ @@ -75,7 +84,7 @@ extern void irq_exit(void); lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi() == NMI_MASK); \ - preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ lockdep_hardirq_enter(); \ } while (0) @@ -85,7 +94,7 @@ extern void irq_exit(void); lockdep_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ printk_nmi_exit(); \ -- cgit v1.2.3-71-gd317 From 5567d11c21a1d508a91a8cb64a819783a0835d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 10:22:06 +0100 Subject: x86/mce: Send #MC singal from task work Convert #MC over to using task_work_add(); it will run the same code slightly later, on the return to user path of the same exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.957390899@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 56 +++++++++++++++++++++++------------------- include/linux/sched.h | 6 +++++ 2 files changed, 37 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 98bf91cd7d5d..2f0ef95795f3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1086,23 +1087,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1188,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1229,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1354,13 +1361,13 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - local_irq_enable(); - preempt_enable(); - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - preempt_disable(); - local_irq_disable(); + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); @@ -1370,7 +1377,6 @@ out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9437b53cc603..57d0ed061ae4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1297,6 +1297,12 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_X86_MCE + u64 mce_addr; + u64 mce_status; + struct callback_head mce_kill_me; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. -- cgit v1.2.3-71-gd317 From 8ae0ae6737ad449c8ae21e2bb01d9736f360a933 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 3 May 2020 15:08:52 +0200 Subject: rcu: Provide rcu_irq_exit_preempt() Interrupts and exceptions invoke rcu_irq_enter() on entry and need to invoke rcu_irq_exit() before they either return to the interrupted code or invoke the scheduler due to preemption. The general assumption is that RCU idle code has to have preemption disabled so that a return from interrupt cannot schedule. So the return from interrupt code invokes rcu_irq_exit() and preempt_schedule_irq(). If there is any imbalance in the rcu_irq/nmi* invocations or RCU idle code had preemption enabled then this goes unnoticed until the CPU goes idle or some other RCU check is executed. Provide rcu_irq_exit_preempt() which can be invoked from the interrupt/exception return code in case that preemption is enabled. It invokes rcu_irq_exit() and contains a few sanity checks in case that CONFIG_PROVE_RCU is enabled to catch such issues directly. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.364456424@linutronix.de --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 3465ba704a11..980eb78751d9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -71,6 +71,7 @@ static inline void rcu_irq_enter(void) { } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } +static inline void rcu_irq_exit_preempt(void) { } static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index fbc26274af4d..02016e0aa8eb 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -47,6 +47,7 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 945401674b9d..62ee01299386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -743,6 +743,28 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * -- cgit v1.2.3-71-gd317 From b1fcf9b83c4149c63d1e0c699e85f93cbe28e211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2020 09:44:43 +0200 Subject: rcu: Provide __rcu_is_watching() Same as rcu_is_watching() but without the preempt_disable/enable() pair inside the function. It is merked noinstr so it ends up in the non-instrumentable text section. This is useful for non-preemptible code especially in the low level entry section. Using rcu_is_watching() there results in a call to the preempt_schedule_notrace() thunk which triggers noinstr section warnings in objtool. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200512213810.518709291@linutronix.de --- include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 5 +++++ 3 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 980eb78751d9..c869fb20cc51 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -86,6 +86,7 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } +static inline bool __rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 02016e0aa8eb..9366fa4d0717 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -58,6 +58,7 @@ extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); +bool __rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62ee01299386..90c8be22d57a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -985,6 +985,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * -- cgit v1.2.3-71-gd317