From 459b09b5a3254008b63382bf41a9b36d0b590f57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 18 May 2021 14:07:25 +0100 Subject: sched/debug: Don't update sched_domain debug directories before sched_debug_init() Since CPU capacity asymmetry can stem purely from maximum frequency differences (e.g. Pixel 1), a rebuild of the scheduler topology can be issued upon loading cpufreq, see: arch_topology.c::init_cpu_capacity_callback() Turns out that if this rebuild happens *before* sched_debug_init() is run (which is a late initcall), we end up messing up the sched_domain debug directory: passing a NULL parent to debugfs_create_dir() ends up creating the directory at the debugfs root, which in this case creates /sys/kernel/debug/domains (instead of /sys/kernel/debug/sched/domains). This currently doesn't happen on asymmetric systems which use cpufreq-scpi or cpufreq-dt drivers, as those are loaded via deferred_probe_initcall() (it is also a late initcall, but appears to be ordered *after* sched_debug_init()). Ionela has been working on detecting maximum frequency asymmetry via ACPI, and that actually happens via a *device* initcall, thus before sched_debug_init(), and causes the aforementionned debugfs mayhem. One option would be to punt sched_debug_init() down to fs_initcall_sync(). Preventing update_sched_domain_debugfs() from running before sched_debug_init() appears to be the safer option. Fixes: 3b87f136f8fc ("sched,debug: Convert sysctl sched_domains to debugfs") Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: http://lore.kernel.org/r/20210514095339.12979-1-ionela.voinescu@arm.com --- kernel/sched/debug.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0c5ec2776ddf..7e08e3d947c2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void) { int cpu, i; + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; + if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; -- cgit v1.2.3-71-gd317 From 18765447c3b7867b3f8cccde52dc9d822852e71b Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Sun, 6 Jun 2021 19:54:51 +0800 Subject: sched/sysctl: Move extern sysctl declarations to sched.h Since commit '8a99b6833c88(sched: Move SCHED_DEBUG sysctl to debugfs)', SCHED_DEBUG sysctls are moved to debugfs, so these extern sysctls in include/linux/sched/sysctl.h are no longer needed for sysctl.c, even some are no longer needed. So move those extern sysctls that needed by kernel/sched/debug.c to kernel/sched/sched.h, and remove others that are no longer needed. Signed-off-by: Hailong Liu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210606115451.26745-1-liuhailongg6@163.com --- include/linux/sched/sysctl.h | 18 ------------------ kernel/sched/sched.h | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index db2c0f34aaaf..304f431178fd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern unsigned int sysctl_sched_tunable_scaling; - -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_size; - -#ifdef CONFIG_SCHED_DEBUG -extern __read_mostly unsigned int sysctl_sched_migration_cost; -extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -extern int sysctl_resched_latency_warn_ms; -extern int sysctl_resched_latency_warn_once; -#endif /* * control realtime throttling: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c80d42e9589b..9a1c6aeb9165 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2385,6 +2385,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +#endif + #ifdef CONFIG_SCHED_HRTICK /* -- cgit v1.2.3-71-gd317 From 031e3bd8986fffe31e1ddbf5264cccfe30c9abd7 Mon Sep 17 00:00:00 2001 From: Yuan ZhaoXiong Date: Sun, 6 Jun 2021 21:11:55 +0800 Subject: sched: Optimize housekeeping_cpumask() in for_each_cpu_and() On a 128 cores AMD machine, there are 8 cores in nohz_full mode, and the others are used for housekeeping. When many housekeeping cpus are in idle state, we can observe huge time burn in the loop for searching nearest busy housekeeper cpu by ftrace. 9) | get_nohz_timer_target() { 9) | housekeeping_test_cpu() { 9) 0.390 us | housekeeping_get_mask.part.1(); 9) 0.561 us | } 9) 0.090 us | __rcu_read_lock(); 9) 0.090 us | housekeeping_cpumask(); 9) 0.521 us | housekeeping_cpumask(); 9) 0.140 us | housekeeping_cpumask(); ... 9) 0.500 us | housekeeping_cpumask(); 9) | housekeeping_any_cpu() { 9) 0.090 us | housekeeping_get_mask.part.1(); 9) 0.100 us | sched_numa_find_closest(); 9) 0.491 us | } 9) 0.100 us | __rcu_read_unlock(); 9) + 76.163 us | } for_each_cpu_and() is a micro function, so in get_nohz_timer_target() function the for_each_cpu_and(i, sched_domain_span(sd), housekeeping_cpumask(HK_FLAG_TIMER)) equals to below: for (i = -1; i = cpumask_next_and(i, sched_domain_span(sd), housekeeping_cpumask(HK_FLAG_TIMER)), i < nr_cpu_ids;) That will cause that housekeeping_cpumask() will be invoked many times. The housekeeping_cpumask() function returns a const value, so it is unnecessary to invoke it every time. This patch can minimize the worst searching time from ~76us to ~16us in my testing. Similarly, the find_new_ilb() function has the same problem. Co-developed-by: Li RongQing Signed-off-by: Li RongQing Signed-off-by: Yuan ZhaoXiong Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1622985115-51007-1-git-send-email-yuanzhaoxiong@baidu.com --- kernel/sched/core.c | 6 ++++-- kernel/sched/fair.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2883c22eef10..0c22cd026440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -993,6 +993,7 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void) default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45edf61eed73..11d22943753f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10188,9 +10188,11 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { if (ilb == smp_processor_id()) continue; -- cgit v1.2.3-71-gd317 From 0fc4dcc13f090c941abfab453a24945a4005b350 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 29 Jun 2021 11:39:07 +0200 Subject: bpf, devmap: Convert remaining READ_ONCE() to rcu_dereference_check() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There were a couple of READ_ONCE()-invocations left-over by the devmap RCU conversion. Convert these to rcu_dereference_check() as well to avoid complaints from sparse. Fixes: 782347b6bcad ("xdp: Add proper __rcu annotations to redirect map entries") Reported-by: kernel test robot Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Paul E. McKenney Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210629093907.573598-1-toke@redhat.com --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2546dafd6672..fdc20892837c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -558,7 +558,8 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { - dst = READ_ONCE(dtab->netdev_map[i]); + dst = rcu_dereference_check(dtab->netdev_map[i], + rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdp, exclude_ifindex)) continue; @@ -654,7 +655,8 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { - dst = READ_ONCE(dtab->netdev_map[i]); + dst = rcu_dereference_check(dtab->netdev_map[i], + rcu_read_lock_bh_held()); if (!dst || dst->dev->ifindex == exclude_ifindex) continue; -- cgit v1.2.3-71-gd317 From 22b6d14992b733e9421a475f4d43df24629737ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Jun 2021 12:37:44 -0700 Subject: scftorture: Avoid false-positive warnings in scftorture_invoker() If the call to set_cpus_allowed_ptr() in scftorture_invoker() fails, a later WARN_ONCE() complains. But with the advent of 570a752b7a9b ("lib/smp_processor_id: Use is_percpu_thread() instead of nr_cpus_allowed"), this complaint can be drowned out by complaints from smp_processor_id(). The rationale for this change is that scftorture's kthreads are not marked with PF_NO_SETAFFINITY, which means that a system administrator could change affinity at any time. However, scftorture is a torture test, and the system administrator might well have a valid test-the-test reason for changing affinity. This commit therefore changes to raw_smp_processor_id() in order to avoid the noise, and also adds a WARN_ON_ONCE() to the call to set_cpus_allowed_ptr() in order to directly detect immediate failure. There is no WARN_ON_ONCE() within the test loop, allowing human-reflex-based affinity resetting, if desired. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 2377cbb32474..29e8fc5d91a7 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -405,15 +405,15 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); cpu = scfp->cpu % nr_cpu_ids; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); + WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu))); set_user_nice(current, MAX_NICE); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); - VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id()); // Make sure that the CPU is affinitized appropriately during testing. - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", __func__, scfp->cpu, curcpu, nr_cpu_ids); -- cgit v1.2.3-71-gd317 From 05bc276cf243d90b9f1eb6ae2962f41eeb53a741 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Jun 2021 09:24:43 -0700 Subject: refscale: Avoid false-positive warnings in ref_scale_reader() If the call to set_cpus_allowed_ptr() in ref_scale_reader() fails, a later WARN_ONCE() complains. But with the advent of 570a752b7a9b ("lib/smp_processor_id: Use is_percpu_thread() instead of nr_cpus_allowed"), this complaint can be drowned out by complaints from smp_processor_id(). The rationale for this change is that refscale's kthreads are not marked with PF_NO_SETAFFINITY, which means that a system administrator could change affinity at any time. However, refscale is a performance/stress test, and the system administrator might well have a valid test-the-test reason for changing affinity. This commit therefore changes to raw_smp_processor_id() in order to avoid the noise, and also adds a WARN_ON_ONCE() to the call to set_cpus_allowed_ptr() in order to directly detect immediate failure. There is no WARN_ON_ONCE() within the test loop, allowing human-reflex-based affinity resetting, if desired. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 313d4547cbc7..d998a76fb542 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -487,13 +487,13 @@ ref_scale_reader(void *arg) s64 duration; VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids))); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -503,7 +503,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) -- cgit v1.2.3-71-gd317 From 1d10bf55d85d34eb73dd8263635f43fd72135d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 May 2021 10:12:45 -0700 Subject: rcu-tasks: Don't delete holdouts within trc_inspect_reader() As Yanfei pointed out, although invoking trc_del_holdout() is safe from the viewpoint of the integrity of the holdout list itself, the put_task_struct() invoked by trc_del_holdout() can result in use-after-free errors due to later accesses to this task_struct structure by the RCU Tasks Trace grace-period kthread. This commit therefore removes this call to trc_del_holdout() from trc_inspect_reader() in favor of the grace-period thread's existing call to trc_del_holdout(), thus eliminating that particular class of use-after-free errors. Reported-by: "Xu, Yanfei" Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 03a118d1c003..3d5cb6cb8a6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -953,10 +953,9 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) in_qs = likely(!t->trc_reader_nesting); } - // Mark as checked. Because this is called from the grace-period - // kthread, also remove the task from the holdout list. + // Mark as checked so that the grace-period kthread will + // remove it from the holdout list. t->trc_reader_checked = true; - trc_del_holdout(t); if (in_qs) return true; // Already in quiescent state, done!!! -- cgit v1.2.3-71-gd317 From a9ab9cce9367a2cc02a3c7eb57a004dc0b8f380d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 May 2021 11:28:40 -0700 Subject: rcu-tasks: Don't delete holdouts within trc_wait_for_one_reader() Invoking trc_del_holdout() from within trc_wait_for_one_reader() is only a performance optimization because the RCU Tasks Trace grace-period kthread will eventually do this within check_all_holdout_tasks_trace(). But it is not a particularly important performance optimization because it only applies to the grace-period kthread, of which there is but one. This commit therefore removes this invocation of trc_del_holdout() in favor of the one in check_all_holdout_tasks_trace() in the grace-period kthread. Reported-by: "Xu, Yanfei" Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3d5cb6cb8a6d..8536c55df514 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -982,7 +982,6 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - trc_del_holdout(t); WARN_ON_ONCE(t->trc_reader_nesting); return; } -- cgit v1.2.3-71-gd317 From 2a2ed5618a0e8a890d948b88b368c0459f35136c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jul 2021 13:59:35 -0700 Subject: rcu: Fix pr_info() formats and values in show_rcu_gp_kthreads() This commit changes from "%lx" to "%x" and from "0x1ffffL" to "0x1ffff" to match the change in type between the old field ->state (unsigned long) and the new field ->__state (unsigned int). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 3f937b20814f..6c76988cc019 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -795,9 +795,9 @@ void show_rcu_gp_kthreads(void) jr = j - data_race(rcu_state.gp_req_activity); js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU, + rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU, js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), -- cgit v1.2.3-71-gd317 From ab4e4d9f79b2c95ef268985d2a9625a03a73c49a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jun 2021 17:35:17 +0200 Subject: locking/mutex: Use try_cmpxchg() For simpler and better code. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Yanfei Xu Link: https://lore.kernel.org/r/20210630154114.834438545@infradead.org --- kernel/locking/mutex.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cb6b112ce155..cab7163f9731 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -100,7 +100,7 @@ static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old, flags = __owner_flags(owner); + unsigned long flags = __owner_flags(owner); unsigned long task = owner & ~MUTEX_FLAGS; if (task) { @@ -124,11 +124,8 @@ static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) */ flags &= ~MUTEX_FLAG_HANDOFF; - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); - if (old == owner) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, curr | flags)) return NULL; - - owner = old; } return __owner_task(owner); @@ -168,10 +165,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; - if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) - return true; - - return false; + return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } #endif @@ -216,7 +210,7 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) unsigned long owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old, new; + unsigned long new; #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); @@ -228,11 +222,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) if (task) new |= MUTEX_FLAG_PICKUP; - old = atomic_long_cmpxchg_release(&lock->owner, owner, new); - if (old == owner) + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new)) break; - - owner = old; } } @@ -1229,8 +1220,6 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old; - #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); @@ -1239,16 +1228,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) break; - old = atomic_long_cmpxchg_release(&lock->owner, owner, - __owner_flags(owner)); - if (old == owner) { + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) { if (owner & MUTEX_FLAG_WAITERS) break; return; } - - owner = old; } spin_lock(&lock->wait_lock); -- cgit v1.2.3-71-gd317 From 048661a1f963e9517630f080687d48af79ed784c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jun 2021 17:35:18 +0200 Subject: locking/mutex: Fix HANDOFF condition Yanfei reported that setting HANDOFF should not depend on recomputing @first, only on @first state. Which would then give: if (ww_ctx || !first) first = __mutex_waiter_is_first(lock, &waiter); if (first) __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); But because 'ww_ctx || !first' is basically 'always' and the test for first is relatively cheap, omit that first branch entirely. Reported-by: Yanfei Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Yanfei Xu Link: https://lore.kernel.org/r/20210630154114.896786297@infradead.org --- kernel/locking/mutex.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cab7163f9731..8c3d4993b1e0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -909,7 +909,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; int ret; @@ -988,6 +987,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, set_current_state(state); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -1016,15 +1017,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. - */ - if (ww_ctx || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); set_current_state(state); /* -- cgit v1.2.3-71-gd317 From ad90880dc9625682a58897cba2ecff657a2aa60b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jun 2021 17:35:19 +0200 Subject: locking/mutex: Introduce __mutex_trylock_or_handoff() Yanfei reported that it is possible to loose HANDOFF when we race with mutex_unlock() and end up setting HANDOFF on an unlocked mutex. At that point anybody can steal it, losing HANDOFF in the process. If this happens often enough, we can in fact starve the top waiter. Solve this by folding the 'set HANDOFF' operation into the trylock operation, such that either we acquire the lock, or it gets HANDOFF set. This avoids having HANDOFF set on an unlocked mutex. Reported-by: Yanfei Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Yanfei Xu Link: https://lore.kernel.org/r/20210630154114.958507900@infradead.org --- kernel/locking/mutex.c | 60 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 8c3d4993b1e0..b81ec975f124 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -91,10 +91,7 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } -/* - * Trylock variant that returns the owning task on failure. - */ -static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) +static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; @@ -104,39 +101,48 @@ static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) unsigned long task = owner & ~MUTEX_FLAGS; if (task) { - if (likely(task != curr)) + if (flags & MUTEX_FLAG_PICKUP) { + if (task != curr) + break; + flags &= ~MUTEX_FLAG_PICKUP; + } else if (handoff) { + if (flags & MUTEX_FLAG_HANDOFF) + break; + flags |= MUTEX_FLAG_HANDOFF; + } else { break; - - if (likely(!(flags & MUTEX_FLAG_PICKUP))) - break; - - flags &= ~MUTEX_FLAG_PICKUP; + } } else { #ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); + DEBUG_LOCKS_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); #endif + task = curr; } - /* - * We set the HANDOFF bit, we must make sure it doesn't live - * past the point where we acquire it. This would be possible - * if we (accidentally) set the bit on an unlocked mutex. - */ - flags &= ~MUTEX_FLAG_HANDOFF; - - if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, curr | flags)) - return NULL; + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) { + if (task == curr) + return NULL; + break; + } } return __owner_task(owner); } +/* + * Trylock or set HANDOFF + */ +static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff) +{ + return !__mutex_trylock_common(lock, handoff); +} + /* * Actual trylock that will work on any unlocked state. */ static inline bool __mutex_trylock(struct mutex *lock) { - return !__mutex_trylock_or_owner(lock); + return !__mutex_trylock_common(lock, false); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -479,6 +485,14 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * Trylock variant that returns the owning task on failure. + */ +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) +{ + return __mutex_trylock_common(lock, false); +} + static inline bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) @@ -1018,8 +1032,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, schedule_preempt_disabled(); first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); set_current_state(state); /* @@ -1027,7 +1039,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock) || + if (__mutex_trylock_or_handoff(lock, first) || (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) break; -- cgit v1.2.3-71-gd317 From e6b4457b05f36bb9e371f29ab1dd2d97272a1540 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jun 2021 17:35:20 +0200 Subject: locking/mutex: Add MUTEX_WARN_ON Cleanup some #ifdef'fery. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Yanfei Xu Link: https://lore.kernel.org/r/20210630154115.020298650@infradead.org --- kernel/locking/mutex.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b81ec975f124..633bf0dce3f8 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -32,8 +32,10 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" +# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond) #else # include "mutex.h" +# define MUTEX_WARN_ON(cond) #endif void @@ -113,9 +115,7 @@ static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, boo break; } } else { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); -#endif + MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); task = curr; } @@ -218,10 +218,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) for (;;) { unsigned long new; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; @@ -754,9 +752,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) * into 'unlocked' state: */ if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif + MUTEX_WARN_ON(!lock->ctx->acquired); if (lock->ctx->acquired > 0) lock->ctx->acquired--; lock->ctx = NULL; @@ -931,9 +927,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, might_sleep(); -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); ww = container_of(lock, struct ww_mutex, base); if (ww_ctx) { @@ -1227,10 +1221,8 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ owner = atomic_long_read(&lock->owner); for (;;) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); if (owner & MUTEX_FLAG_HANDOFF) break; @@ -1396,9 +1388,7 @@ int __sched mutex_trylock(struct mutex *lock) { bool locked; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); locked = __mutex_trylock(lock); if (locked) -- cgit v1.2.3-71-gd317 From 11941f8a85362f612df61f4aaab0e41b64d2111d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:23 +0530 Subject: bpf: cpumap: Implement generic cpumap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements CPUMAP redirect support for generic XDP programs. The idea is to reuse the cpu map entry's queue that is used to push native xdp frames for redirecting skb to a different CPU. This will match native XDP behavior (in that RPS is invoked again for packet reinjected into networking stack). To be able to determine whether the incoming skb is from the driver or cpumap, we reuse skb->redirected bit that skips generic XDP processing when it is set. To always make use of this, CONFIG_NET_REDIRECT guard on it has been lifted and it is always available. >From the redirect side, we add the skb to ptr_ring with its lowest bit set to 1. This should be safe as skb is not 1-byte aligned. This allows kthread to discern between xdp_frames and sk_buff. On consumption of the ptr_ring item, the lowest bit is unset. In the end, the skb is simply added to the list that kthread is anyway going to maintain for xdp_frames converted to skb, and then received again by using netif_receive_skb_list. Bulking optimization for generic cpumap is left as an exercise for a future patch for now. Since cpumap entry progs are now supported, also remove check in generic_xdp_install for the cpumap. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210702111825.491065-4-memxor@gmail.com --- include/linux/bpf.h | 9 +++- include/linux/skbuff.h | 10 +---- kernel/bpf/cpumap.c | 116 +++++++++++++++++++++++++++++++++++++++++-------- net/core/dev.c | 3 +- net/core/filter.c | 6 ++- 5 files changed, 114 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..095aaa104c56 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1513,7 +1513,8 @@ bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); -bool cpu_map_prog_allowed(struct bpf_map *map); +int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1710,6 +1711,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + static inline bool cpu_map_prog_allowed(struct bpf_map *map) { return false; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd9a73f..f19190820e63 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -863,8 +863,8 @@ struct sk_buff { __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; #endif -#ifdef CONFIG_NET_REDIRECT __u8 redirected:1; +#ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE @@ -4664,17 +4664,13 @@ static inline __wsum lco_csum(struct sk_buff *skb) static inline bool skb_is_redirected(const struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT return skb->redirected; -#else - return false; -#endif } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 1; +#ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb->tstamp = 0; @@ -4683,9 +4679,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) static inline void skb_reset_redirect(struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 0; -#endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 480e936c54d0..585b2b77ccc4 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -16,6 +16,7 @@ * netstack, and assigning dedicated CPUs for this stage. This * basically allows for 10G wirespeed pre-filtering via bpf. */ +#include #include #include #include @@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + struct list_head *listp, + struct xdp_cpumap_stats *stats) +{ + struct sk_buff *skb, *tmp; + struct xdp_buff xdp; + u32 act; + int err; + + list_for_each_entry_safe(skb, tmp, listp, list) { + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); + switch (act) { + case XDP_PASS: + break; + case XDP_REDIRECT: + skb_list_del_init(skb); + err = xdp_do_generic_redirect(skb->dev, skb, &xdp, + rcpu->prog); + if (unlikely(err)) { + kfree_skb(skb); + stats->drop++; + } else { + stats->redirect++; + } + return; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(skb->dev, rcpu->prog, act); + fallthrough; + case XDP_DROP: + skb_list_del_init(skb); + kfree_skb(skb); + stats->drop++; + return; + } + } +} + static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, void **frames, int n, struct xdp_cpumap_stats *stats) @@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - if (!rcpu->prog) - return n; - - rcu_read_lock_bh(); - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; @@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } + xdp_clear_return_frame_no_direct(); + + return nframes; +} + +#define CPUMAP_BATCH 8 + +static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + int xdp_n, struct xdp_cpumap_stats *stats, + struct list_head *list) +{ + int nframes; + + if (!rcpu->prog) + return xdp_n; + + rcu_read_lock_bh(); + + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + if (stats->redirect) - xdp_do_flush_map(); + xdp_do_flush(); - xdp_clear_return_frame_no_direct(); + if (unlikely(!list_empty(list))) + cpu_map_bpf_prog_run_skb(rcpu, list, stats); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } -#define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) { @@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data) struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m, nframes, xdp_n; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - int i, n, m, nframes; LIST_HEAD(list); /* Release CPU reschedule checks */ @@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0; i < n; i++) { + for (i = 0, xdp_n = 0; i < n; i++) { void *f = frames[i]; - struct page *page = virt_to_page(f); + struct page *page; + + if (unlikely(__ptr_test_bit(0, &f))) { + struct sk_buff *skb = f; + + __ptr_clear_bit(0, &skb); + list_add_tail(&skb->list, &list); + continue; + } + + frames[xdp_n++] = f; + page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by * build_skb_around via page_is_pfmemalloc(), and when @@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data) } /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); if (nframes) { m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); if (unlikely(m == 0)) { @@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data) return 0; } -bool cpu_map_prog_allowed(struct bpf_map *map) -{ - return map->map_type == BPF_MAP_TYPE_CPUMAP && - map->value_size != offsetofend(struct bpf_cpumap_val, qsize); -} - static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) { struct bpf_prog *prog; @@ -701,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } +int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb) +{ + int ret; + + __skb_pull(skb, skb->mac_len); + skb_set_redirected(skb, false); + __ptr_set_bit(0, &skb); + + ret = ptr_ring_produce(rcpu->queue, skb); + if (ret < 0) + goto trace; + + wake_up_process(rcpu->kthread); +trace: + trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); + return ret; +} + void __cpu_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); diff --git a/net/core/dev.c b/net/core/dev.c index 93e80c36cc97..4c51d1f81633 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5669,8 +5669,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i]) || - cpu_map_prog_allowed(new->aux->used_maps[i])) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) { mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; } diff --git a/net/core/filter.c b/net/core/filter.c index f2c15b2a057a..3b4986e96e9c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4040,8 +4040,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, goto err; consume_skb(skb); break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; default: - /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } -- cgit v1.2.3-71-gd317 From 2ea5eabaf04a1829383aefe98ac38a2e5ae2d698 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:24 +0530 Subject: bpf: devmap: Implement devmap prog execution for generic XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This lifts the restriction on running devmap BPF progs in generic redirect mode. To match native XDP behavior, it is invoked right before generic_xdp_tx is called, and only supports XDP_PASS/XDP_ABORTED/ XDP_DROP actions. We also return 0 even if devmap program drops the packet, as semantically redirect has already succeeded and the devmap prog is the last point before TX of the packet to device where it can deliver a verdict on the packet. This also means it must take care of freeing the skb, as xdp_do_generic_redirect callers only do that in case an error is returned. Since devmap entry prog is supported, remove the check in generic_xdp_install entirely. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210702111825.491065-5-memxor@gmail.com --- include/linux/bpf.h | 1 - kernel/bpf/devmap.c | 49 +++++++++++++++++++++++++++++++++++++++---------- net/core/dev.c | 18 ------------------ 3 files changed, 39 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 095aaa104c56..4afbff308ca3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1508,7 +1508,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2546dafd6672..fa26eac5e4b6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -322,16 +322,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -bool dev_map_can_have_prog(struct bpf_map *map) -{ - if ((map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && - map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) - return true; - - return false; -} - static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *dev) @@ -499,6 +489,37 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return 0; } +static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) +{ + struct xdp_txq_info txq = { .dev = dst->dev }; + struct xdp_buff xdp; + u32 act; + + if (!dst->xdp_prog) + return XDP_PASS; + + __skb_pull(skb, skb->mac_len); + xdp.txq = &txq; + + act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); + switch (act) { + case XDP_PASS: + __skb_push(skb, skb->mac_len); + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dst->dev, dst->xdp_prog, act); + fallthrough; + case XDP_DROP: + kfree_skb(skb); + break; + } + + return act; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -614,6 +635,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; + + /* Redirect has already succeeded semantically at this point, so we just + * return 0 even if packet is dropped. Helper below takes care of + * freeing skb. + */ + if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) + return 0; + skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); diff --git a/net/core/dev.c b/net/core/dev.c index 4c51d1f81633..71f7175cad9a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5660,24 +5660,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; - if (new) { - u32 i; - - mutex_lock(&new->aux->used_maps_mutex); - - /* generic XDP does not work with DEVMAPs that can - * have a bpf_prog installed on an entry - */ - for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) { - mutex_unlock(&new->aux->used_maps_mutex); - return -EINVAL; - } - } - - mutex_unlock(&new->aux->used_maps_mutex); - } - switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); -- cgit v1.2.3-71-gd317 From 11e4b63abbe23872b45f325a7c6c8b7f9ff42cad Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 2 Jul 2021 17:06:57 +0200 Subject: printk/console: Check consistent sequence number when handling race in console_unlock() The standard printk() tries to flush the message to the console immediately. It tries to take the console lock. If the lock is already taken then the current owner is responsible for flushing even the new message. There is a small race window between checking whether a new message is available and releasing the console lock. It is solved by re-checking the state after releasing the console lock. If the check is positive then console_unlock() tries to take the lock again and process the new message as well. The commit 996e966640ddea7b535c ("printk: remove logbuf_lock") causes that console_seq is not longer read atomically. As a result, the re-check might be done with an inconsistent 64-bit index. Solve it by using the last sequence number that has been checked under the console lock. In the worst case, it will take the lock again only to realized that the new message has already been proceed. But it was possible even before. The variable next_seq is marked as __maybe_unused to call down compiler warning when CONFIG_PRINTK is not defined. Fixes: commit 996e966640ddea7b535c ("printk: remove logbuf_lock") Reported-by: kernel test robot # unused next_seq warning Cc: stable@vger.kernel.org # 5.13 Signed-off-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20210702150657.26760-1-pmladek@suse.com --- kernel/printk/printk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..6dad7da8f383 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2545,6 +2545,7 @@ void console_unlock(void) bool do_cond_resched, retry; struct printk_info info; struct printk_record r; + u64 __maybe_unused next_seq; if (console_suspended) { up_console_sem(); @@ -2654,8 +2655,10 @@ skip: cond_resched(); } - console_locked = 0; + /* Get consistent value of the next-to-be-used sequence number. */ + next_seq = console_seq; + console_locked = 0; up_console_sem(); /* @@ -2664,7 +2667,7 @@ skip: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - retry = prb_read_valid(prb, console_seq, NULL); + retry = prb_read_valid(prb, next_seq, NULL); printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) -- cgit v1.2.3-71-gd317 From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 7 Jul 2021 15:38:47 -0700 Subject: bpf: Track subprog poke descriptors correctly and fix use-after-free Subprograms are calling map_poke_track(), but on program release there is no hook to call map_poke_untrack(). However, on program release, the aux memory (and poke descriptor table) is freed even though we still have a reference to it in the element list of the map aux data. When we run map_poke_run(), we then end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run(): [...] [ 402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e [ 402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337 [ 402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G I 5.12.0+ #399 [ 402.824715] Call Trace: [ 402.824719] dump_stack+0x93/0xc2 [ 402.824727] print_address_description.constprop.0+0x1a/0x140 [ 402.824736] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824740] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824744] kasan_report.cold+0x7c/0xd8 [ 402.824752] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824757] prog_array_map_poke_run+0xc2/0x34e [ 402.824765] bpf_fd_array_map_update_elem+0x124/0x1a0 [...] The elements concerned are walked as follows: for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; [...] The access to size_poke_tab is a 4 byte read, verified by checking offsets in the KASAN dump: [ 402.825004] The buggy address belongs to the object at ffff8881905a7800 which belongs to the cache kmalloc-1k of size 1024 [ 402.825008] The buggy address is located 320 bytes inside of 1024-byte region [ffff8881905a7800, ffff8881905a7c00) The pahole output of bpf_prog_aux: struct bpf_prog_aux { [...] /* --- cacheline 5 boundary (320 bytes) --- */ u32 size_poke_tab; /* 320 4 */ [...] In general, subprograms do not necessarily manage their own data structures. For example, BTF func_info and linfo are just pointers to the main program structure. This allows reference counting and cleanup to be done on the latter which simplifies their management a bit. The aux->poke_tab struct, however, did not follow this logic. The initial proposed fix for this use-after-free bug further embedded poke data tracking into the subprogram with proper reference counting. However, Daniel and Alexei questioned why we were treating these objects special; I agree, its unnecessary. The fix here removes the per subprogram poke table allocation and map tracking and instead simply points the aux->poke_tab pointer at the main programs poke table. This way, map tracking is simplified to the main program and we do not need to manage them per subprogram. This also means, bpf_prog_free_deferred(), which unwinds the program reference counting and kfrees objects, needs to ensure that we don't try to double free the poke_tab when free'ing the subprog structures. This is easily solved by NULL'ing the poke_tab pointer. The second detail is to ensure that per subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do this, we add a pointer in the poke structure to point at the subprogram value so JITs can easily check while walking the poke_tab structure if the current entry belongs to the current program. The aux pointer is stable and therefore suitable for such comparison. On the jit_subprogs() error path, we omit cleaning up the poke->aux field because these are only ever referenced from the JIT side, but on error we will never make it to the JIT, so its fine to leave them dangling. Removing these pointers would complicate the error path for no reason. However, we do need to untrack all poke descriptors from the main program as otherwise they could race with the freeing of JIT memory from the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") had an off-by-one on the subprogram instruction index range check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'. However, subprog_end is the next subprogram's start instruction. Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 1 + kernel/bpf/core.c | 8 +++++- kernel/bpf/verifier.c | 60 ++++++++++++++++----------------------------- 4 files changed, 32 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e835164189f1..4b951458c9fc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; + if (poke->aux && poke->aux != prog->aux) + continue; + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..e8e2b0393ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; + void *aux; union { struct { struct bpf_map *map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 034ad93a1ad7..9b1577498373 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) + for (i = 0; i < aux->func_cnt; i++) { + /* We can just unlink the subprog poke descriptor table as + * it was originally linked to the main program and is also + * released along with it. + */ + aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); + } if (aux->func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index be38bb930bf1..42a4063de7cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; func[i]->is_func = 1; func[i]->aux->func_idx = i; - /* the btf and func_info will be freed only at prog->aux */ + /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->poke_tab = prog->aux->poke_tab; + func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; for (j = 0; j < prog->aux->size_poke_tab; j++) { - u32 insn_idx = prog->aux->poke_tab[j].insn_idx; - int ret; + struct bpf_jit_poke_descriptor *poke; - if (!(insn_idx >= subprog_start && - insn_idx <= subprog_end)) - continue; - - ret = bpf_jit_add_poke_descriptor(func[i], - &prog->aux->poke_tab[j]); - if (ret < 0) { - verbose(env, "adding tail call poke descriptor failed\n"); - goto out_free; - } - - func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; - - map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; - ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); - if (ret < 0) { - verbose(env, "tracking tail call prog failed\n"); - goto out_free; - } + poke = &prog->aux->poke_tab[j]; + if (poke->insn_idx < subprog_end && + poke->insn_idx >= subprog_start) + poke->aux = func[i]->aux; } /* Use bpf_prog_F_tag to indicate functions in stack traces. @@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } - /* Untrack main program's aux structs so that during map_poke_run() - * we will not stumble upon the unfilled poke descriptors; each - * of the main program's poke descs got distributed across subprogs - * and got tracked onto map, so we are sure that none of them will - * be missed after the operation below - */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - - map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); - } - /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_jit_attempt_done(prog); return 0; out_free: + /* We failed JIT'ing, so at this point we need to unregister poke + * descriptors from subprogs, so that kernel is not attempting to + * patch it anymore as we're freeing the subprog JIT memory. + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* At this point we're guaranteed that poke descriptors are not + * live anymore. We can just unlink its descriptor table as it's + * released with the main prog. + */ for (i = 0; i < env->subprog_cnt; i++) { if (!func[i]) continue; - - for (j = 0; j < func[i]->aux->size_poke_tab; j++) { - map_ptr = func[i]->aux->poke_tab[j].tail_call.map; - map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); - } + func[i]->aux->poke_tab = NULL; bpf_jit_free(func[i]); } kfree(func); -- cgit v1.2.3-71-gd317 From 1adee589cd6da2ead7f1b5dd82419eac59a2e2b0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 12 Jul 2021 11:03:35 -0500 Subject: kernel: debug: Fix unreachable code in gdb_serial_stub() Fix the following warning: kernel/debug/gdbstub.c:1049:4: warning: fallthrough annotation in unreachable code [-Wimplicit-fallthrough] fallthrough; ^ include/linux/compiler_attributes.h:210:41: note: expanded from macro 'fallthrough' # define fallthrough __attribute__((__fallthrough__) by placing the fallthrough; statement inside ifdeffery. Reported-by: kernel test robot Signed-off-by: Gustavo A. R. Silva --- kernel/debug/gdbstub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 8372897402f4..b6f28fad4307 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1045,8 +1045,8 @@ int gdb_serial_stub(struct kgdb_state *ks) gdb_cmd_detachkill(ks); return DBG_PASS_EVENT; } -#endif fallthrough; +#endif case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) -- cgit v1.2.3-71-gd317 From b48c7236b13cb5ef1b5fdf744aa8841df0f7b43a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 29 Jun 2021 15:11:44 -0500 Subject: exit/bdflush: Remove the deprecated bdflush system call The bdflush system call has been deprecated for a very long time. Recently Michael Schmitz tested[1] and found that the last known caller of of the bdflush system call is unaffected by it's removal. Since the code is not needed delete it. [1] https://lkml.kernel.org/r/36123b5d-daa0-6c2b-f2d4-a942f069fd54@gmail.com Link: https://lkml.kernel.org/r/87sg10quue.fsf_-_@disp2133 Tested-by: Michael Schmitz Acked-by: Geert Uytterhoeven Reviewed-by: Arnd Bergmann Acked-by: Cyril Hrubis Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/syscalls/syscall.tbl | 2 +- arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd32.h | 2 +- arch/ia64/kernel/syscalls/syscall.tbl | 2 +- arch/m68k/kernel/syscalls/syscall.tbl | 2 +- arch/microblaze/kernel/syscalls/syscall.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sh/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 2 +- fs/buffer.c | 27 ---------------------- include/linux/syscalls.h | 1 - include/uapi/linux/capability.h | 1 - kernel/sys_ni.c | 1 - tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- 20 files changed, 16 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a17687ed4b51..7ac22e007d52 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -230,7 +230,7 @@ 259 common osf_swapctl sys_ni_syscall 260 common osf_memcntl sys_ni_syscall 261 common osf_fdatasync sys_ni_syscall -300 common bdflush sys_bdflush +300 common bdflush sys_ni_syscall 301 common sethae sys_sethae 302 common mount sys_mount 303 common old_adjtimex sys_old_adjtimex diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c5df1179fc5d..f8a2d5aa17b7 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -147,7 +147,7 @@ 131 common quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 common personality sys_personality # 137 was sys_afs_syscall diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 99ffcafc736c..03d4ca47d253 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -279,7 +279,7 @@ __SYSCALL(__NR_getpgid, sys_getpgid) #define __NR_fchdir 133 __SYSCALL(__NR_fchdir, sys_fchdir) #define __NR_bdflush 134 -__SYSCALL(__NR_bdflush, sys_bdflush) +__SYSCALL(__NR_bdflush, sys_ni_syscall) #define __NR_sysfs 135 __SYSCALL(__NR_sysfs, sys_sysfs) #define __NR_personality 136 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 6d07742c57b8..4b20224b14d9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -123,7 +123,7 @@ # 1135 was get_kernel_syms # 1136 was query_module 113 common quotactl sys_quotactl -114 common bdflush sys_bdflush +114 common bdflush sys_ni_syscall 115 common sysfs sys_sysfs 116 common personality sys_personality 117 common afs_syscall sys_ni_syscall diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 541bc1b3a8f9..3ec1291c268d 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -141,7 +141,7 @@ 131 common quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 common personality sys_personality # 137 was afs_syscall diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a176faca2927..9be3ace12938 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -141,7 +141,7 @@ 131 common quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 common personality sys_personality 137 common afs_syscall sys_ni_syscall diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 253f2cd70b6b..fae35882a165 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -145,7 +145,7 @@ 131 o32 quotactl sys_quotactl 132 o32 getpgid sys_getpgid 133 o32 fchdir sys_fchdir -134 o32 bdflush sys_bdflush +134 o32 bdflush sys_ni_syscall 135 o32 sysfs sys_sysfs 136 o32 personality sys_personality sys_32_personality 137 o32 afs_syscall sys_ni_syscall diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e26187b9ab87..eaf0603ae781 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -147,7 +147,7 @@ 131 common quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 32 personality parisc_personality 136 64 personality sys_personality diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index aef2a290e71a..6f3953f2a0d5 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -176,7 +176,7 @@ 131 nospu quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 32 personality sys_personality ppc64_personality 136 64 personality ppc64_personality diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b4..aa705e1bd0dc 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -122,7 +122,7 @@ 131 common quotactl sys_quotactl sys_quotactl 132 common getpgid sys_getpgid sys_getpgid 133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_bdflush sys_bdflush +134 common bdflush sys_ni_syscall sys_ni_syscall 135 common sysfs sys_sysfs sys_sysfs 136 common personality sys_s390_personality sys_s390_personality 137 common afs_syscall - - diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e0a70be77d84..7bbd6700ae4b 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -141,7 +141,7 @@ 131 common quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 common personality sys_personality # 137 was afs_syscall diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 603f5a821502..f520e9cd2c78 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -270,7 +270,7 @@ 222 common delete_module sys_delete_module 223 common get_kernel_syms sys_ni_syscall 224 common getpgid sys_getpgid -225 common bdflush sys_bdflush +225 common bdflush sys_ni_syscall 226 common sysfs sys_sysfs 227 common afs_syscall sys_nis_syscall 228 common setfsuid sys_setfsuid16 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ce763a12311c..a5beae6daf20 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -145,7 +145,7 @@ 131 i386 quotactl sys_quotactl 132 i386 getpgid sys_getpgid 133 i386 fchdir sys_fchdir -134 i386 bdflush sys_bdflush +134 i386 bdflush sys_ni_syscall 135 i386 sysfs sys_sysfs 136 i386 personality sys_personality 137 i386 afs_syscall diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 235d67d6ceb4..b3d1bc8a9095 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -223,7 +223,7 @@ # 205 was old nfsservctl 205 common nfsservctl sys_ni_syscall 206 common _sysctl sys_ni_syscall -207 common bdflush sys_bdflush +207 common bdflush sys_ni_syscall 208 common uname sys_newuname 209 common sysinfo sys_sysinfo 210 common init_module sys_init_module diff --git a/fs/buffer.c b/fs/buffer.c index 6290c3afdba4..32718ee13b08 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3267,33 +3267,6 @@ out: } EXPORT_SYMBOL(try_to_free_buffers); -/* - * There are no bdflush tunables left. But distributions are - * still running obsolete flush daemons, so we terminate them here. - * - * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `flush-X' kernel threads fully replace bdflush daemons and this call. - */ -SYSCALL_DEFINE2(bdflush, int, func, long, data) -{ - static int msg_count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the obsolete bdflush" - " system call\n", current->comm); - printk(KERN_INFO "Fix your initscripts?\n"); - } - - if (func == 1) - do_exit(0); - return 0; -} - /* * Buffer-head allocation */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..2b47584eb843 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1158,7 +1158,6 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf); asmlinkage long sys_vfork(void); asmlinkage long sys_recv(int, void __user *, size_t, unsigned); asmlinkage long sys_send(int, void __user *, size_t, unsigned); -asmlinkage long sys_bdflush(int func, long data); asmlinkage long sys_oldumount(char __user *name); asmlinkage long sys_uselib(const char __user *library); asmlinkage long sys_sysfs(int option, diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 2ddb4226cd23..463d1ba2232a 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -243,7 +243,6 @@ struct vfs_ns_cap_data { /* Allow examination and configuration of disk quotas */ /* Allow setting the domainname */ /* Allow setting the hostname */ -/* Allow calling bdflush() */ /* Allow mount() and umount(), setting up new smb connection */ /* Allow some autofs root ioctls */ /* Allow nfsservctl */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..cb6f98f5c97a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -416,7 +416,6 @@ COND_SYSCALL(epoll_wait); COND_SYSCALL(recv); COND_SYSCALL_COMPAT(recv); COND_SYSCALL(send); -COND_SYSCALL(bdflush); COND_SYSCALL(uselib); /* optional: time32 */ diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index aef2a290e71a..6f3953f2a0d5 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -176,7 +176,7 @@ 131 nospu quotactl sys_quotactl 132 common getpgid sys_getpgid 133 common fchdir sys_fchdir -134 common bdflush sys_bdflush +134 common bdflush sys_ni_syscall 135 common sysfs sys_sysfs 136 32 personality sys_personality ppc64_personality 136 64 personality ppc64_personality diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 64d51ab5a8b4..8d619ec86dcc 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -122,7 +122,7 @@ 131 common quotactl sys_quotactl sys_quotactl 132 common getpgid sys_getpgid sys_getpgid 133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_bdflush sys_bdflush +134 common bdflush - - 135 common sysfs sys_sysfs sys_sysfs 136 common personality sys_s390_personality sys_s390_personality 137 common afs_syscall - - -- cgit v1.2.3-71-gd317 From e9ba16e68cce2f85e9f5d2eba5c0453f1a741fd2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 11 Jul 2021 08:26:45 +0200 Subject: smpboot: Mark idle_init() as __always_inlined to work around aggressive compiler un-inlining While this function is a static inline, and is only used once in local scope, certain Kconfig variations may cause it to be compiled as a standalone function: 89231bf0 : 89231bf0: 83 05 60 d9 45 89 01 addl $0x1,0x8945d960 89231bf7: 55 push %ebp Resulting in this build failure: WARNING: modpost: vmlinux.o(.text.unlikely+0x7fd5): Section mismatch in reference from the function idle_init() to the function .init.text:fork_idle() The function idle_init() references the function __init fork_idle(). This is often because idle_init lacks a __init annotation or the annotation of fork_idle is wrong. ERROR: modpost: Section mismatches detected. Certain USBSAN options x86-32 builds with CONFIG_CC_OPTIMIZE_FOR_SIZE=y seem to be causing this. So mark idle_init() as __always_inline to work around this compiler bug/feature. Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e4163042c4d6..21b7953f8242 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -47,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void) * * Creates the thread if it does not exist. */ -static inline void idle_init(unsigned int cpu) +static inline void __always_inline idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); -- cgit v1.2.3-71-gd317 From 5dd0a6b8582ffbfa88351949d50eccd5b6694ade Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 12 Jul 2021 22:57:35 +0200 Subject: bpf: Fix tail_call_reachable rejection for interpreter when jit failed During testing of f263a81451c1 ("bpf: Track subprog poke descriptors correctly and fix use-after-free") under various failure conditions, for example, when jit_subprogs() fails and tries to clean up the program to be run under the interpreter, we ran into the following freeze: [...] #127/8 tailcall_bpf2bpf_3:FAIL [...] [ 92.041251] BUG: KASAN: slab-out-of-bounds in ___bpf_prog_run+0x1b9d/0x2e20 [ 92.042408] Read of size 8 at addr ffff88800da67f68 by task test_progs/682 [ 92.043707] [ 92.044030] CPU: 1 PID: 682 Comm: test_progs Tainted: G O 5.13.0-53301-ge6c08cb33a30-dirty #87 [ 92.045542] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 92.046785] Call Trace: [ 92.047171] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.047773] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.048389] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.049019] ? ktime_get+0x117/0x130 [...] // few hundred [similar] lines more [ 92.659025] ? ktime_get+0x117/0x130 [ 92.659845] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.660738] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.661528] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.662378] ? print_usage_bug+0x50/0x50 [ 92.663221] ? print_usage_bug+0x50/0x50 [ 92.664077] ? bpf_ksym_find+0x9c/0xe0 [ 92.664887] ? ktime_get+0x117/0x130 [ 92.665624] ? kernel_text_address+0xf5/0x100 [ 92.666529] ? __kernel_text_address+0xe/0x30 [ 92.667725] ? unwind_get_return_address+0x2f/0x50 [ 92.668854] ? ___bpf_prog_run+0x15d4/0x2e20 [ 92.670185] ? ktime_get+0x117/0x130 [ 92.671130] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.672020] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.672860] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.675159] ? ktime_get+0x117/0x130 [ 92.677074] ? lock_is_held_type+0xd5/0x130 [ 92.678662] ? ___bpf_prog_run+0x15d4/0x2e20 [ 92.680046] ? ktime_get+0x117/0x130 [ 92.681285] ? __bpf_prog_run32+0x6b/0x90 [ 92.682601] ? __bpf_prog_run64+0x90/0x90 [ 92.683636] ? lock_downgrade+0x370/0x370 [ 92.684647] ? mark_held_locks+0x44/0x90 [ 92.685652] ? ktime_get+0x117/0x130 [ 92.686752] ? lockdep_hardirqs_on+0x79/0x100 [ 92.688004] ? ktime_get+0x117/0x130 [ 92.688573] ? __cant_migrate+0x2b/0x80 [ 92.689192] ? bpf_test_run+0x2f4/0x510 [ 92.689869] ? bpf_test_timer_continue+0x1c0/0x1c0 [ 92.690856] ? rcu_read_lock_bh_held+0x90/0x90 [ 92.691506] ? __kasan_slab_alloc+0x61/0x80 [ 92.692128] ? eth_type_trans+0x128/0x240 [ 92.692737] ? __build_skb+0x46/0x50 [ 92.693252] ? bpf_prog_test_run_skb+0x65e/0xc50 [ 92.693954] ? bpf_prog_test_run_raw_tp+0x2d0/0x2d0 [ 92.694639] ? __fget_light+0xa1/0x100 [ 92.695162] ? bpf_prog_inc+0x23/0x30 [ 92.695685] ? __sys_bpf+0xb40/0x2c80 [ 92.696324] ? bpf_link_get_from_fd+0x90/0x90 [ 92.697150] ? mark_held_locks+0x24/0x90 [ 92.698007] ? lockdep_hardirqs_on_prepare+0x124/0x220 [ 92.699045] ? finish_task_switch+0xe6/0x370 [ 92.700072] ? lockdep_hardirqs_on+0x79/0x100 [ 92.701233] ? finish_task_switch+0x11d/0x370 [ 92.702264] ? __switch_to+0x2c0/0x740 [ 92.703148] ? mark_held_locks+0x24/0x90 [ 92.704155] ? __x64_sys_bpf+0x45/0x50 [ 92.705146] ? do_syscall_64+0x35/0x80 [ 92.706953] ? entry_SYSCALL_64_after_hwframe+0x44/0xae [...] Turns out that the program rejection from e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") is buggy since env->prog->aux->tail_call_reachable is never true. Commit ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") added a tracker into check_max_stack_depth() which propagates the tail_call_reachable condition throughout the subprograms. This info is then assigned to the subprogram's func[i]->aux->tail_call_reachable. However, in the case of the rejection check upon JIT failure, env->prog->aux->tail_call_reachable is used. func[0]->aux->tail_call_reachable which represents the main program's information did not propagate this to the outer env->prog->aux, though. Add this propagation into check_max_stack_depth() where it needs to belong so that the check can be done reliably. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") Co-developed-by: John Fastabend Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/618c34e3163ad1a36b1e82377576a6081e182f25.1626123173.git.daniel@iogearbox.net --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42a4063de7cd..9de3c9c3267c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3677,6 +3677,8 @@ continue_func: if (tail_call_reachable) for (j = 0; j < frame; j++) subprog[ret_prog[j]].tail_call_reachable = true; + if (subprog[0].tail_call_reachable) + env->prog->aux->tail_call_reachable = true; /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT -- cgit v1.2.3-71-gd317 From 0a65579cdd28bed3e84e1b4929c3080da4f06d79 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:32 +0800 Subject: swiotlb: Refactor swiotlb init functions Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct initialization to make the code reusable. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Acked-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e50df8d8f87e..414db5fc8de9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -168,9 +168,28 @@ void __init swiotlb_update_mem_attributes(void) memset(vaddr, 0, bytes); } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, + unsigned long nslabs, bool late_alloc) { + void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + + mem->nslabs = nslabs; + mem->start = start; + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = late_alloc; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; + } + memset(vaddr, 0, bytes); +} + +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +{ struct io_tlb_mem *mem; size_t alloc_size; @@ -186,16 +205,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (!mem) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - mem->nslabs = nslabs; - mem->start = __pa(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); io_tlb_default_mem = mem; if (verbose) @@ -282,8 +293,8 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - unsigned long bytes = nslabs << IO_TLB_SHIFT, i; struct io_tlb_mem *mem; + unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; @@ -297,20 +308,9 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!mem) return -ENOMEM; - mem->nslabs = nslabs; - mem->start = virt_to_phys(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - mem->late_alloc = 1; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } - + memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); io_tlb_default_mem = mem; swiotlb_print_info(); -- cgit v1.2.3-71-gd317 From 6e675a1c455ea7579c7eaf1a38fe64267039d6fe Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:33 +0800 Subject: swiotlb: Refactor swiotlb_create_debugfs Split the debugfs creation to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 414db5fc8de9..ae6a151d0a41 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -669,19 +669,26 @@ bool is_swiotlb_active(void) EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS +static struct dentry *debugfs_dir; -static int __init swiotlb_create_debugfs(void) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) { - struct io_tlb_mem *mem = io_tlb_default_mem; - - if (!mem) - return 0; - mem->debugfs = debugfs_create_dir("swiotlb", NULL); debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); +} + +static int __init swiotlb_create_default_debugfs(void) +{ + struct io_tlb_mem *mem = io_tlb_default_mem; + + debugfs_dir = debugfs_create_dir("swiotlb", NULL); + if (mem) { + mem->debugfs = debugfs_dir; + swiotlb_create_debugfs_files(mem); + } return 0; } -late_initcall(swiotlb_create_debugfs); +late_initcall(swiotlb_create_default_debugfs); #endif -- cgit v1.2.3-71-gd317 From 69031f500865ee3eee19566a1b9c40a189817eaa Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:34 +0800 Subject: swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used Always have the pointer to the swiotlb pool used in struct device. This could help simplify the code for other pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/base/core.c | 4 ++++ include/linux/device.h | 4 ++++ kernel/dma/swiotlb.c | 8 ++++---- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/base/core.c b/drivers/base/core.c index cadcade65825..ea5b85354526 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* for dma_default_coherent */ @@ -2846,6 +2847,9 @@ void device_initialize(struct device *dev) defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) dev->dma_coherent = dma_default_coherent; #endif +#ifdef CONFIG_SWIOTLB + dev->dma_io_tlb_mem = io_tlb_default_mem; +#endif } EXPORT_SYMBOL_GPL(device_initialize); diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..2a22875238a6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -423,6 +423,7 @@ struct dev_links_info { * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations + * @dma_io_tlb_mem: Pointer to the swiotlb pool used. Not for driver use. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. @@ -531,6 +532,9 @@ struct device { #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ +#endif +#ifdef CONFIG_SWIOTLB + struct io_tlb_mem *dma_io_tlb_mem; #endif /* arch specific additions */ struct dev_archdata archdata; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ae6a151d0a41..33d413beddd4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -348,7 +348,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; @@ -429,7 +429,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -506,7 +506,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int i; int index; @@ -557,7 +557,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; -- cgit v1.2.3-71-gd317 From 7fd856aa7f4261ddac62ea59d8383fef22a0690e Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:35 +0800 Subject: swiotlb: Update is_swiotlb_buffer to add a struct device argument Update is_swiotlb_buffer to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/dma-iommu.c | 12 ++++++------ drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 7 ++++--- kernel/dma/direct.c | 6 +++--- kernel/dma/direct.h | 6 +++--- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 98ba927aee1a..4e34e8b26579 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -506,7 +506,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, __iommu_dma_unmap(dev, dma_addr, size); - if (unlikely(is_swiotlb_buffer(phys))) + if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } @@ -577,7 +577,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, } iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys)) + if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs); return iova; } @@ -783,7 +783,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(phys, size, dir); - if (is_swiotlb_buffer(phys)) + if (is_swiotlb_buffer(dev, phys)) swiotlb_sync_single_for_cpu(dev, phys, size, dir); } @@ -796,7 +796,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (is_swiotlb_buffer(phys)) + if (is_swiotlb_buffer(dev, phys)) swiotlb_sync_single_for_device(dev, phys, size, dir); if (!dev_is_dma_coherent(dev)) @@ -817,7 +817,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); - if (is_swiotlb_buffer(sg_phys(sg))) + if (is_swiotlb_buffer(dev, sg_phys(sg))) swiotlb_sync_single_for_cpu(dev, sg_phys(sg), sg->length, dir); } @@ -834,7 +834,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, return; for_each_sg(sgl, sg, nelems, i) { - if (is_swiotlb_buffer(sg_phys(sg))) + if (is_swiotlb_buffer(dev, sg_phys(sg))) swiotlb_sync_single_for_device(dev, sg_phys(sg), sg->length, dir); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 24d11861ac7d..0c4fb34f11ab 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) * in our domain. Therefore _only_ check address within our domain. */ if (pfn_valid(PFN_DOWN(paddr))) - return is_swiotlb_buffer(paddr); + return is_swiotlb_buffer(dev, paddr); return 0; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 216854a5e513..d1f3d95881cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -2,6 +2,7 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include #include #include #include @@ -101,9 +102,9 @@ struct io_tlb_mem { }; extern struct io_tlb_mem *io_tlb_default_mem; -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && paddr >= mem->start && paddr < mem->end; } @@ -115,7 +116,7 @@ bool is_swiotlb_active(void); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f737e3347059..84c9feb5474a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, sg->length, dir); @@ -369,7 +369,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(paddr, sg->length, dir); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); @@ -504,7 +504,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) { return !dev_is_dma_coherent(dev) || - is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); + is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr)); } /** diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 50afc05b6f1d..13e9e7158d94 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, { phys_addr_t paddr = dma_to_phys(dev, addr); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) @@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) @@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); - if (unlikely(is_swiotlb_buffer(phys))) + if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ -- cgit v1.2.3-71-gd317 From 6f2beb268a5d35504a636c4a3b7aaa76ec32d96c Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:36 +0800 Subject: swiotlb: Update is_swiotlb_active to add a struct device argument Update is_swiotlb_active to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/nouveau/nouveau_ttm.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- include/linux/swiotlb.h | 4 ++-- kernel/dma/direct.c | 2 +- kernel/dma/swiotlb.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index ce6b664b10aa..89a894354263 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) max_order = MAX_ORDER; #ifdef CONFIG_SWIOTLB - if (is_swiotlb_active()) { + if (is_swiotlb_active(obj->base.dev->dev)) { unsigned int max_segment; max_segment = swiotlb_max_segment(); diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index f4c2e46b6fe1..2ca9d9a9e5d5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -276,7 +276,7 @@ nouveau_ttm_init(struct nouveau_drm *drm) } #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86) - need_swiotlb = is_swiotlb_active(); + need_swiotlb = is_swiotlb_active(dev->dev); #endif ret = ttm_device_init(&drm->ttm.bdev, &nouveau_bo_driver, drm->dev->dev, diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index b7a8f3a1921f..0d56985bfe81 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev) spin_unlock(&pcifront_dev_lock); - if (!err && !is_swiotlb_active()) { + if (!err && !is_swiotlb_active(&pdev->xdev->dev)) { err = pci_xen_swiotlb_init_late(); if (err) dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n"); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d1f3d95881cd..dd1c30a83058 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); -bool is_swiotlb_active(void); +bool is_swiotlb_active(struct device *dev); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE @@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev) return SIZE_MAX; } -static inline bool is_swiotlb_active(void) +static inline bool is_swiotlb_active(struct device *dev) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 84c9feb5474a..7a88c34d0867 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask) size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ - if (is_swiotlb_active() && + if (is_swiotlb_active(dev) && (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) return swiotlb_max_mapping_size(dev); return SIZE_MAX; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d413beddd4..d8677d6637dd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -662,9 +662,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } -bool is_swiotlb_active(void) +bool is_swiotlb_active(struct device *dev) { - return io_tlb_default_mem != NULL; + return dev->dma_io_tlb_mem != NULL; } EXPORT_SYMBOL_GPL(is_swiotlb_active); -- cgit v1.2.3-71-gd317 From 903cd0f315fe426c6a64c54ed389de0becb663dc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:20 +0800 Subject: swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and use it to determine whether to bounce the data or not. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk [v2: Includes Will's fix] --- drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 13 +++++++++++++ kernel/dma/direct.c | 2 +- kernel/dma/direct.h | 2 +- kernel/dma/swiotlb.c | 4 ++++ 5 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 0c4fb34f11ab..785ec7e8be01 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -374,7 +374,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size, true) && !range_straddles_page_boundary(phys, size) && !xen_arch_need_swiotlb(dev, phys, dev_addr) && - swiotlb_force != SWIOTLB_FORCE) + !is_swiotlb_force_bounce(dev)) goto done; /* diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index dd1c30a83058..da348671b0d5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force; * unmap calls. * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator + * @force_bounce: %true if swiotlb bouncing is forced */ struct io_tlb_mem { phys_addr_t start; @@ -94,6 +95,7 @@ struct io_tlb_mem { spinlock_t lock; struct dentry *debugfs; bool late_alloc; + bool force_bounce; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -109,6 +111,13 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) return mem && paddr >= mem->start && paddr < mem->end; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->force_bounce; +} + void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -120,6 +129,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + return false; +} static inline void swiotlb_exit(void) { } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 7a88c34d0867..a92465b4eb12 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -496,7 +496,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ if (is_swiotlb_active(dev) && - (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) + (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev))) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 13e9e7158d94..4632b0f4f72e 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + if (is_swiotlb_force_bounce(dev)) return swiotlb_map(dev, phys, size, dir, attrs); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8677d6637dd..04319dd22d28 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -179,6 +179,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->end = mem->start + bytes; mem->index = 0; mem->late_alloc = late_alloc; + + if (swiotlb_force == SWIOTLB_FORCE) + mem->force_bounce = true; + spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); -- cgit v1.2.3-71-gd317 From 36f7b2f3ca5f0bee00abf9ea52748ce0644a21c6 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:21 +0800 Subject: swiotlb: Move alloc_size to swiotlb_find_slots Rename find_slots to swiotlb_find_slots and move the maintenance of alloc_size to it for better code reusability later. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 04319dd22d28..0116a630dc13 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -430,8 +430,8 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size) +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); @@ -442,6 +442,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; BUG_ON(!nslots); @@ -486,8 +487,11 @@ not_found: return -1; found: - for (i = index; i < index + nslots; i++) + for (i = index; i < index + nslots; i++) { mem->slots[i].list = 0; + mem->slots[i].alloc_size = + alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + } for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) @@ -528,7 +532,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, orig_addr, alloc_size + offset); + index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -542,11 +546,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size + offset); i++) { + for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); - mem->slots[index + i].alloc_size = - alloc_size - (i << IO_TLB_SHIFT); - } tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) -- cgit v1.2.3-71-gd317 From 70347877231eeb505a27abe80af7ae3d3a281519 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:39 +0800 Subject: swiotlb: Refactor swiotlb_tbl_unmap_single Add a new function, swiotlb_release_slots, to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0116a630dc13..23b6df3a6ab7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -555,27 +555,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } -/* - * tlb_addr is the physical address of the bounce buffer to unmap. - */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, enum dma_data_direction dir, - unsigned long attrs) +static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { - struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long flags; - unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); int count, i; - /* - * First, sync the memory before unmapping the entry - */ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); - /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -610,6 +598,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, spin_unlock_irqrestore(&mem->lock, flags); } +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) +{ + /* + * First, sync the memory before unmapping the entry + */ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); + + swiotlb_release_slots(dev, tlb_addr); +} + void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { -- cgit v1.2.3-71-gd317 From f4111e39a52aa5d5136d890bbd1aa87c1c8fe3bc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:40 +0800 Subject: swiotlb: Add restricted DMA alloc/free support Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to support the memory allocation from restricted DMA pool. The restricted DMA pool is preferred if available. Note that since coherent allocation needs remapping, one must set up another device coherent pool by shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic coherent allocation. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 26 ++++++++++++++++++++++++++ kernel/dma/direct.c | 49 +++++++++++++++++++++++++++++++++++++------------ kernel/dma/swiotlb.c | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index da348671b0d5..3b9454d1e498 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -85,6 +85,7 @@ extern enum swiotlb_force swiotlb_force; * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator * @force_bounce: %true if swiotlb bouncing is forced + * @for_alloc: %true if the pool is used for memory allocation */ struct io_tlb_mem { phys_addr_t start; @@ -96,6 +97,7 @@ struct io_tlb_mem { struct dentry *debugfs; bool late_alloc; bool force_bounce; + bool for_alloc; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -158,4 +160,28 @@ static inline void swiotlb_adjust_size(unsigned long size) extern void swiotlb_print_info(void); extern void swiotlb_set_max_segment(unsigned int); +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size); +bool swiotlb_free(struct device *dev, struct page *page, size_t size); + +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return dev->dma_io_tlb_mem->for_alloc; +} +#else +static inline struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + return NULL; +} +static inline bool swiotlb_free(struct device *dev, struct page *page, + size_t size) +{ + return false; +} +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return false; +} +#endif /* CONFIG_DMA_RESTRICTED_POOL */ + #endif /* __LINUX_SWIOTLB_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a92465b4eb12..2de33e5d302b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static void __dma_direct_free_pages(struct device *dev, struct page *page, + size_t size) +{ + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + swiotlb_free(dev, page, size)) + return; + dma_free_contiguous(dev, page, size); +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + is_swiotlb_for_alloc(dev)) { + page = swiotlb_alloc(dev, size); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __dma_direct_free_pages(dev, page, size); + return NULL; + } + return page; + } + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); @@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; @@ -155,18 +174,23 @@ void *dma_direct_alloc(struct device *dev, size_t size, } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); /* * Remapping or decrypting memory may block. If either is required and * we can't block, allocate the memory from the atomic pools. + * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must + * set up another device coherent pool by shared-dma-pool and use + * dma_alloc_from_dev_coherent instead. */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && !gfpflags_allow_blocking(gfp) && (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev))) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -237,7 +261,7 @@ out_encrypt_pages: return NULL; } out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -247,15 +271,15 @@ void dma_direct_free(struct device *dev, size_t size, unsigned int page_order = get_order(size); if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ dma_free_contiguous(dev, cpu_addr, size); return; } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) { + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } @@ -273,7 +297,7 @@ void dma_direct_free(struct device *dev, size_t size, else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); + __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } struct page *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -283,7 +307,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, void *ret; if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -310,7 +335,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -329,7 +354,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 23b6df3a6ab7..44fc3d10f017 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -462,8 +462,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { - if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { + if (orig_addr && + (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { index = wrap_index(mem, index + 1); continue; } @@ -702,3 +703,36 @@ static int __init swiotlb_create_default_debugfs(void) late_initcall(swiotlb_create_default_debugfs); #endif + +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + phys_addr_t tlb_addr; + int index; + + if (!mem) + return NULL; + + index = swiotlb_find_slots(dev, 0, size); + if (index == -1) + return NULL; + + tlb_addr = slot_addr(mem->start, index); + + return pfn_to_page(PFN_DOWN(tlb_addr)); +} + +bool swiotlb_free(struct device *dev, struct page *page, size_t size) +{ + phys_addr_t tlb_addr = page_to_phys(page); + + if (!is_swiotlb_buffer(dev, tlb_addr)) + return false; + + swiotlb_release_slots(dev, tlb_addr); + + return true; +} + +#endif /* CONFIG_DMA_RESTRICTED_POOL */ -- cgit v1.2.3-71-gd317 From 0b84e4f8b793eb4045fd64f6f514165a7974cd16 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:41 +0800 Subject: swiotlb: Add restricted DMA pool initialization Add the initialization function to create restricted DMA pools from matching reserved-memory nodes. Regardless of swiotlb setting, the restricted DMA pool is preferred if available. The restricted DMA pools provide a basic level of protection against the DMA overwriting buffer contents at unexpected times. However, to protect against general data leakage and system memory corruption, the system needs to provide a way to lock down the memory access, e.g., MPU. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 +- kernel/dma/Kconfig | 14 +++++++++ kernel/dma/swiotlb.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 3b9454d1e498..39284ff2a6cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force; * range check to see if the memory was in fact allocated by this * API. * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and - * @end. This is command line adjustable via setup_io_tlb_npages. + * @end. For default swiotlb, this is command line adjustable via + * setup_io_tlb_npages. * @used: The number of used IO TLB block. * @list: The free list describing the number of free entries available * from each index. diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 77b405508743..3e961dc39634 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -80,6 +80,20 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_RESTRICTED_POOL + bool "DMA Restricted Pool" + depends on OF && OF_RESERVED_MEM + select SWIOTLB + help + This enables support for restricted DMA pools which provide a level of + DMA memory protection on systems with limited hardware protection + capabilities, such as those lacking an IOMMU. + + For more information see + + and . + If unsure, say "n". + # # Should be selected if we can mmap non-coherent mappings to userspace. # The only thing that is really required is a way to set an uncached bit diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 44fc3d10f017..0ffbaae9fba2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -39,6 +39,13 @@ #ifdef CONFIG_DEBUG_FS #include #endif +#ifdef CONFIG_DMA_RESTRICTED_POOL +#include +#include +#include +#include +#include +#endif #include #include @@ -735,4 +742,73 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size) return true; } +static int rmem_swiotlb_device_init(struct reserved_mem *rmem, + struct device *dev) +{ + struct io_tlb_mem *mem = rmem->priv; + unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + + /* + * Since multiple devices can share the same pool, the private data, + * io_tlb_mem struct, will be initialized by the first device attached + * to it. + */ + if (!mem) { + mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), + rmem->size >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); + mem->force_bounce = true; + mem->for_alloc = true; + + rmem->priv = mem; + + if (IS_ENABLED(CONFIG_DEBUG_FS)) { + mem->debugfs = + debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); + } + } + + dev->dma_io_tlb_mem = mem; + + return 0; +} + +static void rmem_swiotlb_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev->dma_io_tlb_mem = io_tlb_default_mem; +} + +static const struct reserved_mem_ops rmem_swiotlb_ops = { + .device_init = rmem_swiotlb_device_init, + .device_release = rmem_swiotlb_device_release, +}; + +static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "linux,cma-default", NULL) || + of_get_flat_dt_prop(node, "linux,dma-default", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { + pr_err("Restricted DMA pool must be accessible within the linear mapping."); + return -EINVAL; + } + + rmem->ops = &rmem_swiotlb_ops; + pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */ -- cgit v1.2.3-71-gd317 From 09a4a79d42ced8ca7fc250b05e45ac1cb23a9c52 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 1 Jul 2021 11:31:30 +0800 Subject: swiotlb: fix implicit debugfs declarations Factor out the debugfs bits from rmem_swiotlb_device_init() into a separate rmem_swiotlb_debugfs_init() to fix the implicit debugfs declarations. Fixes: 461021875c50 ("swiotlb: Add restricted DMA pool initialization") Reported-by: kernel test robot Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0ffbaae9fba2..b7f76bca89bf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -712,6 +712,21 @@ late_initcall(swiotlb_create_default_debugfs); #endif #ifdef CONFIG_DMA_RESTRICTED_POOL + +#ifdef CONFIG_DEBUG_FS +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ + struct io_tlb_mem *mem = rmem->priv; + + mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); +} +#else +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ +} +#endif + struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -766,11 +781,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, rmem->priv = mem; - if (IS_ENABLED(CONFIG_DEBUG_FS)) { - mem->debugfs = - debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); - } + rmem_swiotlb_debugfs_init(rmem); } dev->dma_io_tlb_mem = mem; -- cgit v1.2.3-71-gd317 From 868c9ddc182bc6728bb380cbfb3170734f72c599 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 7 Jul 2021 14:12:54 +0900 Subject: swiotlb: add overflow checks to swiotlb_bounce This is a follow-up on 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") which fixed unaligned dma mappings, making sure the following overflows are caught: - offset of the start of the slot within the device bigger than requested address' offset, in other words if the base address given in swiotlb_tbl_map_single to create the mapping (orig_addr) was after the requested address for the sync (tlb_offset) in the same block: |------------------------------------------| block <----------------------------> mapped part of the block ^ orig_addr ^ invalid tlb_addr for sync - if the resulting offset was bigger than the allocation size this one could happen if the mapping was not until the end. e.g. |------------------------------------------| block <---------------------> mapped part of the block ^ ^ orig_addr invalid tlb_addr Both should never happen so print a warning and bail out without trying to adjust the sizes/offsets: the first one could try to sync from orig_addr to whatever is left of the requested size, but the later really has nothing to sync there... Signed-off-by: Dominique Martinet Cc: Konrad Rzeszutek Wilk Reviewed-by: Bumyong Lee Cc: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b7f76bca89bf..f1a9ae7fad8f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -365,13 +365,27 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); - unsigned int tlb_offset; + unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - - swiotlb_align_offset(dev, orig_addr); + tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); + orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + if (tlb_offset < orig_addr_offset) { + dev_WARN_ONCE(dev, 1, + "Access before mapping start detected. orig offset %u, requested offset %u.\n", + orig_addr_offset, tlb_offset); + return; + } + + tlb_offset -= orig_addr_offset; + if (tlb_offset > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", + alloc_size, size, tlb_offset); + return; + } orig_addr += tlb_offset; alloc_size -= tlb_offset; -- cgit v1.2.3-71-gd317 From 3b0462726e7ef281c35a7a4ae33e93ee2bc9975b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 14 Jul 2021 15:47:49 +0200 Subject: cgroup: verify that source is a string The following sequence can be used to trigger a UAF: int fscontext_fd = fsopen("cgroup"); int fd_null = open("/dev/null, O_RDONLY); int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null); close_range(3, ~0U, 0); The cgroup v1 specific fs parser expects a string for the "source" parameter. However, it is perfectly legitimate to e.g. specify a file descriptor for the "source" parameter. The fs parser doesn't know what a filesystem allows there. So it's a bug to assume that "source" is always of type fs_value_is_string when it can reasonably also be fs_value_is_file. This assumption in the cgroup code causes a UAF because struct fs_parameter uses a union for the actual value. Access to that union is guarded by the param->type member. Since the cgroup paramter parser didn't check param->type but unconditionally moved param->string into fc->source a close on the fscontext_fd would trigger a UAF during put_fs_context() which frees fc->source thereby freeing the file stashed in param->file causing a UAF during a close of the fd_null. Fix this by verifying that param->type is actually a string and report an error if not. In follow up patches I'll add a new generic helper that can be used here and by other filesystems instead of this error-prone copy-pasta fix. But fixing it in here first makes backporting a it to stable a lot easier. Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing") Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com Cc: Christoph Hellwig Cc: Alexander Viro Cc: Dmitry Vyukov Cc: Cc: syzkaller-bugs Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ee93b6e89587..527917c0b30b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "Non-string source"); if (fc->source) return invalf(fc, "Multiple sources not supported"); fc->source = param->string; -- cgit v1.2.3-71-gd317 From d1d488d813703618f0dd93f0e4c4a05928114aa8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 14 Jul 2021 15:47:50 +0200 Subject: fs: add vfs_parse_fs_param_source() helper Add a simple helper that filesystems can use in their parameter parser to parse the "source" parameter. A few places open-coded this function and that already caused a bug in the cgroup v1 parser that we fixed. Let's make it harder to get this wrong by introducing a helper which performs all necessary checks. Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12 Cc: Christoph Hellwig Cc: Alexander Viro Cc: Dmitry Vyukov Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- fs/fs_context.c | 54 ++++++++++++++++++++++++++++++---------------- include/linux/fs_context.h | 2 ++ kernel/cgroup/cgroup-v1.c | 14 +++++------- 3 files changed, 43 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/fs/fs_context.c b/fs/fs_context.c index 2834d1afa6e8..de1985eae535 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) return -ENOPARAM; } +/** + * vfs_parse_fs_param_source - Handle setting "source" via parameter + * @fc: The filesystem context to modify + * @param: The parameter + * + * This is a simple helper for filesystems to verify that the "source" they + * accept is sane. + * + * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and + * -EINVAL otherwise. In the event of failure, supplementary error information + * is logged. + */ +int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) +{ + if (strcmp(param->key, "source") != 0) + return -ENOPARAM; + + if (param->type != fs_value_is_string) + return invalf(fc, "Non-string source"); + + if (fc->source) + return invalf(fc, "Multiple sources"); + + fc->source = param->string; + param->string = NULL; + return 0; +} +EXPORT_SYMBOL(vfs_parse_fs_param_source); + /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify @@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); @@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; + int ret; - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Legacy: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 37e1e8f7f08d..e2bc16300c82 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, extern int generic_parse_monolithic(struct fs_context *fc, void *data); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param_source(struct fs_context *fc, + struct fs_parameter *param); /* * sget() wrappers to be called from the ->get_tree() op. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 527917c0b30b..8d6bf56ed77a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -911,15 +911,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "Non-string source"); - if (fc->source) - return invalf(fc, "Multiple sources not supported"); - fc->source = param->string; - param->string = NULL; - return 0; - } + int ret; + + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; -- cgit v1.2.3-71-gd317 From 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Jun 2021 01:15:59 +0200 Subject: posix-cpu-timers: Fix rearm racing against process tick Since the process wide cputime counter is started locklessly from posix_cpu_timer_rearm(), it can be concurrently stopped by operations on other timers from the same thread group, such as in the following unlucky scenario: CPU 0 CPU 1 ----- ----- timer_settime(TIMER B) posix_cpu_timer_rearm(TIMER A) cpu_clock_sample_group() (pct->timers_active already true) handle_posix_cpu_timers() check_process_timers() stop_process_timers() pct->timers_active = false arm_timer(TIMER A) tick -> run_posix_cpu_timers() // sees !pct->timers_active, ignore // our TIMER A Fix this with simply locking process wide cputime counting start and timer arm in the same block. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Fixes: 60f2ceaa8111 ("posix-cpu-timers: Remove unnecessary locking around cpu_clock_sample_group") Cc: stable@vger.kernel.org Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Eric W. Biederman --- kernel/time/posix-cpu-timers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 29a5e54e6e10..517be7fd175e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) if (!p) goto out; + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) + goto out; + /* * Fetch the current sample and update the timer's expiry time. */ @@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) bump_cpu_timer(timer, now); - /* Protect timer list r/w in arm_timer() */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) - goto out; - /* * Now re-arm for the new expiry time. */ -- cgit v1.2.3-71-gd317 From aebacb7f6ca1926918734faae14d1f0b6fae5cb7 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 9 Jul 2021 16:13:25 +0200 Subject: timers: Fix get_next_timer_interrupt() with no timers pending 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary") subtly altered get_next_timer_interrupt()'s behaviour. The function no longer consistently returns KTIME_MAX with no timers pending. In order to decide if there are any timers pending we check whether the next expiry will happen NEXT_TIMER_MAX_DELTA jiffies from now. Unfortunately, the next expiry time and the timer base clock are no longer updated in unison. The former changes upon certain timer operations (enqueue, expire, detach), whereas the latter keeps track of jiffies as they move forward. Ultimately breaking the logic above. A simplified example: - Upon entering get_next_timer_interrupt() with: jiffies = 1 base->clk = 0; base->next_expiry = NEXT_TIMER_MAX_DELTA; 'base->next_expiry == base->clk + NEXT_TIMER_MAX_DELTA', the function returns KTIME_MAX. - 'base->clk' is updated to the jiffies value. - The next time we enter get_next_timer_interrupt(), taking into account no timer operations happened: base->clk = 1; base->next_expiry = NEXT_TIMER_MAX_DELTA; 'base->next_expiry != base->clk + NEXT_TIMER_MAX_DELTA', the function returns a valid expire time, which is incorrect. This ultimately might unnecessarily rearm sched's timer on nohz_full setups, and add latency to the system[1]. So, introduce 'base->timers_pending'[2], update it every time 'base->next_expiry' changes, and use it in get_next_timer_interrupt(). [1] See tick_nohz_stop_tick(). [2] A quick pahole check on x86_64 and arm64 shows it doesn't make 'struct timer_base' any bigger. Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary") Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Frederic Weisbecker --- kernel/time/timer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3fadb58fc9d7..9eb11c2209e5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -207,6 +207,7 @@ struct timer_base { unsigned int cpu; bool next_expiry_recalc; bool is_idle; + bool timers_pending; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -595,6 +596,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; + base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } @@ -1582,6 +1584,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) } base->next_expiry_recalc = false; + base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); return next; } @@ -1633,7 +1636,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; - bool is_max_delta; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1646,7 +1648,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (base->next_expiry_recalc) base->next_expiry = __next_timer_interrupt(base); nextevt = base->next_expiry; - is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); /* * We have a fresh next event. Check whether we can forward the @@ -1664,7 +1665,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) expires = basem; base->is_idle = false; } else { - if (!is_max_delta) + if (base->timers_pending) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle. @@ -1947,6 +1948,7 @@ int timers_prepare_cpu(unsigned int cpu) base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->timers_pending = false; base->is_idle = false; } return 0; -- cgit v1.2.3-71-gd317 From 75f0fc7b48ad45a2e5736bcf8de26c8872fe8695 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Wed, 14 Jul 2021 10:18:15 +0000 Subject: bpf: Fix potential memleak and UAF in the verifier. In bpf_patch_insn_data(), we first use the bpf_patch_insn_single() to insert new instructions, then use adjust_insn_aux_data() to adjust insn_aux_data. If the old env->prog have no enough room for new inserted instructions, we use bpf_prog_realloc to construct new_prog and free the old env->prog. There have two errors here. First, if adjust_insn_aux_data() return ENOMEM, we should free the new_prog. Second, if adjust_insn_aux_data() return ENOMEM, bpf_patch_insn_data() will return NULL, and env->prog has been freed in bpf_prog_realloc, but we will use it in bpf_check(). So in this patch, we make the adjust_insn_aux_data() never fails. In bpf_patch_insn_data(), we first pre-malloc memory for the new insn_aux_data, then call bpf_patch_insn_single() to insert new instructions, at last call adjust_insn_aux_data() to adjust insn_aux_data. Fixes: 8041902dae52 ("bpf: adjust insn_aux_data when patching insns") Signed-off-by: He Fengqing Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210714101815.164322-1-hefengqing@huawei.com --- kernel/bpf/verifier.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index be38bb930bf1..3dbb3b40b754 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11425,10 +11425,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_prog *new_prog, u32 off, u32 cnt) +static void adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *new_data, + struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; u32 old_seen = old_data[off].seen; u32 prog_len; @@ -11441,12 +11442,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); if (cnt == 1) - return 0; + return; prog_len = new_prog->len; - new_data = vzalloc(array_size(prog_len, - sizeof(struct bpf_insn_aux_data))); - if (!new_data) - return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); @@ -11457,7 +11455,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, } env->insn_aux_data = new_data; vfree(old_data); - return 0; } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -11492,6 +11489,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of const struct bpf_insn *patch, u32 len) { struct bpf_prog *new_prog; + struct bpf_insn_aux_data *new_data = NULL; + + if (len > 1) { + new_data = vzalloc(array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data))); + if (!new_data) + return NULL; + } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); if (IS_ERR(new_prog)) { @@ -11499,10 +11504,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); + vfree(new_data); return NULL; } - if (adjust_insn_aux_data(env, new_prog, off, len)) - return NULL; + adjust_insn_aux_data(env, new_data, new_prog, off, len); adjust_subprog_starts(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; -- cgit v1.2.3-71-gd317 From d809e134be7a1fdd9f5b99ab3291c6da5c0b8240 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:07 -0700 Subject: bpf: Prepare bpf_prog_put() to be called from irq context. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently bpf_prog_put() is called from the task context only. With addition of bpf timers the timer related helpers will start calling bpf_prog_put() from irq-saved region and in rare cases might drop the refcnt to zero. To address this case, first, convert bpf_prog_free_id() to be irq-save (this is similar to bpf_map_free_id), and, second, defer non irq appropriate calls into work queue. For example: bpf_audit_prog() is calling kmalloc and wake_up_interruptible, bpf_prog_kallsyms_del_all()->bpf_ksym_del()->spin_unlock_bh(). They are not safe with irqs disabled. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-2-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e343f158e556..5d1fee634be8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1699,6 +1699,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { + unsigned long flags; + /* cBPF to eBPF migrations are currently not in the idr store. * Offloaded programs are removed from the store when their device * disappears - even if someone grabs an fd to them they are unusable, @@ -1708,7 +1710,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) return; if (do_idr_lock) - spin_lock_bh(&prog_idr_lock); + spin_lock_irqsave(&prog_idr_lock, flags); else __acquire(&prog_idr_lock); @@ -1716,7 +1718,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) prog->aux->id = 0; if (do_idr_lock) - spin_unlock_bh(&prog_idr_lock); + spin_unlock_irqrestore(&prog_idr_lock, flags); else __release(&prog_idr_lock); } @@ -1752,14 +1754,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) } } +static void bpf_prog_put_deferred(struct work_struct *work) +{ + struct bpf_prog_aux *aux; + struct bpf_prog *prog; + + aux = container_of(work, struct bpf_prog_aux, work); + prog = aux->prog; + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); + __bpf_prog_put_noref(prog, true); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic64_dec_and_test(&prog->aux->refcnt)) { - perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); - bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); + struct bpf_prog_aux *aux = prog->aux; + + if (atomic64_dec_and_test(&aux->refcnt)) { /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - __bpf_prog_put_noref(prog, true); + + if (in_irq() || irqs_disabled()) { + INIT_WORK(&aux->work, bpf_prog_put_deferred); + schedule_work(&aux->work); + } else { + bpf_prog_put_deferred(&aux->work); + } } } -- cgit v1.2.3-71-gd317 From c1b3fed319d32a721d4b9c17afaeb430444ff773 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:08 -0700 Subject: bpf: Factor out bpf_spin_lock into helpers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ____bpf_spin_lock/unlock into helpers to make it more clear that quadruple underscore bpf_spin_lock/unlock are irqsave/restore variants. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-3-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62cf00383910..38be3cfc2f58 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) static DEFINE_PER_CPU(unsigned long, irqsave_flags); -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); +} + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_lock_irqsave(lock); return 0; } @@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .arg1_type = ARG_PTR_TO_SPIN_LOCK, }; -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); +} + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_unlock_irqrestore(lock); return 0; } @@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, else lock = dst + map->spin_lock_off; preempt_disable(); - ____bpf_spin_lock(lock); + __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); - ____bpf_spin_unlock(lock); + __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } -- cgit v1.2.3-71-gd317 From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:09 -0700 Subject: bpf: Introduce bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded in hash/array/lru maps as a regular field and helpers to operate on it: // Initialize the timer. // First 4 bits of 'flags' specify clockid. // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags); // Configure the timer to call 'callback_fn' static function. long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); // Arm the timer to expire 'nsec' nanoseconds from the current time. long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags); // Cancel the timer and wait for callback_fn to finish if it was running. long bpf_timer_cancel(struct bpf_timer *timer); Here is how BPF program might look like: struct map_elem { int counter; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1000); __type(key, int); __type(value, struct map_elem); } hmap SEC(".maps"); static int timer_cb(void *map, int *key, struct map_elem *val); /* val points to particular map element that contains bpf_timer. */ SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) { struct map_elem *val; int key = 0; val = bpf_map_lookup_elem(&hmap, &key); if (val) { bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME); bpf_timer_set_callback(&val->timer, timer_cb); bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0); } } This patch adds helper implementations that rely on hrtimers to call bpf functions as timers expire. The following patches add necessary safety checks. Only programs with CAP_BPF are allowed to use bpf_timer. The amount of timers used by the program is constrained by the memcg recorded at map creation time. The bpf_timer_init() helper needs explicit 'map' argument because inner maps are dynamic and not known at load time. While the bpf_timer_set_callback() is receiving hidden 'aux->prog' argument supplied by the verifier. The prog pointer is needed to do refcnting of bpf program to make sure that program doesn't get freed while the timer is armed. This approach relies on "user refcnt" scheme used in prog_array that stores bpf programs for bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is paired with bpf_timer_cancel() that will drop the prog refcnt. The ops->map_release_uref is responsible for cancelling the timers and dropping prog refcnt when user space reference to a map reaches zero. This uref approach is done to make sure that Ctrl-C of user space process will not leave timers running forever unless the user space explicitly pinned a map that contained timers in bpffs. bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't have user references (is not held by open file descriptor from user space and not pinned in bpffs). The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel and free the timer if given map element had it allocated. "bpftool map update" command can be used to cancel timers. The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because '__u64 :64' has 1 byte alignment of 8 byte padding. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 + include/uapi/linux/bpf.h | 73 ++++++++++ kernel/bpf/helpers.c | 324 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 109 ++++++++++++++ kernel/trace/bpf_trace.c | 2 +- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 73 ++++++++++ 7 files changed, 585 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4afbff308ca3..125240b7cefb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,6 +168,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); +void bpf_timer_cancel_and_free(void *timer); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -314,6 +316,7 @@ enum bpf_arg_type { ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bafb6282032b..3544ec5234f0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4777,6 +4777,70 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4948,6 +5012,10 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6074,6 +6142,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 38be3cfc2f58..74b16593983d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -999,6 +999,322 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +/* BPF map elements can contain 'struct bpf_timer'. + * Such map owns all of its BPF timers. + * 'struct bpf_timer' is allocated as part of map element allocation + * and it's zero initialized. + * That space is used to keep 'struct bpf_timer_kern'. + * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and + * remembers 'struct bpf_map *' pointer it's part of. + * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. + * bpf_timer_start() arms the timer. + * If user space reference to a map goes to zero at this point + * ops->map_release_uref callback is responsible for cancelling the timers, + * freeing their memory, and decrementing prog's refcnts. + * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. + * Inner maps can contain bpf timers as well. ops->map_release_uref is + * freeing the timers when inner map is replaced or deleted by user space. + */ +struct bpf_hrtimer { + struct hrtimer timer; + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; +}; + +/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_timer_kern { + struct bpf_hrtimer *timer; + /* bpf_spin_lock is used here instead of spinlock_t to make + * sure that it always fits into space resereved by struct bpf_timer + * regardless of LOCKDEP and spinlock debug flags. + */ + struct bpf_spin_lock lock; +} __attribute__((aligned(8))); + +static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); + +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) +{ + struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); + struct bpf_map *map = t->map; + void *value = t->value; + void *callback_fn; + void *key; + u32 idx; + int ret; + + callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + if (!callback_fn) + goto out; + + /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and + * cannot be preempted by another bpf_timer_cb() on the same cpu. + * Remember the timer this callback is servicing to prevent + * deadlock if callback_fn() calls bpf_timer_cancel() or + * bpf_map_delete_elem() on the same timer. + */ + this_cpu_write(hrtimer_running, t); + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, + (u64)(long)value, 0, 0); + WARN_ON(ret != 0); /* Next patch moves this check into the verifier */ + + this_cpu_write(hrtimer_running, NULL); +out: + return HRTIMER_NORESTART; +} + +BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_hrtimer *t; + int ret = 0; + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + + if (in_nmi()) + return -EOPNOTSUPP; + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (t) { + ret = -EBUSY; + goto out; + } + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + ret = -EPERM; + goto out; + } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ + t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); + if (!t) { + ret = -ENOMEM; + goto out; + } + t->value = (void *)timer - map->timer_off; + t->map = map; + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + timer->timer = t; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_init_proto = { + .func = bpf_timer_init, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + struct bpf_prog *prev, *prog = aux->prog; + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (!atomic64_read(&t->map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. Otherwise timer might still be + * running even when bpf prog is detached and user space + * is gone, since map_release_uref won't ever be called. + */ + ret = -EPERM; + goto out; + } + prev = t->prog; + if (prev != prog) { + /* Bump prog refcnt once. Every bpf_timer_set_callback() + * can pick different callback_fn-s within the same prog. + */ + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto out; + } + if (prev) + /* Drop prev prog refcnt when swapping with new prog */ + bpf_prog_put(prev); + t->prog = prog; + } + rcu_assign_pointer(t->callback_fn, callback_fn); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_set_callback_proto = { + .func = bpf_timer_set_callback, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_PTR_TO_FUNC, +}; + +BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t || !t->prog) { + ret = -EINVAL; + goto out; + } + hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_start_proto = { + .func = bpf_timer_start, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static void drop_prog_refcnt(struct bpf_hrtimer *t) +{ + struct bpf_prog *prog = t->prog; + + if (prog) { + bpf_prog_put(prog); + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + } +} + +BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (this_cpu_read(hrtimer_running) == t) { + /* If bpf callback_fn is trying to bpf_timer_cancel() + * its own timer the hrtimer_cancel() will deadlock + * since it waits for callback_fn to finish + */ + ret = -EDEADLK; + goto out; + } + drop_prog_refcnt(t); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + /* Cancel the timer and wait for associated callback to finish + * if it was running. + */ + ret = ret ?: hrtimer_cancel(&t->timer); + return ret; +} + +static const struct bpf_func_proto bpf_timer_cancel_proto = { + .func = bpf_timer_cancel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, +}; + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_timer_kern *timer = val; + struct bpf_hrtimer *t; + + /* Performance optimization: read timer->timer without lock first. */ + if (!READ_ONCE(timer->timer)) + return; + + __bpf_spin_lock_irqsave(&timer->lock); + /* re-read it under lock */ + t = timer->timer; + if (!t) + goto out; + drop_prog_refcnt(t); + /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + * this timer, since it won't be initialized. + */ + timer->timer = NULL; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + if (!t) + return; + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call kfree(t) + * right after for both preallocated and non-preallocated maps. + * The timer->timer = NULL was already done and no code path can + * see address 't' anymore. + * + * Check that bpf_map_delete/update_elem() wasn't called from timer + * callback_fn. In such case don't call hrtimer_cancel() (since it will + * deadlock) and don't call hrtimer_try_to_cancel() (since it will just + * return -1). Though callback_fn is still running on this cpu it's + * safe to do kfree(t) because bpf_timer_cb() read everything it needed + * from 't'. The bpf subprog callback_fn won't be able to access 't', + * since timer->timer = NULL was already done. The timer will be + * effectively cancelled because bpf_timer_cb() will return + * HRTIMER_NORESTART. + */ + if (this_cpu_read(hrtimer_running) != t) + hrtimer_cancel(&t->timer); + kfree(t); +} + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1065,6 +1381,14 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_timer_init: + return &bpf_timer_init_proto; + case BPF_FUNC_timer_set_callback: + return &bpf_timer_set_callback_proto; + case BPF_FUNC_timer_start: + return &bpf_timer_start_proto; + case BPF_FUNC_timer_cancel: + return &bpf_timer_cancel_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3dbb3b40b754..e8645c819803 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4656,6 +4656,38 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return 0; } +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", + map->name); + return -EINVAL; + } + if (val) { + /* This restriction will be removed in the next patch */ + verbose(env, "bpf_timer field can only be first in the map value element\n"); + return -EINVAL; + } + if (meta->map_ptr) { + verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + return -EFAULT; + } + meta->map_ptr = map; + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -4788,6 +4820,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4819,6 +4852,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, + [ARG_PTR_TO_TIMER] = &timer_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4948,6 +4982,10 @@ skip_type_check: if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + if (meta->map_ptr && meta->map_ptr != reg->map_ptr) { + verbose(env, "Map pointer doesn't match bpf_timer.\n"); + return -EINVAL; + } meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: @@ -5000,6 +5038,9 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_TIMER) { + if (process_timer_func(env, regno, meta)) + return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { @@ -5742,6 +5783,34 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_timer_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6069,6 +6138,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_timer_set_callback) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_timer_callback_state); + if (err < 0) + return -EINVAL; + } + if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) @@ -12591,6 +12667,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_timer_set_callback) { + /* The verifier will process callback_fn as many times as necessary + * with different maps and the register states prepared by + * set_timer_callback_state will be accurate. + * + * The following use case is valid: + * map1 is shared by prog1, prog2, prog3. + * prog1 calls bpf_timer_init for some map1 elements + * prog2 calls bpf_timer_set_callback for some map1 elements. + * Those that were not bpf_timer_init-ed will return -EINVAL. + * prog3 calls bpf_timer_start for some map1 elements. + * Those that were not both bpf_timer_init-ed and + * bpf_timer_set_callback-ed will return -EINVAL. + */ + struct bpf_insn ld_addrs[2] = { + BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), + }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 64bd2d84367f..6c77d25137e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_snprintf: return &bpf_snprintf_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 2d94025b38e9..00ac7b79cddb 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -547,6 +547,7 @@ class PrinterHelpers(Printer): 'struct inode', 'struct socket', 'struct file', + 'struct bpf_timer', ] known_types = { '...', @@ -594,6 +595,7 @@ class PrinterHelpers(Printer): 'struct inode', 'struct socket', 'struct file', + 'struct bpf_timer', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bafb6282032b..3544ec5234f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4777,6 +4777,70 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4948,6 +5012,10 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6074,6 +6142,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3-71-gd317 From 68134668c17f31f51930478f75495b552a411550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:10 -0700 Subject: bpf: Add map side support for bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restrict bpf timers to array, hash (both preallocated and kmalloced), and lru map types. The per-cpu maps with timers don't make sense, since 'struct bpf_timer' is a part of map value. bpf timers in per-cpu maps would mean that the number of timers depends on number of possible cpus and timers would not be accessible from all cpus. lpm map support can be added in the future. The timers in inner maps are supported. The bpf_map_update/delete_elem() helpers and sys_bpf commands cancel and free bpf_timer in a given map element. Similar to 'struct bpf_spin_lock' BTF is required and it is used to validate that map element indeed contains 'struct bpf_timer'. Make check_and_init_map_value() init both bpf_spin_lock and bpf_timer when map element data is reused in preallocated htab and lru maps. Teach copy_map_value() to support both bpf_spin_lock and bpf_timer in a single map element. There could be one of each, but not more than one. Due to 'one bpf_timer in one element' restriction do not support timers in global data, since global data is a map of single element, but from bpf program side it's seen as many global variables and restriction of single global timer would be odd. The sys_bpf map_freeze and sys_mmap syscalls are not allowed on maps with timers, since user space could have corrupted mmap element and crashed the kernel. The maps with timers cannot be readonly. Due to these restrictions search for bpf_timer in datasec BTF in case it was placed in the global data to report clear error. The previous patch allowed 'struct bpf_timer' as a first field in a map element only. Relax this restriction. Refactor lru map to s/bpf_lru_push_free/htab_lru_push_free/ to cancel and free the timer when lru map deletes an element as a part of it eviction algorithm. Make sure that bpf program cannot access 'struct bpf_timer' via direct load/store. The timer operation are done through helpers only. This is similar to 'struct bpf_spin_lock'. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-5-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 44 ++++++++++++++----- include/linux/btf.h | 1 + kernel/bpf/arraymap.c | 21 +++++++++ kernel/bpf/btf.c | 77 +++++++++++++++++++++++++++------ kernel/bpf/hashtab.c | 105 +++++++++++++++++++++++++++++++++++++++------ kernel/bpf/local_storage.c | 4 +- kernel/bpf/map_in_map.c | 2 + kernel/bpf/syscall.c | 21 +++++++-- kernel/bpf/verifier.c | 30 +++++++++++-- 9 files changed, 259 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 125240b7cefb..a9a4a480a6d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -198,24 +198,46 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map) return map->spin_lock_off >= 0; } -static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +static inline bool map_value_has_timer(const struct bpf_map *map) { - if (likely(!map_value_has_spin_lock(map))) - return; - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + return map->timer_off >= 0; } -/* copy everything but bpf_spin_lock */ +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + if (unlikely(map_value_has_spin_lock(map))) + *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = + (struct bpf_spin_lock){}; + if (unlikely(map_value_has_timer(map))) + *(struct bpf_timer *)(dst + map->timer_off) = + (struct bpf_timer){}; +} + +/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { + u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + if (unlikely(map_value_has_spin_lock(map))) { - u32 off = map->spin_lock_off; + s_off = map->spin_lock_off; + s_sz = sizeof(struct bpf_spin_lock); + } else if (unlikely(map_value_has_timer(map))) { + t_off = map->timer_off; + t_sz = sizeof(struct bpf_timer); + } - memcpy(dst, src, off); - memcpy(dst + off + sizeof(struct bpf_spin_lock), - src + off + sizeof(struct bpf_spin_lock), - map->value_size - off - sizeof(struct bpf_spin_lock)); + if (unlikely(s_sz || t_sz)) { + if (s_off < t_off || !s_sz) { + swap(s_off, t_off); + swap(s_sz, t_sz); + } + memcpy(dst, src, t_off); + memcpy(dst + t_off + t_sz, + src + t_off + t_sz, + s_off - t_off - t_sz); + memcpy(dst + s_off + s_sz, + src + s_off + s_sz, + map->value_size - s_off - s_sz); } else { memcpy(dst, src, map->value_size); } diff --git a/include/linux/btf.h b/include/linux/btf.h index 94a0c976c90f..214fde93214b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +int btf_find_timer(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3c4105603f9d..cebd4fb06d19 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } +static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +{ + if (unlikely(map_value_has_timer(&arr->map))) + bpf_timer_cancel_and_free(val + arr->map.timer_off); +} + /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); + check_and_free_timer_in_array(array, val); } return 0; } @@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } +static void array_map_free_timers(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (likely(!map_value_has_timer(map))) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_timer_cancel_and_free(array->value + array->elem_size * i + + map->timer_off); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, + .map_release_uref = array_map_free_timers, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cb4b72997d9b..7780131f710e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) { const struct btf_member *member; u32 i, off = -ENOENT; - if (!__btf_type_is_struct(t)) - return -EINVAL; - for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(member_type)) continue; - if (member_type->size != sizeof(struct bpf_spin_lock)) + if (member_type->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), - "bpf_spin_lock")) + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; if (off != -ENOENT) - /* only one 'struct bpf_spin_lock' is allowed */ + /* only one such field is allowed */ return -E2BIG; off = btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; off /= 8; - if (off % __alignof__(struct bpf_spin_lock)) - /* valid struct bpf_spin_lock will be 4 byte aligned */ + if (off % align) + return -EINVAL; + } + return off; +} + +static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) +{ + const struct btf_var_secinfo *vsi; + u32 i, off = -ENOENT; + + for_each_vsi(i, t, vsi) { + const struct btf_type *var = btf_type_by_id(btf, vsi->type); + const struct btf_type *var_type = btf_type_by_id(btf, var->type); + + if (!__btf_type_is_struct(var_type)) + continue; + if (var_type->size != sz) + continue; + if (vsi->size != sz) + continue; + if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + continue; + if (off != -ENOENT) + /* only one such field is allowed */ + return -E2BIG; + off = vsi->offset; + if (off % align) return -EINVAL; } return off; } +static int btf_find_field(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) +{ + + if (__btf_type_is_struct(t)) + return btf_find_struct_field(btf, t, name, sz, align); + else if (btf_type_is_datasec(t)) + return btf_find_datasec_var(btf, t, name, sz, align); + return -EINVAL; +} + +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + return btf_find_field(btf, t, "bpf_spin_lock", + sizeof(struct bpf_spin_lock), + __alignof__(struct bpf_spin_lock)); +} + +int btf_find_timer(const struct btf *btf, const struct btf_type *t) +{ + return btf_find_field(btf, t, "bpf_timer", + sizeof(struct bpf_timer), + __alignof__(struct bpf_timer)); +} + static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 72c58cc516a3..6dc3fae46a56 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +static bool htab_has_extra_elems(struct bpf_htab *htab) +{ + return !htab_is_percpu(htab) && !htab_is_lru(htab); +} + +static void htab_free_prealloced_timers(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (likely(!map_value_has_timer(&htab->map))) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_timer_cancel_and_free(elem->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { + u32 key_size = htab->map.key_size; + l = container_of(node, struct htab_elem, lru_node); - memcpy(l->key, key, htab->map.key_size); + memcpy(l->key, key, key_size); + check_and_init_map_value(&htab->map, + l->key + round_up(key_size, 8)); return l; } @@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, @@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +{ + if (unlikely(map_value_has_timer(&htab->map))) + bpf_timer_cancel_and_free(elem->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); +} + /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ @@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); + check_and_free_timer(htab, l); break; } @@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + check_and_free_timer(htab, l); kfree(l); } @@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { + check_and_free_timer(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_lock(&htab->map, - l_new->key + round_up(key_size, 8)); + check_and_init_map_value(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); @@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); + else + check_and_free_timer(htab, l_old); } ret = 0; err: @@ -1069,6 +1112,12 @@ err: return ret; } +static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) +{ + check_and_free_timer(htab, elem); + bpf_lru_push_free(&htab->lru, &elem->lru_node); +} + static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { @@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + copy_map_value(&htab->map, + l_new->key + round_up(map->key_size, 8), value); ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) @@ -1128,9 +1178,9 @@ err: htab_unlock_bucket(htab, b, hash, flags); if (ret) - bpf_lru_push_free(&htab->lru, &l_new->lru_node); + htab_lru_push_free(htab, l_new); else if (l_old) - bpf_lru_push_free(&htab->lru, &l_old->lru_node); + htab_lru_push_free(htab, l_old); return ret; } @@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) htab_unlock_bucket(htab, b, hash, flags); if (l) - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); return ret; } @@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab) } } +static void htab_free_malloced_timers(struct bpf_htab *htab) +{ + int i; + + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; + struct htab_elem *l; + + hlist_nulls_for_each_entry(l, n, head, hash_node) + check_and_free_timer(htab, l); + cond_resched_rcu(); + } + rcu_read_unlock(); +} + +static void htab_map_free_timers(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (likely(!map_value_has_timer(&htab->map))) + return; + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers(htab); + else + htab_free_prealloced_timers(htab); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, else copy_map_value(map, value, l->key + roundup_key_size); - check_and_init_map_lock(map, value); + check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); @@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, htab_unlock_bucket(htab, b, hash, bflags); if (is_lru_map && l) - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); return ret; } @@ -1645,7 +1724,7 @@ again_nocopy: true); else copy_map_value(map, dst_val, value); - check_and_init_map_lock(map, dst_val); + check_and_init_map_value(map, dst_val); } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); @@ -1672,7 +1751,7 @@ again_nocopy: while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); } next_batch: @@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, + .map_release_uref = htab_map_free_timers, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, + .map_release_uref = htab_map_free_timers, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index bd11db9774c3..95d70a08325d 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -173,7 +173,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -ENOMEM; memcpy(&new->data[0], value, map->value_size); - check_and_init_map_lock(map, new->data); + check_and_init_map_value(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -509,7 +509,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, map->numa_node); if (!storage->buf) goto enomem; - check_and_init_map_lock(map, storage->buf->data); + check_and_init_map_value(map, storage->buf->data); } else { storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 39ab0b68cade..890dfe14e731 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -50,6 +50,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; + inner_map_meta->timer_off = inner_map->timer_off; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; @@ -75,6 +76,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && + meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d1fee634be8..9a2068e39d23 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, value, ptr, true); else copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); + /* mask lock and timer, since value wasn't zero inited */ + check_and_init_map_value(map, value); } rcu_read_unlock(); } @@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + if (!map->ops->map_mmap || map_value_has_spin_lock(map) || + map_value_has_timer(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } } + map->timer_off = btf_find_timer(btf, value_type); + if (map_value_has_timer(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); map->spin_lock_off = -EINVAL; + map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || + map_value_has_timer(map)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8645c819803..12b50f46a7c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_timer(map)) { + u32 t = map->timer_off; + + if (reg->smin_value + off < t + sizeof(struct bpf_timer) && + t < reg->umax_value + off + size) { + verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (val) { - /* This restriction will be removed in the next patch */ - verbose(env, "bpf_timer field can only be first in the map value element\n"); + if (!map_value_has_timer(map)) { + if (map->timer_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_timer'\n", + map->name); + else if (map->timer_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_timer'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_timer is mangled\n", + map->name); + return -EINVAL; + } + if (map->timer_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", + val + reg->off, map->timer_off); return -EINVAL; } if (meta->map_ptr) { -- cgit v1.2.3-71-gd317 From 3e8ce29850f1839d0603f925b30be9d8a4329917 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:11 -0700 Subject: bpf: Prevent pointer mismatch in bpf_timer_init. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_timer_init() arguments are: 1. pointer to a timer (which is embedded in map element). 2. pointer to a map. Make sure that pointer to a timer actually belongs to that map. Use map_uid (which is unique id of inner map) to reject: inner_map1 = bpf_map_lookup_elem(outer_map, key1) inner_map2 = bpf_map_lookup_elem(outer_map, key2) if (inner_map1 && inner_map2) { timer = bpf_map_lookup_elem(inner_map1); if (timer) // mismatch would have been allowed bpf_timer_init(timer, inner_map2); } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-6-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 9 ++++++++- kernel/bpf/verifier.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..5d3169b57e6e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -53,7 +53,14 @@ struct bpf_reg_state { /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ - struct bpf_map *map_ptr; + struct { + struct bpf_map *map_ptr; + /* To distinguish map lookups from outer map + * the map_uid is non-zero for registers + * pointing to inner maps. + */ + u32 map_uid; + }; /* for PTR_TO_BTF_ID */ struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 12b50f46a7c1..8df2671c3d33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -255,6 +255,7 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; + int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -1135,6 +1136,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = map->inner_map_meta; + /* transfer reg's id which is unique for every map_lookup_elem + * as UID of the inner map. + */ + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -4708,6 +4713,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier bug. Two map pointers in a timer helper\n"); return -EFAULT; } + meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; } @@ -5006,11 +5012,29 @@ skip_type_check: if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr && meta->map_ptr != reg->map_ptr) { - verbose(env, "Map pointer doesn't match bpf_timer.\n"); - return -EINVAL; + if (meta->map_ptr) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * timer = bpf_map_lookup_elem(inner_map1); + * if (timer) + * // mismatch would have been allowed + * bpf_timer_init(timer, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map_ptr != reg->map_ptr || + meta->map_uid != reg->map_uid) { + verbose(env, + "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map_uid, reg->map_uid); + return -EINVAL; + } } meta->map_ptr = reg->map_ptr; + meta->map_uid = reg->map_uid; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within @@ -6204,6 +6228,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].map_uid = meta.map_uid; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) -- cgit v1.2.3-71-gd317 From 40ec00abf1cc92268e3e3320b36bbb33b2224808 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:12 -0700 Subject: bpf: Remember BTF of inner maps. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BTF is required for 'struct bpf_timer' to be recognized inside map value. The bpf timers are supported inside inner maps. Remember 'struct btf *' in inner_map_meta to make it available to the verifier in the sequence: struct bpf_map *inner_map = bpf_map_lookup_elem(&outer_map, ...); if (inner_map) timer = bpf_map_lookup_elem(&inner_map, ...); Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-7-alexei.starovoitov@gmail.com --- kernel/bpf/map_in_map.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 890dfe14e731..5cd8f5277279 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "map_in_map.h" @@ -51,6 +52,10 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + if (inner_map->btf) { + btf_get(inner_map->btf); + inner_map_meta->btf = inner_map->btf; + } /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; @@ -66,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + btf_put(map_meta->btf); kfree(map_meta); } -- cgit v1.2.3-71-gd317 From 86fc6ee6e246438d394e41bb7cc210b0fe724872 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:13 -0700 Subject: bpf: Relax verifier recursion check. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the following bpf subprogram: static int timer_cb(void *map, void *key, void *value) { bpf_timer_set_callback(.., timer_cb); } the 'timer_cb' is a pointer to a function. ld_imm64 insn is used to carry this pointer. bpf_pseudo_func() returns true for such ld_imm64 insn. Unlike bpf_for_each_map_elem() the bpf_timer_set_callback() is asynchronous. Relax control flow check to allow such "recursion" that is seen as an infinite loop by check_cfg(). The distinction between bpf_for_each_map_elem() the bpf_timer_set_callback() is done in the follow up patch. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-8-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8df2671c3d33..1cb1b35e69b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9463,8 +9463,12 @@ static int visit_func_call_insn(int t, int insn_cnt, init_explored_state(env, t + 1); if (visit_callee) { init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, + /* It's ok to allow recursion from CFG point of + * view. __check_func_call() will do the actual + * check. + */ + bpf_pseudo_func(insns + t)); } return ret; } -- cgit v1.2.3-71-gd317 From bfc6bb74e4f16ab264fa73398a7a79d7d2afac2e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:14 -0700 Subject: bpf: Implement verifier support for validation of async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_for_each_map_elem() and bpf_timer_set_callback() helpers are relying on PTR_TO_FUNC infra in the verifier to validate addresses to subprograms and pass them into the helpers as function callbacks. In case of bpf_for_each_map_elem() the callback is invoked synchronously and the verifier treats it as a normal subprogram call by adding another bpf_func_state and new frame in __check_func_call(). bpf_timer_set_callback() doesn't invoke the callback directly. The subprogram will be called asynchronously from bpf_timer_cb(). Teach the verifier to validate such async callbacks as special kind of jump by pushing verifier state into stack and let pop_stack() process it. Special care needs to be taken during state pruning. The call insn doing bpf_timer_set_callback has to be a prune_point. Otherwise short timer callbacks might not have prune points in front of bpf_timer_set_callback() which means is_state_visited() will be called after this call insn is processed in __check_func_call(). Which means that another async_cb state will be pushed to be walked later and the verifier will eventually hit BPF_COMPLEXITY_LIMIT_JMP_SEQ limit. Since push_async_cb() looks like another push_stack() branch the infinite loop detection will trigger false positive. To recognize this case mark such states as in_async_callback_fn. To distinguish infinite loop in async callback vs the same callback called with different arguments for different map and timer add async_entry_cnt to bpf_func_state. Enforce return zero from async callbacks. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-9-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 9 +++- kernel/bpf/helpers.c | 8 ++- kernel/bpf/verifier.c | 123 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 131 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5d3169b57e6e..242d0b1a0772 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -208,12 +208,19 @@ struct bpf_func_state { * zero == main subprog */ u32 subprogno; + /* Every bpf_timer_start will increment async_entry_cnt. + * It's used to distinguish: + * void foo(void) { for(;;); } + * void foo(void) { bpf_timer_set_callback(,foo); } + */ + u32 async_entry_cnt; + bool in_callback_fn; + bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; - bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 74b16593983d..9fe846ec6bd1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1043,7 +1043,6 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *callback_fn; void *key; u32 idx; - int ret; callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) @@ -1066,10 +1065,9 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) key = value - round_up(map->key_size, 8); } - ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, - (u64)(long)key, - (u64)(long)value, 0, 0); - WARN_ON(ret != 0); /* Next patch moves this check into the verifier */ + BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key, + (u64)(long)value, 0, 0); + /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cb1b35e69b7..ab06256bf6c8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -735,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->refs[i].id) verbose(env, ",%d", state->refs[i].id); } + if (state->in_callback_fn) + verbose(env, " cb"); + if (state->in_async_callback_fn) + verbose(env, " async_cb"); verbose(env, "\n"); } @@ -1527,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env, init_reg_state(env, state); } +/* Similar to push_stack(), but for async callbacks */ +static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx, + int subprog) +{ + struct bpf_verifier_stack_elem *elem; + struct bpf_func_state *frame; + + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + elem->log_pos = env->log.len_used; + env->head = elem; + env->stack_size++; + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, + "The sequence of %d jumps is too complex for async cb.\n", + env->stack_size); + goto err; + } + /* Unlike push_stack() do not copy_verifier_state(). + * The caller state doesn't matter. + * This is async callback. It starts in a fresh stack. + * Initialize it similar to do_check_common(). + */ + elem->st.branches = 1; + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) + goto err; + init_func_state(env, frame, + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + elem->st.frame[0] = frame; + return &elem->st; +err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + /* pop all elements and return */ + while (!pop_stack(env, NULL, NULL, false)); + return NULL; +} + + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -5704,6 +5756,30 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + if (insn->code == (BPF_JMP | BPF_CALL) && + insn->imm == BPF_FUNC_timer_set_callback) { + struct bpf_verifier_state *async_cb; + + /* there is no real recursion here. timer callbacks are async */ + async_cb = push_async_cb(env, env->subprog_info[subprog].start, + *insn_idx, subprog); + if (!async_cb) + return -EFAULT; + callee = async_cb->frame[0]; + callee->async_entry_cnt = caller->async_entry_cnt + 1; + + /* Convert bpf_timer_set_callback() args into timer callback args */ + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; + + clear_caller_saved_regs(env, caller->regs); + mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ + return 0; + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -5856,6 +5932,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, /* unused */ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; return 0; } @@ -9224,7 +9301,8 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; - const bool is_subprog = env->cur_state->frame[0]->subprogno; + struct bpf_func_state *frame = env->cur_state->frame[0]; + const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog && @@ -9249,6 +9327,22 @@ static int check_return_code(struct bpf_verifier_env *env) } reg = cur_regs(env) + BPF_REG_0; + + if (frame->in_async_callback_fn) { + /* enforce return zero from async callbacks like timer */ + if (reg->type != SCALAR_VALUE) { + verbose(env, "In async callback the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(tnum_const(0), reg->var_off)) { + verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); + return -EINVAL; + } + return 0; + } + if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", @@ -9496,6 +9590,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: + if (insns[t].imm == BPF_FUNC_timer_set_callback) + /* Mark this call insn to trigger is_state_visited() check + * before call itself is processed by __check_func_call(). + * Otherwise new async state will be pushed for further + * exploration. + */ + init_explored_state(env, t); return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -10503,9 +10604,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; if (sl->state.insn_idx != insn_idx) goto next; + if (sl->state.branches) { - if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur)) { + struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; + + if (frame->in_async_callback_fn && + frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { + /* Different async_entry_cnt means that the verifier is + * processing another entry into async callback. + * Seeing the same state is not an indication of infinite + * loop or infinite recursion. + * But finding the same state doesn't mean that it's safe + * to stop processing the current state. The previous state + * hasn't yet reached bpf_exit, since state.branches > 0. + * Checking in_async_callback_fn alone is not enough either. + * Since the verifier still needs to catch infinite loops + * inside async callbacks. + */ + } else if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); return -EINVAL; -- cgit v1.2.3-71-gd317 From 7ddc80a476c2d599246028af5808d15f9e24c109 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:15 -0700 Subject: bpf: Teach stack depth check about async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teach max stack depth checking algorithm about async callbacks that don't increase bpf program stack size. Also add sanity check that bpf_tail_call didn't sneak into async cb. It's impossible, since PTR_TO_CTX is not available in async cb, hence the program cannot contain bpf_tail_call(ctx,...); Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-10-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 242d0b1a0772..b847e1ccd10f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -406,6 +406,7 @@ struct bpf_subprog_info { bool has_tail_call; bool tail_call_reachable; bool has_ld_abs; + bool is_async_cb; }; /* single container for all structs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab06256bf6c8..344ee67265cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3709,6 +3709,8 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { + int next_insn; + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ @@ -3716,13 +3718,22 @@ continue_func: ret_prog[frame] = idx; /* find the callee */ - i = i + insn[i].imm + 1; - idx = find_subprog(env, i); + next_insn = i + insn[i].imm + 1; + idx = find_subprog(env, next_insn); if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i); + next_insn); return -EFAULT; } + if (subprog[idx].is_async_cb) { + if (subprog[idx].has_tail_call) { + verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + return -EFAULT; + } + /* async callbacks don't increase bpf prog stack size */ + continue; + } + i = next_insn; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -5761,6 +5772,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer callbacks are async */ + env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, *insn_idx, subprog); if (!async_cb) -- cgit v1.2.3-71-gd317 From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Jul 2021 00:02:06 -0400 Subject: tracing: Do not reference char * as a string in histograms The histogram logic was allowing events with char * pointers to be used as normal strings. But it was easy to crash the kernel with: # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger And open some files, and boom! BUG: unable to handle page fault for address: 00007f2ced0c3280 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strlen+0x0/0x20 Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246 RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280 RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98 R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074 R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280 FS: 00007f2ced0f8580(0000) GS:ffff93825a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0 Call Trace: event_hist_trigger+0x463/0x5f0 ? find_held_lock+0x32/0x90 ? sched_clock_cpu+0xe/0xd0 ? lock_release+0x155/0x440 ? kernel_init_free_pages+0x6d/0x90 ? preempt_count_sub+0x9b/0xd0 ? kernel_init_free_pages+0x6d/0x90 ? get_page_from_freelist+0x12c4/0x1680 ? __rb_reserve_next+0xe5/0x460 ? ring_buffer_lock_reserve+0x12a/0x3f0 event_triggers_call+0x52/0xe0 ftrace_syscall_enter+0x264/0x2c0 syscall_trace_enter.constprop.0+0x1ee/0x210 do_syscall_64+0x1c/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Where it triggered a fault on strlen(key) where key was the filename. The reason is that filename is a char * to user space, and the histogram code just blindly dereferenced it, with obvious bad results. I originally tried to use strncpy_from_user/kernel_nofault() but found that there's other places that its dereferenced and not worth the effort. Just do not allow "char *" to act like strings. Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: stable@vger.kernel.org Acked-by: Namhyung Kim Acked-by: Tom Zanussi Fixes: 79e577cbce4c4 ("tracing: Support string type key properly") Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0207aeed31e6..16a9dfc9fffc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1689,7 +1689,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (WARN_ON_ONCE(!field)) goto out; - if (is_string_field(field)) { + /* Pointers to strings are just pointers and dangerous to dereference */ + if (is_string_field(field) && + (field->filter_type != FILTER_PTR_STRING)) { flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; @@ -4495,8 +4497,6 @@ static inline void add_to_key(char *compound_key, void *key, field = key_field->field; if (field->filter_type == FILTER_DYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_PTR_STRING) - size = strlen(key); else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; -- cgit v1.2.3-71-gd317 From 1e37392cccdea94da635e3c6d16b21865806f619 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:54 +0200 Subject: bpf: Enable BPF_TRAMP_F_IP_ARG for trampolines with call_get_func_ip Enabling BPF_TRAMP_F_IP_ARG for trampolines that actually need it. The BPF_TRAMP_F_IP_ARG adds extra 3 instructions to trampoline code and is used only by programs with bpf_get_func_ip helper, which is added in following patch and sets call_get_func_ip bit. This patch ensures that BPF_TRAMP_F_IP_ARG flag is used only for trampolines that have programs with call_get_func_ip set. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-3-jolsa@kernel.org --- include/linux/filter.h | 3 ++- kernel/bpf/trampoline.c | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..ba36989f711a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,7 +559,8 @@ struct bpf_prog { kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ - call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1; /* Do we call get_func_ip() */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 28a3630c48ee..b2535acfe9db 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } static struct bpf_tramp_progs * -bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { const struct bpf_prog_aux *aux; struct bpf_tramp_progs *tprogs; @@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) *total += tr->progs_cnt[kind]; progs = tprogs[kind].progs; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= aux->prog->call_get_func_ip; *progs++ = aux->prog; + } } return tprogs; } @@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) struct bpf_tramp_image *im; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; + bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total); + tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tprogs)) return PTR_ERR(tprogs); @@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + if (ip_arg) + flags |= BPF_TRAMP_F_IP_ARG; + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tprogs, tr->func.addr); -- cgit v1.2.3-71-gd317 From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:55 +0200 Subject: bpf: Add bpf_get_func_ip helper for tracing programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs, specifically for all trampoline attach types. The trampoline's caller IP address is stored in (ctx - 8) address. so there's no reason to actually call the helper, but rather fixup the call instruction and return [ctx - 8] value directly. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/verifier.c | 43 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3544ec5234f0..89688f16ad60 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4841,6 +4841,12 @@ union bpf_attr { * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5016,6 +5022,7 @@ union bpf_attr { FN(timer_set_callback), \ FN(timer_start), \ FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 344ee67265cc..ceef190514e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6161,6 +6161,27 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, return err; } +static int check_get_func_ip(struct bpf_verifier_env *env) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = resolve_prog_type(env->prog); + int func_id = BPF_FUNC_get_func_ip; + + if (type == BPF_PROG_TYPE_TRACING) { + if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && + eatype != BPF_MODIFY_RETURN) { + verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", + func_id_name(func_id), func_id); + return -ENOTSUPP; + } + return 0; + } + + verbose(env, "func %s#%d not supported for program type %d\n", + func_id_name(func_id), func_id, type); + return -ENOTSUPP; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6439,6 +6460,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) env->prog->call_get_stack = true; + if (func_id == BPF_FUNC_get_func_ip) { + if (check_get_func_ip(env)) + return -ENOTSUPP; + env->prog->call_get_func_ip = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12632,6 +12659,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); + enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -12998,6 +13026,21 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_ip inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ip) { + /* Load IP address from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c77d25137e0..3e71503eeb23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -948,6 +948,19 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { + .func = bpf_get_func_ip_tracing, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1058,6 +1071,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_for_each_map_elem_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_tracing; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3544ec5234f0..89688f16ad60 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4841,6 +4841,12 @@ union bpf_attr { * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5016,6 +5022,7 @@ union bpf_attr { FN(timer_set_callback), \ FN(timer_start), \ FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-71-gd317 From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:56 +0200 Subject: bpf: Add bpf_get_func_ip helper for kprobe programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs, so it's now possible to call bpf_get_func_ip from both kprobe and kretprobe programs. Taking the caller's address from 'struct kprobe::addr', which is defined for both kprobe and kretprobe. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 2 ++ kernel/trace/bpf_trace.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 +- 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 89688f16ad60..2db6925e04f4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4844,7 +4844,7 @@ union bpf_attr { * * u64 bpf_get_func_ip(void *ctx) * Description - * Get address of the traced function (for tracing programs). + * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceef190514e4..97216f799ba8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6175,6 +6175,8 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } return 0; + } else if (type == BPF_PROG_TYPE_KPROBE) { + return 0; } verbose(env, "func %s#%d not supported for program type %d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e71503eeb23..0b113716bc7a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -961,6 +961,20 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) +{ + struct kprobe *kp = kprobe_running(); + + return kp ? (u64) kp->addr : 0; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { + .func = bpf_get_func_ip_kprobe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1092,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_kprobe; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 89688f16ad60..2db6925e04f4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4844,7 +4844,7 @@ union bpf_attr { * * u64 bpf_get_func_ip(void *ctx) * Description - * Get address of the traced function (for tracing programs). + * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. */ -- cgit v1.2.3-71-gd317 From 17edea21b38d047a10c189296c58aea9875d0d0a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 4 Jul 2021 12:02:42 -0700 Subject: sock_map: Relax config dependency to CONFIG_NET Currently sock_map still has Kconfig dependency on CONFIG_INET, but there is no actual functional dependency on it after we introduce ->psock_update_sk_prot(). We have to extend it to CONFIG_NET now as we are going to support AF_UNIX. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210704190252.11866-2-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 38 ++++++++++++++++++++------------------ kernel/bpf/Kconfig | 2 +- net/core/Makefile | 2 -- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 94d77dc7ce35..d25c16c365e5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1887,6 +1887,12 @@ void bpf_map_offload_map_free(struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +void sock_map_unhash(struct sock *sk); +void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1919,24 +1925,6 @@ static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, { return -ENOTSUPP; } -#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ - -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); -int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); -void sock_map_unhash(struct sock *sk); -void sock_map_close(struct sock *sk, long timeout); - -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, @@ -1956,7 +1944,21 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bd04f4a44c01..a82d6de86522 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -29,7 +29,7 @@ config BPF_SYSCALL select IRQ_WORK select TASKS_TRACE_RCU select BINARY_PRINTF - select NET_SOCK_MSG if INET + select NET_SOCK_MSG if NET default n help Enable the bpf() system call that allows to manipulate BPF programs diff --git a/net/core/Makefile b/net/core/Makefile index f7f16650fe9e..35ced6201814 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o -ifeq ($(CONFIG_INET),y) obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o -endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o -- cgit v1.2.3-71-gd317 From 40ac971eab89330d6153e7721e88acd2d98833f9 Mon Sep 17 00:00:00 2001 From: Roman Skakun Date: Fri, 16 Jul 2021 11:39:34 +0300 Subject: dma-mapping: handle vmalloc addresses in dma_common_{mmap,get_sgtable} xen-swiotlb can use vmalloc backed addresses for dma coherent allocations and uses the common helpers. Properly handle them to unbreak Xen on ARM platforms. Fixes: 1b65c4e5a9af ("swiotlb-xen: use xen_alloc/free_coherent_pages") Signed-off-by: Roman Skakun Reviewed-by: Andrii Anisov [hch: split the patch, renamed the helpers] Signed-off-by: Christoph Hellwig --- kernel/dma/ops_helpers.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 910ae69cae77..af4a6ef48ce0 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -5,6 +5,13 @@ */ #include +static struct page *dma_common_vaddr_to_page(void *cpu_addr) +{ + if (is_vmalloc_addr(cpu_addr)) + return vmalloc_to_page(cpu_addr); + return virt_to_page(cpu_addr); +} + /* * Create scatter-list for the already allocated DMA buffer. */ @@ -12,7 +19,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - struct page *page = virt_to_page(cpu_addr); + struct page *page = dma_common_vaddr_to_page(cpu_addr); int ret; ret = sg_alloc_table(sgt, 1, GFP_KERNEL); @@ -32,6 +39,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; + struct page *page = dma_common_vaddr_to_page(cpu_addr); int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); @@ -43,7 +51,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; return remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, + page_to_pfn(page) + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; -- cgit v1.2.3-71-gd317 From 59089a189e3adde4cf85f2ce479738d1ae4c514d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 29 Jun 2021 09:39:15 +0000 Subject: bpf: Remove superfluous aux sanitation on subprog rejection Follow-up to fe9a5ca7e370 ("bpf: Do not mark insn as seen under speculative path verification"). The sanitize_insn_aux_data() helper does not serve a particular purpose in today's code. The original intention for the helper was that if function-by-function verification fails, a given program would be cleared from temporary insn_aux_data[], and then its verification would be re-attempted in the context of the main program a second time. However, a failure in do_check_subprogs() will skip do_check_main() and propagate the error to the user instead, thus such situation can never occur. Given its interaction is not compatible to the Spectre v1 mitigation (due to comparing aux->seen with env->pass_cnt), just remove sanitize_insn_aux_data() to avoid future bugs in this area. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9de3c9c3267c..8a7a28b4cfb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12752,37 +12752,6 @@ static void free_states(struct bpf_verifier_env *env) } } -/* The verifier is using insn_aux_data[] to store temporary data during - * verification and to store information for passes that run after the - * verification like dead code sanitization. do_check_common() for subprogram N - * may analyze many other subprograms. sanitize_insn_aux_data() clears all - * temporary data after do_check_common() finds that subprogram N cannot be - * verified independently. pass_cnt counts the number of times - * do_check_common() was run and insn->aux->seen tells the pass number - * insn_aux_data was touched. These variables are compared to clear temporary - * data from failed pass. For testing and experiments do_check_common() can be - * run multiple times even when prior attempt to verify is unsuccessful. - * - * Note that special handling is needed on !env->bypass_spec_v1 if this is - * ever called outside of error path with subsequent program rejection. - */ -static void sanitize_insn_aux_data(struct bpf_verifier_env *env) -{ - struct bpf_insn *insn = env->prog->insnsi; - struct bpf_insn_aux_data *aux; - int i, class; - - for (i = 0; i < env->prog->len; i++) { - class = BPF_CLASS(insn[i].code); - if (class != BPF_LDX && class != BPF_STX) - continue; - aux = &env->insn_aux_data[i]; - if (aux->seen != env->pass_cnt) - continue; - memset(aux, 0, offsetof(typeof(*aux), orig_idx)); - } -} - static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -12859,9 +12828,6 @@ out: if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); - if (ret) - /* clean aux data in case subprog was rejected */ - sanitize_insn_aux_data(env); return ret; } -- cgit v1.2.3-71-gd317 From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 16 Jul 2021 09:18:21 +0000 Subject: bpf: Fix pointer arithmetic mask tightening under state pruning In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..7ba7e800d472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -414,6 +414,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; bool allow_ptr_to_map_access; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a7a28b4cfb9..657062cb4d85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6561,6 +6561,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; } err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -9936,8 +9942,8 @@ next: } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal; @@ -9963,6 +9969,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { if (!rold->precise && !rcur->precise) return true; @@ -10053,9 +10061,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi; @@ -10100,9 +10107,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -10159,10 +10165,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false; - if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; if (!refsafe(old, cur)) -- cgit v1.2.3-71-gd317 From 1f8c543f142942a2325e729fc4c64b6898749c3a Mon Sep 17 00:00:00 2001 From: zhaoxiaoqiang11 Date: Fri, 16 Jul 2021 17:18:53 +0800 Subject: cgroup: remove cgroup_mount from comments Git rid of an outdated comment. Since cgroup was fully switched to fs_context, cgroup_mount() is gone and it's confusing to mention in comments of cgroup_kill_sb(). Delete it. Signed-off-by: zhaoxiaoqiang11 Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3a0161c21b6b..d0725c1a8db5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2169,7 +2169,6 @@ static void cgroup_kill_sb(struct super_block *sb) /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). - * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ -- cgit v1.2.3-71-gd317 From 9d7a6c95f62bc335b62aaf9d50590122bd03a796 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 5 Jul 2021 10:44:52 +0200 Subject: perf: Fix required permissions if sigtrap is requested If perf_event_open() is called with another task as target and perf_event_attr::sigtrap is set, and the target task's user does not match the calling user, also require the CAP_KILL capability or PTRACE_MODE_ATTACH permissions. Otherwise, with the CAP_PERFMON capability alone it would be possible for a user to send SIGTRAP signals via perf events to another user's tasks. This could potentially result in those tasks being terminated if they cannot handle SIGTRAP signals. Note: The check complements the existing capability check, but is not supposed to supersede the ptrace_may_access() check. At a high level we now have: capable of CAP_PERFMON and (CAP_KILL if sigtrap) OR ptrace_may_access(...) // also checks for same thread-group and uid Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Cc: # 5.13+ Link: https://lore.kernel.org/r/20210705084453.2151729-1-elver@google.com --- kernel/events/core.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 464917096e73..c13730b7ac01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12158,10 +12158,33 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { + unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; + bool is_capable; + err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_file; + is_capable = perfmon_capable(); + if (attr.sigtrap) { + /* + * perf_event_attr::sigtrap sends signals to the other + * task. Require the current task to also have + * CAP_KILL. + */ + rcu_read_lock(); + is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); + rcu_read_unlock(); + + /* + * If the required capabilities aren't available, checks + * for ptrace permissions: upgrade to ATTACH, since + * sending signals can effectively change the target + * task. + */ + ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; + } + /* * Preserve ptrace permission check for backwards compatibility. * @@ -12171,7 +12194,7 @@ SYSCALL_DEFINE5(perf_event_open, * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + if (!is_capable && !ptrace_may_access(task, ptrace_mode)) goto err_cred; } -- cgit v1.2.3-71-gd317 From b068fc04de10fff8974f6ef32b861ad134d94ba4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 5 Jul 2021 10:44:53 +0200 Subject: perf: Refactor permissions check into perf_check_permission() Refactor the permission check in perf_event_open() into a helper perf_check_permission(). This makes the permission check logic more readable (because we no longer have a negated disjunction). Add a comment mentioning the ptrace check also checks the uid. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20210705084453.2151729-2-elver@google.com --- kernel/events/core.c | 58 +++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c13730b7ac01..1cb1f9b8392e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11917,6 +11917,37 @@ again: return gctx; } +static bool +perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) +{ + unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; + bool is_capable = perfmon_capable(); + + if (attr->sigtrap) { + /* + * perf_event_attr::sigtrap sends signals to the other task. + * Require the current task to also have CAP_KILL. + */ + rcu_read_lock(); + is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); + rcu_read_unlock(); + + /* + * If the required capabilities aren't available, checks for + * ptrace permissions: upgrade to ATTACH, since sending signals + * can effectively change the target task. + */ + ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; + } + + /* + * Preserve ptrace permission check for backwards compatibility. The + * ptrace check also includes checks that the current task and other + * task have matching uids, and is therefore not done here explicitly. + */ + return is_capable || ptrace_may_access(task, ptrace_mode); +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -12158,43 +12189,18 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; - bool is_capable; - err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_file; - is_capable = perfmon_capable(); - if (attr.sigtrap) { - /* - * perf_event_attr::sigtrap sends signals to the other - * task. Require the current task to also have - * CAP_KILL. - */ - rcu_read_lock(); - is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); - rcu_read_unlock(); - - /* - * If the required capabilities aren't available, checks - * for ptrace permissions: upgrade to ATTACH, since - * sending signals can effectively change the target - * task. - */ - ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; - } - /* - * Preserve ptrace permission check for backwards compatibility. - * * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!is_capable && !ptrace_may_access(task, ptrace_mode)) + if (!perf_check_permission(&attr, task)) goto err_cred; } -- cgit v1.2.3-71-gd317 From d4e5076c3522658996dbb050aa6c708bd2c1a3c1 Mon Sep 17 00:00:00 2001 From: xuyehan Date: Tue, 6 Jul 2021 12:50:43 +0800 Subject: locking/rwsem: Remove an unused parameter of rwsem_wake() The 2nd parameter 'count' is not used in this function. The places where the function is called are also modified. Signed-off-by: xuyehan Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/1625547043-28103-1-git-send-email-yehanxu1@gmail.com --- kernel/locking/rwsem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 809b0016d344..2cad15df296f 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1165,7 +1165,7 @@ out_nolock: * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -1297,7 +1297,7 @@ static inline void __up_read(struct rw_semaphore *sem) if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_nonspinnable(sem); - rwsem_wake(sem, tmp); + rwsem_wake(sem); } } @@ -1319,7 +1319,7 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem, tmp); + rwsem_wake(sem); } /* -- cgit v1.2.3-71-gd317 From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Jul 2021 16:06:15 -0700 Subject: bpf: Add ambient BPF runtime context stored in current b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions. While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future. Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way. This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively. Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra. This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs. To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work). [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/ Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org --- include/linux/bpf-cgroup.h | 54 ---------------------------------------------- include/linux/bpf.h | 54 +++++++++++++++++++++++++++++----------------- include/linux/sched.h | 3 +++ kernel/bpf/helpers.c | 16 +++++--------- kernel/bpf/local_storage.c | 3 --- kernel/fork.c | 1 + net/bpf/test_run.c | 23 ++++++++++---------- 7 files changed, 54 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..a74cd1c3bd87 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) -#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -172,44 +159,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; } -static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -487,9 +436,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0edff8f5177e..978ebd16ae60 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1142,38 +1142,40 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +}; + /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) -/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, - * if bpf_cgroup_storage_set() failed, the rest of programs - * will not execute. This should be a really rare scenario - * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of - * preemptions all between bpf_cgroup_storage_set() and - * bpf_cgroup_storage_unset() on the same cpu. - */ #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ u32 func_ret; \ migrate_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ func_ret = func(_prog, ctx); \ _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - bpf_cgroup_storage_unset(); \ + *(ret_flags) |= (func_ret >> 1); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ _ret; \ @@ -1184,6 +1186,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1191,17 +1195,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1284,6 +1284,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641..c64119aa2e60 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct backing_dev_info; struct bio_list; struct blk_plug; struct bpf_local_storage; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1379,6 +1380,8 @@ struct task_struct { #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9fe846ec6bd1..15746f779fe1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -393,8 +393,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -403,17 +401,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 95d70a08325d..362e81481594 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@ #ifdef CONFIG_CGROUP_BPF -DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/fork.c b/kernel/fork.c index bc94b2cc5995..e8b41e212110 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2083,6 +2083,7 @@ static __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_BPF_SYSCALL RCU_INIT_POINTER(p->bpf_storage, NULL); + p->bpf_ctx = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index cda8375bbbaf..8d46e2962786 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -88,17 +88,19 @@ reset: static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -107,22 +109,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, repeat = 1; bpf_test_timer_enter(&t); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; - + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - - bpf_cgroup_storage_unset(); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } -- cgit v1.2.3-71-gd317 From a1ad4b8a19566b11e0306f8b197f2fd4567340e5 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:48 +0100 Subject: printk: Straighten out log_flags into printk_info_flags In the past, `enum log_flags` was part of `struct log`, hence the name. `struct log` has since been reworked and now this struct is stored inside `struct printk_info`. However, the name was never updated, which is somewhat confusing -- especially since these flags operate at the record level rather than at the level of an abstract log. printk_info_flags also joins its other metadata struct friends in printk_ringbuffer.h. Signed-off-by: Chris Down Reviewed-by: Petr Mladek Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/3dd801982f02603e6e3aa4f8bc4f5ebb830a4949.1623775748.git.chris@chrisdown.name --- kernel/printk/internal.h | 6 ++++++ kernel/printk/printk.c | 43 ++++++++++++++++++++----------------------- 2 files changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 51615c909b2f..1075e60fcd98 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -12,6 +12,12 @@ #define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 +/* Flags for a single printk record. */ +enum printk_info_flags { + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ +}; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..39f1ec22a6a6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -350,11 +350,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * non-prinatable characters are escaped in the "\xff" notation. */ -enum log_flags { - LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_CONT = 8, /* text is a fragment of a continuation line */ -}; - /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_RAW_SPINLOCK(syslog_lock); @@ -1965,19 +1960,20 @@ static inline u32 printk_caller_id(void) * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. - * @lflags: A pointer to the current log flags, will be updated. + * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * - * @lflags may be NULL if the caller is not interested in the parsed value. - * Otherwise the variable pointed to by @lflags will be OR'd with the parsed + * @flags may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ -static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) +static u16 parse_prefix(char *text, int *level, + enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; @@ -1993,8 +1989,8 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ - if (lflags) - *lflags |= LOG_CONT; + if (flags) + *flags |= LOG_CONT; } prefix_len += 2; @@ -2004,8 +2000,9 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) return prefix_len; } -static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, - const char *fmt, va_list args) +static u16 printk_sprint(char *text, u16 size, int facility, + enum printk_info_flags *flags, const char *fmt, + va_list args) { u16 text_len; @@ -2014,7 +2011,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; - *lflags |= LOG_NEWLINE; + *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ @@ -2038,7 +2035,7 @@ int vprintk_store(int facility, int level, { const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; - enum log_flags lflags = 0; + enum printk_info_flags flags = 0; struct printk_record r; u16 trunc_msg_len = 0; char prefix_buf[8]; @@ -2070,22 +2067,22 @@ int vprintk_store(int facility, int level, /* Extract log level or control flags. */ if (facility == 0) - parse_prefix(&prefix_buf[0], &level, &lflags); + parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) - lflags |= LOG_NEWLINE; + flags |= LOG_NEWLINE; - if (lflags & LOG_CONT) { + if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, - facility, &lflags, fmt, args); + facility, &flags, fmt, args); r.info->text_len += text_len; - if (lflags & LOG_NEWLINE) { + if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { @@ -2112,20 +2109,20 @@ int vprintk_store(int facility, int level, } /* fill message */ - text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; - r.info->flags = lflags & 0x1f; + r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ - if (!(lflags & LOG_NEWLINE)) + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); -- cgit v1.2.3-71-gd317 From f3d75cf537db57f7918a17a75527951de850e5ec Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:51 +0100 Subject: printk: Rework parse_prefix into printk_parse_prefix parse_prefix is needed externally by later patches, so move it into a context where it can be used as such. Also give it the printk_ prefix to reduce the chance of collisions. Signed-off-by: Chris Down Cc: Petr Mladek Reviewed-by: Petr Mladek Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/b22ba314a860e5c7f887958f1eab2649f9bd1d06.1623775748.git.chris@chrisdown.name --- kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1075e60fcd98..1596e2837318 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -57,6 +57,8 @@ bool printk_percpu_data_ready(void); void defer_console_output(void); +u16 printk_parse_prefix(const char *text, int *level, + enum printk_info_flags *flags); #else /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 39f1ec22a6a6..03956c3eb745 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1956,7 +1956,7 @@ static inline u32 printk_caller_id(void) } /** - * parse_prefix - Parse level and control flags. + * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. @@ -1972,7 +1972,7 @@ static inline u32 printk_caller_id(void) * * Return: The length of the parsed level and control flags. */ -static u16 parse_prefix(char *text, int *level, +u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; @@ -2018,7 +2018,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, if (facility == 0) { u16 prefix_len; - prefix_len = parse_prefix(text, NULL, NULL); + prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); @@ -2067,7 +2067,7 @@ int vprintk_store(int facility, int level, /* Extract log level or control flags. */ if (facility == 0) - parse_prefix(&prefix_buf[0], &level, &flags); + printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; -- cgit v1.2.3-71-gd317 From 337015573718b161891a3473d25f59273f2e626b Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:53 +0100 Subject: printk: Userspace format indexing support We have a number of systems industry-wide that have a subset of their functionality that works as follows: 1. Receive a message from local kmsg, serial console, or netconsole; 2. Apply a set of rules to classify the message; 3. Do something based on this classification (like scheduling a remediation for the machine), rinse, and repeat. As a couple of examples of places we have this implemented just inside Facebook, although this isn't a Facebook-specific problem, we have this inside our netconsole processing (for alarm classification), and as part of our machine health checking. We use these messages to determine fairly important metrics around production health, and it's important that we get them right. While for some kinds of issues we have counters, tracepoints, or metrics with a stable interface which can reliably indicate the issue, in order to react to production issues quickly we need to work with the interface which most kernel developers naturally use when developing: printk. Most production issues come from unexpected phenomena, and as such usually the code in question doesn't have easily usable tracepoints or other counters available for the specific problem being mitigated. We have a number of lines of monitoring defence against problems in production (host metrics, process metrics, service metrics, etc), and where it's not feasible to reliably monitor at another level, this kind of pragmatic netconsole monitoring is essential. As one would expect, monitoring using printk is rather brittle for a number of reasons -- most notably that the message might disappear entirely in a new version of the kernel, or that the message may change in some way that the regex or other classification methods start to silently fail. One factor that makes this even harder is that, under normal operation, many of these messages are never expected to be hit. For example, there may be a rare hardware bug which one wants to detect if it was to ever happen again, but its recurrence is not likely or anticipated. This precludes using something like checking whether the printk in question was printed somewhere fleetwide recently to determine whether the message in question is still present or not, since we don't anticipate that it should be printed anywhere, but still need to monitor for its future presence in the long-term. This class of issue has happened on a number of occasions, causing unhealthy machines with hardware issues to remain in production for longer than ideal. As a recent example, some monitoring around blk_update_request fell out of date and caused semi-broken machines to remain in production for longer than would be desirable. Searching through the codebase to find the message is also extremely fragile, because many of the messages are further constructed beyond their callsite (eg. btrfs_printk and other module-specific wrappers, each with their own functionality). Even if they aren't, guessing the format and formulation of the underlying message based on the aesthetics of the message emitted is not a recipe for success at scale, and our previous issues with fleetwide machine health checking demonstrate as much. This provides a solution to the issue of silently changed or deleted printks: we record pointers to all printk format strings known at compile time into a new .printk_index section, both in vmlinux and modules. At runtime, this can then be iterated by looking at /printk/index/, which emits the following format, both readable by humans and able to be parsed by machines: $ head -1 vmlinux; shuf -n 5 vmlinux # filename:line function "format" <5> block/blk-settings.c:661 disk_stack_limits "%s: Warning: Device %s is misaligned\n" <4> kernel/trace/trace.c:8296 trace_create_file "Could not create tracefs '%s' entry\n" <6> arch/x86/kernel/hpet.c:144 _hpet_print_config "hpet: %s(%d):\n" <6> init/do_mounts.c:605 prepare_namespace "Waiting for root device %s...\n" <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n" This mitigates the majority of cases where we have a highly-specific printk which we want to match on, as we can now enumerate and check whether the format changed or the printk callsite disappeared entirely in userspace. This allows us to catch changes to printks we monitor earlier and decide what to do about it before it becomes problematic. There is no additional runtime cost for printk callers or printk itself, and the assembly generated is exactly the same. Signed-off-by: Chris Down Cc: Petr Mladek Cc: Jessica Yu Cc: Sergey Senozhatsky Cc: John Ogness Cc: Steven Rostedt Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: Kees Cook Reviewed-by: Petr Mladek Tested-by: Petr Mladek Reported-by: kernel test robot Acked-by: Andy Shevchenko Acked-by: Jessica Yu # for module.{c,h} Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/e42070983637ac5e384f17fbdbe86d19c7b212a5.1623775748.git.chris@chrisdown.name --- MAINTAINERS | 5 + arch/arm/kernel/entry-v7m.S | 2 +- arch/arm/lib/backtrace-clang.S | 2 +- arch/arm/lib/backtrace.S | 2 +- arch/arm/mach-rpc/io-acorn.S | 2 +- arch/arm/vfp/vfphw.S | 6 +- arch/ia64/include/uapi/asm/cmpxchg.h | 4 +- arch/openrisc/kernel/entry.S | 6 +- arch/powerpc/kernel/head_fsl_booke.S | 2 +- arch/um/include/shared/user.h | 3 +- arch/x86/kernel/head_32.S | 2 +- include/asm-generic/vmlinux.lds.h | 13 +++ include/linux/module.h | 5 + include/linux/printk.h | 95 ++++++++++++++++- init/Kconfig | 14 +++ kernel/module.c | 5 + kernel/printk/Makefile | 1 + kernel/printk/index.c | 195 +++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 13 ++- 19 files changed, 353 insertions(+), 24 deletions(-) create mode 100644 kernel/printk/index.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index a61f4f3b78a9..a19a104e0cc4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14918,6 +14918,11 @@ S: Maintained F: include/linux/printk.h F: kernel/printk/ +PRINTK INDEXING +R: Chris Down +S: Maintained +F: kernel/printk/index.c + PRISM54 WIRELESS DRIVER M: Luis Chamberlain L: linux-wireless@vger.kernel.org diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S index d0e898608d30..7bde93c10962 100644 --- a/arch/arm/kernel/entry-v7m.S +++ b/arch/arm/kernel/entry-v7m.S @@ -23,7 +23,7 @@ __invalid_entry: adr r0, strerr mrs r1, ipsr mov r2, lr - bl printk + bl _printk #endif mov r0, sp bl show_regs diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 6174c45f53a5..5b2cdb1003e3 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -202,7 +202,7 @@ finished_setup: 1006: adr r0, .Lbad mov r1, loglvl mov r2, frame - bl printk + bl _printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 872f658638d9..e8408f22d4dc 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -103,7 +103,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions 1006: adr r0, .Lbad mov r1, loglvl mov r2, frame - bl printk + bl _printk no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) diff --git a/arch/arm/mach-rpc/io-acorn.S b/arch/arm/mach-rpc/io-acorn.S index b9082a2a2a01..aa9bf0d771c0 100644 --- a/arch/arm/mach-rpc/io-acorn.S +++ b/arch/arm/mach-rpc/io-acorn.S @@ -25,4 +25,4 @@ ENTRY(insl) ENTRY(outsl) adr r0, .Liosl_warning mov r1, lr - b printk + b _printk diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S index d5837bf05a9a..6f7926c9c179 100644 --- a/arch/arm/vfp/vfphw.S +++ b/arch/arm/vfp/vfphw.S @@ -23,7 +23,7 @@ #ifdef DEBUG stmfd sp!, {r0-r3, ip, lr} ldr r0, =1f - bl printk + bl _printk ldmfd sp!, {r0-r3, ip, lr} .pushsection .rodata, "a" @@ -38,7 +38,7 @@ stmfd sp!, {r0-r3, ip, lr} mov r1, \arg ldr r0, =1f - bl printk + bl _printk ldmfd sp!, {r0-r3, ip, lr} .pushsection .rodata, "a" @@ -55,7 +55,7 @@ mov r2, \arg2 mov r1, \arg1 ldr r0, =1f - bl printk + bl _printk ldmfd sp!, {r0-r3, ip, lr} .pushsection .rodata, "a" diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h index 926c6cb1e029..2c2f3cfeaa77 100644 --- a/arch/ia64/include/uapi/asm/cmpxchg.h +++ b/arch/ia64/include/uapi/asm/cmpxchg.h @@ -143,9 +143,9 @@ extern long ia64_cmpxchg_called_with_bad_pointer(void); do { \ if (_cmpxchg_bugcheck_count-- <= 0) { \ void *ip; \ - extern int printk(const char *fmt, ...); \ + extern int _printk(const char *fmt, ...); \ ip = (void *) ia64_getreg(_IA64_REG_IP); \ - printk("CMPXCHG_BUGCHECK: stuck at %p on word %p\n", ip, (v));\ + _printk("CMPXCHG_BUGCHECK: stuck at %p on word %p\n", ip, (v));\ break; \ } \ } while (0) diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index bc657e55c15f..947613f61d4a 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -551,7 +551,7 @@ EXCEPTION_ENTRY(_external_irq_handler) l.movhi r3,hi(42f) l.ori r3,r3,lo(42f) l.sw 0x0(r1),r3 - l.jal printk + l.jal _printk l.sw 0x4(r1),r4 l.addi r1,r1,0x8 @@ -681,8 +681,8 @@ _syscall_debug: l.sw -4(r1),r27 l.sw -8(r1),r11 l.addi r1,r1,-8 - l.movhi r27,hi(printk) - l.ori r27,r27,lo(printk) + l.movhi r27,hi(_printk) + l.ori r27,r27,lo(_printk) l.jalr r27 l.nop l.addi r1,r1,8 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 0f9642f36b49..9a2f4265e6d2 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -858,7 +858,7 @@ KernelSPE: ori r3,r3,87f@l mr r4,r2 /* current */ lwz r5,_NIP(r1) - bl printk + bl _printk #endif b interrupt_return #ifdef CONFIG_PRINTK diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index e793e4212f0a..dd4badffdeb3 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -38,7 +38,8 @@ extern void panic(const char *fmt, ...) #define UM_KERN_CONT KERN_CONT #ifdef UML_CONFIG_PRINTK -extern int printk(const char *fmt, ...) +#define printk(...) _printk(__VA_ARGS__) +extern int _printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); #else static inline int printk(const char *fmt, ...) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 67f590425d90..d8c64dab0efe 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -432,7 +432,7 @@ SYM_FUNC_START(early_ignore_irq) pushl 32(%esp) pushl 40(%esp) pushl $int_msg - call printk + call _printk call dump_stack diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 17325416e2de..ddb2ff158321 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -483,6 +483,8 @@ \ TRACEDATA \ \ + PRINTK_INDEX \ + \ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ __start___ksymtab = .; \ @@ -893,6 +895,17 @@ #define TRACEDATA #endif +#ifdef CONFIG_PRINTK_INDEX +#define PRINTK_INDEX \ + .printk_index : AT(ADDR(.printk_index) - LOAD_OFFSET) { \ + __start_printk_index = .; \ + *(.printk_index) \ + __stop_printk_index = .; \ + } +#else +#define PRINTK_INDEX +#endif + #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ __start_notes = .; \ diff --git a/include/linux/module.h b/include/linux/module.h index 8a298d820dbc..c9f1200b2312 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -511,6 +511,11 @@ struct module { struct klp_modinfo *klp_info; #endif +#ifdef CONFIG_PRINTK_INDEX + unsigned int printk_index_size; + struct pi_entry **printk_index_start; +#endif + #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; diff --git a/include/linux/printk.h b/include/linux/printk.h index e834d78f0478..2651b82ed352 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -174,12 +174,12 @@ asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold -int printk(const char *fmt, ...); +int _printk(const char *fmt, ...); /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ -__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); +__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -218,12 +218,12 @@ int vprintk(const char *s, va_list args) return 0; } static inline __printf(1, 2) __cold -int printk(const char *s, ...) +int _printk(const char *s, ...) { return 0; } static inline __printf(1, 2) __cold -int printk_deferred(const char *s, ...) +int _printk_deferred(const char *s, ...) { return 0; } @@ -348,6 +348,93 @@ extern int kptr_restrict; #define pr_fmt(fmt) fmt #endif +struct module; + +#ifdef CONFIG_PRINTK_INDEX +struct pi_entry { + const char *fmt; + const char *func; + const char *file; + unsigned int line; + + /* + * While printk and pr_* have the level stored in the string at compile + * time, some subsystems dynamically add it at runtime through the + * format string. For these dynamic cases, we allow the subsystem to + * tell us the level at compile time. + * + * NULL indicates that the level, if any, is stored in fmt. + */ + const char *level; + + /* + * The format string used by various subsystem specific printk() + * wrappers to prefix the message. + * + * Note that the static prefix defined by the pr_fmt() macro is stored + * directly in the message format (@fmt), not here. + */ + const char *subsys_fmt_prefix; +} __packed; + +#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix) \ + do { \ + if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \ + /* + * We check __builtin_constant_p multiple times here + * for the same input because GCC will produce an error + * if we try to assign a static variable to fmt if it + * is not a constant, even with the outer if statement. + */ \ + static const struct pi_entry _entry \ + __used = { \ + .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + .level = __builtin_constant_p(_level) ? (_level) : NULL, \ + .subsys_fmt_prefix = _subsys_fmt_prefix,\ + }; \ + static const struct pi_entry *_entry_ptr \ + __used __section(".printk_index") = &_entry; \ + } \ + } while (0) + +#else /* !CONFIG_PRINTK_INDEX */ +#define __printk_index_emit(...) do {} while (0) +#endif /* CONFIG_PRINTK_INDEX */ + +/* + * Some subsystems have their own custom printk that applies a va_format to a + * generic format, for example, to include a device number or other metadata + * alongside the format supplied by the caller. + * + * In order to store these in the way they would be emitted by the printk + * infrastructure, the subsystem provides us with the start, fixed string, and + * any subsequent text in the format string. + * + * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed + * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the + * first one. + * + * subsys_fmt_prefix must be known at compile time, or compilation will fail + * (since this is a mistake). If fmt or level is not known at compile time, no + * index entry will be made (since this can legitimately happen). + */ +#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \ + __printk_index_emit(fmt, level, subsys_fmt_prefix) + +#define printk_index_wrap(_p_func, _fmt, ...) \ + ({ \ + __printk_index_emit(_fmt, NULL, NULL); \ + _p_func(_fmt, ##__VA_ARGS__); \ + }) + + +#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) +#define printk_deferred(fmt, ...) \ + printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) + /** * pr_emerg - Print an emergency-level message * @fmt: format string diff --git a/init/Kconfig b/init/Kconfig index bb0d6e6262b1..ccffa7ae5ccc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -775,6 +775,20 @@ config PRINTK_SAFE_LOG_BUF_SHIFT 13 => 8 KB for each CPU 12 => 4 KB for each CPU +config PRINTK_INDEX + bool "Printk indexing debugfs interface" + depends on PRINTK && DEBUG_FS + help + Add support for indexing of all printk formats known at compile time + at /printk/index/. + + This can be used as part of maintaining daemons which monitor + /dev/kmsg, as it permits auditing the printk formats present in a + kernel, allowing detection of cases where monitored printks are + changed or no longer present. + + There is no additional runtime cost to printk with this enabled. + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/kernel/module.c b/kernel/module.c index ed13917ea5f3..40ec9a030eec 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3355,6 +3355,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(unsigned long), &mod->num_kprobe_blacklist); #endif +#ifdef CONFIG_PRINTK_INDEX + mod->printk_index_start = section_objs(info, ".printk_index", + sizeof(*mod->printk_index_start), + &mod->printk_index_size); +#endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE mod->static_call_sites = section_objs(info, ".static_call_sites", sizeof(*mod->static_call_sites), diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index eee3dc9b60a9..d118739874c0 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -3,3 +3,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK) += printk_ringbuffer.o +obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/index.c b/kernel/printk/index.c new file mode 100644 index 000000000000..ca062f5e1779 --- /dev/null +++ b/kernel/printk/index.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Userspace indexing of printk formats + */ + +#include +#include +#include +#include +#include + +#include "internal.h" + +extern struct pi_entry *__start_printk_index[]; +extern struct pi_entry *__stop_printk_index[]; + +/* The base dir for module formats, typically debugfs/printk/index/ */ +static struct dentry *dfs_index; + +static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos) +{ + struct pi_entry **entries; + unsigned int nr_entries; + +#ifdef CONFIG_MODULES + if (mod) { + entries = mod->printk_index_start; + nr_entries = mod->printk_index_size; + } +#endif + + if (!mod) { + /* vmlinux, comes from linker symbols */ + entries = __start_printk_index; + nr_entries = __stop_printk_index - __start_printk_index; + } + + if (pos >= nr_entries) + return NULL; + + return entries[pos]; +} + +static void *pi_next(struct seq_file *s, void *v, loff_t *pos) +{ + const struct module *mod = s->file->f_inode->i_private; + struct pi_entry *entry = pi_get_entry(mod, *pos); + + (*pos)++; + + return entry; +} + +static void *pi_start(struct seq_file *s, loff_t *pos) +{ + /* + * Make show() print the header line. Do not update *pos because + * pi_next() still has to return the entry at index 0 later. + */ + if (*pos == 0) + return SEQ_START_TOKEN; + + return pi_next(s, NULL, pos); +} + +/* + * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only + * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be + * ignored for quoting. + */ +#define seq_escape_printf_format(s, src) \ + seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\") + +static int pi_show(struct seq_file *s, void *v) +{ + const struct pi_entry *entry = v; + int level = LOGLEVEL_DEFAULT; + enum printk_info_flags flags = 0; + u16 prefix_len = 0; + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "# filename:line function \"format\"\n"); + return 0; + } + + if (!entry->fmt) + return 0; + + if (entry->level) + printk_parse_prefix(entry->level, &level, &flags); + else + prefix_len = printk_parse_prefix(entry->fmt, &level, &flags); + + + if (flags & LOG_CONT) { + /* + * LOGLEVEL_DEFAULT here means "use the same level as the + * message we're continuing from", not the default message + * loglevel, so don't display it as such. + */ + if (level == LOGLEVEL_DEFAULT) + seq_puts(s, ""); + else + seq_printf(s, "<%d,c>", level); + } else + seq_printf(s, "<%d>", level); + + seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func); + if (entry->subsys_fmt_prefix) + seq_escape_printf_format(s, entry->subsys_fmt_prefix); + seq_escape_printf_format(s, entry->fmt + prefix_len); + seq_puts(s, "\"\n"); + + return 0; +} + +static void pi_stop(struct seq_file *p, void *v) { } + +static const struct seq_operations dfs_index_sops = { + .start = pi_start, + .next = pi_next, + .show = pi_show, + .stop = pi_stop, +}; + +DEFINE_SEQ_ATTRIBUTE(dfs_index); + +#ifdef CONFIG_MODULES +static const char *pi_get_module_name(struct module *mod) +{ + return mod ? mod->name : "vmlinux"; +} +#else +static const char *pi_get_module_name(struct module *mod) +{ + return "vmlinux"; +} +#endif + +void pi_create_file(struct module *mod) +{ + debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index, + mod, &dfs_index_fops); +} + +void pi_remove_file(struct module *mod) +{ + debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); +} + +#ifdef CONFIG_MODULES +static int pi_module_notify(struct notifier_block *nb, unsigned long op, + void *data) +{ + struct module *mod = data; + + switch (op) { + case MODULE_STATE_COMING: + pi_create_file(mod); + break; + case MODULE_STATE_GOING: + pi_remove_file(mod); + break; + default: /* we don't care about other module states */ + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block module_printk_fmts_nb = { + .notifier_call = pi_module_notify, +}; + +static void __init pi_setup_module_notifier(void) +{ + register_module_notifier(&module_printk_fmts_nb); +} +#else +static inline void __init pi_setup_module_notifier(void) { } +#endif + +static int __init pi_init(void) +{ + struct dentry *dfs_root = debugfs_create_dir("printk", NULL); + + dfs_index = debugfs_create_dir("index", dfs_root); + pi_setup_module_notifier(); + pi_create_file(NULL); + + return 0; +} + +/* debugfs comes up on core and must be initialised first */ +postcore_initcall(pi_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 03956c3eb745..765f7af6ce56 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2184,10 +2184,13 @@ int vprintk_default(const char *fmt, va_list args) EXPORT_SYMBOL_GPL(vprintk_default); /** - * printk - print a kernel message + * _printk - print a kernel message * @fmt: format string * - * This is printk(). It can be called from any context. We want it to work. + * This is _printk(). It can be called from any context. We want it to work. + * + * If printk indexing is enabled, _printk() is called from printk_index_wrap. + * Otherwise, printk is simply #defined to _printk. * * We try to grab the console_lock. If we succeed, it's easy - we log the * output and call the console drivers. If we fail to get the semaphore, we @@ -2204,7 +2207,7 @@ EXPORT_SYMBOL_GPL(vprintk_default); * * See the vsnprintf() documentation for format string extensions over C99. */ -asmlinkage __visible int printk(const char *fmt, ...) +asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; @@ -2215,7 +2218,7 @@ asmlinkage __visible int printk(const char *fmt, ...) return r; } -EXPORT_SYMBOL(printk); +EXPORT_SYMBOL(_printk); #else /* CONFIG_PRINTK */ @@ -3200,7 +3203,7 @@ int vprintk_deferred(const char *fmt, va_list args) return r; } -int printk_deferred(const char *fmt, ...) +int _printk_deferred(const char *fmt, ...) { va_list args; int r; -- cgit v1.2.3-71-gd317 From d97e99386ad0dcae08cb0f0c70efa806a2d4811c Mon Sep 17 00:00:00 2001 From: MaYuming Date: Sun, 11 Jul 2021 10:57:47 +0800 Subject: audit: add header protection to kernel/audit.h Protect kernel/audit.h against multiple #include's. Signed-off-by: MaYuming [PM: rewrite subj/description] Signed-off-by: Paul Moore --- kernel/audit.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index b565ea16c0a5..d6a2c899a8db 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -6,6 +6,9 @@ * Copyright 2005 IBM Corporation */ +#ifndef _KERNEL_AUDIT_H_ +#define _KERNEL_AUDIT_H_ + #include #include #include @@ -331,3 +334,5 @@ extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); + +#endif -- cgit v1.2.3-71-gd317 From dfcb27540213e8061ecffacd4bd8ed54a310a7b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2021 02:09:28 +0200 Subject: rcu/nocb: Start moving nocb code to its own plugin file The kernel/rcu/tree_plugin.h file contains not only the plugins for preemptible RCU, but also many other features including rcu_nocbs callback offloading. This offloading has become large and complex, so it is time to put it in its own file. This commit starts that process. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker [ paulmck: Rename to tree_nocb.h, add Frederic as author. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_nocb.h | 1496 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 1487 --------------------------------------------- 3 files changed, 1497 insertions(+), 1487 deletions(-) create mode 100644 kernel/rcu/tree_nocb.h (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..b6eb480402e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4784,4 +4784,5 @@ void __init rcu_init(void) #include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h new file mode 100644 index 000000000000..8fdf44f8523f --- /dev/null +++ b/kernel/rcu/tree_nocb.h @@ -0,0 +1,1496 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * Copyright SUSE, 2021 + * + * Author: Ingo Molnar + * Paul E. McKenney + * Frederic Weisbecker + */ + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. + */ + + +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. + */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = true; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_bypass_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) + cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_rdp_is_offloaded(rdp)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_rdp_is_offloaded(rdp)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ + swake_up_all(sq); +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); +} + +/* Is the specified CPU a no-CBs CPU? */ +bool rcu_is_nocb_cpu(int cpu) +{ + if (cpumask_available(rcu_nocb_mask)) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + bool needwake = false; + + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + return false; + } + + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp_gp->nocb_timer); + } + + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); + wake_up_process(rdp_gp->nocb_gp_kthread); + } + + return needwake; +} + +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + +/* + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. + */ +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + /* + * Bypass wakeup overrides previous deferments. In case + * of callback storm, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + if (!rcu_rdp_is_offloaded(rdp)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} + +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_rdp_is_offloaded(rdp) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); +} + +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; /* Not offloaded, no bypassing. */ + } + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + return true; // Callback already enqueued. + } + + // We need to use the bypass. + rcu_nocb_wait_contended(rdp); + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + smp_mb(); /* Order enqueue before wake. */ + if (ncbs) { + local_irq_restore(flags); + } else { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + } + return true; // Callback already enqueued. +} + +/* + * Awaken the no-CBs grace-period kthread if needed, either due to it + * legitimately being asleep or due to overload conditions. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) +{ + unsigned long cur_gp_seq; + unsigned long j; + long len; + struct task_struct *t; + + // If we are being polled or there is no kthread, just leave. + t = READ_ONCE(rdp->nocb_gp_kthread); + if (rcu_nocb_poll || !t) { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeNotPoll")); + return; + } + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + if (was_alldone) { + rdp->qlen_last_fqs_check = len; + if (!irqs_disabled_flags(flags)) { + /* ... if queue was empty ... */ + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); + } + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = len; + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp->nocb_timer)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + return; +} + +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, + bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } + return false; + } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return true; +} + + +/* + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. + */ +static void nocb_gp_wait(struct rcu_data *my_rdp) +{ + bool bypass = false; + long bypass_ncbs; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; + unsigned long flags; + bool gotcbs = false; + unsigned long j = jiffies; + bool needwait_gp = false; // This prevents actual uninitialized use. + bool needwake; + bool needwake_gp; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; + + /* + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. + */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + + if (!nocb_gp_enabled_cb(rdp)) + continue; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + if (bypass_ncbs && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; /* No callbacks here, try next. */ + } + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("Bypass")); + bypass = true; + } + rnp = rdp->mynode; + + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } + // Need to wait on some grace period? + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + smp_mb(); /* CB invocation -after- GP end. */ + } else { + needwake = false; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake) { + swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; + } + if (needwake_gp) + rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + } + + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; + + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); + } + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + schedule_timeout_idle(1); + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); + } + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + my_rdp->nocb_gp_seq = -1; + WARN_ON(signal_pending(current)); +} + +/* + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. + */ +static int rcu_nocb_gp_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); + nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + +/* + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. + */ +static void nocb_cb_wait(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long cur_gp_seq; + unsigned long flags; + bool needwake_state = false; + bool needwake_gp = false; + bool can_sleep = true; + struct rcu_node *rnp = rdp->mynode; + + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + if (rcu_segcblist_ready_cbs(cblist)) + can_sleep = false; + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. + */ +static int rcu_nocb_cb_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. + for (;;) { + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return READ_ONCE(rdp->nocb_defer_wakeup) >= level; +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, + struct rcu_data *rdp, int level, + unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + int ndw; + int ret; + + if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + return false; + } + + ndw = rdp_gp->nocb_defer_wakeup; + ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; +} + +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) +{ + unsigned long flags; + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) + return false; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); +} + +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); + +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_gp = false; + + rcu_segcblist_offload(cblist, offload); + + if (rdp->nocb_cb_sleep) + rdp->nocb_cb_sleep = false; + rcu_nocb_unlock_irqrestore(rdp, flags); + + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + + pr_info("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Flush once and for all now. This suffices because we are + * running on the target CPU holding ->nocb_lock (thus having + * interrupts disabled), and because rdp_offload_toggle() + * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. + * Thus future calls to rcu_segcblist_completely_offloaded() will + * return false, which means that future calls to rcu_nocb_try_bypass() + * will refuse to put anything into the bypass. + */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + ret = rdp_offload_toggle(rdp, false, flags); + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); + /* + * Lock one last time to acquire latest callback updates from kthreads + * so we can later handle callbacks locally without locking. + */ + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * lock is released but how about being paranoid for once? + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + /* Sanity check */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + + + return ret; +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + pr_info("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-offload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = false; + struct rcu_data *rdp; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + } + if (!cpumask_available(rcu_nocb_mask)) + return; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + } + rcu_organize_nocb_kthreads(); +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); + raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. + */ +static void rcu_spawn_one_nocb_kthread(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; + struct task_struct *t; + + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + return; + + /* If we didn't spawn the GP kthread first, reorganize! */ + rdp_gp = rdp->nocb_gp_rdp; + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + } + + /* Spawn the kthread for this CPU. */ + t = kthread_run(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread, spawn it. + */ +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ + if (rcu_scheduler_fully_active) + rcu_spawn_one_nocb_kthread(cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); +} + +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); + +/* + * Initialize GP-CB relationships for all no-CBs CPU. + */ +static void __init rcu_organize_nocb_kthreads(void) +{ + int cpu; + bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; + int ls = rcu_nocb_gp_stride; + int nl = 0; /* Next GP kthread. */ + struct rcu_data *rdp; + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; + + if (!cpumask_available(rcu_nocb_mask)) + return; + if (ls == -1) { + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); + rcu_nocb_gp_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. + */ + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu >= nl) { + /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_gp_rdp = rdp; + rdp_gp = rdp; + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } + } else { + /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; + rdp->nocb_gp_rdp = rdp_gp; + rdp_prev->nocb_next_cb_rdp = rdp; + if (dump_tree) + pr_cont(" %d", cpu); + } + rdp_prev = rdp; + } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); +} + +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + +/* + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. + */ +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + char bufw[20]; + char bufr[20]; + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "cC"[!!atomic_read(&rdp->nocb_lock_contended)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) + return; /* Nothing untoward. */ + + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); +} + +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + return true; +} + +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + return false; +} + +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return false; +} + +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) +{ +} + +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..aec5075c60b3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -13,39 +13,6 @@ #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_NOCB_CPU -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - /* Race on early boot between thread creation and assignment */ - if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) - return true; - - if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) - if (in_task()) - return true; - return false; -} - -#else -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* @@ -1479,1460 +1446,6 @@ static void rcu_cleanup_after_idle(void) #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_NOCB_CPU - -/* - * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads - * created that pull the callbacks from the corresponding CPU, wait for - * a grace period to elapse, and invoke the callbacks. These kthreads - * are organized into GP kthreads, which manage incoming callbacks, wait for - * grace periods, and awaken CB kthreads, and the CB kthreads, which only - * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs - * do a wake_up() on their GP kthread when they insert a callback into any - * empty list, unless the rcu_nocb_poll boot parameter has been specified, - * in which case each kthread actively polls its CPU. (Which isn't so great - * for energy efficiency, but which does reduce RCU's overhead on that CPU.) - * - * This is intended to be used in conjunction with Frederic Weisbecker's - * adaptive-idle work, which would seriously reduce OS jitter on CPUs - * running CPU-bound user-mode computations. - * - * Offloading of callbacks can also be used as an energy-efficiency - * measure because CPUs with no RCU callbacks queued are more aggressive - * about entering dyntick-idle mode. - */ - - -/* - * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * If the list is invalid, a warning is emitted and all CPUs are offloaded. - */ -static int __init rcu_nocb_setup(char *str) -{ - alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); - } - return 1; -} -__setup("rcu_nocbs=", rcu_nocb_setup); - -static int __init parse_rcu_nocb_poll(char *arg) -{ - rcu_nocb_poll = true; - return 0; -} -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); - -/* - * Don't bother bypassing ->cblist if the call_rcu() rate is low. - * After all, the main point of bypassing is to avoid lock contention - * on ->nocb_lock, which only can happen at high call_rcu() rates. - */ -static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; -module_param(nocb_nobypass_lim_per_jiffy, int, 0); - -/* - * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. - */ -static void rcu_nocb_bypass_lock(struct rcu_data *rdp) - __acquires(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - if (raw_spin_trylock(&rdp->nocb_bypass_lock)) - return; - atomic_inc(&rdp->nocb_lock_contended); - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ - raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); -} - -/* - * Conditionally acquire the specified rcu_data structure's - * ->nocb_bypass_lock. - */ -static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - return raw_spin_trylock(&rdp->nocb_bypass_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_bypass_lock. - */ -static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) - __releases(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_bypass_lock); -} - -/* - * Acquire the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_rdp_is_offloaded(rdp)) - return; - raw_spin_lock(&rdp->nocb_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_lock); - } -} - -/* - * Release the specified rcu_data structure's ->nocb_lock and restore - * interrupts, but only if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - } else { - local_irq_restore(flags); - } -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - lockdep_assert_held(&rdp->nocb_lock); -} - -/* - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended - * grace period. - */ -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ - swake_up_all(sq); -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ - init_swait_queue_head(&rnp->nocb_gp_wq[0]); - init_swait_queue_head(&rnp->nocb_gp_wq[1]); -} - -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - -static bool __wake_nocb_gp(struct rcu_data *rdp_gp, - struct rcu_data *rdp, - bool force, unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - bool needwake = false; - - if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("AlreadyAwake")); - return false; - } - - if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); - } - - if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { - WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - needwake = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (needwake) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); - } - - return needwake; -} - -/* - * Kick the GP kthread for this NOCB group. - */ -static bool wake_nocb_gp(struct rcu_data *rdp, bool force) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return __wake_nocb_gp(rdp_gp, rdp, force, flags); -} - -/* - * Arrange to wake the GP kthread for this NOCB group at some future - * time when it is safe to do so. - */ -static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, - const char *reason) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - - /* - * Bypass wakeup overrides previous deferments. In case - * of callback storm, no need to wake up too early. - */ - if (waketype == RCU_NOCB_WAKE_BYPASS) { - mod_timer(&rdp_gp->nocb_timer, jiffies + 2); - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } else { - if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) - mod_timer(&rdp_gp->nocb_timer, jiffies + 1); - if (rdp_gp->nocb_defer_wakeup < waketype) - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } - - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - struct rcu_cblist rcl; - - WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); - rcu_lockdep_assert_cblist_protected(rdp); - lockdep_assert_held(&rdp->nocb_bypass_lock); - if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { - raw_spin_unlock(&rdp->nocb_bypass_lock); - return false; - } - /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ - if (rhp) - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); - WRITE_ONCE(rdp->nocb_bypass_first, j); - rcu_nocb_bypass_unlock(rdp); - return true; -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - if (!rcu_rdp_is_offloaded(rdp)) - return true; - rcu_lockdep_assert_cblist_protected(rdp); - rcu_nocb_bypass_lock(rdp); - return rcu_nocb_do_flush_bypass(rdp, rhp, j); -} - -/* - * If the ->nocb_bypass_lock is immediately available, flush the - * ->nocb_bypass queue into ->cblist. - */ -static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) -{ - rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_rdp_is_offloaded(rdp) || - !rcu_nocb_bypass_trylock(rdp)) - return; - WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); -} - -/* - * See whether it is appropriate to use the ->nocb_bypass list in order - * to control contention on ->nocb_lock. A limited number of direct - * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass - * is non-empty, further callbacks must be placed into ->nocb_bypass, - * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch - * back to direct use of ->cblist. However, ->nocb_bypass should not be - * used if ->cblist is empty, because otherwise callbacks can be stranded - * on ->nocb_bypass because we cannot count on the current CPU ever again - * invoking call_rcu(). The general rule is that if ->nocb_bypass is - * non-empty, the corresponding no-CBs grace-period kthread must not be - * in an indefinite sleep state. - * - * Finally, it is not permitted to use the bypass during early boot, - * as doing so would confuse the auto-initialization code. Besides - * which, there is no point in worrying about lock contention while - * there is only one CPU in operation. - */ -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - unsigned long c; - unsigned long cur_gp_seq; - unsigned long j = jiffies; - long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - - lockdep_assert_irqs_disabled(); - - // Pure softirq/rcuc based processing: no bypassing, no - // locking. - if (!rcu_rdp_is_offloaded(rdp)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - - // Don't use ->nocb_bypass during early boot. - if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - rcu_nocb_lock(rdp); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // If we have advanced to a new jiffy, reset counts to allow - // moving back from ->nocb_bypass to ->cblist. - if (j == rdp->nocb_nobypass_last) { - c = rdp->nocb_nobypass_count + 1; - } else { - WRITE_ONCE(rdp->nocb_nobypass_last, j); - c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; - if (ULONG_CMP_LT(rdp->nocb_nobypass_count, - nocb_nobypass_lim_per_jiffy)) - c = 0; - else if (c > nocb_nobypass_lim_per_jiffy) - c = nocb_nobypass_lim_per_jiffy; - } - WRITE_ONCE(rdp->nocb_nobypass_count, c); - - // If there hasn't yet been all that many ->cblist enqueues - // this jiffy, tell the caller to enqueue onto ->cblist. But flush - // ->nocb_bypass first. - if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - - // If ->nocb_bypass has been used too long or is too full, - // flush ->nocb_bypass to ->cblist. - if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || - ncbs >= qhimark) { - rcu_nocb_lock(rdp); - if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - return true; // Callback already enqueued. - } - - // We need to use the bypass. - rcu_nocb_wait_contended(rdp); - rcu_nocb_bypass_lock(rdp); - ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); - if (!ncbs) { - WRITE_ONCE(rdp->nocb_bypass_first, j); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); - } - rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ - if (ncbs) { - local_irq_restore(flags); - } else { - // No-CBs GP kthread might be indefinitely asleep, if so, wake. - rcu_nocb_lock(rdp); // Rare during call_rcu() flood. - if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQwake")); - __call_rcu_nocb_wake(rdp, true, flags); - } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - } - return true; // Callback already enqueued. -} - -/* - * Awaken the no-CBs grace-period kthread if needed, either due to it - * legitimately being asleep or due to overload conditions. - * - * If warranted, also wake up the kthread servicing this CPUs queues. - */ -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, - unsigned long flags) - __releases(rdp->nocb_lock) -{ - unsigned long cur_gp_seq; - unsigned long j; - long len; - struct task_struct *t; - - // If we are being polled or there is no kthread, just leave. - t = READ_ONCE(rdp->nocb_gp_kthread); - if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeNotPoll")); - return; - } - // Need to actually to a wakeup. - len = rcu_segcblist_n_cbs(&rdp->cblist); - if (was_alldone) { - rdp->qlen_last_fqs_check = len; - if (!irqs_disabled_flags(flags)) { - /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp(rdp, false); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeEmpty")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); - } - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - /* ... or if many callbacks queued. */ - rdp->qlen_last_fqs_check = len; - j = jiffies; - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - smp_mb(); /* Enqueue before timer_pending(). */ - if ((rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - return; -} - -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; - } - - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; -} - - -/* - * No-CBs GP kthreads come here to wait for additional callbacks to show up - * or for grace periods to end. - */ -static void nocb_gp_wait(struct rcu_data *my_rdp) -{ - bool bypass = false; - long bypass_ncbs; - int __maybe_unused cpu = my_rdp->cpu; - unsigned long cur_gp_seq; - unsigned long flags; - bool gotcbs = false; - unsigned long j = jiffies; - bool needwait_gp = false; // This prevents actual uninitialized use. - bool needwake; - bool needwake_gp; - struct rcu_data *rdp; - struct rcu_node *rnp; - unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. - bool wasempty = false; - - /* - * Each pass through the following loop checks for CBs and for the - * nearest grace period (if any) to wait for next. The CB kthreads - * and the global grace-period kthread are awakened if needed. - */ - WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); - rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (bypass_ncbs && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || - bypass_ncbs > 2 * qhimark)) { - // Bypass full or old, so flush it. - (void)rcu_nocb_try_flush_bypass(rdp, j); - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; /* No callbacks here, try next. */ - } - if (bypass_ncbs) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("Bypass")); - bypass = true; - } - rnp = rdp->mynode; - - // Advance callbacks if helpful and low contention. - needwake_gp = false; - if (!rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL) || - (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake_gp = rcu_advance_cbs(rnp, rdp); - wasempty = rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL); - raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ - } - // Need to wait on some grace period? - WARN_ON_ONCE(wasempty && - !rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { - if (!needwait_gp || - ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) - wait_gp_seq = cur_gp_seq; - needwait_gp = true; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("NeedWaitGP")); - } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - needwake = rdp->nocb_cb_sleep; - WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ - } else { - needwake = false; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake) { - swake_up_one(&rdp->nocb_cb_wq); - gotcbs = true; - } - if (needwake_gp) - rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - } - - my_rdp->nocb_gp_bypass = bypass; - my_rdp->nocb_gp_gp = needwait_gp; - my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - - if (bypass && !rcu_nocb_poll) { - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. - wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, - TPS("WakeBypassIsDeferred")); - } - if (rcu_nocb_poll) { - /* Polling, so trace if first poll in the series. */ - if (gotcbs) - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); - } else if (!needwait_gp) { - /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); - } else { - rnp = my_rdp->mynode; - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); - swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], - rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); - } - if (!rcu_nocb_poll) { - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } - WRITE_ONCE(my_rdp->nocb_gp_sleep, true); - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); - } - my_rdp->nocb_gp_seq = -1; - WARN_ON(signal_pending(current)); -} - -/* - * No-CBs grace-period-wait kthread. There is one of these per group - * of CPUs, but only once at least one CPU in that group has come online - * at least once since boot. This kthread checks for newly posted - * callbacks from any of the CPUs it is responsible for, waits for a - * grace period, then awakens all of the rcu_nocb_cb_kthread() instances - * that then have callback-invocation work to do. - */ -static int rcu_nocb_gp_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - for (;;) { - WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); - nocb_gp_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) -{ - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); -} - -/* - * Invoke any ready callbacks from the corresponding no-CBs CPU, - * then, if there are no more, wait for more to appear. - */ -static void nocb_cb_wait(struct rcu_data *rdp) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long cur_gp_seq; - unsigned long flags; - bool needwake_state = false; - bool needwake_gp = false; - bool can_sleep = true; - struct rcu_node *rnp = rdp->mynode; - - local_irq_save(flags); - rcu_momentary_dyntick_idle(); - local_irq_restore(flags); - /* - * Disable BH to provide the expected environment. Also, when - * transitioning to/from NOCB mode, a self-requeuing callback might - * be invoked from softirq. A short grace period could cause both - * instances of this callback would execute concurrently. - */ - local_bh_disable(); - rcu_do_batch(rdp); - local_bh_enable(); - lockdep_assert_irqs_enabled(); - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && - raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ - needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; - } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); -} - -/* - * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke - * nocb_cb_wait() to do the dirty work. - */ -static int rcu_nocb_cb_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - // Each pass through this loop does one callback batch, and, - // if there are no more ready callbacks, waits for them. - for (;;) { - nocb_cb_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -/* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return READ_ONCE(rdp->nocb_defer_wakeup) >= level; -} - -/* Do a deferred wakeup of rcu_nocb_kthread(). */ -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, - struct rcu_data *rdp, int level, - unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - int ndw; - int ret; - - if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - return false; - } - - ndw = rdp_gp->nocb_defer_wakeup; - ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); - - return ret; -} - -/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(struct timer_list *t) -{ - unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - - WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); - - raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ - do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); -} - -/* - * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. - * This means we do an inexact common-case check. Note that if - * we miss, ->nocb_timer will eventually clean things up. - */ -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) - return false; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); -} - -void rcu_nocb_flush_deferred_wakeup(void) -{ - do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); -} -EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); - -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; - rcu_nocb_unlock_irqrestore(rdp, flags); - - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (rdp_gp->nocb_gp_sleep) { - rdp_gp->nocb_gp_sleep = false; - wake_gp = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; -} - -static long rcu_nocb_rdp_deoffload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - - pr_info("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); - /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - - - return ret; -} - -int rcu_nocb_cpu_deoffload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); - -static long rcu_nocb_rdp_offload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - /* - * For now we only support re-offload, ie: the rdp must have been - * offloaded on boot first. - */ - if (!rdp->nocb_gp_rdp) - return -EINVAL; - - pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - - /* - * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - ret = rdp_offload_toggle(rdp, true, flags); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - - return ret; -} - -int rcu_nocb_cpu_offload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); - -void __init rcu_init_nohz(void) -{ - int cpu; - bool need_rcu_nocb_mask = false; - struct rcu_data *rdp; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) - need_rcu_nocb_mask = true; -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; - } - } - if (!cpumask_available(rcu_nocb_mask)) - return; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - if (cpumask_empty(rcu_nocb_mask)) - pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); - else - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_empty(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - } - rcu_organize_nocb_kthreads(); -} - -/* Initialize per-rcu_data variables for no-CBs CPUs. */ -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ - init_swait_queue_head(&rdp->nocb_cb_wq); - init_swait_queue_head(&rdp->nocb_gp_wq); - init_swait_queue_head(&rdp->nocb_state_wq); - raw_spin_lock_init(&rdp->nocb_lock); - raw_spin_lock_init(&rdp->nocb_bypass_lock); - raw_spin_lock_init(&rdp->nocb_gp_lock); - timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); - rcu_cblist_init(&rdp->nocb_bypass); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread - * for this CPU's group has not yet been created, spawn it as well. - */ -static void rcu_spawn_one_nocb_kthread(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_data *rdp_gp; - struct task_struct *t; - - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) - return; - - /* If we didn't spawn the GP kthread first, reorganize! */ - rdp_gp = rdp->nocb_gp_rdp; - if (!rdp_gp->nocb_gp_kthread) { - t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, - "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); - } - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp->nocb_cb_kthread, t); - WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); -} - -/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_gp_stride = -1; -module_param(rcu_nocb_gp_stride, int, 0444); - -/* - * Initialize GP-CB relationships for all no-CBs CPU. - */ -static void __init rcu_organize_nocb_kthreads(void) -{ - int cpu; - bool firsttime = true; - bool gotnocbs = false; - bool gotnocbscbs = true; - int ls = rcu_nocb_gp_stride; - int nl = 0; /* Next GP kthread. */ - struct rcu_data *rdp; - struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; - - if (!cpumask_available(rcu_nocb_mask)) - return; - if (ls == -1) { - ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); - rcu_nocb_gp_stride = ls; - } - - /* - * Each pass through this loop sets up one rcu_data structure. - * Should the corresponding CPU come online in the future, then - * we will spawn the needed set of rcu_nocb_kthread() kthreads. - */ - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->cpu >= nl) { - /* New GP kthread, set up for CBs & next GP. */ - gotnocbs = true; - nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; - rdp_gp = rdp; - if (dump_tree) { - if (!firsttime) - pr_cont("%s\n", gotnocbscbs - ? "" : " (self only)"); - gotnocbscbs = false; - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", - __func__, cpu); - } - } else { - /* Another CB kthread, link to previous GP kthread. */ - gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; - if (dump_tree) - pr_cont(" %d", cpu); - } - rdp_prev = rdp; - } - if (gotnocbs && dump_tree) - pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); -} - -/* - * Bind the current task to the offloaded CPUs. If there are no offloaded - * CPUs, leave the task unbound. Splat if the bind attempt fails. - */ -void rcu_bind_current_to_nocb(void) -{ - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) - WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); -} -EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); - -// The ->on_cpu field is available only in CONFIG_SMP=y, so... -#ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; -} -#else // #ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return ""; -} -#endif // #else #ifdef CONFIG_SMP - -/* - * Dump out nocb grace-period kthread state for the specified rcu_data - * structure. - */ -static void show_rcu_nocb_gp_state(struct rcu_data *rdp) -{ - struct rcu_node *rnp = rdp->mynode; - - pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", - rdp->cpu, - "kK"[!!rdp->nocb_gp_kthread], - "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], - "dD"[!!rdp->nocb_defer_wakeup], - "tT"[timer_pending(&rdp->nocb_timer)], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[swait_active(&rdp->nocb_gp_wq)], - ".W"[swait_active(&rnp->nocb_gp_wq[0])], - ".W"[swait_active(&rnp->nocb_gp_wq[1])], - ".B"[!!rdp->nocb_gp_bypass], - ".G"[!!rdp->nocb_gp_gp], - (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), - rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); -} - -/* Dump out nocb kthread state for the specified rcu_data structure. */ -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ - char bufw[20]; - char bufr[20]; - struct rcu_segcblist *rsclp = &rdp->cblist; - bool waslocked; - bool wassleep; - - if (rdp->nocb_gp_rdp == rdp) - show_rcu_nocb_gp_state(rdp); - - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", - rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, - "kK"[!!rdp->nocb_cb_kthread], - "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], - "lL"[raw_spin_is_locked(&rdp->nocb_lock)], - "sS"[!!rdp->nocb_cb_sleep], - ".W"[swait_active(&rdp->nocb_cb_wq)], - jiffies - rdp->nocb_bypass_first, - jiffies - rdp->nocb_nobypass_last, - rdp->nocb_nobypass_count, - ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, - ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, - ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], - ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist), - rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); - - /* It is OK for GP kthreads to have GP state. */ - if (rdp->nocb_gp_rdp == rdp) - return; - - waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) - return; /* Nothing untoward. */ - - pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", - "lL"[waslocked], - "dD"[!!rdp->nocb_defer_wakeup], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[wassleep]); -} - -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - -/* No ->nocb_lock to acquire. */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - local_irq_restore(flags); -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); -} - -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return NULL; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ -} - -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - return true; -} - -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - return false; -} - -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags) -{ - WARN_ON_ONCE(1); /* Should be dead code! */ -} - -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ -} - -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return false; -} - -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - return false; -} - -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ -} - -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit v1.2.3-71-gd317 From cba712beebf32b27fea71241aa3cdd2ab0fc31a3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2021 02:09:29 +0200 Subject: rcu/nocb: Remove NOCB deferred wakeup from rcutree_dead_cpu() At CPU offline time, we must handle any pending wakeup for the nocb_gp kthread linked to the outgoing CPU. Now we are making sure of that twice: 1) From rcu_report_dead() when the outgoing CPU makes the very last local cleanups by itself before switching offline. 2) From rcutree_dead_cpu(). Here the offlining CPU has gone and is truly now offline. Another CPU takes care of post-portem cleaning up and check if the offline CPU had pending wakeup. Both ways are fine but we have to choose one or the other because we don't need to repeat that action. Simply benefit from cache locality and keep only the first solution. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b6eb480402e0..0fda98a0d12e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2471,9 +2471,6 @@ int rcutree_dead_cpu(unsigned int cpu) WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); return 0; -- cgit v1.2.3-71-gd317 From 45f4b4a202c03de14e315aaae3d305820cd12221 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 11:26:53 -0700 Subject: rcu-tasks: Add comments explaining task_struct strategy Accesses to task_struct structures must be either protected by RCU or by get_task_struct(). Tasks trace RCU uses these in a non-obvious combination, in conjunction with an IPI handler. This commit therefore adds comments explaining this usage. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..6a117375a62a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -785,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // set that task's .need_qs flag so that task's next outermost // rcu_read_unlock_trace() will report the quiescent state (in which // case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. +// the task is added to a "holdout" list. Note that IPIs are used +// to invoke trc_read_check_handler() in the context of running tasks +// in order to avoid ordering overhead on common-case shared-variable +// accessses. // rcu_tasks_trace_postscan(): // Initialize state and attempt to identify an immediate quiescent // state as above (but only for idle tasks), unblock CPU-hotplug @@ -994,6 +997,12 @@ static void trc_wait_for_one_reader(struct task_struct *t, } put_task_struct(t); + // If this task is not yet on the holdout list, then we are in + // an RCU read-side critical section. Otherwise, the invocation of + // rcu_add_holdout() that added it to the list did the necessary + // get_task_struct(). Either way, the task cannot be freed out + // from under this code. + // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); if (task_curr(t) && -- cgit v1.2.3-71-gd317 From bdb0cca0d11060fce8a8a44588ac1470c25d62bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 12:48:18 -0700 Subject: rcu-tasks: Mark ->trc_reader_nesting data races There are several ->trc_reader_nesting data races that are too low-probability for KCSAN to notice, but which will happen sooner or later. This commit therefore marks these accesses, and comments one that cannot race. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6a117375a62a..f6b5320c4484 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -897,7 +897,7 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!t->trc_reader_nesting)) { + if (likely(!READ_ONCE(t->trc_reader_nesting))) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); // Mark as checked after decrement to avoid false @@ -906,7 +906,7 @@ static void trc_read_check_handler(void *t_in) goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(t->trc_reader_nesting < 0)) { + if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); goto reset_ipi; @@ -953,6 +953,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_ofl_updates++; in_qs = true; } else { + // The task is not running, so C-language access is safe. in_qs = likely(!t->trc_reader_nesting); } @@ -985,7 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } @@ -1101,7 +1102,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], - t->trc_reader_nesting, + READ_ONCE(t->trc_reader_nesting), " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); @@ -1196,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); WRITE_ONCE(t->trc_reader_nesting, 0); if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) rcu_read_unlock_trace_special(t, 0); -- cgit v1.2.3-71-gd317 From f8ab3fad80dddf3f2cecb53983063c4431058ca1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 15:36:37 -0700 Subject: rcu-tasks: Mark ->trc_reader_special.b.need_qs data races There are several ->trc_reader_special.b.need_qs data races that are too low-probability for KCSAN to notice, but which will happen sooner or later. This commit therefore marks these accesses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f6b5320c4484..1cece5e9be9a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -850,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - int nq = t->trc_reader_special.b.need_qs; + int nq = READ_ONCE(t->trc_reader_special.b.need_qs); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) @@ -916,7 +916,7 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: @@ -968,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // state so that it will awaken the grace-period kthread upon exit // from that critical section. atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } @@ -1103,7 +1103,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], READ_ONCE(t->trc_reader_nesting), - " N"[!!t->trc_reader_special.b.need_qs], + " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], cpu); sched_show_task(t); } -- cgit v1.2.3-71-gd317 From e4be1f44b6f8d36a8607a598d41c766044b74be3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Jun 2021 11:57:15 -0700 Subject: rcu-tasks: Fix synchronize_rcu_rude() typo in comment This commit replaces the fictitious synchronize_rcu_rude() function with its real-world synchronize_rcu_tasks_rude() counterpart. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1cece5e9be9a..f9f52daacd1c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -643,8 +643,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } // // "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of // passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching -// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// provides an asynchronous call_rcu_tasks_rude() API and batching of +// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. // This invokes schedule_on_each_cpu() in order to send IPIs far and wide // and induces otherwise unnecessary context switches on all online CPUs, // whether idle or not. -- cgit v1.2.3-71-gd317 From fed31a4dd3adb5455df7c704de2abb639a1dc1c0 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Tue, 13 Jul 2021 08:56:45 +0800 Subject: rcu: Fix macro name CONFIG_TASKS_RCU_TRACE This commit fixes several typos where CONFIG_TASKS_RCU_TRACE should instead be CONFIG_TASKS_TRACE_RCU. Among other things, these typos could cause CONFIG_TASKS_TRACE_RCU_READ_MB=y kernels to suffer from memory-ordering bugs that could result in false-positive quiescent states and too-short grace periods. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d9680b798b21..955c82b4737c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -167,7 +167,7 @@ void synchronize_rcu_tasks(void); # define synchronize_rcu_tasks synchronize_rcu # endif -# ifdef CONFIG_TASKS_RCU_TRACE +# ifdef CONFIG_TASKS_TRACE_RCU # define rcu_tasks_trace_qs(t) \ do { \ if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..6ce104242b23 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2982,17 +2982,17 @@ static void noinstr rcu_dynticks_task_exit(void) /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ static void rcu_dynticks_task_trace_enter(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ static void rcu_dynticks_task_trace_exit(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -- cgit v1.2.3-71-gd317 From a7a73697360ea81244eea550138b8f614348860c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:48 +0200 Subject: kcsan: Remove CONFIG_KCSAN_DEBUG By this point CONFIG_KCSAN_DEBUG is pretty useless, as the system just isn't usable with it due to spamming console (I imagine a randconfig test robot will run into this sooner or later). Remove it. Back in 2019 I used it occasionally to record traces of watchpoints and verify the encoding is correct, but these days we have proper tests. If something similar is needed in future, just add it back ad-hoc. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 --------- lib/Kconfig.kcsan | 3 --- 2 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 26709ea65c71..d92977ede7e1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -479,15 +479,6 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) break; /* ignore; we do not diff the values */ } - if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { - kcsan_disable_current(); - pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", - is_write ? "write" : "read", size, ptr, - watchpoint_slot((unsigned long)ptr), - encode_watchpoint((unsigned long)ptr, size, is_write)); - kcsan_enable_current(); - } - /* * Delay this thread, to increase probability of observing a racy * conflicting access. diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 6152fbd5cbb4..5304f211f81f 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -62,9 +62,6 @@ config KCSAN_VERBOSE generated from any one of them, system stability may suffer due to deadlocks or recursion. If in doubt, say N. -config KCSAN_DEBUG - bool "Debugging of KCSAN internals" - config KCSAN_SELFTEST bool "Perform short selftests on boot" default y -- cgit v1.2.3-71-gd317 From 08cac6049412061e571fadadc3e23464dc46d0f2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:50 +0200 Subject: kcsan: Reduce get_ctx() uses in kcsan_found_watchpoint() There are a number get_ctx() calls that are close to each other, which results in poor codegen (repeated preempt_count loads). Specifically in kcsan_found_watchpoint() (even though it's a slow-path) it is beneficial to keep the race-window small until the watchpoint has actually been consumed to avoid missed opportunities to report a race. Let's clean it up a bit before we add more code in kcsan_found_watchpoint(). Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index d92977ede7e1..906100923b88 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -301,9 +301,9 @@ static inline void reset_kcsan_skip(void) this_cpu_write(kcsan_skip, skip_count); } -static __always_inline bool kcsan_is_enabled(void) +static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx) { - return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; + return READ_ONCE(kcsan_enabled) && !ctx->disable_count; } /* Introduce delay depending on context and configuration. */ @@ -353,10 +353,17 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_t *watchpoint, long encoded_watchpoint) { + struct kcsan_ctx *ctx = get_ctx(); unsigned long flags; bool consumed; - if (!kcsan_is_enabled()) + /* + * We know a watchpoint exists. Let's try to keep the race-window + * between here and finally consuming the watchpoint below as small as + * possible -- avoid unneccessarily complex code until consumed. + */ + + if (!kcsan_is_enabled(ctx)) return; /* @@ -364,14 +371,12 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * reporting a race where e.g. the writer set up the watchpoint, but the * reader has access_mask!=0, we have to ignore the found watchpoint. */ - if (get_ctx()->access_mask != 0) + if (ctx->access_mask) return; /* - * Consume the watchpoint as soon as possible, to minimize the chances - * of !consumed. Consuming the watchpoint must always be guarded by - * kcsan_is_enabled() check, as otherwise we might erroneously - * triggering reports when disabled. + * Consuming the watchpoint must be guarded by kcsan_is_enabled() to + * avoid erroneously triggering reports if the context is disabled. */ consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); @@ -409,6 +414,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); + struct kcsan_ctx *ctx = get_ctx(); unsigned long irq_flags = 0; /* @@ -417,7 +423,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) */ reset_kcsan_skip(); - if (!kcsan_is_enabled()) + if (!kcsan_is_enabled(ctx)) goto out; /* @@ -489,7 +495,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ - access_mask = get_ctx()->access_mask; + access_mask = ctx->access_mask; new = 0; switch (size) { case 1: -- cgit v1.2.3-71-gd317 From 49f72d5358dd3c0d28bcd2232c513000b15480f0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:51 +0200 Subject: kcsan: Rework atomic.h into permissive.h Rework atomic.h into permissive.h to better reflect its purpose, and introduce kcsan_ignore_address() and kcsan_ignore_data_race(). Introduce CONFIG_KCSAN_PERMISSIVE and update the stub functions in preparation for subsequent changes. As before, developers who choose to use KCSAN in "strict" mode will see all data races and are not affected. Furthermore, by relying on the value-change filter logic for kcsan_ignore_data_race(), even if the permissive rules are enabled, the opt-outs in report.c:skip_report() override them (such as for RCU-related functions by default). The option CONFIG_KCSAN_PERMISSIVE is disabled by default, so that the documented default behaviour of KCSAN does not change. Instead, like CONFIG_KCSAN_IGNORE_ATOMICS, the option needs to be explicitly opted in. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 8 +++++++ kernel/kcsan/atomic.h | 23 ------------------- kernel/kcsan/core.c | 33 +++++++++++++++++++-------- kernel/kcsan/permissive.h | 47 +++++++++++++++++++++++++++++++++++++++ lib/Kconfig.kcsan | 10 +++++++++ 5 files changed, 89 insertions(+), 32 deletions(-) delete mode 100644 kernel/kcsan/atomic.h create mode 100644 kernel/kcsan/permissive.h (limited to 'kernel') diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index 69dc9c502ccc..7db43c7c09b8 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -127,6 +127,14 @@ Kconfig options: causes KCSAN to not report data races due to conflicts where the only plain accesses are aligned writes up to word size. +* ``CONFIG_KCSAN_PERMISSIVE``: Enable additional permissive rules to ignore + certain classes of common data races. Unlike the above, the rules are more + complex involving value-change patterns, access type, and address. This + option depends on ``CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=y``. For details + please see the ``kernel/kcsan/permissive.h``. Testers and maintainers that + only focus on reports from specific subsystems and not the whole kernel are + recommended to disable this option. + To use the strictest possible rules, select ``CONFIG_KCSAN_STRICT=y``, which configures KCSAN to follow the Linux-kernel memory consistency model (LKMM) as closely as possible. diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h deleted file mode 100644 index 530ae1bda8e7..000000000000 --- a/kernel/kcsan/atomic.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Rules for implicitly atomic memory accesses. - * - * Copyright (C) 2019, Google LLC. - */ - -#ifndef _KERNEL_KCSAN_ATOMIC_H -#define _KERNEL_KCSAN_ATOMIC_H - -#include - -/* - * Special rules for certain memory where concurrent conflicting accesses are - * common, however, the current convention is to not mark them; returns true if - * access to @ptr should be considered atomic. Called from slow-path. - */ -static bool kcsan_is_atomic_special(const volatile void *ptr) -{ - return false; -} - -#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 906100923b88..439edb9dcbb1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -20,9 +20,9 @@ #include #include -#include "atomic.h" #include "encoding.h" #include "kcsan.h" +#include "permissive.h" static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; @@ -353,6 +353,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_t *watchpoint, long encoded_watchpoint) { + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; struct kcsan_ctx *ctx = get_ctx(); unsigned long flags; bool consumed; @@ -374,6 +375,16 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (ctx->access_mask) return; + /* + * If the other thread does not want to ignore the access, and there was + * a value change as a result of this thread's operation, we will still + * generate a report of unknown origin. + * + * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter. + */ + if (!is_assert && kcsan_ignore_address(ptr)) + return; + /* * Consuming the watchpoint must be guarded by kcsan_is_enabled() to * avoid erroneously triggering reports if the context is disabled. @@ -396,7 +407,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } - if ((type & KCSAN_ACCESS_ASSERT) != 0) + if (is_assert) atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); @@ -427,12 +438,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; /* - * Special atomic rules: unlikely to be true, so we check them here in - * the slow-path, and not in the fast-path in is_atomic(). Call after - * kcsan_is_enabled(), as we may access memory that is not yet - * initialized during early boot. + * Check to-ignore addresses after kcsan_is_enabled(), as we may access + * memory that is not yet initialized during early boot. */ - if (!is_assert && kcsan_is_atomic_special(ptr)) + if (!is_assert && kcsan_ignore_address(ptr)) goto out; if (!check_encodable((unsigned long)ptr, size)) { @@ -518,8 +527,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (access_mask) diff &= access_mask; - /* Were we able to observe a value-change? */ - if (diff != 0) + /* + * Check if we observed a value change. + * + * Also check if the data race should be ignored (the rules depend on + * non-zero diff); if it is to be ignored, the below rules for + * KCSAN_VALUE_CHANGE_MAYBE apply. + */ + if (diff && !kcsan_ignore_data_race(size, type, old, new, diff)) value_change = KCSAN_VALUE_CHANGE_TRUE; /* Check if this access raced with another. */ diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h new file mode 100644 index 000000000000..f90e30800c11 --- /dev/null +++ b/kernel/kcsan/permissive.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Special rules for ignoring entire classes of data-racy memory accesses. None + * of the rules here imply that such data races are generally safe! + * + * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep + * them separate from core code to make it easier to audit. + * + * Copyright (C) 2019, Google LLC. + */ + +#ifndef _KERNEL_KCSAN_PERMISSIVE_H +#define _KERNEL_KCSAN_PERMISSIVE_H + +#include + +/* + * Access ignore rules based on address. + */ +static __always_inline bool kcsan_ignore_address(const volatile void *ptr) +{ + if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + return false; + + return false; +} + +/* + * Data race ignore rules based on access type and value change patterns. + */ +static bool +kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff) +{ + if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + return false; + + /* + * Rules here are only for plain read accesses, so that we still report + * data races between plain read-write accesses. + */ + if (type || size > sizeof(long)) + return false; + + return false; +} + +#endif /* _KERNEL_KCSAN_PERMISSIVE_H */ diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index c76fbb3ee09e..26f03c754d39 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -231,4 +231,14 @@ config KCSAN_IGNORE_ATOMICS due to two conflicting plain writes will be reported (aligned and unaligned, if CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n). +config KCSAN_PERMISSIVE + bool "Enable all additional permissive rules" + depends on KCSAN_REPORT_VALUE_CHANGE_ONLY + help + Enable additional permissive rules to ignore certain classes of data + races (also see kernel/kcsan/permissive.h). None of the permissive + rules imply that such data races are generally safe, but can be used + to further reduce reported data races due to data-racy patterns + common across the kernel. + endif # KCSAN -- cgit v1.2.3-71-gd317 From 9c827cd1fcdf54bb50f874f91af0d5de2aceb035 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:52 +0200 Subject: kcsan: Print if strict or non-strict during init Show a brief message if KCSAN is strict or non-strict, and if non-strict also say that CONFIG_KCSAN_STRICT=y can be used to see all data races. This is to hint to users of KCSAN who blindly use the default config that their configuration might miss data races of interest. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 439edb9dcbb1..76e67d1e02d4 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -656,6 +656,15 @@ void __init kcsan_init(void) pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); } + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) || + IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) || + IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) || + IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n"); + } else { + pr_info("strict mode configured\n"); + } } /* === Exported interface =================================================== */ -- cgit v1.2.3-71-gd317 From d8fd74d35a8d3c602232e3238e916bda9d03d520 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:53 +0200 Subject: kcsan: permissive: Ignore data-racy 1-bit value changes Add rules to ignore data-racy reads with only 1-bit value changes. Details about the rules are captured in comments in kernel/kcsan/permissive.h. More background follows. While investigating a number of data races, we've encountered data-racy accesses on flags variables to be very common. The typical pattern is a reader masking all but one bit, and/or the writer setting/clearing only 1 bit (current->flags being a frequently encountered case; more examples in mm/sl[au]b.c, which disable KCSAN for this reason). Since these types of data-racy accesses are common (with the assumption they are intentional and hard to miscompile) having the option (with CONFIG_KCSAN_PERMISSIVE=y) to filter them will avoid forcing everyone to mark them, and deliberately left to preference at this time. One important motivation for having this option built-in is to move closer to being able to enable KCSAN on CI systems or for testers wishing to test the whole kernel, while more easily filtering less interesting data races with higher probability. For the implementation, we considered several alternatives, but had one major requirement: that the rules be kept together with the Linux-kernel tree. Adding them to the compiler would preclude us from making changes quickly; if the rules require tweaks, having them part of the compiler requires waiting another ~1 year for the next release -- that's not realistic. We are left with the following options: 1. Maintain compiler plugins as part of the kernel-tree that removes instrumentation for some accesses (e.g. plain-& with 1-bit mask). The analysis would be reader-side focused, as no assumption can be made about racing writers. Because it seems unrealistic to maintain 2 plugins, one for LLVM and GCC, we would likely pick LLVM. Furthermore, no kernel infrastructure exists to maintain LLVM plugins, and the build-system implications and maintenance overheads do not look great (historically, plugins written against old LLVM APIs are not guaranteed to work with newer LLVM APIs). 2. Find a set of rules that can be expressed in terms of observed value changes, and make it part of the KCSAN runtime. The analysis is writer-side focused, given we rely on observed value changes. The approach taken here is (2). While a complete approach requires both (1) and (2), experiments show that the majority of data races involving trivial bit operations on flags variables can be removed with (2) alone. It goes without saying that the filtering of data races using (1) or (2) does _not_ guarantee they are safe! Therefore, limiting ourselves to (2) for now is the conservative choice for setups that wish to enable CONFIG_KCSAN_PERMISSIVE=y. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 32 +++++++++++++++++++++++++++++++ kernel/kcsan/permissive.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 8bcffbdef3d3..dc55fd5a36fc 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -414,6 +414,14 @@ static noinline void test_kernel_atomic_builtins(void) __atomic_load_n(&test_var, __ATOMIC_RELAXED); } +static noinline void test_kernel_xor_1bit(void) +{ + /* Do not report data races between the read-writes. */ + kcsan_nestable_atomic_begin(); + test_var ^= 0x10000; + kcsan_nestable_atomic_end(); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -952,6 +960,29 @@ static void test_atomic_builtins(struct kunit *test) KUNIT_EXPECT_FALSE(test, match_never); } +__no_kcsan +static void test_1bit_value_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + }, + }; + bool match = false; + + begin_test_checks(test_kernel_read, test_kernel_xor_1bit); + do { + match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) + ? report_available() + : report_matches(&expect); + } while (!end_test_checks(match)); + if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + KUNIT_EXPECT_FALSE(test, match); + else + KUNIT_EXPECT_TRUE(test, match); +} + /* * Generate thread counts for all test cases. Values generated are in interval * [2, 5] followed by exponentially increasing thread counts from 8 to 32. @@ -1024,6 +1055,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), KCSAN_KUNIT_CASE(test_atomic_builtins), + KCSAN_KUNIT_CASE(test_1bit_value_change), {}, }; diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h index f90e30800c11..2c01fe4a59ee 100644 --- a/kernel/kcsan/permissive.h +++ b/kernel/kcsan/permissive.h @@ -12,6 +12,8 @@ #ifndef _KERNEL_KCSAN_PERMISSIVE_H #define _KERNEL_KCSAN_PERMISSIVE_H +#include +#include #include /* @@ -22,7 +24,11 @@ static __always_inline bool kcsan_ignore_address(const volatile void *ptr) if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) return false; - return false; + /* + * Data-racy bitops on current->flags are too common, ignore completely + * for now. + */ + return ptr == ¤t->flags; } /* @@ -41,6 +47,47 @@ kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff) if (type || size > sizeof(long)) return false; + /* + * A common pattern is checking/setting just 1 bit in a variable; for + * example: + * + * if (flags & SOME_FLAG) { ... } + * + * and elsewhere flags is updated concurrently: + * + * flags |= SOME_OTHER_FLAG; // just 1 bit + * + * While it is still recommended that such accesses be marked + * appropriately, in many cases these types of data races are so common + * that marking them all is often unrealistic and left to maintainer + * preference. + * + * The assumption in all cases is that with all known compiler + * optimizations (including those that tear accesses), because no more + * than 1 bit changed, the plain accesses are safe despite the presence + * of data races. + * + * The rules here will ignore the data races if we observe no more than + * 1 bit changed. + * + * Of course many operations can effecively change just 1 bit, but the + * general assuption that data races involving 1-bit changes can be + * tolerated still applies. + * + * And in case a true bug is missed, the bug likely manifests as a + * reportable data race elsewhere. + */ + if (hweight64(diff) == 1) { + /* + * Exception: Report data races where the values look like + * ordinary booleans (one of them was 0 and the 0th bit was + * changed) More often than not, they come with interesting + * memory ordering requirements, so let's report them. + */ + if (!((!old || !new) && diff == 1)) + return true; + } + return false; } -- cgit v1.2.3-71-gd317 From 1e7107c5ef44431bc1ebbd4c353f1d7c22e5f2ec Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 16 Jun 2021 08:51:57 -0400 Subject: cgroup1: fix leaked context root causing sporadic NULL deref in LTP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reported sporadic (roughly one in 10 or so) null dereferences and other strange behaviour for a set of automated LTP tests. Things like: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014 RIP: 0010:kernfs_sop_show_path+0x1b/0x60 ...or these others: RIP: 0010:do_mkdirat+0x6a/0xf0 RIP: 0010:d_alloc_parallel+0x98/0x510 RIP: 0010:do_readlinkat+0x86/0x120 There were other less common instances of some kind of a general scribble but the common theme was mount and cgroup and a dubious dentry triggering the NULL dereference. I was only able to reproduce it under qemu by replicating Richard's setup as closely as possible - I never did get it to happen on bare metal, even while keeping everything else the same. In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") we see this as a part of the overall change: -------------- struct cgroup_subsys *ss; - struct dentry *dentry; [...] - dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root, - CGROUP_SUPER_MAGIC, ns); [...] - if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns); + if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); msleep(10); return restart_syscall(); } -------------- In changing from the local "*dentry" variable to using fc->root, we now export/leave that dentry pointer in the file context after doing the dput() in the unlikely "is_dying" case. With LTP doing a crazy amount of back to back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely becomes slightly likely and then bad things happen. A fix would be to not leave the stale reference in fc->root as follows: --------------                 dput(fc->root); + fc->root = NULL;                 deactivate_locked_super(sb); -------------- ...but then we are just open-coding a duplicate of fc_drop_locked() so we simply use that instead. Cc: Al Viro Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: stable@vger.kernel.org # v5.1+ Reported-by: Richard Purdie Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") Signed-off-by: Paul Gortmaker Signed-off-by: Tejun Heo --- fs/internal.h | 1 - include/linux/fs_context.h | 1 + kernel/cgroup/cgroup-v1.c | 4 +--- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/internal.h b/fs/internal.h index 3ce8edbaa3ca..82e8eb32ff3d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,7 +61,6 @@ extern void __init chrdev_init(void); */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); -extern void fc_drop_locked(struct fs_context *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e2bc16300c82..6b54982fc5f3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -141,6 +141,7 @@ extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); +extern void fc_drop_locked(struct fs_context *fc); /* * sget() wrappers to be called from the ->get_tree() op. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8d6bf56ed77a..de2c432dee20 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1221,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { - struct super_block *sb = fc->root->d_sb; - dput(fc->root); - deactivate_locked_super(sb); + fc_drop_locked(fc); ret = 1; } -- cgit v1.2.3-71-gd317 From b42b0bddcbc87b4c66f6497f66fc72d52b712aa7 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 14 Jul 2021 17:19:33 +0800 Subject: workqueue: fix UAF in pwq_unbound_release_workfn() I got a UAF report when doing fuzz test: [ 152.880091][ T8030] ================================================================== [ 152.881240][ T8030] BUG: KASAN: use-after-free in pwq_unbound_release_workfn+0x50/0x190 [ 152.882442][ T8030] Read of size 4 at addr ffff88810d31bd00 by task kworker/3:2/8030 [ 152.883578][ T8030] [ 152.883932][ T8030] CPU: 3 PID: 8030 Comm: kworker/3:2 Not tainted 5.13.0+ #249 [ 152.885014][ T8030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 152.886442][ T8030] Workqueue: events pwq_unbound_release_workfn [ 152.887358][ T8030] Call Trace: [ 152.887837][ T8030] dump_stack_lvl+0x75/0x9b [ 152.888525][ T8030] ? pwq_unbound_release_workfn+0x50/0x190 [ 152.889371][ T8030] print_address_description.constprop.10+0x48/0x70 [ 152.890326][ T8030] ? pwq_unbound_release_workfn+0x50/0x190 [ 152.891163][ T8030] ? pwq_unbound_release_workfn+0x50/0x190 [ 152.891999][ T8030] kasan_report.cold.15+0x82/0xdb [ 152.892740][ T8030] ? pwq_unbound_release_workfn+0x50/0x190 [ 152.893594][ T8030] __asan_load4+0x69/0x90 [ 152.894243][ T8030] pwq_unbound_release_workfn+0x50/0x190 [ 152.895057][ T8030] process_one_work+0x47b/0x890 [ 152.895778][ T8030] worker_thread+0x5c/0x790 [ 152.896439][ T8030] ? process_one_work+0x890/0x890 [ 152.897163][ T8030] kthread+0x223/0x250 [ 152.897747][ T8030] ? set_kthread_struct+0xb0/0xb0 [ 152.898471][ T8030] ret_from_fork+0x1f/0x30 [ 152.899114][ T8030] [ 152.899446][ T8030] Allocated by task 8884: [ 152.900084][ T8030] kasan_save_stack+0x21/0x50 [ 152.900769][ T8030] __kasan_kmalloc+0x88/0xb0 [ 152.901416][ T8030] __kmalloc+0x29c/0x460 [ 152.902014][ T8030] alloc_workqueue+0x111/0x8e0 [ 152.902690][ T8030] __btrfs_alloc_workqueue+0x11e/0x2a0 [ 152.903459][ T8030] btrfs_alloc_workqueue+0x6d/0x1d0 [ 152.904198][ T8030] scrub_workers_get+0x1e8/0x490 [ 152.904929][ T8030] btrfs_scrub_dev+0x1b9/0x9c0 [ 152.905599][ T8030] btrfs_ioctl+0x122c/0x4e50 [ 152.906247][ T8030] __x64_sys_ioctl+0x137/0x190 [ 152.906916][ T8030] do_syscall_64+0x34/0xb0 [ 152.907535][ T8030] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 152.908365][ T8030] [ 152.908688][ T8030] Freed by task 8884: [ 152.909243][ T8030] kasan_save_stack+0x21/0x50 [ 152.909893][ T8030] kasan_set_track+0x20/0x30 [ 152.910541][ T8030] kasan_set_free_info+0x24/0x40 [ 152.911265][ T8030] __kasan_slab_free+0xf7/0x140 [ 152.911964][ T8030] kfree+0x9e/0x3d0 [ 152.912501][ T8030] alloc_workqueue+0x7d7/0x8e0 [ 152.913182][ T8030] __btrfs_alloc_workqueue+0x11e/0x2a0 [ 152.913949][ T8030] btrfs_alloc_workqueue+0x6d/0x1d0 [ 152.914703][ T8030] scrub_workers_get+0x1e8/0x490 [ 152.915402][ T8030] btrfs_scrub_dev+0x1b9/0x9c0 [ 152.916077][ T8030] btrfs_ioctl+0x122c/0x4e50 [ 152.916729][ T8030] __x64_sys_ioctl+0x137/0x190 [ 152.917414][ T8030] do_syscall_64+0x34/0xb0 [ 152.918034][ T8030] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 152.918872][ T8030] [ 152.919203][ T8030] The buggy address belongs to the object at ffff88810d31bc00 [ 152.919203][ T8030] which belongs to the cache kmalloc-512 of size 512 [ 152.921155][ T8030] The buggy address is located 256 bytes inside of [ 152.921155][ T8030] 512-byte region [ffff88810d31bc00, ffff88810d31be00) [ 152.922993][ T8030] The buggy address belongs to the page: [ 152.923800][ T8030] page:ffffea000434c600 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10d318 [ 152.925249][ T8030] head:ffffea000434c600 order:2 compound_mapcount:0 compound_pincount:0 [ 152.926399][ T8030] flags: 0x57ff00000010200(slab|head|node=1|zone=2|lastcpupid=0x7ff) [ 152.927515][ T8030] raw: 057ff00000010200 dead000000000100 dead000000000122 ffff888009c42c80 [ 152.928716][ T8030] raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 [ 152.929890][ T8030] page dumped because: kasan: bad access detected [ 152.930759][ T8030] [ 152.931076][ T8030] Memory state around the buggy address: [ 152.931851][ T8030] ffff88810d31bc00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 152.932967][ T8030] ffff88810d31bc80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 152.934068][ T8030] >ffff88810d31bd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 152.935189][ T8030] ^ [ 152.935763][ T8030] ffff88810d31bd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 152.936847][ T8030] ffff88810d31be00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 152.937940][ T8030] ================================================================== If apply_wqattrs_prepare() fails in alloc_workqueue(), it will call put_pwq() which invoke a work queue to call pwq_unbound_release_workfn() and use the 'wq'. The 'wq' allocated in alloc_workqueue() will be freed in error path when apply_wqattrs_prepare() fails. So it will lead a UAF. CPU0 CPU1 alloc_workqueue() alloc_and_link_pwqs() apply_wqattrs_prepare() fails apply_wqattrs_cleanup() schedule_work(&pwq->unbound_release_work) kfree(wq) worker_thread() pwq_unbound_release_workfn() <- trigger uaf here If apply_wqattrs_prepare() fails, the new pwq are not linked, it doesn't hold any reference to the 'wq', 'wq' is invalid to access in the worker, so add check pwq if linked to fix this. Fixes: 2d5f0764b526 ("workqueue: split apply_workqueue_attrs() into 3 stages") Cc: stable@vger.kernel.org # v4.2+ Reported-by: Hulk Robot Suggested-by: Lai Jiangshan Signed-off-by: Yang Yingliang Reviewed-by: Lai Jiangshan Tested-by: Pavel Skripkin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 50142fc08902..f148eacda55a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3676,15 +3676,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work) unbound_release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; - bool is_last; + bool is_last = false; - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; + /* + * when @pwq is not linked, it doesn't hold any reference to the + * @wq, and @wq is invalid to access. + */ + if (!list_empty(&pwq->pwqs_node)) { + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; - mutex_lock(&wq->mutex); - list_del_rcu(&pwq->pwqs_node); - is_last = list_empty(&wq->pwqs); - mutex_unlock(&wq->mutex); + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + } mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); -- cgit v1.2.3-71-gd317 From 67f0d6d9883c13174669f88adac4f0ee656cc16a Mon Sep 17 00:00:00 2001 From: Haoran Luo Date: Wed, 21 Jul 2021 14:12:07 +0000 Subject: tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop. The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when "head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to the same buffer page, whose "buffer_data_page" is empty and "read" field is non-zero. An error scenario could be constructed as followed (kernel perspective): 1. All pages in the buffer has been accessed by reader(s) so that all of them will have non-zero "read" field. 2. Read and clear all buffer pages so that "rb_num_of_entries()" will return 0 rendering there's no more data to read. It is also required that the "read_page", "commit_page" and "tail_page" points to the same page, while "head_page" is the next page of them. 3. Invoke "ring_buffer_lock_reserve()" with large enough "length" so that it shot pass the end of current tail buffer page. Now the "head_page", "commit_page" and "tail_page" points to the same page. 4. Discard current event with "ring_buffer_discard_commit()", so that "head_page", "commit_page" and "tail_page" points to a page whose buffer data page is now empty. When the error scenario has been constructed, "tracing_read_pipe" will be trapped inside a deadloop: "trace_empty()" returns 0 since "rb_per_cpu_empty()" returns 0 when it hits the CPU containing such constructed ring buffer. Then "trace_find_next_entry_inc()" always return NULL since "rb_num_of_entries()" reports there's no more entry to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking "tracing_read_pipe" back to the start of the "waitagain" loop. I've also written a proof-of-concept script to construct the scenario and trigger the bug automatically, you can use it to trace and validate my reasoning above: https://github.com/aegistudio/RingBufferDetonator.git Tests has been carried out on linux kernel 5.14-rc2 (2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version of kernel (for testing whether my update fixes the bug) and some older kernels (for range of affected kernels). Test result is also attached to the proof-of-concept repository. Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/ Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio Cc: stable@vger.kernel.org Fixes: bf41a158cacba ("ring-buffer: make reentrant") Suggested-by: Linus Torvalds Signed-off-by: Haoran Luo Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d1463eac11a3..e592d1df6f88 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3880,10 +3880,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) if (unlikely(!head)) return true; - return reader->read == rb_page_commit(reader) && - (commit == reader || - (commit == head && - head->read == rb_page_commit(commit))); + /* Reader should exhaust content in reader page */ + if (reader->read != rb_page_commit(reader)) + return false; + + /* + * If writers are committing on the reader page, knowing all + * committed content has been read, the ring buffer is empty. + */ + if (commit == reader) + return true; + + /* + * If writers are committing on a page other than reader page + * and head page, there should always be content to read. + */ + if (commit != head) + return false; + + /* + * Writers are committing on the head page, we just need + * to care about there're committed data, and the reader will + * swap reader page with head page when it is to read data. + */ + return rb_page_commit(commit) == 0; } /** -- cgit v1.2.3-71-gd317 From 16c5900ba776c5acd6568abd60c40f948a96e496 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jul 2021 23:19:45 +0200 Subject: bpf: Fix pointer cast warning kp->addr is a pointer, so it cannot be cast directly to a 'u64' when it gets interpreted as an integer value: kernel/trace/bpf_trace.c: In function '____bpf_get_func_ip_kprobe': kernel/trace/bpf_trace.c:968:21: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 968 | return kp ? (u64) kp->addr : 0; Use the uintptr_t type instead. Fixes: 9ffd9f3ff719 ("bpf: Add bpf_get_func_ip helper for kprobe programs") Signed-off-by: Arnd Bergmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721212007.3876595-1-arnd@kernel.org --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08906007306d..1f22ce1fa971 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -965,7 +965,7 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (u64) kp->addr : 0; + return kp ? (uintptr_t)kp->addr : 0; } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { -- cgit v1.2.3-71-gd317 From 724f17b7d45d62c71e92471666647a823cb9baa9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 12:56:30 +0100 Subject: bpf: Remove redundant intiialization of variable stype The variable stype is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721115630.109279-1-colin.king@canonical.com --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 362e81481594..7ed2a14dc0de 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -406,7 +406,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + enum bpf_cgroup_storage_type stype; struct bpf_cgroup_storage *storage; int cpu; -- cgit v1.2.3-71-gd317 From 3b13911a2fd0dd0146c9777a254840c5466cf120 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 21 Jul 2021 19:10:08 -0400 Subject: tracing: Synthetic event field_pos is an index not a boolean Performing the following: ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs' > events/sched/sched_waking/trigger ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1:onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,prev_comm)'\ > events/sched/sched_switch/trigger ># echo 1 > events/synthetic/enable Crashed the kernel: BUG: kernel NULL pointer dereference, address: 000000000000001b #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.13.0-rc5-test+ #104 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strlen+0x0/0x20 Code: f6 82 80 2b 0b bc 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2b 0b bc 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 9 f8 c3 31 RSP: 0018:ffffaa75000d79d0 EFLAGS: 00010046 RAX: 0000000000000002 RBX: ffff9cdb55575270 RCX: 0000000000000000 RDX: ffff9cdb58c7a320 RSI: ffffaa75000d7b40 RDI: 000000000000001b RBP: ffffaa75000d7b40 R08: ffff9cdb40a4f010 R09: ffffaa75000d7ab8 R10: ffff9cdb4398c700 R11: 0000000000000008 R12: ffff9cdb58c7a320 R13: ffff9cdb55575270 R14: ffff9cdb58c7a000 R15: 0000000000000018 FS: 0000000000000000(0000) GS:ffff9cdb5aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000001b CR3: 00000000c0612006 CR4: 00000000001706e0 Call Trace: trace_event_raw_event_synth+0x90/0x1d0 action_trace+0x5b/0x70 event_hist_trigger+0x4bd/0x4e0 ? cpumask_next_and+0x20/0x30 ? update_sd_lb_stats.constprop.0+0xf6/0x840 ? __lock_acquire.constprop.0+0x125/0x550 ? find_held_lock+0x32/0x90 ? sched_clock_cpu+0xe/0xd0 ? lock_release+0x155/0x440 ? update_load_avg+0x8c/0x6f0 ? enqueue_entity+0x18a/0x920 ? __rb_reserve_next+0xe5/0x460 ? ring_buffer_lock_reserve+0x12a/0x3f0 event_triggers_call+0x52/0xe0 trace_event_buffer_commit+0x1ae/0x240 trace_event_raw_event_sched_switch+0x114/0x170 __traceiter_sched_switch+0x39/0x50 __schedule+0x431/0xb00 schedule_idle+0x28/0x40 do_idle+0x198/0x2e0 cpu_startup_entry+0x19/0x20 secondary_startup_64_no_verify+0xc2/0xcb The reason is that the dynamic events array keeps track of the field position of the fields array, via the field_pos variable in the synth_field structure. Unfortunately, that field is a boolean for some reason, which means any field_pos greater than 1 will be a bug (in this case it was 2). Link: https://lkml.kernel.org/r/20210721191008.638bce34@oasis.local.home Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Ingo Molnar Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events") Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_synth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index 6e146b959dcd..4007fe95cf42 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -14,10 +14,10 @@ struct synth_field { char *name; size_t size; unsigned int offset; + unsigned int field_pos; bool is_signed; bool is_string; bool is_dynamic; - bool field_pos; }; struct synth_event { -- cgit v1.2.3-71-gd317 From 1e3bac71c5053c99d438771fc9fa5082ae5d90aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 21 Jul 2021 11:00:53 -0400 Subject: tracing/histogram: Rename "cpu" to "common_cpu" Currently the histogram logic allows the user to write "cpu" in as an event field, and it will record the CPU that the event happened on. The problem with this is that there's a lot of events that have "cpu" as a real field, and using "cpu" as the CPU it ran on, makes it impossible to run histograms on the "cpu" field of events. For example, if I want to have a histogram on the count of the workqueue_queue_work event on its cpu field, running: ># echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger Gives a misleading and wrong result. Change the command to "common_cpu" as no event should have "common_*" fields as that's a reserved name for fields used by all events. And this makes sense here as common_cpu would be a field used by all events. Now we can even do: ># echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger ># cat events/workqueue/workqueue_queue_work/hist # event histogram # # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active] # { common_cpu: 0, cpu: 2 } hitcount: 1 { common_cpu: 0, cpu: 4 } hitcount: 1 { common_cpu: 7, cpu: 7 } hitcount: 1 { common_cpu: 0, cpu: 7 } hitcount: 1 { common_cpu: 0, cpu: 1 } hitcount: 1 { common_cpu: 0, cpu: 6 } hitcount: 2 { common_cpu: 0, cpu: 5 } hitcount: 2 { common_cpu: 1, cpu: 1 } hitcount: 4 { common_cpu: 6, cpu: 6 } hitcount: 4 { common_cpu: 5, cpu: 5 } hitcount: 14 { common_cpu: 4, cpu: 4 } hitcount: 26 { common_cpu: 0, cpu: 0 } hitcount: 39 { common_cpu: 2, cpu: 2 } hitcount: 184 Now for backward compatibility, I added a trick. If "cpu" is used, and the field is not found, it will fall back to "common_cpu" and work as it did before. This way, it will still work for old programs that use "cpu" to get the actual CPU, but if the event has a "cpu" as a field, it will get that event's "cpu" field, which is probably what it wants anyway. I updated the tracefs/README to include documentation about both the common_timestamp and the common_cpu. This way, if that text is present in the README, then an application can know that common_cpu is supported over just plain "cpu". Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home Cc: Namhyung Kim Cc: Ingo Molnar Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers") Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 2 +- kernel/trace/trace.c | 4 ++++ kernel/trace/trace_events_hist.c | 22 ++++++++++++++++------ 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index b71e09f745c3..f99be8062bc8 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -191,7 +191,7 @@ Documentation written by Tom Zanussi with the event, in nanoseconds. May be modified by .usecs to have timestamps interpreted as microseconds. - cpu int the cpu on which the event occurred. + common_cpu int the cpu on which the event occurred. ====================== ==== ======================================= Extended error information diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b80b5bab71..c59dd35a6da5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5609,6 +5609,10 @@ static const char readme_msg[] = "\t [:name=histname1]\n" "\t [:.]\n" "\t [if ]\n\n" + "\t Note, special fields can be used as well:\n" + "\t common_timestamp - to record current timestamp\n" + "\t common_cpu - to record the CPU the event happened on\n" + "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 16a9dfc9fffc..34325f41ebc0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1111,7 +1111,7 @@ static const char *hist_field_name(struct hist_field *field, field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) - field_name = "cpu"; + field_name = "common_cpu"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -1991,14 +1991,24 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); - field = ERR_PTR(-EINVAL); - goto out; + /* + * For backward compatibility, if field_name + * was "cpu", then we treat this the same as + * common_cpu. + */ + if (strcmp(field_name, "cpu") == 0) { + *flags |= HIST_FIELD_FL_CPU; + } else { + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, + errpos(field_name)); + field = ERR_PTR(-EINVAL); + goto out; + } } } out: @@ -5085,7 +5095,7 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s=", hist_field->var.name); if (hist_field->flags & HIST_FIELD_FL_CPU) - seq_puts(m, "cpu"); + seq_puts(m, "common_cpu"); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) -- cgit v1.2.3-71-gd317 From 9528c19507dc9bc3d6cd96f4611d7cb80c5afcde Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 21 Jul 2021 19:53:41 -0400 Subject: tracing: Clean up alloc_synth_event() alloc_synth_event() currently has the following code to initialize the event fields and dynamic_fields: for (i = 0, j = 0; i < n_fields; i++) { event->fields[i] = fields[i]; if (fields[i]->is_dynamic) { event->dynamic_fields[j] = fields[i]; event->dynamic_fields[j]->field_pos = i; event->dynamic_fields[j++] = fields[i]; event->n_dynamic_fields++; } } 1) It would make more sense to have all fields keep track of their field_pos. 2) event->dynmaic_fields[j] is assigned twice for no reason. 3) We can move updating event->n_dynamic_fields outside the loop, and just assign it to j. This combination makes the code much cleaner. Link: https://lkml.kernel.org/r/20210721195341.29bb0f77@oasis.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 2ac75eb6aa86..9315fc03e303 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -893,15 +893,13 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, dyn_event_init(&event->devent, &synth_event_ops); for (i = 0, j = 0; i < n_fields; i++) { + fields[i]->field_pos = i; event->fields[i] = fields[i]; - if (fields[i]->is_dynamic) { - event->dynamic_fields[j] = fields[i]; - event->dynamic_fields[j]->field_pos = i; + if (fields[i]->is_dynamic) event->dynamic_fields[j++] = fields[i]; - event->n_dynamic_fields++; - } } + event->n_dynamic_fields = j; event->n_fields = n_fields; out: return event; -- cgit v1.2.3-71-gd317 From 68e83498cb4fad31963b5c76a71e80b824bc316e Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 21 Jul 2021 13:47:26 +0200 Subject: ftrace: Avoid synchronize_rcu_tasks_rude() call when not necessary synchronize_rcu_tasks_rude() triggers IPIs and forces rescheduling on all CPUs. It is a costly operation and, when targeting nohz_full CPUs, very disrupting (hence the name). So avoid calling it when 'old_hash' doesn't need to be freed. Link: https://lkml.kernel.org/r/20210721114726.1545103-1-nsaenzju@redhat.com Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e6fb3e6e1ffc..4fbcf560dd03 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5985,7 +5985,8 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - synchronize_rcu_tasks_rude(); + if (old_hash != EMPTY_HASH) + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } -- cgit v1.2.3-71-gd317 From 3b1a8f457fcf105924c72e99f1191834837c978d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 13:09:15 +0100 Subject: ftrace: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Link: https://lkml.kernel.org/r/20210721120915.122278-1-colin.king@canonical.com Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4fbcf560dd03..7b180f61e6d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7545,7 +7545,7 @@ int ftrace_is_dead(void) */ int register_ftrace_function(struct ftrace_ops *ops) { - int ret = -1; + int ret; ftrace_ops_init(ops); -- cgit v1.2.3-71-gd317 From 352384d5c84ebe40fa77098cc234fe173247d8ef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 21:52:18 -0400 Subject: tracepoints: Update static_call before tp_funcs when adding a tracepoint Because of the significant overhead that retpolines pose on indirect calls, the tracepoint code was updated to use the new "static_calls" that can modify the running code to directly call a function instead of using an indirect caller, and this function can be changed at runtime. In the tracepoint code that calls all the registered callbacks that are attached to a tracepoint, the following is done: it_func_ptr = rcu_dereference_raw((&__tracepoint_##name)->funcs); if (it_func_ptr) { __data = (it_func_ptr)->data; static_call(tp_func_##name)(__data, args); } If there's just a single callback, the static_call is updated to just call that callback directly. Once another handler is added, then the static caller is updated to call the iterator, that simply loops over all the funcs in the array and calls each of the callbacks like the old method using indirect calling. The issue was discovered with a race between updating the funcs array and updating the static_call. The funcs array was updated first and then the static_call was updated. This is not an issue as long as the first element in the old array is the same as the first element in the new array. But that assumption is incorrect, because callbacks also have a priority field, and if there's a callback added that has a higher priority than the callback on the old array, then it will become the first callback in the new array. This means that it is possible to call the old callback with the new callback data element, which can cause a kernel panic. static_call = callback1() funcs[] = {callback1,data1}; callback2 has higher priority than callback1 CPU 1 CPU 2 ----- ----- new_funcs = {callback2,data2}, {callback1,data1} rcu_assign_pointer(tp->funcs, new_funcs); /* * Now tp->funcs has the new array * but the static_call still calls callback1 */ it_func_ptr = tp->funcs [ new_funcs ] data = it_func_ptr->data [ data2 ] static_call(callback1, data); /* Now callback1 is called with * callback2's data */ [ KERNEL PANIC ] update_static_call(iterator); To prevent this from happening, always switch the static_call to the iterator before assigning the tp->funcs to the new array. The iterator will always properly match the callback with its data. To trigger this bug: In one terminal: while :; do hackbench 50; done In another terminal echo 1 > /sys/kernel/tracing/events/sched/sched_waking/enable while :; do echo 1 > /sys/kernel/tracing/set_event_pid; sleep 0.5 echo 0 > /sys/kernel/tracing/set_event_pid; sleep 0.5 done And it doesn't take long to crash. This is because the set_event_pid adds a callback to the sched_waking tracepoint with a high priority, which will be called before the sched_waking trace event callback is called. Note, the removal to a single callback updates the array first, before changing the static_call to single callback, which is the proper order as the first element in the array is the same as what the static_call is being changed to. Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/ Cc: stable@vger.kernel.org Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()") Reported-by: Stefan Metzmacher tested-by: Stefan Metzmacher Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 976bf8ce8039..fc32821f8240 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -299,8 +299,8 @@ static int tracepoint_add_func(struct tracepoint *tp, * a pointer to it. This array is referenced by __DO_TRACE from * include/linux/tracepoint.h using rcu_dereference_sched(). */ - rcu_assign_pointer(tp->funcs, tp_funcs); tracepoint_update_call(tp, tp_funcs, false); + rcu_assign_pointer(tp->funcs, tp_funcs); static_key_enable(&tp->key); release_probes(old); -- cgit v1.2.3-71-gd317 From 2c9f7eaf08659fa23d25b93a446f74306b3abea8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 May 2021 13:38:19 -0500 Subject: signal/sparc: si_trapno is only used with SIGILL ILL_ILLTRP While reviewing the signal handlers on sparc it became clear that si_trapno is only set to a non-zero value when sending SIGILL with si_code ILL_ILLTRP. Add force_sig_fault_trapno and send SIGILL ILL_ILLTRP with it. Remove the define of __ARCH_SI_TRAPNO and remove the always zero si_trapno parameter from send_sig_fault and force_sig_fault. v1: https://lkml.kernel.org/r/m1eeers7q7.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-7-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87mtqnxx89.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- arch/sparc/include/uapi/asm/siginfo.h | 3 --- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/sys_sparc_32.c | 2 +- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/sparc/kernel/traps_32.c | 22 +++++++++--------- arch/sparc/kernel/traps_64.c | 44 +++++++++++++++-------------------- arch/sparc/kernel/unaligned_32.c | 2 +- arch/sparc/mm/fault_32.c | 2 +- arch/sparc/mm/fault_64.c | 2 +- include/linux/sched/signal.h | 1 + kernel/signal.c | 19 +++++++++++++++ 11 files changed, 56 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h index 68bdde4c2a2e..0e7c27522aed 100644 --- a/arch/sparc/include/uapi/asm/siginfo.h +++ b/arch/sparc/include/uapi/asm/siginfo.h @@ -8,9 +8,6 @@ #endif /* defined(__sparc__) && defined(__arch64__) */ - -#define __ARCH_SI_TRAPNO - #include diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index d33c58a58d4f..547b06b49ce3 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -518,7 +518,7 @@ void synchronize_user_stack(void) static void stack_unaligned(unsigned long sp) { - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp, 0); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) sp); } static const char uwfault32[] = KERN_INFO \ diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index be77538bc038..082a551897ed 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -151,7 +151,7 @@ sparc_breakpoint (struct pt_regs *regs) #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%x, nPC=%x\n", regs->pc, regs->npc); #endif - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%x nPC=%x\n", regs->pc, regs->npc); diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 6b92fadb6ec7..1e9a9e016237 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -514,7 +514,7 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs) #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc); #endif - force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc, 0); + force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->tpc); #ifdef DEBUG_SPARC_BREAKPOINT printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc); #endif diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 247a0d9683b2..5630e5a395e0 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -102,8 +102,8 @@ void do_hw_interrupt(struct pt_regs *regs, unsigned long type) if(regs->psr & PSR_PS) die_if_kernel("Kernel bad trap", regs); - force_sig_fault(SIGILL, ILL_ILLTRP, - (void __user *)regs->pc, type - 0x80); + force_sig_fault_trapno(SIGILL, ILL_ILLTRP, + (void __user *)regs->pc, type - 0x80); } void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, @@ -116,7 +116,7 @@ void do_illegal_instruction(struct pt_regs *regs, unsigned long pc, unsigned lon regs->pc, *(unsigned long *)regs->pc); #endif - send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0, current); + send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, current); } void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long npc, @@ -124,7 +124,7 @@ void do_priv_instruction(struct pt_regs *regs, unsigned long pc, unsigned long n { if(psr & PSR_PS) die_if_kernel("Penguin instruction from Penguin mode??!?!", regs); - send_sig_fault(SIGILL, ILL_PRVOPC, (void __user *)pc, 0, current); + send_sig_fault(SIGILL, ILL_PRVOPC, (void __user *)pc, current); } /* XXX User may want to be allowed to do this. XXX */ @@ -145,7 +145,7 @@ void do_memaccess_unaligned(struct pt_regs *regs, unsigned long pc, unsigned lon #endif send_sig_fault(SIGBUS, BUS_ADRALN, /* FIXME: Should dig out mna address */ (void *)0, - 0, current); + current); } static unsigned long init_fsr = 0x0UL; @@ -291,7 +291,7 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc, else if (fsr & 0x01) code = FPE_FLTRES; } - send_sig_fault(SIGFPE, code, (void __user *)pc, 0, fpt); + send_sig_fault(SIGFPE, code, (void __user *)pc, fpt); #ifndef CONFIG_SMP last_task_used_math = NULL; #endif @@ -305,7 +305,7 @@ void handle_tag_overflow(struct pt_regs *regs, unsigned long pc, unsigned long n { if(psr & PSR_PS) die_if_kernel("Penguin overflow trap from kernel mode", regs); - send_sig_fault(SIGEMT, EMT_TAGOVF, (void __user *)pc, 0, current); + send_sig_fault(SIGEMT, EMT_TAGOVF, (void __user *)pc, current); } void handle_watchpoint(struct pt_regs *regs, unsigned long pc, unsigned long npc, @@ -327,13 +327,13 @@ void handle_reg_access(struct pt_regs *regs, unsigned long pc, unsigned long npc printk("Register Access Exception at PC %08lx NPC %08lx PSR %08lx\n", pc, npc, psr); #endif - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc, 0); + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)pc); } void handle_cp_disabled(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { - send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current); + send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, current); } void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long npc, @@ -343,13 +343,13 @@ void handle_cp_exception(struct pt_regs *regs, unsigned long pc, unsigned long n printk("Co-Processor Exception at PC %08lx NPC %08lx PSR %08lx\n", pc, npc, psr); #endif - send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, 0, current); + send_sig_fault(SIGILL, ILL_COPROC, (void __user *)pc, current); } void handle_hw_divzero(struct pt_regs *regs, unsigned long pc, unsigned long npc, unsigned long psr) { - send_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)pc, 0, current); + send_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)pc, current); } #ifdef CONFIG_DEBUG_BUGVERBOSE diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index a850dccd78ea..6863025ed56d 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -107,8 +107,8 @@ void bad_trap(struct pt_regs *regs, long lvl) regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGILL, ILL_ILLTRP, - (void __user *)regs->tpc, lvl); + force_sig_fault_trapno(SIGILL, ILL_ILLTRP, + (void __user *)regs->tpc, lvl); } void bad_trap_tl1(struct pt_regs *regs, long lvl) @@ -201,8 +201,7 @@ void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, un regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGSEGV, SEGV_MAPERR, - (void __user *)regs->tpc, 0); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)regs->tpc); out: exception_exit(prev_state); } @@ -237,7 +236,7 @@ void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsig regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr, 0); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *) addr); } void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx) @@ -321,7 +320,7 @@ void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, un if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar, 0); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)sfar); out: exception_exit(prev_state); } @@ -385,13 +384,13 @@ void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsig */ switch (type) { case HV_FAULT_TYPE_INV_ASI: - force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr, 0); + force_sig_fault(SIGILL, ILL_ILLADR, (void __user *)addr); break; case HV_FAULT_TYPE_MCD_DIS: - force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr, 0); + force_sig_fault(SIGSEGV, SEGV_ACCADI, (void __user *)addr); break; default: - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr, 0); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)addr); break; } } @@ -568,7 +567,7 @@ static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned lon regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0, 0); + force_sig_fault(SIGBUS, BUS_OBJERR, (void *)0); } void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar) @@ -2069,8 +2068,7 @@ void do_mcd_err(struct pt_regs *regs, struct sun4v_error_entry ent) /* Send SIGSEGV to the userspace process with the right signal * code */ - force_sig_fault(SIGSEGV, SEGV_ADIDERR, (void __user *)ent.err_raddr, - 0); + force_sig_fault(SIGSEGV, SEGV_ADIDERR, (void __user *)ent.err_raddr); } /* We run with %pil set to PIL_NORMAL_MAX and PSTATE_IE enabled in %pstate. @@ -2184,7 +2182,7 @@ bool sun4v_nonresum_error_user_handled(struct pt_regs *regs, } if (attrs & SUN4V_ERR_ATTRS_PIO) { force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)sun4v_get_vaddr(regs), 0); + (void __user *)sun4v_get_vaddr(regs)); return true; } @@ -2340,8 +2338,7 @@ static void do_fpe_common(struct pt_regs *regs) else if (fsr & 0x01) code = FPE_FLTRES; } - force_sig_fault(SIGFPE, code, - (void __user *)regs->tpc, 0); + force_sig_fault(SIGFPE, code, (void __user *)regs->tpc); } } @@ -2395,8 +2392,7 @@ void do_tof(struct pt_regs *regs) regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGEMT, EMT_TAGOVF, - (void __user *)regs->tpc, 0); + force_sig_fault(SIGEMT, EMT_TAGOVF, (void __user *)regs->tpc); out: exception_exit(prev_state); } @@ -2415,8 +2411,7 @@ void do_div0(struct pt_regs *regs) regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGFPE, FPE_INTDIV, - (void __user *)regs->tpc, 0); + force_sig_fault(SIGFPE, FPE_INTDIV, (void __user *)regs->tpc); out: exception_exit(prev_state); } @@ -2612,7 +2607,7 @@ void do_illegal_instruction(struct pt_regs *regs) } } } - force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc, 0); + force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)pc); out: exception_exit(prev_state); } @@ -2632,7 +2627,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar, 0); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)sfar); out: exception_exit(prev_state); } @@ -2650,7 +2645,7 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c if (is_no_fault_exception(regs)) return; - force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr, 0); + force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) addr); } /* sun4v_mem_corrupt_detect_precise() - Handle precise exception on an ADI @@ -2697,7 +2692,7 @@ void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr, regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr, 0); + force_sig_fault(SIGSEGV, SEGV_ADIPERR, (void __user *)addr); } void do_privop(struct pt_regs *regs) @@ -2712,8 +2707,7 @@ void do_privop(struct pt_regs *regs) regs->tpc &= 0xffffffff; regs->tnpc &= 0xffffffff; } - force_sig_fault(SIGILL, ILL_PRVOPC, - (void __user *)regs->tpc, 0); + force_sig_fault(SIGILL, ILL_PRVOPC, (void __user *)regs->tpc); out: exception_exit(prev_state); } diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index ef5c5207c9ff..455f0258c745 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -278,5 +278,5 @@ asmlinkage void user_unaligned_trap(struct pt_regs *regs, unsigned int insn) { send_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)safe_compute_effective_address(regs, insn), - 0, current); + current); } diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index de2031c2b2d7..fa858626b85b 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -83,7 +83,7 @@ static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs, show_signal_msg(regs, sig, code, addr, current); - force_sig_fault(sig, code, (void __user *) addr, 0); + force_sig_fault(sig, code, (void __user *) addr); } static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault) diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 0a6bcc85fba7..9a9652a15fed 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -176,7 +176,7 @@ static void do_fault_siginfo(int code, int sig, struct pt_regs *regs, if (unlikely(show_unhandled_signals)) show_signal_msg(regs, sig, code, addr, current); - force_sig_fault(sig, code, (void __user *) addr, 0); + force_sig_fault(sig, code, (void __user *) addr); } static unsigned int get_fault_insn(struct pt_regs *regs, unsigned int insn) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..99a9ab2b169a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -329,6 +329,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); +int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..87a374225277 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1808,6 +1808,22 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) return force_sig_info(&info); } +/* For the rare architectures that include trap information using + * si_trapno. + */ +int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_trapno = trapno; + return force_sig_info(&info); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; @@ -3243,6 +3259,9 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_PERF_EVENT; + else if (IS_ENABLED(CONFIG_SPARC) && + (sig == SIGILL) && (si_code == ILL_ILLTRP)) + layout = SIL_FAULT_TRAPNO; #ifdef __ARCH_SI_TRAPNO else if (layout == SIL_FAULT) layout = SIL_FAULT_TRAPNO; -- cgit v1.2.3-71-gd317 From 7de5f68d497cbc700c4a28cc037dd61f00e452e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 May 2021 14:15:51 -0500 Subject: signal/alpha: si_trapno is only used with SIGFPE and SIGTRAP TRAP_UNK While reviewing the signal handlers on alpha it became clear that si_trapno is only set to a non-zero value when sending SIGFPE and when sending SITGRAP with si_code TRAP_UNK. Add send_sig_fault_trapno and send SIGTRAP TRAP_UNK, and SIGFPE with it. Remove the define of __ARCH_SI_TRAPNO and remove the always zero si_trapno parameter from send_sig_fault and force_sig_fault. v1: https://lkml.kernel.org/r/m1eeers7q7.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-7-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87h7gvxx7l.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- arch/alpha/include/uapi/asm/siginfo.h | 2 -- arch/alpha/kernel/osf_sys.c | 2 +- arch/alpha/kernel/signal.c | 4 ++-- arch/alpha/kernel/traps.c | 26 +++++++++++++------------- arch/alpha/mm/fault.c | 4 ++-- include/linux/sched/signal.h | 2 ++ kernel/signal.c | 21 +++++++++++++++++++++ 7 files changed, 41 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h index 6e1a2af2f962..e08eae88182b 100644 --- a/arch/alpha/include/uapi/asm/siginfo.h +++ b/arch/alpha/include/uapi/asm/siginfo.h @@ -2,8 +2,6 @@ #ifndef _ALPHA_SIGINFO_H #define _ALPHA_SIGINFO_H -#define __ARCH_SI_TRAPNO - #include #endif diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index d5367a1c6300..bbdb1a9a5fd8 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -876,7 +876,7 @@ SYSCALL_DEFINE5(osf_setsysinfo, unsigned long, op, void __user *, buffer, if (fex & IEEE_TRAP_ENABLE_DZE) si_code = FPE_FLTDIV; if (fex & IEEE_TRAP_ENABLE_INV) si_code = FPE_FLTINV; - send_sig_fault(SIGFPE, si_code, + send_sig_fault_trapno(SIGFPE, si_code, (void __user *)NULL, /* FIXME */ 0, current); } diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 948b89789da8..bc077babafab 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -219,7 +219,7 @@ do_sigreturn(struct sigcontext __user *sc) /* Send SIGTRAP if we're single-stepping: */ if (ptrace_cancel_bpt (current)) { - send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0, + send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, current); } return; @@ -247,7 +247,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame) /* Send SIGTRAP if we're single-stepping: */ if (ptrace_cancel_bpt (current)) { - send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, 0, + send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *) regs->pc, current); } return; diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 921d4b6e4d95..e9e3de18793b 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -227,7 +227,7 @@ do_entArith(unsigned long summary, unsigned long write_mask, } die_if_kernel("Arithmetic fault", regs, 0, NULL); - send_sig_fault(SIGFPE, si_code, (void __user *) regs->pc, 0, current); + send_sig_fault_trapno(SIGFPE, si_code, (void __user *) regs->pc, 0, current); } asmlinkage void @@ -268,13 +268,13 @@ do_entIF(unsigned long type, struct pt_regs *regs) regs->pc -= 4; /* make pc point to former bpt */ } - send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, 0, + send_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc, current); return; case 1: /* bugcheck */ - send_sig_fault(SIGTRAP, TRAP_UNK, (void __user *) regs->pc, 0, - current); + send_sig_fault_trapno(SIGTRAP, TRAP_UNK, + (void __user *) regs->pc, 0, current); return; case 2: /* gentrap */ @@ -335,8 +335,8 @@ do_entIF(unsigned long type, struct pt_regs *regs) break; } - send_sig_fault(signo, code, (void __user *) regs->pc, regs->r16, - current); + send_sig_fault_trapno(signo, code, (void __user *) regs->pc, + regs->r16, current); return; case 4: /* opDEC */ @@ -360,9 +360,9 @@ do_entIF(unsigned long type, struct pt_regs *regs) if (si_code == 0) return; if (si_code > 0) { - send_sig_fault(SIGFPE, si_code, - (void __user *) regs->pc, 0, - current); + send_sig_fault_trapno(SIGFPE, si_code, + (void __user *) regs->pc, + 0, current); return; } } @@ -387,7 +387,7 @@ do_entIF(unsigned long type, struct pt_regs *regs) ; } - send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0, current); + send_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, current); } /* There is an ifdef in the PALcode in MILO that enables a @@ -402,7 +402,7 @@ do_entDbg(struct pt_regs *regs) { die_if_kernel("Instruction fault", regs, 0, NULL); - force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc, 0); + force_sig_fault(SIGILL, ILL_ILLOPC, (void __user *)regs->pc); } @@ -964,12 +964,12 @@ give_sigsegv: si_code = SEGV_MAPERR; mmap_read_unlock(mm); } - send_sig_fault(SIGSEGV, si_code, va, 0, current); + send_sig_fault(SIGSEGV, si_code, va, current); return; give_sigbus: regs->pc -= 4; - send_sig_fault(SIGBUS, BUS_ADRALN, va, 0, current); + send_sig_fault(SIGBUS, BUS_ADRALN, va, current); return; } diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 09172f017efc..eee5102c3d88 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -219,13 +219,13 @@ retry: mmap_read_unlock(mm); /* Send a sigbus, regardless of whether we were in kernel or user mode. */ - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address); if (!user_mode(regs)) goto no_context; return; do_sigsegv: - force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0); + force_sig_fault(SIGSEGV, si_code, (void __user *) address); return; #ifdef CONFIG_ALPHA_LARGE_VMALLOC diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 99a9ab2b169a..6657184cef07 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -330,6 +330,8 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); +int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, + struct task_struct *t); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); diff --git a/kernel/signal.c b/kernel/signal.c index 87a374225277..ae06a424aa72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1824,6 +1824,23 @@ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) return force_sig_info(&info); } +/* For the rare architectures that include trap information using + * si_trapno. + */ +int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, + struct task_struct *t) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_trapno = trapno; + return send_sig_info(info.si_signo, &info, t); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; @@ -3262,6 +3279,10 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; + else if (IS_ENABLED(CONFIG_ALPHA) && + ((sig == SIGFPE) || + ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) + layout = SIL_FAULT_TRAPNO; #ifdef __ARCH_SI_TRAPNO else if (layout == SIL_FAULT) layout = SIL_FAULT_TRAPNO; -- cgit v1.2.3-71-gd317 From c7fff9288dce1ee5fa9de8d656e09cc8e0e3281b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:53:38 -0500 Subject: signal: Remove the generic __ARCH_SI_TRAPNO support Now that __ARCH_SI_TRAPNO is no longer set by any architecture remove all of the code it enabled from the kernel. On alpha and sparc a more explict approach of using send_sig_fault_trapno or force_sig_fault_trapno in the very limited circumstances where si_trapno was set to a non-zero value. The generic support that is being removed always set si_trapno on all fault signals. With only SIGILL ILL_ILLTRAP on sparc and SIGFPE and SIGTRAP TRAP_UNK on alpla providing si_trapno values asking all senders of fault signals to provide an si_trapno value does not make sense. Making si_trapno an ordinary extension of the fault siginfo layout has enabled the architecture generic implementation of SIGTRAP TRAP_PERF, and enables other faulting signals to grow architecture generic senders as well. v1: https://lkml.kernel.org/r/m18s4zs7nu.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-8-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87bl73xx6x.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- arch/mips/include/uapi/asm/siginfo.h | 2 -- include/linux/sched/signal.h | 8 -------- kernel/signal.c | 14 -------------- 3 files changed, 24 deletions(-) (limited to 'kernel') diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h index c34c7eef0a1c..8cb8bd061a68 100644 --- a/arch/mips/include/uapi/asm/siginfo.h +++ b/arch/mips/include/uapi/asm/siginfo.h @@ -10,9 +10,7 @@ #ifndef _UAPI_ASM_SIGINFO_H #define _UAPI_ASM_SIGINFO_H - #define __ARCH_SIGEV_PREAMBLE_SIZE (sizeof(long) + 2*sizeof(int)) -#undef __ARCH_SI_TRAPNO /* exception code needs to fill this ... */ #define __ARCH_HAS_SWAPPED_SIGINFO diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 6657184cef07..928e0025d358 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -298,11 +298,6 @@ static inline void kernel_signal_stop(void) schedule(); } -#ifdef __ARCH_SI_TRAPNO -# define ___ARCH_SI_TRAPNO(_a1) , _a1 -#else -# define ___ARCH_SI_TRAPNO(_a1) -#endif #ifdef __ia64__ # define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 #else @@ -310,14 +305,11 @@ static inline void kernel_signal_stop(void) #endif int force_sig_fault_to_task(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)); int send_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); diff --git a/kernel/signal.c b/kernel/signal.c index ae06a424aa72..2181423e562a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1666,7 +1666,6 @@ void force_sigsegv(int sig) } int force_sig_fault_to_task(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { @@ -1677,9 +1676,6 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ARCH_SI_TRAPNO - info.si_trapno = trapno; -#endif #ifdef __ia64__ info.si_imm = imm; info.si_flags = flags; @@ -1689,16 +1685,13 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr } int force_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) { return force_sig_fault_to_task(sig, code, addr - ___ARCH_SI_TRAPNO(trapno) ___ARCH_SI_IA64(imm, flags, isr), current); } int send_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { @@ -1709,9 +1702,6 @@ int send_sig_fault(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ARCH_SI_TRAPNO - info.si_trapno = trapno; -#endif #ifdef __ia64__ info.si_imm = imm; info.si_flags = flags; @@ -3283,10 +3273,6 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; -#ifdef __ARCH_SI_TRAPNO - else if (layout == SIL_FAULT) - layout = SIL_FAULT_TRAPNO; -#endif } else if (si_code <= NSIGPOLL) layout = SIL_POLL; -- cgit v1.2.3-71-gd317 From f4ac73023449e6f2f74f69e38f4840c83edfa840 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:58:56 -0500 Subject: signal: Rename SIL_PERF_EVENT SIL_FAULT_PERF_EVENT for consistency It helps to know which part of the siginfo structure the siginfo_layout value is talking about. v1: https://lkml.kernel.org/r/m18s4zs7nu.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-9-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87zgumw8cc.fsf_-_@disp2133 Acked-by: Marco Elver Signed-off-by: Eric W. Biederman --- fs/signalfd.c | 4 ++-- include/linux/signal.h | 2 +- kernel/signal.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/signalfd.c b/fs/signalfd.c index 167b5889db4b..040e1cf90528 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -114,10 +114,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: /* * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR, - * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only + * SIL_FAULT_PKUERR, and SIL_FAULT_PERF_EVENT are only * generated by faults that deliver them synchronously to * userspace. In case someone injects one of these signals * and signalfd catches it treat it as SIL_FAULT. diff --git a/include/linux/signal.h b/include/linux/signal.h index 3454c7ff0778..3f96a6374e4f 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -44,7 +44,7 @@ enum siginfo_layout { SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, - SIL_PERF_EVENT, + SIL_FAULT_PERF_EVENT, SIL_CHLD, SIL_RT, SIL_SYS, diff --git a/kernel/signal.c b/kernel/signal.c index 2181423e562a..332b21f2fe72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1213,7 +1213,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; @@ -2580,7 +2580,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; @@ -3265,7 +3265,7 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) - layout = SIL_PERF_EVENT; + layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; @@ -3394,7 +3394,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; @@ -3471,7 +3471,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; -- cgit v1.2.3-71-gd317 From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:19 -0700 Subject: bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter This patch allows bpf tcp iter to call bpf_(get|set)sockopt. To allow a specific bpf iter (tcp here) to call a set of helpers, get_func_proto function pointer is added to bpf_iter_reg. The bpf iter is a tracing prog which currently requires CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not impose other capability checks for bpf_(get|set)sockopt. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com --- include/linux/bpf.h | 8 ++++++++ kernel/bpf/bpf_iter.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 7 ++++++- net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 978ebd16ae60..c8cc09013210 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +typedef const struct bpf_func_proto * +(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, + const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), @@ -1454,6 +1457,7 @@ struct bpf_iter_reg { bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; + bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; @@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem { int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); @@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_proto; +extern const struct bpf_func_proto bpf_sk_getsockopt_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2d4fbdbb194e..2e9d47bb40ff 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_iter_target_info *tinfo; + const struct bpf_func_proto *fn = NULL; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog->aux->attach_btf_id) { + const struct bpf_iter_reg *reg_info; + + reg_info = tinfo->reg_info; + if (reg_info->get_func_proto) + fn = reg_info->get_func_proto(func_id, prog); + break; + } + } + mutex_unlock(&targets_mutex); + + return fn; +} + static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1f22ce1fa971..c5e0b6a64091 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *fn; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_skb_output: @@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; default: - return raw_tp_prog_func_proto(func_id, prog); + fn = raw_tp_prog_func_proto(func_id, prog); + if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) + fn = bpf_iter_get_func_proto(func_id, prog); + return fn; } } diff --git a/net/core/filter.c b/net/core/filter.c index 3b4986e96e9c..faf29fd82276 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5016,6 +5016,40 @@ err_clear: return -EINVAL; } +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84ac0135d389..f9c6e47141fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3259,6 +3259,20 @@ static const struct bpf_iter_seq_info tcp_seq_info = { .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, @@ -3266,6 +3280,7 @@ static struct bpf_iter_reg tcp_reg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; -- cgit v1.2.3-71-gd317 From 463e862ac63ef27fca423782536f6465abc3f180 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:24 +0100 Subject: swiotlb: Convert io_default_tlb_mem to static allocation Since commit 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used"), 'struct device' may hold a copy of the global 'io_default_tlb_mem' pointer if the device is using swiotlb for DMA. A subsequent call to swiotlb_exit() will therefore leave dangling pointers behind in these device structures, resulting in KASAN splats such as: | BUG: KASAN: use-after-free in __iommu_dma_unmap_swiotlb+0x64/0xb0 | Read of size 8 at addr ffff8881d7830000 by task swapper/0/0 | | CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc3-debug #1 | Hardware name: HP HP Desktop M01-F1xxx/87D6, BIOS F.12 12/17/2020 | Call Trace: | | dump_stack+0x9c/0xcf | print_address_description.constprop.0+0x18/0x130 | kasan_report.cold+0x7f/0x111 | __iommu_dma_unmap_swiotlb+0x64/0xb0 | nvme_pci_complete_rq+0x73/0x130 | blk_complete_reqs+0x6f/0x80 | __do_softirq+0xfc/0x3be Convert 'io_default_tlb_mem' to a static structure, so that the per-device pointers remain valid after swiotlb_exit() has been invoked. All users are updated to reference the static structure directly, using the 'nslabs' field to determine whether swiotlb has been initialised. The 'slots' array is still allocated dynamically and referenced via a pointer rather than a flexible array member. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Fixes: 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/base/core.c | 2 +- drivers/xen/swiotlb-xen.c | 4 +-- include/linux/swiotlb.h | 4 +-- kernel/dma/swiotlb.c | 66 ++++++++++++++++++++++++++--------------------- 4 files changed, 41 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/drivers/base/core.c b/drivers/base/core.c index ea5b85354526..b49824001cfa 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2848,7 +2848,7 @@ void device_initialize(struct device *dev) dev->dma_coherent = dma_default_coherent; #endif #ifdef CONFIG_SWIOTLB - dev->dma_io_tlb_mem = io_tlb_default_mem; + dev->dma_io_tlb_mem = &io_tlb_default_mem; #endif } EXPORT_SYMBOL_GPL(device_initialize); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 785ec7e8be01..f06d9b4f1e0f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -164,7 +164,7 @@ int __ref xen_swiotlb_init(void) int rc = -ENOMEM; char *start; - if (io_tlb_default_mem != NULL) { + if (io_tlb_default_mem.nslabs) { pr_warn("swiotlb buffer already initialized\n"); return -EEXIST; } @@ -547,7 +547,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask; + return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 39284ff2a6cd..b0cb2a9973f4 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -103,9 +103,9 @@ struct io_tlb_mem { phys_addr_t orig_addr; size_t alloc_size; unsigned int list; - } slots[]; + } *slots; }; -extern struct io_tlb_mem *io_tlb_default_mem; +extern struct io_tlb_mem io_tlb_default_mem; static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1a9ae7fad8f..7948f274f9bb 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -70,7 +70,7 @@ enum swiotlb_force swiotlb_force; -struct io_tlb_mem *io_tlb_default_mem; +struct io_tlb_mem io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -101,7 +101,7 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem ? max_segment : 0; + return io_tlb_default_mem.nslabs ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -134,9 +134,9 @@ void __init swiotlb_adjust_size(unsigned long size) void swiotlb_print_info(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) { + if (!mem->nslabs) { pr_warn("No low mem\n"); return; } @@ -163,11 +163,11 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (!mem || mem->late_alloc) + if (!mem->nslabs || mem->late_alloc) return; vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); @@ -201,25 +201,24 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; size_t alloc_size; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); - mem = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem) + alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); + mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->slots) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); - io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); @@ -304,26 +303,24 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - mem = (void *)__get_free_pages(GFP_KERNEL, - get_order(struct_size(mem, slots, nslabs))); - if (!mem) + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(array_size(sizeof(*mem->slots), nslabs))); + if (!mem->slots) return -ENOMEM; - memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); - io_tlb_default_mem = mem; swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; @@ -331,18 +328,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; size_t size; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) + if (!mem->nslabs) return; - size = struct_size(mem, slots, mem->nslabs); + size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) - free_pages((unsigned long)mem, get_order(size)); + free_pages((unsigned long)mem->slots, get_order(size)); else - memblock_free_late(__pa(mem), PAGE_ALIGN(size)); - io_tlb_default_mem = NULL; + memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + memset(mem, 0, sizeof(*mem)); } /* @@ -696,7 +693,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(struct device *dev) { - return dev->dma_io_tlb_mem != NULL; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->nslabs; } EXPORT_SYMBOL_GPL(is_swiotlb_active); @@ -711,10 +710,10 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) static int __init swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem) { + if (mem->nslabs) { mem->debugfs = debugfs_dir; swiotlb_create_debugfs_files(mem); } @@ -783,10 +782,17 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, * to it. */ if (!mem) { - mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; + mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), + GFP_KERNEL); + if (!mem->slots) { + kfree(mem); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); @@ -806,7 +812,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { - dev->dma_io_tlb_mem = io_tlb_default_mem; + dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { -- cgit v1.2.3-71-gd317 From 1efd3fc0ccf52e1aa5f0bf5b0d82847180d20951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:25 +0100 Subject: swiotlb: Emit diagnostic in swiotlb_exit() A recent debugging session would have been made a little bit easier if we had noticed sooner that swiotlb_exit() was being called during boot. Add a simple diagnostic message to swiotlb_exit() to complement the one from swiotlb_print_info() during initialisation. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Link: https://lore.kernel.org/r/20210705190352.GA19461@willie-the-truck Suggested-by: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7948f274f9bb..b3c793ed9e64 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -334,6 +334,7 @@ void __init swiotlb_exit(void) if (!mem->nslabs) return; + pr_info("tearing down default memory pool\n"); size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) free_pages((unsigned long)mem->slots, get_order(size)); -- cgit v1.2.3-71-gd317 From ad6c00283163cb7ad52cdf97d2850547446f7d98 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:26 +0100 Subject: swiotlb: Free tbl memory in swiotlb_exit() Although swiotlb_exit() frees the 'slots' metadata array referenced by 'io_tlb_default_mem', it leaves the underlying buffer pages allocated despite no longer being usable. Extend swiotlb_exit() to free the buffer pages as well as the slots array. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b3c793ed9e64..87c40517e822 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -328,18 +328,27 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - size_t size; struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long tbl_vaddr; + size_t tbl_size, slots_size; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); - size = array_size(sizeof(*mem->slots), mem->nslabs); - if (mem->late_alloc) - free_pages((unsigned long)mem->slots, get_order(size)); - else - memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + tbl_vaddr = (unsigned long)phys_to_virt(mem->start); + tbl_size = PAGE_ALIGN(mem->end - mem->start); + slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); + + set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); + if (mem->late_alloc) { + free_pages(tbl_vaddr, get_order(tbl_size)); + free_pages((unsigned long)mem->slots, get_order(slots_size)); + } else { + memblock_free_late(mem->start, tbl_size); + memblock_free_late(__pa(mem->slots), slots_size); + } + memset(mem, 0, sizeof(*mem)); } -- cgit v1.2.3-71-gd317 From a1833a54033e4ca760ad58fa2a6469ad59b3fa1a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jul 2021 11:06:37 -0700 Subject: smpboot: fix duplicate and misplaced inlining directive gcc doesn't care, but clang quite reasonably pointed out that the recent commit e9ba16e68cce ("smpboot: Mark idle_init() as __always_inlined to work around aggressive compiler un-inlining") did some really odd things: kernel/smpboot.c:50:20: warning: duplicate 'inline' declaration specifier [-Wduplicate-decl-specifier] static inline void __always_inline idle_init(unsigned int cpu) ^ which not only has that duplicate inlining specifier, but the new __always_inline was put in the wrong place of the function definition. We put the storage class specifiers (ie things like "static" and "extern") first, and the type information after that. And while the compiler may not care, we put the inline specifier before the types. So it should be just static __always_inline void idle_init(unsigned int cpu) instead. Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 21b7953f8242..cf6acab78538 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -47,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void) * * Creates the thread if it does not exist. */ -static inline void __always_inline idle_init(unsigned int cpu) +static __always_inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); -- cgit v1.2.3-71-gd317 From 0f0aa84850a4105401723c6c0eeb61c2e67c869a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 23 Jul 2021 09:47:07 +0200 Subject: printk/index: Fix warning about missing prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 337015573718b161 ("printk: Userspace format indexing support") triggered the following build failure: kernel/printk/index.c:140:6: warning: no previous prototype for ‘pi_create_file’ [-Wmissing-prototypes] void pi_create_file(struct module *mod) ^~~~~~~~~~~~~~ kernel/printk/index.c:146:6: warning: no previous prototype for ‘pi_remove_file’ [-Wmissing-prototypes] void pi_remove_file(struct module *mod) ^~~~~~~~~~~~~~ Fixes: 337015573718b161 ("printk: Userspace format indexing support") Reported-by: kernel test robot Suggested-by: Chris Down [pmladek@suse.com: Let the compiler decide about inlining.] Signed-off-by: Petr Mladek Link: https://lore.kernel.org/lkml/YPql089IwSpudw%2F1@alley/ --- kernel/printk/index.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/index.c b/kernel/printk/index.c index ca062f5e1779..58d27272f992 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -137,13 +137,13 @@ static const char *pi_get_module_name(struct module *mod) } #endif -void pi_create_file(struct module *mod) +static void pi_create_file(struct module *mod) { debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index, mod, &dfs_index_fops); } -void pi_remove_file(struct module *mod) +static void pi_remove_file(struct module *mod) { debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); } -- cgit v1.2.3-71-gd317 From 7d9e2661f268585ca24ab4edbc1e2925b08374b2 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Sun, 25 Jul 2021 15:16:00 -0600 Subject: printk: Move the printk() kerneldoc comment to its new home Commit 337015573718 ("printk: Userspace format indexing support") turned printk() into a macro, but left the kerneldoc comment for it with the (now) _printk() function, resulting in this docs-build warning: kernel/printk/printk.c:1: warning: 'printk' not found Move the kerneldoc comment back next to the (now) macro it's meant to describe and have the docs build find it there. Fixes: 337015573718b161 ("printk: Userspace format indexing support") Signed-off-by: Jonathan Corbet Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87o8aqt7qn.fsf@meer.lwn.net --- Documentation/core-api/printk-basics.rst | 5 +---- include/linux/printk.h | 24 ++++++++++++++++++++++++ kernel/printk/printk.c | 24 ------------------------ 3 files changed, 25 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/printk-basics.rst b/Documentation/core-api/printk-basics.rst index 965e4281eddd..2dde24ca7d9f 100644 --- a/Documentation/core-api/printk-basics.rst +++ b/Documentation/core-api/printk-basics.rst @@ -107,9 +107,6 @@ also ``CONFIG_DYNAMIC_DEBUG`` in the case of pr_debug()) is defined. Function reference ================== -.. kernel-doc:: kernel/printk/printk.c - :functions: printk - .. kernel-doc:: include/linux/printk.h - :functions: pr_emerg pr_alert pr_crit pr_err pr_warn pr_notice pr_info + :functions: printk pr_emerg pr_alert pr_crit pr_err pr_warn pr_notice pr_info pr_fmt pr_debug pr_devel pr_cont diff --git a/include/linux/printk.h b/include/linux/printk.h index 2651b82ed352..c1e176403967 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -431,6 +431,30 @@ struct pi_entry { }) +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * If printk indexing is enabled, _printk() is called from printk_index_wrap. + * Otherwise, printk is simply #defined to _printk. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the + * output and call the console drivers. If we fail to get the semaphore, we + * place the output into the log buffer and return. The current holder of + * the console_sem will notice the new output in console_unlock(); and will + * send it to the consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) #define printk_deferred(fmt, ...) \ printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 765f7af6ce56..8030c670f0bc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2183,30 +2183,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/** - * _printk - print a kernel message - * @fmt: format string - * - * This is _printk(). It can be called from any context. We want it to work. - * - * If printk indexing is enabled, _printk() is called from printk_index_wrap. - * Otherwise, printk is simply #defined to _printk. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the - * output and call the console drivers. If we fail to get the semaphore, we - * place the output into the log buffer and return. The current holder of - * the console_sem will notice the new output in console_unlock(); and will - * send it to the consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; -- cgit v1.2.3-71-gd317 From 002eb6ad075142e5940122c7fcee71cf1e906e29 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:55 +0206 Subject: printk: track/limit recursion Currently the printk safe buffers provide a form of recursion protection by redirecting to the safe buffers whenever printk() is recursively called. In preparation for removal of the safe buffers, provide an alternate explicit recursion protection. Recursion is limited to 3 levels per-CPU and per-context. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-3-john.ogness@linutronix.de --- kernel/printk/printk.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..7fa0b4d91975 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1940,6 +1940,76 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, } } +/* + * Recursion is tracked separately on each CPU. If NMIs are supported, an + * additional NMI context per CPU is also separately tracked. Until per-CPU + * is available, a separate "early tracking" is performed. + */ +static DEFINE_PER_CPU(u8, printk_count); +static u8 printk_count_early; +#ifdef CONFIG_HAVE_NMI +static DEFINE_PER_CPU(u8, printk_count_nmi); +static u8 printk_count_nmi_early; +#endif + +/* + * Recursion is limited to keep the output sane. printk() should not require + * more than 1 level of recursion (allowing, for example, printk() to trigger + * a WARN), but a higher value is used in case some printk-internal errors + * exist, such as the ringbuffer validation checks failing. + */ +#define PRINTK_MAX_RECURSION 3 + +/* + * Return a pointer to the dedicated counter for the CPU+context of the + * caller. + */ +static u8 *__printk_recursion_counter(void) +{ +#ifdef CONFIG_HAVE_NMI + if (in_nmi()) { + if (printk_percpu_data_ready()) + return this_cpu_ptr(&printk_count_nmi); + return &printk_count_nmi_early; + } +#endif + if (printk_percpu_data_ready()) + return this_cpu_ptr(&printk_count); + return &printk_count_early; +} + +/* + * Enter recursion tracking. Interrupts are disabled to simplify tracking. + * The caller must check the boolean return value to see if the recursion is + * allowed. On failure, interrupts are not disabled. + * + * @recursion_ptr must be a variable of type (u8 *) and is the same variable + * that is passed to printk_exit_irqrestore(). + */ +#define printk_enter_irqsave(recursion_ptr, flags) \ +({ \ + bool success = true; \ + \ + typecheck(u8 *, recursion_ptr); \ + local_irq_save(flags); \ + (recursion_ptr) = __printk_recursion_counter(); \ + if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ + local_irq_restore(flags); \ + success = false; \ + } else { \ + (*(recursion_ptr))++; \ + } \ + success; \ +}) + +/* Exit recursion tracking, restoring interrupts. */ +#define printk_exit_irqrestore(recursion_ptr, flags) \ + do { \ + typecheck(u8 *, recursion_ptr); \ + (*(recursion_ptr))--; \ + local_irq_restore(flags); \ + } while (0) + int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -2040,11 +2110,14 @@ int vprintk_store(int facility, int level, struct prb_reserved_entry e; enum log_flags lflags = 0; struct printk_record r; + unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; + u8 *recursion_ptr; u16 reserve_size; va_list args2; u16 text_len; + int ret = 0; u64 ts_nsec; /* @@ -2055,6 +2128,9 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that @@ -2092,7 +2168,8 @@ int vprintk_store(int facility, int level, prb_commit(&e); } - return text_len; + ret = text_len; + goto out; } } @@ -2108,7 +2185,7 @@ int vprintk_store(int facility, int level, prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) - return 0; + goto out; } /* fill message */ @@ -2130,7 +2207,10 @@ int vprintk_store(int facility, int level, else prb_final_commit(&e); - return (text_len + trunc_msg_len); + ret = text_len + trunc_msg_len; +out: + printk_exit_irqrestore(recursion_ptr, irqflags); + return ret; } asmlinkage int vprintk_emit(int facility, int level, -- cgit v1.2.3-71-gd317 From 93d102f094be9beab28e5afb656c188b16a3793b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:56 +0206 Subject: printk: remove safe buffers With @logbuf_lock removed, the high level printk functions for storing messages are lockless. Messages can be stored from any context, so there is no need for the NMI and safe buffers anymore. Remove the NMI and safe buffers. Although the safe buffers are removed, the NMI and safe context tracking is still in place. In these contexts, store the message immediately but still use irq_work to defer the console printing. Since printk recursion tracking is in place, safe context tracking for most of printk is not needed. Remove it. Only safe context tracking relating to the console and console_owner locks is left in place. This is because the console and console_owner locks are needed for the actual printing. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-4-john.ogness@linutronix.de --- arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/watchdog.c | 5 - include/linux/printk.h | 10 -- kernel/kexec_core.c | 1 - kernel/panic.c | 3 - kernel/printk/internal.h | 17 --- kernel/printk/printk.c | 120 ++++++--------- kernel/printk/printk_safe.c | 335 +---------------------------------------- lib/nmi_backtrace.c | 6 - 9 files changed, 48 insertions(+), 450 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index b4ab95c9e94a..2522800217d1 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void) extern void panic_flush_kmsg_end(void) { - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); debug_locks_off(); diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index c9a8f4781a10..dc17d8903d4f 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -183,11 +183,6 @@ static void watchdog_smp_panic(int cpu, u64 tb) wd_smp_unlock(&flags); - printk_safe_flush(); - /* - * printk_safe_flush() seems to require another print - * before anything actually goes out to console. - */ if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); diff --git a/include/linux/printk.h b/include/linux/printk.h index d796183f26c9..719d919f9b67 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -208,8 +208,6 @@ void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; -extern void printk_safe_flush(void); -extern void printk_safe_flush_on_panic(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -277,14 +275,6 @@ static inline void dump_stack_lvl(const char *log_lvl) static inline void dump_stack(void) { } - -static inline void printk_safe_flush(void) -{ -} - -static inline void printk_safe_flush_on_panic(void) -{ -} #endif #ifdef CONFIG_SMP diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f099baee3578..69c6e9b7761c 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -978,7 +978,6 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_safe_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..1f0df42f8d0c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -247,7 +247,6 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -271,8 +270,6 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - /* Call flush even twice. It tries harder with a single online CPU */ - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 51615c909b2f..6cc35c5de890 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,7 +22,6 @@ __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); -void printk_safe_init(void); bool printk_percpu_data_ready(void); #define printk_safe_enter_irqsave(flags) \ @@ -37,18 +36,6 @@ bool printk_percpu_data_ready(void); local_irq_restore(flags); \ } while (0) -#define printk_safe_enter_irq() \ - do { \ - local_irq_disable(); \ - __printk_safe_enter(); \ - } while (0) - -#define printk_safe_exit_irq() \ - do { \ - __printk_safe_exit(); \ - local_irq_enable(); \ - } while (0) - void defer_console_output(void); #else @@ -61,9 +48,5 @@ void defer_console_output(void); #define printk_safe_enter_irqsave(flags) local_irq_save(flags) #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) -#define printk_safe_enter_irq() local_irq_disable() -#define printk_safe_exit_irq() local_irq_enable() - -static inline void printk_safe_init(void) { } static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7fa0b4d91975..219ad710a9e8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -732,27 +732,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - printk_safe_enter_irq(); if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - printk_safe_exit_irq(); goto out; } - printk_safe_exit_irq(); ret = wait_event_interruptible(log_wait, prb_read_valid(prb, atomic64_read(&user->seq), r)); if (ret) goto out; - printk_safe_enter_irq(); } if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; - printk_safe_exit_irq(); goto out; } @@ -762,7 +757,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, &r->info->dev_info); atomic64_set(&user->seq, r->info->seq + 1); - printk_safe_exit_irq(); if (len > count) { ret = -EINVAL; @@ -797,7 +791,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - printk_safe_enter_irq(); switch (whence) { case SEEK_SET: /* the first record */ @@ -818,7 +811,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - printk_safe_exit_irq(); return ret; } @@ -833,7 +825,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - printk_safe_enter_irq(); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) @@ -841,7 +832,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) else ret = EPOLLIN|EPOLLRDNORM; } - printk_safe_exit_irq(); return ret; } @@ -874,9 +864,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf)); - printk_safe_enter_irq(); atomic64_set(&user->seq, prb_first_valid_seq(prb)); - printk_safe_exit_irq(); file->private_data = user; return 0; @@ -1042,9 +1030,6 @@ static inline void log_buf_add_cpu(void) {} static void __init set_percpu_data_ready(void) { - printk_safe_init(); - /* Make sure we set this flag only after printk_safe() init is done */ - barrier(); __printk_percpu_data_ready = true; } @@ -1082,6 +1067,7 @@ void __init setup_log_buf(int early) struct prb_desc *new_descs; struct printk_info info; struct printk_record r; + unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; @@ -1142,24 +1128,37 @@ void __init setup_log_buf(int early) new_descs, ilog2(new_descs_count), new_infos); - printk_safe_enter_irqsave(flags); + local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; - prb_for_each_record(0, &printk_rb_static, seq, &r) - free -= add_to_rb(&printk_rb_dynamic, &r); + prb_for_each_record(0, &printk_rb_static, seq, &r) { + text_size = add_to_rb(&printk_rb_dynamic, &r); + if (text_size > free) + free = 0; + else + free -= text_size; + } - /* - * This is early enough that everything is still running on the - * boot CPU and interrupts are disabled. So no new messages will - * appear during the transition to the dynamic buffer. - */ prb = &printk_rb_dynamic; - printk_safe_exit_irqrestore(flags); + local_irq_restore(flags); + + /* + * Copy any remaining messages that might have appeared from + * NMI context after copying but before switching to the + * dynamic buffer. + */ + prb_for_each_record(seq, &printk_rb_static, seq, &r) { + text_size = add_to_rb(&printk_rb_dynamic, &r); + if (text_size > free) + free = 0; + else + free -= text_size; + } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", @@ -1498,11 +1497,9 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); break; } if (r.info->seq != syslog_seq) { @@ -1531,8 +1528,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); if (!n) break; @@ -1566,7 +1562,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; time = printk_time; - printk_safe_enter_irq(); /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. @@ -1587,23 +1582,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) break; } - printk_safe_exit_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - printk_safe_enter_irq(); if (len < 0) break; } if (clear) { - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); latched_seq_write(&clear_seq, seq); - raw_spin_unlock(&syslog_lock); + raw_spin_unlock_irq(&syslog_lock); } - printk_safe_exit_irq(); kfree(text); return len; @@ -1611,11 +1603,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); } /* Return a consistent copy of @syslog_seq. */ @@ -1703,12 +1693,10 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); return 0; } if (info.seq != syslog_seq) { @@ -1736,8 +1724,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -2219,7 +2206,6 @@ asmlinkage int vprintk_emit(int facility, int level, { int printed_len; bool in_sched = false; - unsigned long flags; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2233,9 +2219,7 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - printk_safe_enter_irqsave(flags); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - printk_safe_exit_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { @@ -2664,9 +2648,9 @@ again: for (;;) { size_t ext_len = 0; + int handover; size_t len; - printk_safe_enter_irqsave(flags); skip: if (!prb_read_valid(prb, console_seq, &r)) break; @@ -2716,19 +2700,22 @@ skip: * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). */ + printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - if (console_lock_spinning_disable_and_check()) { - printk_safe_exit_irqrestore(flags); - return; - } - + handover = console_lock_spinning_disable_and_check(); printk_safe_exit_irqrestore(flags); + if (handover) + return; if (do_cond_resched) cond_resched(); @@ -2745,8 +2732,6 @@ skip: * flush, no worries. */ retry = prb_read_valid(prb, console_seq, NULL); - printk_safe_exit_irqrestore(flags); - if (retry && console_trylock()) goto again; } @@ -2808,13 +2793,8 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) { - unsigned long flags; - - printk_safe_enter_irqsave(flags); + if (mode == CONSOLE_REPLAY_ALL) console_seq = prb_first_valid_seq(prb); - printk_safe_exit_irqrestore(flags); - } console_unlock(); } @@ -3466,14 +3446,12 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, struct printk_info info; unsigned int line_count; struct printk_record r; - unsigned long flags; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; - printk_safe_enter_irqsave(flags); prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ @@ -3494,7 +3472,6 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, iter->cur_seq = r.info->seq + 1; ret = true; out: - printk_safe_exit_irqrestore(flags); if (len) *len = l; return ret; @@ -3526,7 +3503,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; - unsigned long flags; u64 seq; u64 next_seq; size_t len = 0; @@ -3539,7 +3515,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; - printk_safe_enter_irqsave(flags); if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ @@ -3548,10 +3523,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, } /* last entry */ - if (iter->cur_seq >= iter->next_seq) { - printk_safe_exit_irqrestore(flags); + if (iter->cur_seq >= iter->next_seq) goto out; - } /* * Find first record that fits, including all following records, @@ -3583,7 +3556,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, iter->next_seq = next_seq; ret = true; - printk_safe_exit_irqrestore(flags); out: if (len_out) *len_out = len; @@ -3601,12 +3573,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { - unsigned long flags; - - printk_safe_enter_irqsave(flags); iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); - printk_safe_exit_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 94232186fccb..29c580dac93d 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -15,286 +15,9 @@ #include "internal.h" -/* - * In NMI and safe mode, printk() avoids taking locks. Instead, - * it uses an alternative implementation that temporary stores - * the strings into a per-CPU buffer. The content of the buffer - * is later flushed into the main ring buffer via IRQ work. - * - * The alternative implementation is chosen transparently - * by examining current printk() context mask stored in @printk_context - * per-CPU variable. - * - * The implementation allows to flush the strings also from another CPU. - * There are situations when we want to make sure that all buffers - * were handled or when IRQs are blocked. - */ - -#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - \ - sizeof(atomic_t) - \ - sizeof(struct irq_work)) - -struct printk_safe_seq_buf { - atomic_t len; /* length of written data */ - atomic_t message_lost; - struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[SAFE_LOG_BUF_LEN]; -}; - -static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); static DEFINE_PER_CPU(int, printk_context); -static DEFINE_RAW_SPINLOCK(safe_read_lock); - -#ifdef CONFIG_PRINTK_NMI -static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); -#endif - -/* Get flushed in a more safe context. */ -static void queue_flush_work(struct printk_safe_seq_buf *s) -{ - if (printk_percpu_data_ready()) - irq_work_queue(&s->work); -} - -/* - * Add a message to per-CPU context-dependent buffer. NMI and printk-safe - * have dedicated buffers, because otherwise printk-safe preempted by - * NMI-printk would have overwritten the NMI messages. - * - * The messages are flushed from irq work (or from panic()), possibly, - * from other CPU, concurrently with printk_safe_log_store(). Should this - * happen, printk_safe_log_store() will notice the buffer->len mismatch - * and repeat the write. - */ -static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) -{ - int add; - size_t len; - va_list ap; - -again: - len = atomic_read(&s->len); - - /* The trailing '\0' is not counted into len. */ - if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&s->message_lost); - queue_flush_work(s); - return 0; - } - - /* - * Make sure that all old data have been read before the buffer - * was reset. This is not needed when we just append data. - */ - if (!len) - smp_rmb(); - - va_copy(ap, args); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); - va_end(ap); - if (!add) - return 0; - - /* - * Do it once again if the buffer has been flushed in the meantime. - * Note that atomic_cmpxchg() is an implicit memory barrier that - * makes sure that the data were written before updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, len + add) != len) - goto again; - - queue_flush_work(s); - return add; -} - -static inline void printk_safe_flush_line(const char *text, int len) -{ - /* - * Avoid any console drivers calls from here, because we may be - * in NMI or printk_safe context (when in panic). The messages - * must go only into the ring buffer at this stage. Consoles will - * get explicitly called later when a crashdump is not generated. - */ - printk_deferred("%.*s", len, text); -} - -/* printk part of the temporary buffer line by line */ -static int printk_safe_flush_buffer(const char *start, size_t len) -{ - const char *c, *end; - bool header; - - c = start; - end = start + len; - header = true; - - /* Print line by line. */ - while (c < end) { - if (*c == '\n') { - printk_safe_flush_line(start, c - start + 1); - start = ++c; - header = true; - continue; - } - - /* Handle continuous lines or missing new line. */ - if ((c + 1 < end) && printk_get_level(c)) { - if (header) { - c = printk_skip_level(c); - continue; - } - - printk_safe_flush_line(start, c - start); - start = c++; - header = true; - continue; - } - - header = false; - c++; - } - - /* Check if there was a partial line. Ignore pure header. */ - if (start < end && !header) { - static const char newline[] = KERN_CONT "\n"; - - printk_safe_flush_line(start, end - start); - printk_safe_flush_line(newline, strlen(newline)); - } - - return len; -} - -static void report_message_lost(struct printk_safe_seq_buf *s) -{ - int lost = atomic_xchg(&s->message_lost, 0); - - if (lost) - printk_deferred("Lost %d message(s)!\n", lost); -} - -/* - * Flush data from the associated per-CPU buffer. The function - * can be called either via IRQ work or independently. - */ -static void __printk_safe_flush(struct irq_work *work) -{ - struct printk_safe_seq_buf *s = - container_of(work, struct printk_safe_seq_buf, work); - unsigned long flags; - size_t len; - int i; - - /* - * The lock has two functions. First, one reader has to flush all - * available message to make the lockless synchronization with - * writers easier. Second, we do not want to mix messages from - * different CPUs. This is especially important when printing - * a backtrace. - */ - raw_spin_lock_irqsave(&safe_read_lock, flags); - - i = 0; -more: - len = atomic_read(&s->len); - - /* - * This is just a paranoid check that nobody has manipulated - * the buffer an unexpected way. If we printed something then - * @len must only increase. Also it should never overflow the - * buffer size. - */ - if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_safe_flush: internal error\n"; - - printk_safe_flush_line(msg, strlen(msg)); - len = 0; - } - - if (!len) - goto out; /* Someone else has already flushed the buffer. */ - - /* Make sure that data has been written up to the @len */ - smp_rmb(); - i += printk_safe_flush_buffer(s->buffer + i, len - i); - - /* - * Check that nothing has got added in the meantime and truncate - * the buffer. Note that atomic_cmpxchg() is an implicit memory - * barrier that makes sure that the data were copied before - * updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, 0) != len) - goto more; - -out: - report_message_lost(s); - raw_spin_unlock_irqrestore(&safe_read_lock, flags); -} - -/** - * printk_safe_flush - flush all per-cpu nmi buffers. - * - * The buffers are flushed automatically via IRQ work. This function - * is useful only when someone wants to be sure that all buffers have - * been flushed at some point. - */ -void printk_safe_flush(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { -#ifdef CONFIG_PRINTK_NMI - __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); -#endif - __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); - } -} - -/** - * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system - * goes down. - * - * Similar to printk_safe_flush() but it can be called even in NMI context when - * the system goes down. It does the best effort to get NMI messages into - * the main ring buffer. - * - * Note that it could try harder when there is only one CPU online. - */ -void printk_safe_flush_on_panic(void) -{ - /* - * Make sure that we could access the safe buffers. - * Do not risk a double release when more CPUs are up. - */ - if (raw_spin_is_locked(&safe_read_lock)) { - if (num_online_cpus() > 1) - return; - - debug_locks_off(); - raw_spin_lock_init(&safe_read_lock); - } - - printk_safe_flush(); -} - #ifdef CONFIG_PRINTK_NMI -/* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. - */ -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - void noinstr printk_nmi_enter(void) { this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); @@ -309,9 +32,6 @@ void noinstr printk_nmi_exit(void) * Marks a code that might produce many messages in NMI context * and the risk of losing them is more critical than eventual * reordering. - * - * It has effect only when called in NMI context. Then printk() - * will store the messages into the main logbuf directly. */ void printk_nmi_direct_enter(void) { @@ -324,27 +44,8 @@ void printk_nmi_direct_exit(void) this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); } -#else - -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - return 0; -} - #endif /* CONFIG_PRINTK_NMI */ -/* - * Lock-less printk(), to avoid deadlocks should the printk() recurse - * into itself. It uses a per-CPU buffer to store the message, just like - * NMI. - */ -static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - /* Can be preempted by NMI. */ void __printk_safe_enter(void) { @@ -369,46 +70,18 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) { - unsigned long flags; + if (this_cpu_read(printk_context) & + (PRINTK_NMI_DIRECT_CONTEXT_MASK | + PRINTK_NMI_CONTEXT_MASK | + PRINTK_SAFE_CONTEXT_MASK)) { int len; - printk_safe_enter_irqsave(flags); len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - printk_safe_exit_irqrestore(flags); defer_console_output(); return len; } - /* Use extra buffer in NMI. */ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - return vprintk_nmi(fmt, args); - - /* Use extra buffer to prevent a recursion deadlock in safe mode. */ - if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) - return vprintk_safe(fmt, args); - /* No obstacles. */ return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); - -void __init printk_safe_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct printk_safe_seq_buf *s; - - s = &per_cpu(safe_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); - -#ifdef CONFIG_PRINTK_NMI - s = &per_cpu(nmi_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); -#endif - } - - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_safe_flush(); -} diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index dae233c5f597..9813a983d024 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, touch_softlockup_watchdog(); } - /* - * Force flush any remote buffers that might be stuck in IRQ context - * and therefore could not run their irq_work. - */ - printk_safe_flush(); - clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -- cgit v1.2.3-71-gd317 From 85e3e7fbbb720b9897fba9a99659e31cbd1c082e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:57 +0206 Subject: printk: remove NMI tracking All NMI contexts are handled the same as the safe context: store the message and defer printing. There is no need to have special NMI context tracking for this. Using in_nmi() is enough. There are several parts of the kernel that are manually calling into the printk NMI context tracking in order to cause general printk deferred printing: arch/arm/kernel/smp.c arch/powerpc/kexec/crash.c kernel/trace/trace.c For arm/kernel/smp.c and powerpc/kexec/crash.c, provide a new function pair printk_deferred_enter/exit that explicitly achieves the same objective. For ftrace, remove the printk context manipulation completely. It was added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when accessing the main log buffer in NMI"). The purpose was to enforce storing messages directly into the ring buffer even in NMI context. It really should have only modified the behavior in NMI context. There is no need for a special behavior any longer. All messages are always stored directly now. The console deferring is handled transparently in vprintk(). Signed-off-by: John Ogness [pmladek@suse.com: Remove special handling in ftrace.c completely. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-5-john.ogness@linutronix.de --- arch/arm/kernel/smp.c | 4 ++-- arch/powerpc/kexec/crash.c | 2 +- include/linux/hardirq.h | 2 -- include/linux/printk.h | 31 +++++++++++++++++++------------ init/Kconfig | 5 ----- kernel/printk/internal.h | 8 -------- kernel/printk/printk_safe.c | 37 +------------------------------------ kernel/trace/trace.c | 2 -- 8 files changed, 23 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index c7bb168b0d97..842427ff2b3c 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -667,9 +667,9 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_BACKTRACE: - printk_nmi_enter(); + printk_deferred_enter(); nmi_cpu_backtrace(get_irq_regs()); - printk_nmi_exit(); + printk_deferred_exit(); break; default: diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 0196d0c211ac..1070378c8e35 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -313,7 +313,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) int (*old_handler)(struct pt_regs *regs); /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ - printk_nmi_enter(); + printk_deferred_enter(); /* * This function is only called after the system diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 69bc86ea382c..76878b357ffa 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -116,7 +116,6 @@ extern void rcu_nmi_exit(void); do { \ lockdep_off(); \ arch_nmi_enter(); \ - printk_nmi_enter(); \ BUG_ON(in_nmi() == NMI_MASK); \ __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ } while (0) @@ -135,7 +134,6 @@ extern void rcu_nmi_exit(void); do { \ BUG_ON(!in_nmi()); \ __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ - printk_nmi_exit(); \ arch_nmi_exit(); \ lockdep_on(); \ } while (0) diff --git a/include/linux/printk.h b/include/linux/printk.h index 719d919f9b67..a1379df43251 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -149,18 +149,6 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -#ifdef CONFIG_PRINTK_NMI -extern void printk_nmi_enter(void); -extern void printk_nmi_exit(void); -extern void printk_nmi_direct_enter(void); -extern void printk_nmi_direct_exit(void); -#else -static inline void printk_nmi_enter(void) { } -static inline void printk_nmi_exit(void) { } -static inline void printk_nmi_direct_enter(void) { } -static inline void printk_nmi_direct_exit(void) { } -#endif /* PRINTK_NMI */ - struct dev_printk_info; #ifdef CONFIG_PRINTK @@ -180,6 +168,16 @@ int printk(const char *fmt, ...); */ __printf(1, 2) __cold int printk_deferred(const char *fmt, ...); +extern void __printk_safe_enter(void); +extern void __printk_safe_exit(void); +/* + * The printk_deferred_enter/exit macros are available only as a hack for + * some code paths that need to defer all printk console printing. Interrupts + * must be disabled for the deferred duration. + */ +#define printk_deferred_enter __printk_safe_enter +#define printk_deferred_exit __printk_safe_exit + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -224,6 +222,15 @@ int printk_deferred(const char *s, ...) { return 0; } + +static inline void printk_deferred_enter(void) +{ +} + +static inline void printk_deferred_exit(void) +{ +} + static inline int printk_ratelimit(void) { return 0; diff --git a/init/Kconfig b/init/Kconfig index a61c92066c2e..9c0510693543 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1506,11 +1506,6 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. -config PRINTK_NMI - def_bool y - depends on PRINTK - depends on HAVE_NMI - config BUG bool "BUG() support" if EXPERT default y diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6cc35c5de890..b6d310c72fc9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,12 +6,6 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 -#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 - -#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 - __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -19,8 +13,6 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); -void __printk_safe_enter(void); -void __printk_safe_exit(void); bool printk_percpu_data_ready(void); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 29c580dac93d..ef0f9a2044da 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -4,12 +4,9 @@ */ #include -#include -#include #include #include #include -#include #include #include @@ -17,35 +14,6 @@ static DEFINE_PER_CPU(int, printk_context); -#ifdef CONFIG_PRINTK_NMI -void noinstr printk_nmi_enter(void) -{ - this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -void noinstr printk_nmi_exit(void) -{ - this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -/* - * Marks a code that might produce many messages in NMI context - * and the risk of losing them is more critical than eventual - * reordering. - */ -void printk_nmi_direct_enter(void) -{ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -void printk_nmi_direct_exit(void) -{ - this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -#endif /* CONFIG_PRINTK_NMI */ - /* Can be preempted by NMI. */ void __printk_safe_enter(void) { @@ -70,10 +38,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) & - (PRINTK_NMI_DIRECT_CONTEXT_MASK | - PRINTK_NMI_CONTEXT_MASK | - PRINTK_SAFE_CONTEXT_MASK)) { + if (this_cpu_read(printk_context) || in_nmi()) { int len; len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d23a09d3eb37..2f41311c61d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9647,7 +9647,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); - printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -9729,7 +9728,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); - printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); -- cgit v1.2.3-71-gd317 From b371cbb584d843bc4194d0cd4ce5ecd19b0cf55f Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:58 +0206 Subject: printk: convert @syslog_lock to mutex @syslog_lock was a raw_spin_lock to simplify the transition of removing @logbuf_lock and the safe buffers. With that transition complete, and since all uses of @syslog_lock are within sleepable contexts, @syslog_lock can become a mutex. Note that until now register_console() would disable interrupts using irqsave, which implies that it may be called with interrupts disabled. And indeed, there is one possible call chain on parisc where this happens: handle_interruption(code=1) /* High-priority machine check (HPMC) */ pdc_console_restart() pdc_console_init_force() register_console() However, register_console() calls console_lock(), which might sleep. So it has never been allowed to call register_console() from an atomic context and the above call chain is a bug. Note that the removal of read_syslog_seq_irq() is slightly changing the behavior of SYSLOG_ACTION_READ by testing against a possibly outdated @seq value. However, the value of @seq could have changed after the test, so it is not a new window. A follow-up commit closes this window. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-6-john.ogness@linutronix.de --- kernel/printk/printk.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 219ad710a9e8..86f1258d08b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -356,7 +356,7 @@ enum log_flags { }; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ -static DEFINE_RAW_SPINLOCK(syslog_lock); +static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); @@ -1497,9 +1497,9 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); break; } if (r.info->seq != syslog_seq) { @@ -1528,7 +1528,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); if (!n) break; @@ -1592,9 +1592,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) } if (clear) { - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); } kfree(text); @@ -1603,21 +1603,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); - raw_spin_unlock_irq(&syslog_lock); -} - -/* Return a consistent copy of @syslog_seq. */ -static u64 read_syslog_seq_irq(void) -{ - u64 seq; - - raw_spin_lock_irq(&syslog_lock); - seq = syslog_seq; - raw_spin_unlock_irq(&syslog_lock); - - return seq; + mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) @@ -1626,6 +1614,7 @@ int do_syslog(int type, char __user *buf, int len, int source) bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; + u64 seq; error = check_syslog_permissions(type, source); if (error) @@ -1644,8 +1633,12 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; - error = wait_event_interruptible(log_wait, - prb_read_valid(prb, read_syslog_seq_irq(), NULL)); + /* Get a consistent copy of @syslog_seq. */ + mutex_lock(&syslog_lock); + seq = syslog_seq; + mutex_unlock(&syslog_lock); + + error = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1693,10 +1686,10 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { @@ -1714,7 +1707,6 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; - u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { @@ -1724,7 +1716,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -2929,7 +2921,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) */ void register_console(struct console *newcon) { - unsigned long flags; struct console *bcon = NULL; int err; @@ -3034,9 +3025,9 @@ void register_console(struct console *newcon) exclusive_console_stop_seq = console_seq; /* Get a consistent copy of @syslog_seq. */ - raw_spin_lock_irqsave(&syslog_lock, flags); + mutex_lock(&syslog_lock); console_seq = syslog_seq; - raw_spin_unlock_irqrestore(&syslog_lock, flags); + mutex_unlock(&syslog_lock); } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3-71-gd317 From 8d909b2333f37e5da84a9e6a2cbe21f52be5f42a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:59 +0206 Subject: printk: syslog: close window between wait and read Syslog's SYSLOG_ACTION_READ is supposed to block until the next syslog record can be read, and then it should read that record. However, because @syslog_lock is not held between waking up and reading the record, another reader could read the record first, thus causing SYSLOG_ACTION_READ to return with a value of 0, never having read _anything_. By holding @syslog_lock between waking up and reading, it can be guaranteed that SYSLOG_ACTION_READ blocks until it successfully reads a syslog record (or a real error occurs). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-7-john.ogness@linutronix.de --- kernel/printk/printk.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 86f1258d08b0..65fffa6368c9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1480,12 +1480,14 @@ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, return seq; } +/* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; + u64 seq; text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) @@ -1493,15 +1495,35 @@ static int syslog_print(char __user *buf, int size) prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); - while (size > 0) { + mutex_lock(&syslog_lock); + + /* + * Wait for the @syslog_seq record to be available. @syslog_seq may + * change while waiting. + */ + do { + seq = syslog_seq; + + mutex_unlock(&syslog_lock); + len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + mutex_lock(&syslog_lock); + + if (len) + goto out; + } while (syslog_seq != seq); + + /* + * Copy records that fit into the buffer. The above cycle makes sure + * that the first record is always available. + */ + do { size_t n; size_t skip; + int err; - mutex_lock(&syslog_lock); - if (!prb_read_valid(prb, syslog_seq, &r)) { - mutex_unlock(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) break; - } + if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; @@ -1528,12 +1550,15 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - mutex_unlock(&syslog_lock); if (!n) break; - if (copy_to_user(buf, text + skip, n)) { + mutex_unlock(&syslog_lock); + err = copy_to_user(buf, text + skip, n); + mutex_lock(&syslog_lock); + + if (err) { if (!len) len = -EFAULT; break; @@ -1542,8 +1567,9 @@ static int syslog_print(char __user *buf, int size) len += n; size -= n; buf += n; - } - + } while (size); +out: + mutex_unlock(&syslog_lock); kfree(text); return len; } @@ -1614,7 +1640,6 @@ int do_syslog(int type, char __user *buf, int len, int source) bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; - u64 seq; error = check_syslog_permissions(type, source); if (error) @@ -1632,15 +1657,6 @@ int do_syslog(int type, char __user *buf, int len, int source) return 0; if (!access_ok(buf, len)) return -EFAULT; - - /* Get a consistent copy of @syslog_seq. */ - mutex_lock(&syslog_lock); - seq = syslog_seq; - mutex_unlock(&syslog_lock); - - error = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); - if (error) - return error; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ @@ -1707,6 +1723,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; + u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { -- cgit v1.2.3-71-gd317 From 0f3adb8a1e5f36e792598c1d77a2cfac9c90a4f9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:26 -0400 Subject: cgroup/cpuset: Miscellaneous code cleanup Use more descriptive variable names for update_prstate(), remove unnecessary code and fix some typos. There is no functional change. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..f5fef5516d99 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1114,7 +1114,7 @@ enum subparts_cmd { * cpus_allowed can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transofrmed from a partition - * root back to a non-partition root. any CPUs in cpus_allowed that are in + * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. * @@ -1225,7 +1225,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effectiveb_cpus + * addmask = cpus_allowed & parent->effective_cpus * * Note that parent's subparts_cpus may have been * pre-shrunk in case there is a change in the cpu list. @@ -1365,12 +1365,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) case PRS_DISABLED: /* * If parent is not a partition root or an - * invalid partition root, clear the state - * state and the CS_CPU_EXCLUSIVE flag. + * invalid partition root, clear its state + * and its CS_CPU_EXCLUSIVE flag. */ WARN_ON_ONCE(cp->partition_root_state != PRS_ERROR); - cp->partition_root_state = 0; + cp->partition_root_state = PRS_DISABLED; /* * clear_bit() is an atomic operation and @@ -1937,30 +1937,28 @@ out: /* * update_prstate - update partititon_root_state - * cs: the cpuset to update - * val: 0 - disabled, 1 - enabled + * cs: the cpuset to update + * new_prs: new partition root state * * Call with cpuset_mutex held. */ -static int update_prstate(struct cpuset *cs, int val) +static int update_prstate(struct cpuset *cs, int new_prs) { int err; struct cpuset *parent = parent_cs(cs); - struct tmpmasks tmp; + struct tmpmasks tmpmask; - if ((val != 0) && (val != 1)) - return -EINVAL; - if (val == cs->partition_root_state) + if (new_prs == cs->partition_root_state) return 0; /* * Cannot force a partial or invalid partition root to a full * partition root. */ - if (val && cs->partition_root_state) + if (new_prs && (cs->partition_root_state < 0)) return -EINVAL; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; err = -EINVAL; @@ -1978,7 +1976,7 @@ static int update_prstate(struct cpuset *cs, int val) goto out; err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmp); + NULL, &tmpmask); if (err) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; @@ -1990,18 +1988,18 @@ static int update_prstate(struct cpuset *cs, int val) * CS_CPU_EXCLUSIVE bit. */ if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = 0; + cs->partition_root_state = PRS_DISABLED; update_flag(CS_CPU_EXCLUSIVE, cs, 0); err = 0; goto out; } err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmp); + NULL, &tmpmask); if (err) goto out; - cs->partition_root_state = 0; + cs->partition_root_state = PRS_DISABLED; /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); @@ -2015,11 +2013,11 @@ static int update_prstate(struct cpuset *cs, int val) update_tasks_cpumask(parent); if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); + update_sibling_cpumasks(parent, cs, &tmpmask); rebuild_sched_domains_locked(); out: - free_cpumasks(NULL, &tmp); + free_cpumasks(NULL, &tmpmask); return err; } @@ -3060,7 +3058,7 @@ retry: goto retry; } - parent = parent_cs(cs); + parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); -- cgit v1.2.3-71-gd317 From 15d428e6fe77fffc3f4fff923336036f5496ef17 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:27 -0400 Subject: cgroup/cpuset: Fix a partition bug with hotplug In cpuset_hotplug_workfn(), the detection of whether the cpu list has been changed is done by comparing the effective cpus of the top cpuset with the cpu_active_mask. However, in the rare case that just all the CPUs in the subparts_cpus are offlined, the detection fails and the partition states are not updated correctly. Fix it by forcing the cpus_updated flag to true in this particular case. Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f5fef5516d99..28a784bf64b1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3166,6 +3166,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); + /* + * In the rare case that hotplug removes all the cpus in subparts_cpus, + * we assumed that cpus are updated. + */ + if (!cpus_updated && top_cpuset.nr_subparts_cpus) + cpus_updated = true; + /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); -- cgit v1.2.3-71-gd317 From 95f7f15461fa482a05237669507b4c9b06865b73 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 14 Jul 2021 11:26:20 +0530 Subject: kdb: Get rid of custom debug heap allocator Currently the only user for debug heap is kdbnearsym() which can be modified to rather use statically allocated buffer for symbol name as per it's current usage. So do that and hence remove custom debug heap allocator. Note that this change puts a restriction on kdbnearsym() callers to carefully use shared namebuf such that a caller should consume the symbol returned immediately prior to another call to fetch a different symbol. Also, this change uses standard KSYM_NAME_LEN macro for namebuf allocation instead of local variable: knt1_size which should avoid any conflicts caused by changes to KSYM_NAME_LEN macro value. This change has been tested using kgdbtest on arm64 which doesn't show any regressions. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210714055620.369915-1-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_debugger.c | 1 - kernel/debug/kdb/kdb_private.h | 5 - kernel/debug/kdb/kdb_support.c | 329 ++++------------------------------------ 3 files changed, 28 insertions(+), 307 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 0220afda3200..e91fc3e4edd5 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -140,7 +140,6 @@ int kdb_stub(struct kgdb_state *ks) */ kdb_common_deinit_state(); KDB_STATE_CLEAR(PAGER); - kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { if (KDB_STATE(DOING_KGDB)) KDB_STATE_CLEAR(DOING_KGDB); diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 170c69aedebb..8dbc840113c9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); extern int kdbnearsym(unsigned long, kdb_symtab_t *); -extern void kdbnearsym_cleanup(void); extern char *kdb_strdup(const char *str, gfp_t type); extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); @@ -233,10 +232,6 @@ extern struct task_struct *kdb_curr_task(int); #define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL) -extern void *debug_kmalloc(size_t size, gfp_t flags); -extern void debug_kfree(void *); -extern void debug_kusage(void); - extern struct task_struct *kdb_current_task; extern struct pt_regs *kdb_current_regs; diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 9f50d22d68e6..c605b17b2a0d 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -52,48 +52,48 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) } EXPORT_SYMBOL(kdbgetsymval); -static char *kdb_name_table[100]; /* arbitrary size */ - -/* - * kdbnearsym - Return the name of the symbol with the nearest address - * less than 'addr'. +/** + * kdbnearsym() - Return the name of the symbol with the nearest address + * less than @addr. + * @addr: Address to check for near symbol + * @symtab: Structure to receive results * - * Parameters: - * addr Address to check for symbol near - * symtab Structure to receive results - * Returns: - * 0 No sections contain this address, symtab zero filled - * 1 Address mapped to module/symbol/section, data in symtab - * Remarks: - * 2.6 kallsyms has a "feature" where it unpacks the name into a - * string. If that string is reused before the caller expects it - * then the caller sees its string change without warning. To - * avoid cluttering up the main kdb code with lots of kdb_strdup, - * tests and kfree calls, kdbnearsym maintains an LRU list of the - * last few unique strings. The list is sized large enough to - * hold active strings, no kdb caller of kdbnearsym makes more - * than ~20 later calls before using a saved value. + * WARNING: This function may return a pointer to a single statically + * allocated buffer (namebuf). kdb's unusual calling context (single + * threaded, all other CPUs halted) provides us sufficient locking for + * this to be safe. The only constraint imposed by the static buffer is + * that the caller must consume any previous reply prior to another call + * to lookup a new symbol. + * + * Note that, strictly speaking, some architectures may re-enter the kdb + * trap if the system turns out to be very badly damaged and this breaks + * the single-threaded assumption above. In these circumstances successful + * continuation and exit from the inner trap is unlikely to work and any + * user attempting this receives a prominent warning before being allowed + * to progress. In these circumstances we remain memory safe because + * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do + * tolerate the possibility of garbled symbol display from the outer kdb + * trap. + * + * Return: + * * 0 - No sections contain this address, symtab zero filled + * * 1 - Address mapped to module/symbol/section, data in symtab */ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) { int ret = 0; unsigned long symbolsize = 0; unsigned long offset = 0; -#define knt1_size 128 /* must be >= kallsyms table size */ - char *knt1 = NULL; + static char namebuf[KSYM_NAME_LEN]; kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) goto out; - knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); - if (!knt1) { - kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr); - goto out; - } + symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, - (char **)(&symtab->mod_name), knt1); + (char **)(&symtab->mod_name), namebuf); if (offset > 8*1024*1024) { symtab->sym_name = NULL; addr = offset = symbolsize = 0; @@ -102,63 +102,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) symtab->sym_end = symtab->sym_start + symbolsize; ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0'; - if (ret) { - int i; - /* Another 2.6 kallsyms "feature". Sometimes the sym_name is - * set but the buffer passed into kallsyms_lookup is not used, - * so it contains garbage. The caller has to work out which - * buffer needs to be saved. - * - * What was Rusty smoking when he wrote that code? - */ - if (symtab->sym_name != knt1) { - strncpy(knt1, symtab->sym_name, knt1_size); - knt1[knt1_size-1] = '\0'; - } - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i] && - strcmp(kdb_name_table[i], knt1) == 0) - break; - } - if (i >= ARRAY_SIZE(kdb_name_table)) { - debug_kfree(kdb_name_table[0]); - memmove(kdb_name_table, kdb_name_table+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-1)); - } else { - debug_kfree(knt1); - knt1 = kdb_name_table[i]; - memmove(kdb_name_table+i, kdb_name_table+i+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-i-1)); - } - i = ARRAY_SIZE(kdb_name_table) - 1; - kdb_name_table[i] = knt1; - symtab->sym_name = kdb_name_table[i]; - knt1 = NULL; - } - if (symtab->mod_name == NULL) symtab->mod_name = "kernel"; kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); - out: - debug_kfree(knt1); return ret; } -void kdbnearsym_cleanup(void) -{ - int i; - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i]) { - debug_kfree(kdb_name_table[i]); - kdb_name_table[i] = NULL; - } - } -} - static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1]; /* @@ -656,230 +607,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) return (mask & kdb_task_state_string(state)) != 0; } -/* Last ditch allocator for debugging, so we can still debug even when - * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned - * for space usage, not for speed. One smallish memory pool, the free - * chain is always in ascending address order to allow coalescing, - * allocations are done in brute force best fit. - */ - -struct debug_alloc_header { - u32 next; /* offset of next header from start of pool */ - u32 size; - void *caller; -}; - -/* The memory returned by this allocator must be aligned, which means - * so must the header size. Do not assume that sizeof(struct - * debug_alloc_header) is a multiple of the alignment, explicitly - * calculate the overhead of this header, including the alignment. - * The rest of this code must not use sizeof() on any header or - * pointer to a header. - */ -#define dah_align 8 -#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align) - -static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */ -static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned; -static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max; - -/* Locking is awkward. The debug code is called from all contexts, - * including non maskable interrupts. A normal spinlock is not safe - * in NMI context. Try to get the debug allocator lock, if it cannot - * be obtained after a second then give up. If the lock could not be - * previously obtained on this cpu then only try once. - * - * sparse has no annotation for "this function _sometimes_ acquires a - * lock", so fudge the acquire/release notation. - */ -static DEFINE_SPINLOCK(dap_lock); -static int get_dap_lock(void) - __acquires(dap_lock) -{ - static int dap_locked = -1; - int count; - if (dap_locked == smp_processor_id()) - count = 1; - else - count = 1000; - while (1) { - if (spin_trylock(&dap_lock)) { - dap_locked = -1; - return 1; - } - if (!count--) - break; - udelay(1000); - } - dap_locked = smp_processor_id(); - __acquire(dap_lock); - return 0; -} - -void *debug_kmalloc(size_t size, gfp_t flags) -{ - unsigned int rem, h_offset; - struct debug_alloc_header *best, *bestprev, *prev, *h; - void *p = NULL; - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return NULL; - } - h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first_call) { - h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead; - dah_first_call = 0; - } - size = ALIGN(size, dah_align); - prev = best = bestprev = NULL; - while (1) { - if (h->size >= size && (!best || h->size < best->size)) { - best = h; - bestprev = prev; - if (h->size == size) - break; - } - if (!h->next) - break; - prev = h; - h = (struct debug_alloc_header *)(debug_alloc_pool + h->next); - } - if (!best) - goto out; - rem = best->size - size; - /* The pool must always contain at least one header */ - if (best->next == 0 && bestprev == NULL && rem < dah_overhead) - goto out; - if (rem >= dah_overhead) { - best->size = size; - h_offset = ((char *)best - debug_alloc_pool) + - dah_overhead + best->size; - h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset); - h->size = rem - dah_overhead; - h->next = best->next; - } else - h_offset = best->next; - best->caller = __builtin_return_address(0); - dah_used += best->size; - dah_used_max = max(dah_used, dah_used_max); - if (bestprev) - bestprev->next = h_offset; - else - dah_first = h_offset; - p = (char *)best + dah_overhead; - memset(p, POISON_INUSE, best->size - 1); - *((char *)p + best->size - 1) = POISON_END; -out: - spin_unlock(&dap_lock); - return p; -} - -void debug_kfree(void *p) -{ - struct debug_alloc_header *h; - unsigned int h_offset; - if (!p) - return; - if ((char *)p < debug_alloc_pool || - (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) { - kfree(p); - return; - } - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; /* memory leak, cannot be helped */ - } - h = (struct debug_alloc_header *)((char *)p - dah_overhead); - memset(p, POISON_FREE, h->size - 1); - *((char *)p + h->size - 1) = POISON_END; - h->caller = NULL; - dah_used -= h->size; - h_offset = (char *)h - debug_alloc_pool; - if (h_offset < dah_first) { - h->next = dah_first; - dah_first = h_offset; - } else { - struct debug_alloc_header *prev; - unsigned int prev_offset; - prev = (struct debug_alloc_header *)(debug_alloc_pool + - dah_first); - while (1) { - if (!prev->next || prev->next > h_offset) - break; - prev = (struct debug_alloc_header *) - (debug_alloc_pool + prev->next); - } - prev_offset = (char *)prev - debug_alloc_pool; - if (prev_offset + dah_overhead + prev->size == h_offset) { - prev->size += dah_overhead + h->size; - memset(h, POISON_FREE, dah_overhead - 1); - *((char *)h + dah_overhead - 1) = POISON_END; - h = prev; - h_offset = prev_offset; - } else { - h->next = prev->next; - prev->next = h_offset; - } - } - if (h_offset + dah_overhead + h->size == h->next) { - struct debug_alloc_header *next; - next = (struct debug_alloc_header *) - (debug_alloc_pool + h->next); - h->size += dah_overhead + next->size; - h->next = next->next; - memset(next, POISON_FREE, dah_overhead - 1); - *((char *)next + dah_overhead - 1) = POISON_END; - } - spin_unlock(&dap_lock); -} - -void debug_kusage(void) -{ - struct debug_alloc_header *h_free, *h_used; -#ifdef CONFIG_IA64 - /* FIXME: using dah for ia64 unwind always results in a memory leak. - * Fix that memory leak first, then set debug_kusage_one_time = 1 for - * all architectures. - */ - static int debug_kusage_one_time; -#else - static int debug_kusage_one_time = 1; -#endif - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; - } - h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first == 0 && - (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead || - dah_first_call)) - goto out; - if (!debug_kusage_one_time) - goto out; - debug_kusage_one_time = 0; - kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first); - if (dah_first) { - h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_func_printf("h_used %px size %d\n", h_used, h_used->size); - } - do { - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - kdb_func_printf("h_used %px size %d caller %px\n", - h_used, h_used->size, h_used->caller); - h_free = (struct debug_alloc_header *) - (debug_alloc_pool + h_free->next); - } while (h_free->next); - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - if ((char *)h_used - debug_alloc_pool != - sizeof(debug_alloc_pool_aligned)) - kdb_func_printf("h_used %px size %d caller %px\n", - h_used, h_used->size, h_used->caller); -out: - spin_unlock(&dap_lock); -} - /* Maintain a small stack of kdb_flags to allow recursion without disturbing * the global kdb state. */ -- cgit v1.2.3-71-gd317 From b39cded834154cf54442489b56b33d047edd6d8f Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:17 +0530 Subject: kdb: Rename struct defcmd_set to struct kdb_macro Rename struct defcmd_set to struct kdb_macro as that sounds more appropriate given its purpose. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-2-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d8ee5647b732..5cf9867fa118 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -654,7 +654,7 @@ static void kdb_cmderror(int diag) * Returns: * zero for success, a kdb diagnostic if error */ -struct defcmd_set { +struct kdb_macro { int count; bool usable; char *name; @@ -662,8 +662,8 @@ struct defcmd_set { char *help; char **command; }; -static struct defcmd_set *defcmd_set; -static int defcmd_set_count; +static struct kdb_macro *kdb_macro; +static int kdb_macro_count; static bool defcmd_in_progress; /* Forward references */ @@ -671,7 +671,7 @@ static int kdb_exec_defcmd(int argc, const char **argv); static int kdb_defcmd2(const char *cmdstr, const char *argv0) { - struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; + struct kdb_macro *s = kdb_macro + kdb_macro_count - 1; char **save_command = s->command; if (strcmp(argv0, "endefcmd") == 0) { defcmd_in_progress = false; @@ -704,7 +704,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) static int kdb_defcmd(int argc, const char **argv) { - struct defcmd_set *save_defcmd_set = defcmd_set, *s; + struct kdb_macro *save_kdb_macro = kdb_macro, *s; if (defcmd_in_progress) { kdb_printf("kdb: nested defcmd detected, assuming missing " "endefcmd\n"); @@ -712,7 +712,7 @@ static int kdb_defcmd(int argc, const char **argv) } if (argc == 0) { int i; - for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) { + for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, s->usage, s->help); for (i = 0; i < s->count; ++i) @@ -727,13 +727,13 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set), - GFP_KDB); - if (!defcmd_set) + kdb_macro = kmalloc_array(kdb_macro_count + 1, sizeof(*kdb_macro), + GFP_KDB); + if (!kdb_macro) goto fail_defcmd; - memcpy(defcmd_set, save_defcmd_set, - defcmd_set_count * sizeof(*defcmd_set)); - s = defcmd_set + defcmd_set_count; + memcpy(kdb_macro, save_kdb_macro, + kdb_macro_count * sizeof(*kdb_macro)); + s = kdb_macro + kdb_macro_count; memset(s, 0, sizeof(*s)); s->usable = true; s->name = kdb_strdup(argv[1], GFP_KDB); @@ -753,19 +753,19 @@ static int kdb_defcmd(int argc, const char **argv) strcpy(s->help, argv[3]+1); s->help[strlen(s->help)-1] = '\0'; } - ++defcmd_set_count; + ++kdb_macro_count; defcmd_in_progress = true; - kfree(save_defcmd_set); + kfree(save_kdb_macro); return 0; fail_help: kfree(s->usage); fail_usage: kfree(s->name); fail_name: - kfree(defcmd_set); + kfree(kdb_macro); fail_defcmd: - kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]); - defcmd_set = save_defcmd_set; + kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]); + kdb_macro = save_kdb_macro; return KDB_NOTIMP; } @@ -781,14 +781,14 @@ fail_defcmd: static int kdb_exec_defcmd(int argc, const char **argv) { int i, ret; - struct defcmd_set *s; + struct kdb_macro *s; if (argc != 0) return KDB_ARGCOUNT; - for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) { + for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { if (strcmp(s->name, argv[0]) == 0) break; } - if (i == defcmd_set_count) { + if (i == kdb_macro_count) { kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", argv[0]); return KDB_NOTIMP; -- cgit v1.2.3-71-gd317 From c25abcd625505f53b72dc156bac32b5120826742 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:18 +0530 Subject: kdb: Get rid of redundant kdb_register_flags() Commit e4f291b3f7bb ("kdb: Simplify kdb commands registration") allowed registration of pre-allocated kdb commands with pointer to struct kdbtab_t. Lets switch other users as well to register pre- allocated kdb commands via: - Changing prototype for kdb_register() to pass a pointer to struct kdbtab_t instead. - Embed kdbtab_t structure in kdb_macro_t rather than individual params. With these changes kdb_register_flags() becomes redundant and hence removed. Also, since we have switched all users to register pre-allocated commands, "is_dynamic" flag in struct kdbtab_t becomes redundant and hence removed as well. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Acked-by: Steven Rostedt (VMware) Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-3-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 27 ++++--- kernel/debug/kdb/kdb_main.c | 167 +++++++++++++---------------------------- kernel/debug/kdb/kdb_private.h | 13 ---- kernel/trace/trace_kdb.c | 12 ++- samples/kdb/kdb_hello.c | 20 +++-- 5 files changed, 88 insertions(+), 151 deletions(-) (limited to 'kernel') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 0125a677b67f..de858edfb3b8 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -13,6 +13,8 @@ * Copyright (C) 2009 Jason Wessel */ +#include + /* Shifted versions of the command enable bits are be used if the command * has no arguments (see kdb_check_flags). This allows commands, such as * go, to have different permissions depending upon whether it is called @@ -64,6 +66,17 @@ typedef enum { typedef int (*kdb_func_t)(int, const char **); +/* The KDB shell command table */ +typedef struct _kdbtab { + char *cmd_name; /* Command name */ + kdb_func_t cmd_func; /* Function to execute command */ + char *cmd_usage; /* Usage String for this command */ + char *cmd_help; /* Help message for this command */ + short cmd_minlen; /* Minimum legal # cmd chars required */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ + struct list_head list_node; /* Command list */ +} kdbtab_t; + #ifdef CONFIG_KGDB_KDB #include #include @@ -193,19 +206,13 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos) #endif /* ! CONFIG_KALLSYMS */ /* Dynamic kdb shell command registration */ -extern int kdb_register(char *, kdb_func_t, char *, char *, short); -extern int kdb_register_flags(char *, kdb_func_t, char *, char *, - short, kdb_cmdflags_t); -extern int kdb_unregister(char *); +extern int kdb_register(kdbtab_t *cmd); +extern void kdb_unregister(kdbtab_t *cmd); #else /* ! CONFIG_KGDB_KDB */ static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } static inline void kdb_init(int level) {} -static inline int kdb_register(char *cmd, kdb_func_t func, char *usage, - char *help, short minlen) { return 0; } -static inline int kdb_register_flags(char *cmd, kdb_func_t func, char *usage, - char *help, short minlen, - kdb_cmdflags_t flags) { return 0; } -static inline int kdb_unregister(char *cmd) { return 0; } +static inline int kdb_register(kdbtab_t *cmd) { return 0; } +static inline void kdb_unregister(kdbtab_t *cmd) {} #endif /* CONFIG_KGDB_KDB */ enum { KDB_NOT_INITIALIZED, diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5cf9867fa118..b2880fad26d4 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -657,9 +656,7 @@ static void kdb_cmderror(int diag) struct kdb_macro { int count; bool usable; - char *name; - char *usage; - char *help; + kdbtab_t cmd; char **command; }; static struct kdb_macro *kdb_macro; @@ -678,13 +675,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = false; if (s->usable) - /* macros are always safe because when executed each - * internal command re-enters kdb_parse() and is - * safety checked individually. - */ - kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, - s->help, 0, - KDB_ENABLE_ALWAYS_SAFE); + kdb_register(&s->cmd); return 0; } if (!s->usable) @@ -705,6 +696,8 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) static int kdb_defcmd(int argc, const char **argv) { struct kdb_macro *save_kdb_macro = kdb_macro, *s; + kdbtab_t *mp; + if (defcmd_in_progress) { kdb_printf("kdb: nested defcmd detected, assuming missing " "endefcmd\n"); @@ -713,8 +706,8 @@ static int kdb_defcmd(int argc, const char **argv) if (argc == 0) { int i; for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { - kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, - s->usage, s->help); + kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->cmd.cmd_name, + s->cmd.cmd_usage, s->cmd.cmd_help); for (i = 0; i < s->count; ++i) kdb_printf("%s", s->command[i]); kdb_printf("endefcmd\n"); @@ -736,31 +729,36 @@ static int kdb_defcmd(int argc, const char **argv) s = kdb_macro + kdb_macro_count; memset(s, 0, sizeof(*s)); s->usable = true; - s->name = kdb_strdup(argv[1], GFP_KDB); - if (!s->name) + + mp = &s->cmd; + mp->cmd_func = kdb_exec_defcmd; + mp->cmd_minlen = 0; + mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; + mp->cmd_name = kdb_strdup(argv[1], GFP_KDB); + if (!mp->cmd_name) goto fail_name; - s->usage = kdb_strdup(argv[2], GFP_KDB); - if (!s->usage) + mp->cmd_usage = kdb_strdup(argv[2], GFP_KDB); + if (!mp->cmd_usage) goto fail_usage; - s->help = kdb_strdup(argv[3], GFP_KDB); - if (!s->help) + mp->cmd_help = kdb_strdup(argv[3], GFP_KDB); + if (!mp->cmd_help) goto fail_help; - if (s->usage[0] == '"') { - strcpy(s->usage, argv[2]+1); - s->usage[strlen(s->usage)-1] = '\0'; + if (mp->cmd_usage[0] == '"') { + strcpy(mp->cmd_usage, argv[2]+1); + mp->cmd_usage[strlen(mp->cmd_usage)-1] = '\0'; } - if (s->help[0] == '"') { - strcpy(s->help, argv[3]+1); - s->help[strlen(s->help)-1] = '\0'; + if (mp->cmd_help[0] == '"') { + strcpy(mp->cmd_help, argv[3]+1); + mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; } ++kdb_macro_count; defcmd_in_progress = true; kfree(save_kdb_macro); return 0; fail_help: - kfree(s->usage); + kfree(mp->cmd_usage); fail_usage: - kfree(s->name); + kfree(mp->cmd_name); fail_name: kfree(kdb_macro); fail_defcmd: @@ -785,7 +783,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) if (argc != 0) return KDB_ARGCOUNT; for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { - if (strcmp(s->name, argv[0]) == 0) + if (strcmp(s->cmd.cmd_name, argv[0]) == 0) break; } if (i == kdb_macro_count) { @@ -797,7 +795,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) /* Recursive use of kdb_parse, do not use argv after * this point */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]); + kdb_printf("[%s]kdb> %s\n", s->cmd.cmd_name, s->command[i]); ret = kdb_parse(s->command[i]); if (ret) return ret; @@ -2613,56 +2611,32 @@ static int kdb_grep_help(int argc, const char **argv) return 0; } -/* - * kdb_register_flags - This function is used to register a kernel - * debugger command. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * repeat Does the command auto repeat on enter? - * Returns: - * zero for success, one if a duplicate command. +/** + * kdb_register() - This function is used to register a kernel debugger + * command. + * @cmd: pointer to kdb command + * + * Note that it's the job of the caller to keep the memory for the cmd + * allocated until unregister is called. */ -int kdb_register_flags(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_cmdflags_t flags) +int kdb_register(kdbtab_t *cmd) { kdbtab_t *kp; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd) == 0) { - kdb_printf("Duplicate kdb command registered: " - "%s, func %px help %s\n", cmd, func, help); + if (strcmp(kp->cmd_name, cmd->cmd_name) == 0) { + kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n", + cmd->cmd_name, cmd->cmd_func, cmd->cmd_help); return 1; } } - kp = kmalloc(sizeof(*kp), GFP_KDB); - if (!kp) { - kdb_printf("Could not allocate new kdb_command table\n"); - return 1; - } - - kp->cmd_name = cmd; - kp->cmd_func = func; - kp->cmd_usage = usage; - kp->cmd_help = help; - kp->cmd_minlen = minlen; - kp->cmd_flags = flags; - kp->is_dynamic = true; - - list_add_tail(&kp->list_node, &kdb_cmds_head); - + list_add_tail(&cmd->list_node, &kdb_cmds_head); return 0; } -EXPORT_SYMBOL_GPL(kdb_register_flags); +EXPORT_SYMBOL_GPL(kdb_register); -/* +/** * kdb_register_table() - This function is used to register a kdb command * table. * @kp: pointer to kdb command table @@ -2676,55 +2650,15 @@ void kdb_register_table(kdbtab_t *kp, size_t len) } } -/* - * kdb_register - Compatibility register function for commands that do - * not need to specify a repeat state. Equivalent to - * kdb_register_flags with flags set to 0. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * Returns: - * zero for success, one if a duplicate command. - */ -int kdb_register(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen) -{ - return kdb_register_flags(cmd, func, usage, help, minlen, 0); -} -EXPORT_SYMBOL_GPL(kdb_register); - -/* - * kdb_unregister - This function is used to unregister a kernel - * debugger command. It is generally called when a module which - * implements kdb commands is unloaded. - * Inputs: - * cmd Command name - * Returns: - * zero for success, one command not registered. +/** + * kdb_unregister() - This function is used to unregister a kernel debugger + * command. It is generally called when a module which + * implements kdb command is unloaded. + * @cmd: pointer to kdb command */ -int kdb_unregister(char *cmd) +void kdb_unregister(kdbtab_t *cmd) { - kdbtab_t *kp; - - /* - * find the command. - */ - list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd) == 0) { - list_del(&kp->list_node); - if (kp->is_dynamic) - kfree(kp); - return 0; - } - } - - /* Couldn't find it. */ - return 1; + list_del(&cmd->list_node); } EXPORT_SYMBOL_GPL(kdb_unregister); @@ -2900,6 +2834,11 @@ static kdbtab_t maintab[] = { .cmd_func = kdb_defcmd, .cmd_usage = "name \"usage\" \"help\"", .cmd_help = "Define a set of commands, down to endefcmd", + /* + * Macros are always safe because when executed each + * internal command re-enters kdb_parse() and is safety + * checked individually. + */ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, }, { .cmd_name = "kill", diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 8dbc840113c9..629590084a0d 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -164,19 +164,6 @@ typedef struct _kdb_bp { #ifdef CONFIG_KGDB_KDB extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */]; -/* The KDB shell command table */ -typedef struct _kdbtab { - char *cmd_name; /* Command name */ - kdb_func_t cmd_func; /* Function to execute command */ - char *cmd_usage; /* Usage String for this command */ - char *cmd_help; /* Help message for this command */ - short cmd_minlen; /* Minimum legal # command - * chars required */ - kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ - struct list_head list_node; /* Command list */ - bool is_dynamic; /* Command table allocation type */ -} kdbtab_t; - extern void kdb_register_table(kdbtab_t *kp, size_t len); extern int kdb_bt(int, const char **); /* KDB display back trace */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 9da76104f7a2..6c4f92c79e43 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -147,11 +147,17 @@ static int kdb_ftdump(int argc, const char **argv) return 0; } +static kdbtab_t ftdump_cmd = { + .cmd_name = "ftdump", + .cmd_func = kdb_ftdump, + .cmd_usage = "[skip_#entries] [cpu]", + .cmd_help = "Dump ftrace log; -skip dumps last #entries", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, +}; + static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", - "Dump ftrace log; -skip dumps last #entries", 0, - KDB_ENABLE_ALWAYS_SAFE); + kdb_register(&ftdump_cmd); return 0; } diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c index c1c2fa0f62c2..9ad514a6648b 100644 --- a/samples/kdb/kdb_hello.c +++ b/samples/kdb/kdb_hello.c @@ -28,28 +28,26 @@ static int kdb_hello_cmd(int argc, const char **argv) return 0; } +static kdbtab_t hello_cmd = { + .cmd_name = "hello", + .cmd_func = kdb_hello_cmd, + .cmd_usage = "[string]", + .cmd_help = "Say Hello World or Hello [string]", +}; static int __init kdb_hello_cmd_init(void) { /* * Registration of a dynamically added kdb command is done with - * kdb_register() with the arguments being: - * 1: The name of the shell command - * 2: The function that processes the command - * 3: Description of the usage of any arguments - * 4: Descriptive text when you run help - * 5: Number of characters to complete the command - * 0 == type the whole command - * 1 == match both "g" and "go" for example + * kdb_register(). */ - kdb_register("hello", kdb_hello_cmd, "[string]", - "Say Hello World or Hello [string]", 0); + kdb_register(&hello_cmd); return 0; } static void __exit kdb_hello_cmd_exit(void) { - kdb_unregister("hello"); + kdb_unregister(&hello_cmd); } module_init(kdb_hello_cmd_init); -- cgit v1.2.3-71-gd317 From 9a5db530aa7d98b10c4f5104027565c98cca49e6 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:19 +0530 Subject: kdb: Simplify kdb_defcmd macro logic Switch to use a linked list instead of dynamic array which makes allocation of kdb macro and traversing the kdb macro commands list simpler. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-4-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 107 ++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b2880fad26d4..7c7a2ef834fc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -654,13 +654,16 @@ static void kdb_cmderror(int diag) * zero for success, a kdb diagnostic if error */ struct kdb_macro { - int count; - bool usable; - kdbtab_t cmd; - char **command; + kdbtab_t cmd; /* Macro command */ + struct list_head statements; /* Associated statement list */ }; + +struct kdb_macro_statement { + char *statement; /* Statement text */ + struct list_head list_node; /* Statement list node */ +}; + static struct kdb_macro *kdb_macro; -static int kdb_macro_count; static bool defcmd_in_progress; /* Forward references */ @@ -668,34 +671,33 @@ static int kdb_exec_defcmd(int argc, const char **argv); static int kdb_defcmd2(const char *cmdstr, const char *argv0) { - struct kdb_macro *s = kdb_macro + kdb_macro_count - 1; - char **save_command = s->command; + struct kdb_macro_statement *kms; + + if (!kdb_macro) + return KDB_NOTIMP; + if (strcmp(argv0, "endefcmd") == 0) { defcmd_in_progress = false; - if (!s->count) - s->usable = false; - if (s->usable) - kdb_register(&s->cmd); + if (!list_empty(&kdb_macro->statements)) + kdb_register(&kdb_macro->cmd); return 0; } - if (!s->usable) - return KDB_NOTIMP; - s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB); - if (!s->command) { - kdb_printf("Could not allocate new kdb_defcmd table for %s\n", + + kms = kmalloc(sizeof(*kms), GFP_KDB); + if (!kms) { + kdb_printf("Could not allocate new kdb macro command: %s\n", cmdstr); - s->usable = false; return KDB_NOTIMP; } - memcpy(s->command, save_command, s->count * sizeof(*(s->command))); - s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB); - kfree(save_command); + + kms->statement = kdb_strdup(cmdstr, GFP_KDB); + list_add_tail(&kms->list_node, &kdb_macro->statements); + return 0; } static int kdb_defcmd(int argc, const char **argv) { - struct kdb_macro *save_kdb_macro = kdb_macro, *s; kdbtab_t *mp; if (defcmd_in_progress) { @@ -704,13 +706,21 @@ static int kdb_defcmd(int argc, const char **argv) kdb_defcmd2("endefcmd", "endefcmd"); } if (argc == 0) { - int i; - for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { - kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->cmd.cmd_name, - s->cmd.cmd_usage, s->cmd.cmd_help); - for (i = 0; i < s->count; ++i) - kdb_printf("%s", s->command[i]); - kdb_printf("endefcmd\n"); + kdbtab_t *kp; + struct kdb_macro *kmp; + struct kdb_macro_statement *kms; + + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (kp->cmd_func == kdb_exec_defcmd) { + kdb_printf("defcmd %s \"%s\" \"%s\"\n", + kp->cmd_name, kp->cmd_usage, + kp->cmd_help); + kmp = container_of(kp, struct kdb_macro, cmd); + list_for_each_entry(kms, &kmp->statements, + list_node) + kdb_printf("%s", kms->statement); + kdb_printf("endefcmd\n"); + } } return 0; } @@ -720,17 +730,11 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - kdb_macro = kmalloc_array(kdb_macro_count + 1, sizeof(*kdb_macro), - GFP_KDB); + kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB); if (!kdb_macro) goto fail_defcmd; - memcpy(kdb_macro, save_kdb_macro, - kdb_macro_count * sizeof(*kdb_macro)); - s = kdb_macro + kdb_macro_count; - memset(s, 0, sizeof(*s)); - s->usable = true; - mp = &s->cmd; + mp = &kdb_macro->cmd; mp->cmd_func = kdb_exec_defcmd; mp->cmd_minlen = 0; mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; @@ -751,9 +755,9 @@ static int kdb_defcmd(int argc, const char **argv) strcpy(mp->cmd_help, argv[3]+1); mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; } - ++kdb_macro_count; + + INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; - kfree(save_kdb_macro); return 0; fail_help: kfree(mp->cmd_usage); @@ -763,7 +767,6 @@ fail_name: kfree(kdb_macro); fail_defcmd: kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]); - kdb_macro = save_kdb_macro; return KDB_NOTIMP; } @@ -778,25 +781,31 @@ fail_defcmd: */ static int kdb_exec_defcmd(int argc, const char **argv) { - int i, ret; - struct kdb_macro *s; + int ret; + kdbtab_t *kp; + struct kdb_macro *kmp; + struct kdb_macro_statement *kms; + if (argc != 0) return KDB_ARGCOUNT; - for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { - if (strcmp(s->cmd.cmd_name, argv[0]) == 0) + + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (strcmp(kp->cmd_name, argv[0]) == 0) break; } - if (i == kdb_macro_count) { + if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) { kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", argv[0]); return KDB_NOTIMP; } - for (i = 0; i < s->count; ++i) { - /* Recursive use of kdb_parse, do not use argv after - * this point */ + kmp = container_of(kp, struct kdb_macro, cmd); + list_for_each_entry(kms, &kmp->statements, list_node) { + /* + * Recursive use of kdb_parse, do not use argv after this point. + */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", s->cmd.cmd_name, s->command[i]); - ret = kdb_parse(s->command[i]); + kdb_printf("[%s]kdb> %s\n", kmp->cmd.cmd_name, kms->statement); + ret = kdb_parse(kms->statement); if (ret) return ret; } -- cgit v1.2.3-71-gd317 From e868f0a3c4b9c1d7721f08b703142a876814a3f8 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:20 +0530 Subject: kdb: Rename members of struct kdbtab_t Remove redundant prefix "cmd_" from name of members in struct kdbtab_t for better readibility. Suggested-by: Doug Anderson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-5-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 12 +- kernel/debug/kdb/kdb_bp.c | 72 ++++---- kernel/debug/kdb/kdb_main.c | 404 ++++++++++++++++++++++---------------------- kernel/trace/trace_kdb.c | 10 +- samples/kdb/kdb_hello.c | 8 +- 5 files changed, 252 insertions(+), 254 deletions(-) (limited to 'kernel') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index de858edfb3b8..ea0f5e580fac 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -68,12 +68,12 @@ typedef int (*kdb_func_t)(int, const char **); /* The KDB shell command table */ typedef struct _kdbtab { - char *cmd_name; /* Command name */ - kdb_func_t cmd_func; /* Function to execute command */ - char *cmd_usage; /* Usage String for this command */ - char *cmd_help; /* Help message for this command */ - short cmd_minlen; /* Minimum legal # cmd chars required */ - kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ + char *name; /* Command name */ + kdb_func_t func; /* Function to execute command */ + char *usage; /* Usage String for this command */ + char *help; /* Help message for this command */ + short minlen; /* Minimum legal # cmd chars required */ + kdb_cmdflags_t flags; /* Command behaviour flags */ struct list_head list_node; /* Command list */ } kdbtab_t; diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 2168f8dacb99..372025cf1ca3 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -523,51 +523,51 @@ static int kdb_ss(int argc, const char **argv) } static kdbtab_t bptab[] = { - { .cmd_name = "bp", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "Set/Display breakpoints", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "bp", + .func = kdb_bp, + .usage = "[]", + .help = "Set/Display breakpoints", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "bl", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "Display breakpoints", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "bl", + .func = kdb_bp, + .usage = "[]", + .help = "Display breakpoints", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "bc", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Clear Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "bc", + .func = kdb_bc, + .usage = "", + .help = "Clear Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "be", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Enable Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "be", + .func = kdb_bc, + .usage = "", + .help = "Enable Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "bd", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Disable Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "bd", + .func = kdb_bc, + .usage = "", + .help = "Disable Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "ss", - .cmd_func = kdb_ss, - .cmd_usage = "", - .cmd_help = "Single Step", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "ss", + .func = kdb_ss, + .usage = "", + .help = "Single Step", + .minlen = 1, + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, }; static kdbtab_t bphcmd = { - .cmd_name = "bph", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "[datar [length]|dataw [length]] Set hw brk", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + .name = "bph", + .func = kdb_bp, + .usage = "[]", + .help = "[datar [length]|dataw [length]] Set hw brk", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }; /* Initialize the breakpoint table and register breakpoint commands. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7c7a2ef834fc..fa6deda894a1 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -711,10 +711,9 @@ static int kdb_defcmd(int argc, const char **argv) struct kdb_macro_statement *kms; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (kp->cmd_func == kdb_exec_defcmd) { + if (kp->func == kdb_exec_defcmd) { kdb_printf("defcmd %s \"%s\" \"%s\"\n", - kp->cmd_name, kp->cmd_usage, - kp->cmd_help); + kp->name, kp->usage, kp->help); kmp = container_of(kp, struct kdb_macro, cmd); list_for_each_entry(kms, &kmp->statements, list_node) @@ -735,34 +734,34 @@ static int kdb_defcmd(int argc, const char **argv) goto fail_defcmd; mp = &kdb_macro->cmd; - mp->cmd_func = kdb_exec_defcmd; - mp->cmd_minlen = 0; - mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; - mp->cmd_name = kdb_strdup(argv[1], GFP_KDB); - if (!mp->cmd_name) + mp->func = kdb_exec_defcmd; + mp->minlen = 0; + mp->flags = KDB_ENABLE_ALWAYS_SAFE; + mp->name = kdb_strdup(argv[1], GFP_KDB); + if (!mp->name) goto fail_name; - mp->cmd_usage = kdb_strdup(argv[2], GFP_KDB); - if (!mp->cmd_usage) + mp->usage = kdb_strdup(argv[2], GFP_KDB); + if (!mp->usage) goto fail_usage; - mp->cmd_help = kdb_strdup(argv[3], GFP_KDB); - if (!mp->cmd_help) + mp->help = kdb_strdup(argv[3], GFP_KDB); + if (!mp->help) goto fail_help; - if (mp->cmd_usage[0] == '"') { - strcpy(mp->cmd_usage, argv[2]+1); - mp->cmd_usage[strlen(mp->cmd_usage)-1] = '\0'; + if (mp->usage[0] == '"') { + strcpy(mp->usage, argv[2]+1); + mp->usage[strlen(mp->usage)-1] = '\0'; } - if (mp->cmd_help[0] == '"') { - strcpy(mp->cmd_help, argv[3]+1); - mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; + if (mp->help[0] == '"') { + strcpy(mp->help, argv[3]+1); + mp->help[strlen(mp->help)-1] = '\0'; } INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; return 0; fail_help: - kfree(mp->cmd_usage); + kfree(mp->usage); fail_usage: - kfree(mp->cmd_name); + kfree(mp->name); fail_name: kfree(kdb_macro); fail_defcmd: @@ -790,7 +789,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) return KDB_ARGCOUNT; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, argv[0]) == 0) + if (strcmp(kp->name, argv[0]) == 0) break; } if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) { @@ -804,7 +803,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) * Recursive use of kdb_parse, do not use argv after this point. */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", kmp->cmd.cmd_name, kms->statement); + kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement); ret = kdb_parse(kms->statement); if (ret) return ret; @@ -1016,11 +1015,11 @@ int kdb_parse(const char *cmdstr) * If this command is allowed to be abbreviated, * check to see if this is it. */ - if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) && - (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0)) + if (tp->minlen && (strlen(argv[0]) <= tp->minlen) && + (strncmp(argv[0], tp->name, tp->minlen) == 0)) break; - if (strcmp(argv[0], tp->cmd_name) == 0) + if (strcmp(argv[0], tp->name) == 0) break; } @@ -1031,8 +1030,7 @@ int kdb_parse(const char *cmdstr) */ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) { list_for_each_entry(tp, &kdb_cmds_head, list_node) { - if (strncmp(argv[0], tp->cmd_name, - strlen(tp->cmd_name)) == 0) + if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0) break; } } @@ -1040,19 +1038,19 @@ int kdb_parse(const char *cmdstr) if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) { int result; - if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1)) return KDB_NOPERM; KDB_STATE_SET(CMD); - result = (*tp->cmd_func)(argc-1, (const char **)argv); + result = (*tp->func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + if (tp->flags & KDB_REPEAT_WITH_ARGS) return result; - argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0; if (argv[argc]) *(argv[argc]) = '\0'; return result; @@ -2419,12 +2417,12 @@ static int kdb_help(int argc, const char **argv) char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; - if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true)) continue; - if (strlen(kt->cmd_usage) > 20) + if (strlen(kt->usage) > 20) space = "\n "; - kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, - kt->cmd_usage, space, kt->cmd_help); + kdb_printf("%-15.15s %-20s%s%s\n", kt->name, + kt->usage, space, kt->help); } return 0; } @@ -2633,9 +2631,9 @@ int kdb_register(kdbtab_t *cmd) kdbtab_t *kp; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd->cmd_name) == 0) { + if (strcmp(kp->name, cmd->name) == 0) { kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n", - cmd->cmd_name, cmd->cmd_func, cmd->cmd_help); + cmd->name, cmd->func, cmd->help); return 1; } } @@ -2672,218 +2670,218 @@ void kdb_unregister(kdbtab_t *cmd) EXPORT_SYMBOL_GPL(kdb_unregister); static kdbtab_t maintab[] = { - { .cmd_name = "md", - .cmd_func = kdb_md, - .cmd_usage = "", - .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "md", + .func = kdb_md, + .usage = "", + .help = "Display Memory Contents, also mdWcN, e.g. md8c1", + .minlen = 1, + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mdr", - .cmd_func = kdb_md, - .cmd_usage = " ", - .cmd_help = "Display Raw Memory", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mdr", + .func = kdb_md, + .usage = " ", + .help = "Display Raw Memory", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mdp", - .cmd_func = kdb_md, - .cmd_usage = " ", - .cmd_help = "Display Physical Memory", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mdp", + .func = kdb_md, + .usage = " ", + .help = "Display Physical Memory", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mds", - .cmd_func = kdb_md, - .cmd_usage = "", - .cmd_help = "Display Memory Symbolically", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mds", + .func = kdb_md, + .usage = "", + .help = "Display Memory Symbolically", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mm", - .cmd_func = kdb_mm, - .cmd_usage = " ", - .cmd_help = "Modify Memory Contents", - .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS, + { .name = "mm", + .func = kdb_mm, + .usage = " ", + .help = "Modify Memory Contents", + .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "go", - .cmd_func = kdb_go, - .cmd_usage = "[]", - .cmd_help = "Continue Execution", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_REG_WRITE | + { .name = "go", + .func = kdb_go, + .usage = "[]", + .help = "Continue Execution", + .minlen = 1, + .flags = KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, }, - { .cmd_name = "rd", - .cmd_func = kdb_rd, - .cmd_usage = "", - .cmd_help = "Display Registers", - .cmd_flags = KDB_ENABLE_REG_READ, + { .name = "rd", + .func = kdb_rd, + .usage = "", + .help = "Display Registers", + .flags = KDB_ENABLE_REG_READ, }, - { .cmd_name = "rm", - .cmd_func = kdb_rm, - .cmd_usage = " ", - .cmd_help = "Modify Registers", - .cmd_flags = KDB_ENABLE_REG_WRITE, + { .name = "rm", + .func = kdb_rm, + .usage = " ", + .help = "Modify Registers", + .flags = KDB_ENABLE_REG_WRITE, }, - { .cmd_name = "ef", - .cmd_func = kdb_ef, - .cmd_usage = "", - .cmd_help = "Display exception frame", - .cmd_flags = KDB_ENABLE_MEM_READ, + { .name = "ef", + .func = kdb_ef, + .usage = "", + .help = "Display exception frame", + .flags = KDB_ENABLE_MEM_READ, }, - { .cmd_name = "bt", - .cmd_func = kdb_bt, - .cmd_usage = "[]", - .cmd_help = "Stack traceback", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + { .name = "bt", + .func = kdb_bt, + .usage = "[]", + .help = "Stack traceback", + .minlen = 1, + .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, }, - { .cmd_name = "btp", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Display stack for process ", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "btp", + .func = kdb_bt, + .usage = "", + .help = "Display stack for process ", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "bta", - .cmd_func = kdb_bt, - .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]", - .cmd_help = "Backtrace all processes matching state flag", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "bta", + .func = kdb_bt, + .usage = "[D|R|S|T|C|Z|E|U|I|M|A]", + .help = "Backtrace all processes matching state flag", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "btc", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Backtrace current process on each cpu", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "btc", + .func = kdb_bt, + .usage = "", + .help = "Backtrace current process on each cpu", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "btt", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Backtrace process given its struct task address", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + { .name = "btt", + .func = kdb_bt, + .usage = "", + .help = "Backtrace process given its struct task address", + .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, }, - { .cmd_name = "env", - .cmd_func = kdb_env, - .cmd_usage = "", - .cmd_help = "Show environment variables", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "env", + .func = kdb_env, + .usage = "", + .help = "Show environment variables", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "set", - .cmd_func = kdb_set, - .cmd_usage = "", - .cmd_help = "Set environment variables", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "set", + .func = kdb_set, + .usage = "", + .help = "Set environment variables", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "help", - .cmd_func = kdb_help, - .cmd_usage = "", - .cmd_help = "Display Help Message", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "help", + .func = kdb_help, + .usage = "", + .help = "Display Help Message", + .minlen = 1, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "?", - .cmd_func = kdb_help, - .cmd_usage = "", - .cmd_help = "Display Help Message", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "?", + .func = kdb_help, + .usage = "", + .help = "Display Help Message", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "cpu", - .cmd_func = kdb_cpu, - .cmd_usage = "", - .cmd_help = "Switch to new cpu", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, + { .name = "cpu", + .func = kdb_cpu, + .usage = "", + .help = "Switch to new cpu", + .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, }, - { .cmd_name = "kgdb", - .cmd_func = kdb_kgdb, - .cmd_usage = "", - .cmd_help = "Enter kgdb mode", - .cmd_flags = 0, + { .name = "kgdb", + .func = kdb_kgdb, + .usage = "", + .help = "Enter kgdb mode", + .flags = 0, }, - { .cmd_name = "ps", - .cmd_func = kdb_ps, - .cmd_usage = "[|A]", - .cmd_help = "Display active task list", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "ps", + .func = kdb_ps, + .usage = "[|A]", + .help = "Display active task list", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "pid", - .cmd_func = kdb_pid, - .cmd_usage = "", - .cmd_help = "Switch to another task", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "pid", + .func = kdb_pid, + .usage = "", + .help = "Switch to another task", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "reboot", - .cmd_func = kdb_reboot, - .cmd_usage = "", - .cmd_help = "Reboot the machine immediately", - .cmd_flags = KDB_ENABLE_REBOOT, + { .name = "reboot", + .func = kdb_reboot, + .usage = "", + .help = "Reboot the machine immediately", + .flags = KDB_ENABLE_REBOOT, }, #if defined(CONFIG_MODULES) - { .cmd_name = "lsmod", - .cmd_func = kdb_lsmod, - .cmd_usage = "", - .cmd_help = "List loaded kernel modules", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "lsmod", + .func = kdb_lsmod, + .usage = "", + .help = "List loaded kernel modules", + .flags = KDB_ENABLE_INSPECT, }, #endif #if defined(CONFIG_MAGIC_SYSRQ) - { .cmd_name = "sr", - .cmd_func = kdb_sr, - .cmd_usage = "", - .cmd_help = "Magic SysRq key", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "sr", + .func = kdb_sr, + .usage = "", + .help = "Magic SysRq key", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, #endif #if defined(CONFIG_PRINTK) - { .cmd_name = "dmesg", - .cmd_func = kdb_dmesg, - .cmd_usage = "[lines]", - .cmd_help = "Display syslog buffer", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "dmesg", + .func = kdb_dmesg, + .usage = "[lines]", + .help = "Display syslog buffer", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, #endif - { .cmd_name = "defcmd", - .cmd_func = kdb_defcmd, - .cmd_usage = "name \"usage\" \"help\"", - .cmd_help = "Define a set of commands, down to endefcmd", + { .name = "defcmd", + .func = kdb_defcmd, + .usage = "name \"usage\" \"help\"", + .help = "Define a set of commands, down to endefcmd", /* * Macros are always safe because when executed each * internal command re-enters kdb_parse() and is safety * checked individually. */ - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "kill", - .cmd_func = kdb_kill, - .cmd_usage = "<-signal> ", - .cmd_help = "Send a signal to a process", - .cmd_flags = KDB_ENABLE_SIGNAL, + { .name = "kill", + .func = kdb_kill, + .usage = "<-signal> ", + .help = "Send a signal to a process", + .flags = KDB_ENABLE_SIGNAL, }, - { .cmd_name = "summary", - .cmd_func = kdb_summary, - .cmd_usage = "", - .cmd_help = "Summarize the system", - .cmd_minlen = 4, - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "summary", + .func = kdb_summary, + .usage = "", + .help = "Summarize the system", + .minlen = 4, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "per_cpu", - .cmd_func = kdb_per_cpu, - .cmd_usage = " [] []", - .cmd_help = "Display per_cpu variables", - .cmd_minlen = 3, - .cmd_flags = KDB_ENABLE_MEM_READ, + { .name = "per_cpu", + .func = kdb_per_cpu, + .usage = " [] []", + .help = "Display per_cpu variables", + .minlen = 3, + .flags = KDB_ENABLE_MEM_READ, }, - { .cmd_name = "grephelp", - .cmd_func = kdb_grep_help, - .cmd_usage = "", - .cmd_help = "Display help on | grep", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "grephelp", + .func = kdb_grep_help, + .usage = "", + .help = "Display help on | grep", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, }; static kdbtab_t nmicmd = { - .cmd_name = "disable_nmi", - .cmd_func = kdb_disable_nmi, - .cmd_usage = "", - .cmd_help = "Disable NMI entry to KDB", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .name = "disable_nmi", + .func = kdb_disable_nmi, + .usage = "", + .help = "Disable NMI entry to KDB", + .flags = KDB_ENABLE_ALWAYS_SAFE, }; /* Initialize the kdb command table. */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 6c4f92c79e43..59857a1ee44c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -148,11 +148,11 @@ static int kdb_ftdump(int argc, const char **argv) } static kdbtab_t ftdump_cmd = { - .cmd_name = "ftdump", - .cmd_func = kdb_ftdump, - .cmd_usage = "[skip_#entries] [cpu]", - .cmd_help = "Dump ftrace log; -skip dumps last #entries", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .name = "ftdump", + .func = kdb_ftdump, + .usage = "[skip_#entries] [cpu]", + .help = "Dump ftrace log; -skip dumps last #entries", + .flags = KDB_ENABLE_ALWAYS_SAFE, }; static __init int kdb_ftrace_register(void) diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c index 9ad514a6648b..82736e5a5e32 100644 --- a/samples/kdb/kdb_hello.c +++ b/samples/kdb/kdb_hello.c @@ -29,10 +29,10 @@ static int kdb_hello_cmd(int argc, const char **argv) } static kdbtab_t hello_cmd = { - .cmd_name = "hello", - .cmd_func = kdb_hello_cmd, - .cmd_usage = "[string]", - .cmd_help = "Say Hello World or Hello [string]", + .name = "hello", + .func = kdb_hello_cmd, + .usage = "[string]", + .help = "Say Hello World or Hello [string]", }; static int __init kdb_hello_cmd_init(void) -- cgit v1.2.3-71-gd317 From 25f6fa53a07422e2bb004229eefd32760c469fb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 May 2021 17:04:57 -0700 Subject: refscale: Add measurement of clock readout This commit adds a "clock" type to refscale, which checks the performance of ktime_get_real_fast_ns(). Use the "clocksource=" kernel boot parameter to select the underlying clock source. [ paulmck: Work around compiler false positive per kernel test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d998a76fb542..66dc14cf5687 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -467,6 +467,40 @@ static struct ref_scale_ops acqrel_ops = { .name = "acqrel" }; +static volatile u64 stopopts; + +static void ref_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += ktime_get_real_fast_ns(); + preempt_enable(); + stopopts = x; +} + +static void ref_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += ktime_get_real_fast_ns(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops clock_ops = { + .readsection = ref_clock_section, + .delaysection = ref_clock_delay_section, + .name = "clock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -759,7 +793,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; if (!torture_init_begin(scale_type, verbose)) -- cgit v1.2.3-71-gd317 From 59e836662860a28880d45b35e1fbc5afca4847ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 16 May 2021 21:17:27 -0700 Subject: rcutorture: Preempt rather than block when testing task stalls Currently, rcu_torture_stall() does a one-jiffy timed wait when stall_cpu_block is set. This works, but emits a pointless splat in CONFIG_PREEMPT=y kernels. This commit avoids this splat by instead invoking preempt_schedule() in CONFIG_PREEMPT=y kernels. This uses an admittedly ugly #ifdef, but abstracted approaches just looked worse. A prettier approach would provide a preempt_schedule() definition with a WARN_ON() for CONFIG_PREEMPT=n kernels, but this seems quite silly. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 40ef5417d954..ab4215266ebe 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2022,8 +2022,13 @@ static int rcu_torture_stall(void *args) __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - if (stall_cpu_block) + if (stall_cpu_block) { +#ifdef CONFIG_PREEMPTION + preempt_schedule(); +#else schedule_timeout_uninterruptible(HZ); +#endif + } if (stall_cpu_irqsoff) local_irq_enable(); else if (!stall_cpu_block) -- cgit v1.2.3-71-gd317 From 811192c5f24bfd7246ce9ce06f668d8c408bf39b Mon Sep 17 00:00:00 2001 From: "Jiangong.Han" Date: Tue, 22 Jun 2021 18:37:08 +0800 Subject: rcuscale: Console output claims too few grace periods The rcuscale console output claims N grace periods, numbered from zero to N, which means that there were really N+1 grace periods. The root cause of this bug is that rcu_scale_writer() stores the number of the last grace period (numbered from zero) into writer_n_durations[me] instead of the number of grace periods. This commit therefore assigns the actual number of grace periods to writer_n_durations[me], and also makes the corresponding adjustment to the loop outputting per-grace-period measurements. Sample of old console output: rcu-scale: writer 0 gps: 133 ...... rcu-scale: 0 writer-duration: 0 44003961 rcu-scale: 0 writer-duration: 1 32003582 ...... rcu-scale: 0 writer-duration: 132 28004391 rcu-scale: 0 writer-duration: 133 27996410 Sample of new console output: rcu-scale: writer 0 gps: 134 ...... rcu-scale: 0 writer-duration: 0 44003961 rcu-scale: 0 writer-duration: 1 32003582 ...... rcu-scale: 0 writer-duration: 132 28004391 rcu-scale: 0 writer-duration: 133 27996410 Signed-off-by: Jiangong.Han Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dca51fe9c73f..2cc34a22a506 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -487,7 +487,7 @@ retry: if (gp_async) { cur_ops->gp_barrier(); } - writer_n_durations[me] = i_max; + writer_n_durations[me] = i_max + 1; torture_kthread_stopping("rcu_scale_writer"); return 0; } @@ -561,7 +561,7 @@ rcu_scale_cleanup(void) wdpp = writer_durations[i]; if (!wdpp) continue; - for (j = 0; j <= writer_n_durations[i]; j++) { + for (j = 0; j < writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", scale_type, SCALE_FLAG, -- cgit v1.2.3-71-gd317 From 5b237d650eb8b0870b5d816fecc0be00237cbfff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 15:51:48 -0700 Subject: locktorture: Mark statistics data races The lock_stress_stats structure's ->n_lock_fail and ->n_lock_acquired fields are incremented and sampled locklessly using plain C-language statements, which KCSAN objects to. This commit therefore marks the statistics gathering with data_race() to flag the intent. While in the area, this commit also reduces the number of accesses to the ->n_lock_acquired field, thus eliminating some possible check/use confusion. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b3adb40549bf..313d5e613fbe 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { + long cur; bool fail = false; int i, n_stress; - long max = 0, min = statp ? statp[0].n_lock_acquired : 0; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_acquired) - max = statp[i].n_lock_acquired; - if (min > statp[i].n_lock_acquired) - min = statp[i].n_lock_acquired; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", -- cgit v1.2.3-71-gd317 From af5f6e27d52cdb2cb3826df19a69a74e9d5eff5e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 16:04:11 -0700 Subject: locktorture: Count lock readers Currently, the lock_is_read_held variable is bool, so that a reader sets it to true just after lock acquisition and then to false just before lock release. This works in a rough statistical sense, but can result in false negatives just after one of a pair of concurrent readers has released the lock. This approach does have low overhead, but at the expense of the setting to true potentially never leaving the reader's store buffer, thus resulting in an unconditional false negative. This commit therefore converts this variable to atomic_t and makes the reader use atomic_inc() just after acquisition and atomic_dec() just before release. This does increase overhead, but this increase is negligible compared to the 10-microsecond lock hold time. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 313d5e613fbe..7c5a4a087cc7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -59,7 +59,7 @@ static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; static unsigned long last_lock_release; struct lock_stress_stats { @@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg) if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; - if (WARN_ON_ONCE(lock_is_read_held)) + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; @@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(tid); - lock_is_read_held = true; + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = false; + atomic_dec(&lock_is_read_held); cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); @@ -998,7 +998,6 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); -- cgit v1.2.3-71-gd317 From 9b9a80677fd80bd531cb05bfe205a40a51955939 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Jun 2021 17:53:43 -0700 Subject: scftorture: Add RPC-like IPI tests This commit adds the single_weight_rpc module parameter, which causes the IPI handler to awaken the IPI sender. In many scheduler configurations, this will result in an IPI back to the sender that is likely to be received at a time when the sender CPU is idle. The intent is to stress IPI reception during CPU busy-to-idle transitions. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 29e8fc5d91a7..5cf40e438319 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -64,6 +64,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); @@ -86,6 +87,8 @@ struct scf_statistics { long long n_resched; long long n_single; long long n_single_ofl; + long long n_single_rpc; + long long n_single_rpc_ofl; long long n_single_wait; long long n_single_wait_ofl; long long n_many; @@ -101,14 +104,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count); // Data for random primitive selection #define SCF_PRIM_RESCHED 0 #define SCF_PRIM_SINGLE 1 -#define SCF_PRIM_MANY 2 -#define SCF_PRIM_ALL 3 -#define SCF_NPRIMS 7 // Need wait and no-wait versions of each, - // except for SCF_PRIM_RESCHED. +#define SCF_PRIM_SINGLE_RPC 2 +#define SCF_PRIM_MANY 3 +#define SCF_PRIM_ALL 4 +#define SCF_NPRIMS 8 // Need wait and no-wait versions of each, + // except for SCF_PRIM_RESCHED and + // SCF_PRIM_SINGLE_RPC. static char *scf_prim_name[] = { "resched_cpu", "smp_call_function_single", + "smp_call_function_single_rpc", "smp_call_function_many", "smp_call_function", }; @@ -128,6 +134,8 @@ struct scf_check { bool scfc_out; int scfc_cpu; // -1 for not _single(). bool scfc_wait; + bool scfc_rpc; + struct completion scfc_completion; }; // Use to wait for all threads to start. @@ -158,6 +166,7 @@ static void scf_torture_stats_print(void) scfs.n_resched += scf_stats_p[i].n_resched; scfs.n_single += scf_stats_p[i].n_single; scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_rpc += scf_stats_p[i].n_single_rpc; scfs.n_single_wait += scf_stats_p[i].n_single_wait; scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; scfs.n_many += scf_stats_p[i].n_many; @@ -168,9 +177,10 @@ static void scf_torture_stats_print(void) if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) bangstr = "!!! "; - pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ", SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_single_rpc, scfs.n_single_rpc_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), @@ -282,10 +292,13 @@ static void scf_handler(void *scfc_in) out: if (unlikely(!scfcp)) return; - if (scfcp->scfc_wait) + if (scfcp->scfc_wait) { WRITE_ONCE(scfcp->scfc_out, true); - else + if (scfcp->scfc_rpc) + complete(&scfcp->scfc_completion); + } else { kfree(scfcp); + } } // As above, but check for correct CPU. @@ -319,6 +332,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_cpu = -1; scfcp->scfc_wait = scfsp->scfs_wait; scfcp->scfc_out = false; + scfcp->scfc_rpc = false; } } switch (scfsp->scfs_prim) { @@ -350,6 +364,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp = NULL; } break; + case SCF_PRIM_SINGLE_RPC: + if (!scfcp) + break; + cpu = torture_random(trsp) % nr_cpu_ids; + scfp->n_single_rpc++; + scfcp->scfc_cpu = cpu; + scfcp->scfc_wait = true; + init_completion(&scfcp->scfc_completion); + scfcp->scfc_rpc = true; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0); + if (!ret) { + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + wait_for_completion(&scfcp->scfc_completion); + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + } else { + scfp->n_single_rpc_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; case SCF_PRIM_MANY: if (scfsp->scfs_wait) scfp->n_many_wait++; @@ -379,10 +421,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } if (scfcp && scfsp->scfs_wait) { if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && - !scfcp->scfc_out)) + !scfcp->scfc_out)) { + pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else + } else { kfree(scfcp); + } barrier(); // Prevent race-reduction compiler optimizations. } if (use_cpus_read_lock) @@ -453,8 +497,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -497,6 +541,7 @@ static int __init scf_torture_init(void) int firsterr = 0; unsigned long weight_resched1 = weight_resched; unsigned long weight_single1 = weight_single; + unsigned long weight_single_rpc1 = weight_single_rpc; unsigned long weight_single_wait1 = weight_single_wait; unsigned long weight_many1 = weight_many; unsigned long weight_many_wait1 = weight_many_wait; @@ -508,11 +553,13 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 && + if (weight_resched == -1 && + weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 && weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { weight_resched1 = 2 * nr_cpu_ids; weight_single1 = 2 * nr_cpu_ids; + weight_single_rpc1 = 2 * nr_cpu_ids; weight_single_wait1 = 2 * nr_cpu_ids; weight_many1 = 2; weight_many_wait1 = 2; @@ -523,6 +570,8 @@ static int __init scf_torture_init(void) weight_resched1 = 0; if (weight_single == -1) weight_single1 = 0; + if (weight_single_rpc == -1) + weight_single_rpc1 = 0; if (weight_single_wait == -1) weight_single_wait1 = 0; if (weight_many == -1) @@ -534,7 +583,7 @@ static int __init scf_torture_init(void) if (weight_all_wait == -1) weight_all_wait1 = 0; } - if (weight_single1 == 0 && weight_single_wait1 == 0 && + if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); @@ -546,6 +595,7 @@ static int __init scf_torture_init(void) else if (weight_resched1) VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); scf_sel_add(weight_many1, SCF_PRIM_MANY, false); scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); -- cgit v1.2.3-71-gd317 From 586e4d4193a653eef21f02b50dee89e2e4be208c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Jul 2021 17:53:27 -0700 Subject: scftorture: Avoid NULL pointer exception on early exit When scftorture finds an error in the module parameters controlling the relative frequencies of smp_call_function*() variants, it takes an early exit. So early that it has not allocated memory to track the kthreads running the test, which results in a segfault. This commit therefore checks for the existence of the memory before attempting to stop the kthreads that would otherwise have been recorded in that non-existent memory. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5cf40e438319..64a08288b1a6 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -513,7 +513,7 @@ static void scf_torture_cleanup(void) return; WRITE_ONCE(scfdone, true); - if (nthreads) + if (nthreads && scf_stats_p) for (i = 0; i < nthreads; i++) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else -- cgit v1.2.3-71-gd317 From bb7262b295472eb6858b5c49893954794027cd84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:40:07 +0100 Subject: timers: Move clearing of base::timer_running under base:: Lock syzbot reported KCSAN data races vs. timer_base::timer_running being set to NULL without holding base::lock in expire_timers(). This looks innocent and most reads are clearly not problematic, but Frederic identified an issue which is: int data = 0; void timer_func(struct timer_list *t) { data = 1; } CPU 0 CPU 1 ------------------------------ -------------------------- base = lock_timer_base(timer, &flags); raw_spin_unlock(&base->lock); if (base->running_timer != timer) call_timer_fn(timer, fn, baseclk); ret = detach_if_pending(timer, base, true); base->running_timer = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); raw_spin_lock(&base->lock); x = data; If the timer has previously executed on CPU 1 and then CPU 0 can observe base->running_timer == NULL and returns, assuming the timer has completed, but it's not guaranteed on all architectures. The comment for del_timer_sync() makes that guarantee. Moving the assignment under base->lock prevents this. For non-RT kernel it's performance wise completely irrelevant whether the store happens before or after taking the lock. For an RT kernel moving the store under the lock requires an extra unlock/lock pair in the case that there is a waiter for the timer, but that's not the end of the world. Reported-by: syzbot+aa7c2385d46c5eba0b89@syzkaller.appspotmail.com Reported-by: syzbot+abea4558531bae1ba9fe@syzkaller.appspotmail.com Fixes: 030dcdd197d7 ("timers: Prepare support for PREEMPT_RT") Signed-off-by: Thomas Gleixner Tested-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/87lfea7gw8.fsf@nanos.tec.linutronix.de Cc: stable@vger.kernel.org --- kernel/time/timer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9eb11c2209e5..e3d2c23c413d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1265,8 +1265,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) static void timer_sync_wait_running(struct timer_base *base) { if (atomic_read(&base->timer_waiters)) { + raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); spin_lock(&base->expiry_lock); + raw_spin_lock_irq(&base->lock); } } @@ -1457,14 +1459,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); - base->running_timer = NULL; raw_spin_lock(&base->lock); + base->running_timer = NULL; } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); + raw_spin_lock_irq(&base->lock); base->running_timer = NULL; timer_sync_wait_running(base); - raw_spin_lock_irq(&base->lock); } } } -- cgit v1.2.3-71-gd317 From 33b57e0cc78eb82f2921eb4c6d1c8fcaa733823b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 27 Jul 2021 15:23:35 -0700 Subject: bpf: Increase supported cgroup storage value size Current max cgroup storage value size is 4k (PAGE_SIZE). The other local storages accept up to 64k (BPF_LOCAL_STORAGE_MAX_VALUE_SIZE). Let's align max cgroup value size with the other storages. For percpu, the max is 32k (PCPU_MIN_UNIT_SIZE) because percpu allocator is not happy about larger values. netcnt test is extended to exercise those maximum values (non-percpu max size is close to, but not real max). v4: * remove inner union (Andrii Nakryiko) * keep net_cnt on the stack (Andrii Nakryiko) v3: * refine SIZEOF_BPF_LOCAL_STORAGE_ELEM comment (Yonghong Song) * anonymous struct in percpu_net_cnt & net_cnt (Yonghong Song) * reorder free (Yonghong Song) v2: * cap max_value_size instead of BUILD_BUG_ON (Martin KaFai Lau) Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727222335.4029096-1-sdf@google.com --- kernel/bpf/local_storage.c | 11 ++++++- tools/testing/selftests/bpf/netcnt_common.h | 38 +++++++++++++++++++------ tools/testing/selftests/bpf/progs/netcnt_prog.c | 8 +++--- tools/testing/selftests/bpf/test_netcnt.c | 4 +-- 4 files changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 7ed2a14dc0de..035e9e3a7132 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,6 +1,7 @@ //SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -283,9 +284,17 @@ enoent: static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { + __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu + * is the same as other local storages. + */ + if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + max_value_size = min_t(__u32, max_value_size, + PCPU_MIN_UNIT_SIZE); + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); @@ -293,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size == 0) return ERR_PTR(-EINVAL); - if (attr->value_size > PAGE_SIZE) + if (attr->value_size > max_value_size) return ERR_PTR(-E2BIG); if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h index 81084c1c2c23..0ab1c88041cd 100644 --- a/tools/testing/selftests/bpf/netcnt_common.h +++ b/tools/testing/selftests/bpf/netcnt_common.h @@ -6,19 +6,39 @@ #define MAX_PERCPU_PACKETS 32 -struct percpu_net_cnt { - __u64 packets; - __u64 bytes; +/* sizeof(struct bpf_local_storage_elem): + * + * It really is about 128 bytes on x86_64, but allocate more to account for + * possible layout changes, different architectures, etc. + * The kernel will wrap up to PAGE_SIZE internally anyway. + */ +#define SIZEOF_BPF_LOCAL_STORAGE_ELEM 256 - __u64 prev_ts; +/* Try to estimate kernel's BPF_LOCAL_STORAGE_MAX_VALUE_SIZE: */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE (0xFFFF - \ + SIZEOF_BPF_LOCAL_STORAGE_ELEM) - __u64 prev_packets; - __u64 prev_bytes; +#define PCPU_MIN_UNIT_SIZE 32768 + +union percpu_net_cnt { + struct { + __u64 packets; + __u64 bytes; + + __u64 prev_ts; + + __u64 prev_packets; + __u64 prev_bytes; + }; + __u8 data[PCPU_MIN_UNIT_SIZE]; }; -struct net_cnt { - __u64 packets; - __u64 bytes; +union net_cnt { + struct { + __u64 packets; + __u64 bytes; + }; + __u8 data[BPF_LOCAL_STORAGE_MAX_VALUE_SIZE]; }; #endif diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c index d071adf178bd..43649bce4c54 100644 --- a/tools/testing/selftests/bpf/progs/netcnt_prog.c +++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c @@ -13,21 +13,21 @@ struct { __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct percpu_net_cnt); + __type(value, union percpu_net_cnt); } percpu_netcnt SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct net_cnt); + __type(value, union net_cnt); } netcnt SEC(".maps"); SEC("cgroup/skb") int bpf_nextcnt(struct __sk_buff *skb) { - struct percpu_net_cnt *percpu_cnt; + union percpu_net_cnt *percpu_cnt; char fmt[] = "%d %llu %llu\n"; - struct net_cnt *cnt; + union net_cnt *cnt; __u64 ts, dt; int ret; diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c index a7b9a69f4fd5..4990a99e7381 100644 --- a/tools/testing/selftests/bpf/test_netcnt.c +++ b/tools/testing/selftests/bpf/test_netcnt.c @@ -33,14 +33,14 @@ static int bpf_find_map(const char *test, struct bpf_object *obj, int main(int argc, char **argv) { - struct percpu_net_cnt *percpu_netcnt; + union percpu_net_cnt *percpu_netcnt; struct bpf_cgroup_storage_key key; int map_fd, percpu_map_fd; int error = EXIT_FAILURE; - struct net_cnt netcnt; struct bpf_object *obj; int prog_fd, cgroup_fd; unsigned long packets; + union net_cnt netcnt; unsigned long bytes; int cpu, nproc; __u32 prog_cnt; -- cgit v1.2.3-71-gd317 From c3df5fb57fe8756d67fd56ed29da65cdfde839f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jul 2021 13:12:20 -1000 Subject: cgroup: rstat: fix A-A deadlock on 32bit around u64_stats_sync 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex with a spinlock") added cgroup_rstat_flush_irqsafe() allowing flushing to happen from the irq context. However, rstat paths use u64_stats_sync to synchronize access to 64bit stat counters on 32bit machines. u64_stats_sync is implemented using seq_lock and trying to read from an irq context can lead to A-A deadlock if the irq happens to interrupt the stat update. Fix it by using the irqsafe variants - u64_stats_update_begin_irqsave() and u64_stats_update_end_irqrestore() - in the update paths. Note that none of this matters on 64bit machines. All these are just for 32bit SMP setups. Note that the interface was introduced way back, its first and currently only use was recently added by 2d146aa3aa84 ("mm: memcontrol: switch to rstat"). Stable tagging targets this commit. Signed-off-by: Tejun Heo Reported-by: Rik van Riel Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat") Cc: stable@vger.kernel.org # v5.13+ --- block/blk-cgroup.c | 14 ++++++++------ kernel/cgroup/rstat.c | 19 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 575d7a2e7203..31fe9be179d9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -790,6 +790,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct blkcg_gq *parent = blkg->parent; struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur, delta; + unsigned long flags; unsigned int seq; /* fetch the current per-cpu values */ @@ -799,21 +800,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) } while (u64_stats_fetch_retry(&bisc->sync, seq)); /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, &cur); blkg_iostat_sub(&delta, &bisc->last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { - u64_stats_update_begin(&parent->iostat.sync); + flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); blkg_iostat_add(&parent->iostat.cur, &delta); blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); + u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); } } @@ -848,6 +849,7 @@ static void blkcg_fill_root_iostats(void) memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; + unsigned long flags; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += @@ -864,9 +866,9 @@ static void blkcg_fill_root_iostats(void) tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - u64_stats_update_begin(&blkg->iostat.sync); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end(&blkg->iostat.sync); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } } } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 7f0e58917432..b264ab5652ba 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -347,19 +347,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) } static struct cgroup_rstat_cpu * -cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_cpu *rstatc; rstatc = get_cpu_ptr(cgrp->rstat_cpu); - u64_stats_update_begin(&rstatc->bsync); + *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); return rstatc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc) + struct cgroup_rstat_cpu *rstatc, + unsigned long flags) { - u64_stats_update_end(&rstatc->bsync); + u64_stats_update_end_irqrestore(&rstatc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } @@ -367,18 +368,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; + unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc); + cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; + unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_USER: @@ -394,7 +397,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc); + cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } /* -- cgit v1.2.3-71-gd317 From 345daff2e994ee844d6a609c37f085695fbb4c4d Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Tue, 27 Jul 2021 17:24:18 +0200 Subject: ucounts: Fix race condition between alloc_ucounts and put_ucounts The race happens because put_ucounts() doesn't use spinlock and get_ucounts is not under spinlock: CPU0 CPU1 ---- ---- alloc_ucounts() put_ucounts() spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); atomic_dec_and_test(&ucounts->count)) spin_unlock_irq(&ucounts_lock); spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); kfree(ucounts); ucounts = get_ucounts(ucounts); ================================================================== BUG: KASAN: use-after-free in instrument_atomic_read_write include/linux/instrumented.h:101 [inline] BUG: KASAN: use-after-free in atomic_add_negative include/asm-generic/atomic-instrumented.h:556 [inline] BUG: KASAN: use-after-free in get_ucounts kernel/ucount.c:152 [inline] BUG: KASAN: use-after-free in get_ucounts kernel/ucount.c:150 [inline] BUG: KASAN: use-after-free in alloc_ucounts+0x19b/0x5b0 kernel/ucount.c:188 Write of size 4 at addr ffff88802821e41c by task syz-executor.4/16785 CPU: 1 PID: 16785 Comm: syz-executor.4 Not tainted 5.14.0-rc1-next-20210712-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:105 print_address_description.constprop.0.cold+0x6c/0x309 mm/kasan/report.c:233 __kasan_report mm/kasan/report.c:419 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:436 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 instrument_atomic_read_write include/linux/instrumented.h:101 [inline] atomic_add_negative include/asm-generic/atomic-instrumented.h:556 [inline] get_ucounts kernel/ucount.c:152 [inline] get_ucounts kernel/ucount.c:150 [inline] alloc_ucounts+0x19b/0x5b0 kernel/ucount.c:188 set_cred_ucounts+0x171/0x3a0 kernel/cred.c:684 __sys_setuid+0x285/0x400 kernel/sys.c:623 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665d9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fde54097188 EFLAGS: 00000246 ORIG_RAX: 0000000000000069 RAX: ffffffffffffffda RBX: 000000000056bf80 RCX: 00000000004665d9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000000ff RBP: 00000000004bfcb9 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000056bf80 R13: 00007ffc8655740f R14: 00007fde54097300 R15: 0000000000022000 Allocated by task 16784: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:434 [inline] ____kasan_kmalloc mm/kasan/common.c:513 [inline] ____kasan_kmalloc mm/kasan/common.c:472 [inline] __kasan_kmalloc+0x9b/0xd0 mm/kasan/common.c:522 kmalloc include/linux/slab.h:591 [inline] kzalloc include/linux/slab.h:721 [inline] alloc_ucounts+0x23d/0x5b0 kernel/ucount.c:169 set_cred_ucounts+0x171/0x3a0 kernel/cred.c:684 __sys_setuid+0x285/0x400 kernel/sys.c:623 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 16785: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track+0x1c/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free mm/kasan/common.c:328 [inline] __kasan_slab_free+0xfb/0x130 mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:229 [inline] slab_free_hook mm/slub.c:1650 [inline] slab_free_freelist_hook+0xdf/0x240 mm/slub.c:1675 slab_free mm/slub.c:3235 [inline] kfree+0xeb/0x650 mm/slub.c:4295 put_ucounts kernel/ucount.c:200 [inline] put_ucounts+0x117/0x150 kernel/ucount.c:192 put_cred_rcu+0x27a/0x520 kernel/cred.c:124 rcu_do_batch kernel/rcu/tree.c:2550 [inline] rcu_core+0x7ab/0x1380 kernel/rcu/tree.c:2785 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Last potentially related work creation: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_record_aux_stack+0xe5/0x110 mm/kasan/generic.c:348 insert_work+0x48/0x370 kernel/workqueue.c:1332 __queue_work+0x5c1/0xed0 kernel/workqueue.c:1498 queue_work_on+0xee/0x110 kernel/workqueue.c:1525 queue_work include/linux/workqueue.h:507 [inline] call_usermodehelper_exec+0x1f0/0x4c0 kernel/umh.c:435 kobject_uevent_env+0xf8f/0x1650 lib/kobject_uevent.c:618 netdev_queue_add_kobject net/core/net-sysfs.c:1621 [inline] netdev_queue_update_kobjects+0x374/0x450 net/core/net-sysfs.c:1655 register_queue_kobjects net/core/net-sysfs.c:1716 [inline] netdev_register_kobject+0x35a/0x430 net/core/net-sysfs.c:1959 register_netdevice+0xd33/0x1500 net/core/dev.c:10331 nsim_init_netdevsim drivers/net/netdevsim/netdev.c:317 [inline] nsim_create+0x381/0x4d0 drivers/net/netdevsim/netdev.c:364 __nsim_dev_port_add+0x32e/0x830 drivers/net/netdevsim/dev.c:1295 nsim_dev_port_add_all+0x53/0x150 drivers/net/netdevsim/dev.c:1355 nsim_dev_probe+0xcb5/0x1190 drivers/net/netdevsim/dev.c:1496 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x23c/0xcd0 drivers/base/dd.c:595 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:747 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:777 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:894 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:965 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc2f/0x2180 drivers/base/core.c:3356 nsim_bus_dev_new drivers/net/netdevsim/bus.c:431 [inline] new_device_store+0x436/0x710 drivers/net/netdevsim/bus.c:298 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2152 [inline] new_sync_write+0x426/0x650 fs/read_write.c:518 vfs_write+0x75a/0xa40 fs/read_write.c:605 ksys_write+0x12d/0x250 fs/read_write.c:658 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Second to last potentially related work creation: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_record_aux_stack+0xe5/0x110 mm/kasan/generic.c:348 insert_work+0x48/0x370 kernel/workqueue.c:1332 __queue_work+0x5c1/0xed0 kernel/workqueue.c:1498 queue_work_on+0xee/0x110 kernel/workqueue.c:1525 queue_work include/linux/workqueue.h:507 [inline] call_usermodehelper_exec+0x1f0/0x4c0 kernel/umh.c:435 kobject_uevent_env+0xf8f/0x1650 lib/kobject_uevent.c:618 kobject_synth_uevent+0x701/0x850 lib/kobject_uevent.c:208 uevent_store+0x20/0x50 drivers/base/core.c:2371 dev_attr_store+0x50/0x80 drivers/base/core.c:2072 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2152 [inline] new_sync_write+0x426/0x650 fs/read_write.c:518 vfs_write+0x75a/0xa40 fs/read_write.c:605 ksys_write+0x12d/0x250 fs/read_write.c:658 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88802821e400 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 28 bytes inside of 192-byte region [ffff88802821e400, ffff88802821e4c0) The buggy address belongs to the page: page:ffffea0000a08780 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2821e flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 dead000000000100 dead000000000122 ffff888010841a00 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 1, ts 12874702440, free_ts 12637793385 prep_new_page mm/page_alloc.c:2433 [inline] get_page_from_freelist+0xa72/0x2f80 mm/page_alloc.c:4166 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5374 alloc_page_interleave+0x1e/0x200 mm/mempolicy.c:2119 alloc_pages+0x238/0x2a0 mm/mempolicy.c:2242 alloc_slab_page mm/slub.c:1713 [inline] allocate_slab+0x32b/0x4c0 mm/slub.c:1853 new_slab mm/slub.c:1916 [inline] new_slab_objects mm/slub.c:2662 [inline] ___slab_alloc+0x4ba/0x820 mm/slub.c:2825 __slab_alloc.constprop.0+0xa7/0xf0 mm/slub.c:2865 slab_alloc_node mm/slub.c:2947 [inline] slab_alloc mm/slub.c:2989 [inline] __kmalloc+0x312/0x330 mm/slub.c:4133 kmalloc include/linux/slab.h:596 [inline] kzalloc include/linux/slab.h:721 [inline] __register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1318 rds_tcp_init_net+0x1db/0x4f0 net/rds/tcp.c:551 ops_init+0xaf/0x470 net/core/net_namespace.c:140 __register_pernet_operations net/core/net_namespace.c:1137 [inline] register_pernet_operations+0x35a/0x850 net/core/net_namespace.c:1214 register_pernet_device+0x26/0x70 net/core/net_namespace.c:1301 rds_tcp_init+0x77/0xe0 net/rds/tcp.c:717 do_one_initcall+0x103/0x650 init/main.c:1285 do_initcall_level init/main.c:1360 [inline] do_initcalls init/main.c:1376 [inline] do_basic_setup init/main.c:1396 [inline] kernel_init_freeable+0x6b8/0x741 init/main.c:1598 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1343 [inline] free_pcp_prepare+0x312/0x7d0 mm/page_alloc.c:1394 free_unref_page_prepare mm/page_alloc.c:3329 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3408 __vunmap+0x783/0xb70 mm/vmalloc.c:2587 free_work+0x58/0x70 mm/vmalloc.c:82 process_one_work+0x98d/0x1630 kernel/workqueue.c:2276 worker_thread+0x658/0x11f0 kernel/workqueue.c:2422 kthread+0x3e5/0x4d0 kernel/kthread.c:319 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Memory state around the buggy address: ffff88802821e300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff88802821e380: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88802821e400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88802821e480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88802821e500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== - The race fix has two parts. * Changing the code to guarantee that ucounts->count is only decremented when ucounts_lock is held. This guarantees that find_ucounts will never find a structure with a zero reference count. * Changing alloc_ucounts to increment ucounts->count while ucounts_lock is held. This guarantees the reference count on the found data structure will not be decremented to zero (and the data structure freed) before the reference count is incremented. -- Eric Biederman Reported-by: syzbot+01985d7909f9468f013c@syzkaller.appspotmail.com Reported-by: syzbot+59dd63761094a80ad06d@syzkaller.appspotmail.com Reported-by: syzbot+6cd79f45bb8fa1c9eeae@syzkaller.appspotmail.com Reported-by: syzbot+b6e65bd125a05f803d6b@syzkaller.appspotmail.com Fixes: b6c336528926 ("Use atomic_t for ucounts reference counting") Cc: Hillf Danton Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/7b2ace1759b281cdd2d66101d6b305deef722efb.1627397820.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 87799e2379bd..77be3bbe3cc4 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -160,6 +160,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; + long overflow; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -184,8 +185,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) return new; } } + overflow = atomic_add_negative(1, &ucounts->count); spin_unlock_irq(&ucounts_lock); - ucounts = get_ucounts(ucounts); + if (overflow) { + put_ucounts(ucounts); + return NULL; + } return ucounts; } @@ -193,8 +198,7 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_test(&ucounts->count)) { - spin_lock_irqsave(&ucounts_lock, flags); + if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); kfree(ucounts); -- cgit v1.2.3-71-gd317 From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/riscv/net/bpf_jit_comp32.c | 4 ++++ arch/riscv/net/bpf_jit_comp64.c | 4 ++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ arch/x86/net/bpf_jit_comp32.c | 6 ++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 19 ++++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 14 files changed, 102 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 897634d0a67c..a951276f0547 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1602,6 +1602,9 @@ exit: rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dccf98a37283..41c23f474ea6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -823,6 +823,19 @@ emit_cond_jmp: return ret; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 939dd06764bc..3a73e9375712 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1355,6 +1355,9 @@ jeq_common: } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 34bb1583fc0c..beb12cbc8c29 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index de8595880fee..b87a63dba9c8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -627,6 +627,12 @@ emit_clear: } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 81de865f4c7c..e6497424cbf6 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_W: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 87e3bf5b9086..3af4131c22c7 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -939,6 +939,10 @@ out_be: emit_ld(rd, 0, RV_REG_T1, ctx); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: emit_imm(RV_REG_T1, imm, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2ae419f5115a..88419263a89a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 4b8d3c65d266..9a2f20cbd48b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return 1; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4b951458c9fc..16d76f814e9b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1219,6 +1219,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3da88ded6ee3..3bfda5f502cb 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, i++; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..83b896044e79 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -73,6 +73,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b1577498373..b1a5fc04492b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,8 @@ #include #include #include + +#include #include /* Registers */ @@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, @@ -1621,7 +1624,21 @@ out: COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bbfc6bb79240..ca3cd9aaa6ce 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); -- cgit v1.2.3-71-gd317 From 2039f26f3aca5b0e419b98f65dd36481337b86ee Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Fix leakage due to insufficient speculative store bypass mitigation Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 87 ++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7ba7e800d472..828d08afeee0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,8 +340,8 @@ struct bpf_insn_aux_data { }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 657062cb4d85..f9bda5476ea5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0) reg = &cur->regs[value_regno]; + if (!env->bypass_spec_v4) { + bool sanitize = reg && is_spillable_regtype(reg->type); + + for (i = 0; i < size; i++) { + if (state->stack[spi].slot_type[i] == STACK_INVALID) { + sanitize = true; + break; + } + } + + if (sanitize) + env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + } if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - - if (!env->bypass_spec_v4) { - bool sanitize = false; - - if (state->stack[spi].slot_type[0] == STACK_SPILL && - register_is_const(&state->stack[spi].spilled_ptr)) - sanitize = true; - for (i = 0; i < BPF_REG_SIZE; i++) - if (state->stack[spi].slot_type[i] == STACK_MISC) { - sanitize = true; - break; - } - if (sanitize) { - int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; - int soff = (-spi - 1) * BPF_REG_SIZE; - - /* detected reuse of integer stack slot with a pointer - * which means either llvm is reusing stack slot or - * an attacker is trying to exploit CVE-2018-3639 - * (speculative store bypass) - * Have to sanitize that slot with preemptive - * store of zero. - */ - if (*poff && *poff != soff) { - /* disallow programs where single insn stores - * into two different stack slots, since verifier - * cannot sanitize them - */ - verbose(env, - "insn %d cannot access two stack slots fp%d and fp%d", - insn_idx, *poff, soff); - return -EINVAL; - } - *poff = soff; - } - } save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -11913,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + ctx_access = true; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - else + ctx_access = BPF_CLASS(insn->code) == BPF_STX; + } else { continue; + } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_off) { + env->insn_aux_data[i + delta].sanitize_stack_spill) { struct bpf_insn patch[] = { - /* Sanitize suspicious stack slot with zero. - * There are no memory dependencies for this store, - * since it's only using frame pointer and immediate - * constant of zero - */ - BPF_ST_MEM(BPF_DW, BPF_REG_FP, - env->insn_aux_data[i + delta].sanitize_stack_off, - 0), - /* the original STX instruction will immediately - * overwrite the same stack slot with appropriate value - */ *insn, + BPF_ST_NOSPEC(), }; cnt = ARRAY_SIZE(patch); @@ -11955,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } + if (!ctx_access) + continue; + switch (env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) -- cgit v1.2.3-71-gd317 From 10102a890b543a8a08457dc69fa55bc032403c7d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 27 Jul 2021 14:06:35 +0100 Subject: printk: Add printk.console_no_auto_verbose boot parameter console_verbose() increases console loglevel to CONSOLE_LOGLEVEL_MOTORMOUTH, which provides more information to debug a panic/oops. Unfortunately, in Arista we maintain some DUTs (Device Under Test) that are configured to have 9600 baud rate. While verbose console messages have their value to post-analyze crashes, on such setup they: - may prevent panic/oops messages being printed - take too long to flush on console resulting in watchdog reboot In all our setups we use kdump which saves dmesg buffer after panic, so in reality those extra messages on console provide no additional value, but rather add risk of not getting to __crash_kexec(). Provide printk.console_no_auto_verbose boot parameter, which allows to switch off printk being verbose on oops/panic/lockdep. Cc: Andrew Morton Cc: John Ogness Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Dmitry Safonov Suggested-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210727130635.675184-3-dima@arista.com --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++++ include/linux/printk.h | 6 +----- kernel/printk/printk.c | 12 ++++++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 26453f250683..fdd80888217a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4101,6 +4101,15 @@ Format: (1/Y/y=enable, 0/N/n=disable) default: disabled + printk.console_no_auto_verbose= + Disable console loglevel raise on oops, panic + or lockdep-detected issues (only if lock debug is on). + With an exception to setups with low baudrate on + serial console, keeping this 0 is a good choice + in order to provide more debug information. + Format: + default: 0 (auto_verbose is enabled) + printk.devkmsg={on,off,ratelimit} Control writing to /dev/kmsg. on - unlimited logging to /dev/kmsg from userspace diff --git a/include/linux/printk.h b/include/linux/printk.h index f3f1a1eb19bd..a5e1c5adfc3f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -69,11 +69,7 @@ extern int console_printk[]; #define minimum_console_loglevel (console_printk[2]) #define default_console_loglevel (console_printk[3]) -static inline void console_verbose(void) -{ - if (console_loglevel) - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; -} +extern void console_verbose(void); /* strlen("ratelimit") + 1 */ #define DEVKMSG_STR_MAX_SIZE 10 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..a6b94c3c5ac5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2404,6 +2404,18 @@ module_param_named(console_suspend, console_suspend_enabled, MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); +static bool printk_console_no_auto_verbose; + +void console_verbose(void) +{ + if (console_loglevel && !printk_console_no_auto_verbose) + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; +} +EXPORT_SYMBOL_GPL(console_verbose); + +module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); +MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); + /** * suspend_console - suspend the console subsystem * -- cgit v1.2.3-71-gd317 From f728c4a9e8405caae69d4bc1232c54ff57b5d20f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Jul 2021 11:03:52 +0800 Subject: workqueue: Fix possible memory leaks in wq_numa_init() In error handling branch "if (WARN_ON(node == NUMA_NO_NODE))", the previously allocated memories are not released. Doing this before allocating memory eliminates memory leaks. tj: Note that the condition only occurs when the arch code is pretty broken and the WARN_ON might as well be BUG_ON(). Signed-off-by: Zhen Lei Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f148eacda55a..542c2d03dab6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5902,6 +5902,13 @@ static void __init wq_numa_init(void) return; } + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); @@ -5919,11 +5926,6 @@ static void __init wq_numa_init(void) for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } cpumask_set_cpu(cpu, tbl[node]); } -- cgit v1.2.3-71-gd317 From d36216429ff3e69db4f6ea5e0c86b80010f5f30b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Jul 2021 11:30:25 -0700 Subject: bpf: Emit better log message if bpf_iter ctx arg btf_id == 0 To avoid kernel build failure due to some missing .BTF-ids referenced functions/types, the patch ([1]) tries to fill btf_id 0 for these types. In bpf verifier, for percpu variable and helper returning btf_id cases, verifier already emitted proper warning with something like verbose(env, "Helper has invalid btf_id in R%d\n", regno); verbose(env, "invalid return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); But this is not the case for bpf_iter context arguments. I hacked resolve_btfids to encode btf_id 0 for struct task_struct. With `./test_progs -n 7/5`, I got, 0: (79) r2 = *(u64 *)(r1 +0) func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta' ; struct seq_file *seq = ctx->meta->seq; 1: (79) r6 = *(u64 *)(r2 +0) ; struct task_struct *task = ctx->task; 2: (79) r7 = *(u64 *)(r1 +8) ; if (task == (void *)0) { 3: (55) if r7 != 0x0 goto pc+11 ... ; BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); 26: (61) r1 = *(u32 *)(r7 +1372) Type '(anon)' is not a struct Basically, verifier will return btf_id 0 for task_struct. Later on, when the code tries to access task->tgid, the verifier correctly complains the type is '(anon)' and it is not a struct. Users still need to backtrace to find out what is going on. Let us catch the invalid btf_id 0 earlier and provide better message indicating btf_id is wrong. The new error message looks like below: R1 type=ctx expected=fp ; struct seq_file *seq = ctx->meta->seq; 0: (79) r2 = *(u64 *)(r1 +0) func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta' ; struct seq_file *seq = ctx->meta->seq; 1: (79) r6 = *(u64 *)(r2 +0) ; struct task_struct *task = ctx->task; 2: (79) r7 = *(u64 *)(r1 +8) invalid btf_id for context argument offset 8 invalid bpf_context access off=8 size=8 [1] https://lore.kernel.org/bpf/20210727132532.2473636-1-hengqi.chen@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210728183025.1461750-1-yhs@fb.com --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7780131f710e..c395024610ed 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4825,6 +4825,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { + if (!ctx_arg_info->btf_id) { + bpf_log(log,"invalid btf_id for context argument offset %u\n", off); + return false; + } + info->reg_type = ctx_arg_info->reg_type; info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; -- cgit v1.2.3-71-gd317 From d5ee8e750c9449e9849a09ce6fb6b8adeaa66adc Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Tue, 20 Jul 2021 11:05:11 -0400 Subject: padata: Convert from atomic_t to refcount_t on parallel_data->refcnt refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Acked-by: Daniel Jordan Signed-off-by: Herbert Xu --- include/linux/padata.h | 3 ++- kernel/padata.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index a433f13fc4bf..495b16b6b4d7 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -12,6 +12,7 @@ #ifndef PADATA_H #define PADATA_H +#include #include #include #include @@ -96,7 +97,7 @@ struct parallel_data { struct padata_shell *ps; struct padata_list __percpu *reorder_list; struct padata_serial_queue __percpu *squeue; - atomic_t refcnt; + refcount_t refcnt; unsigned int seq_nr; unsigned int processed; int cpu; diff --git a/kernel/padata.c b/kernel/padata.c index d4d3ba6e1728..378c36080781 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -211,7 +211,7 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - atomic_inc(&pd->refcnt); + refcount_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = *cb_cpu; @@ -383,7 +383,7 @@ static void padata_serial_worker(struct work_struct *serial_work) } local_bh_enable(); - if (atomic_sub_and_test(cnt, &pd->refcnt)) + if (refcount_sub_and_test(cnt, &pd->refcnt)) padata_free_pd(pd); } @@ -593,7 +593,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_reorder_list(pd); padata_init_squeues(pd); pd->seq_nr = -1; - atomic_set(&pd->refcnt, 1); + refcount_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -667,7 +667,7 @@ static int padata_replace(struct padata_instance *pinst) synchronize_rcu(); list_for_each_entry_continue_reverse(ps, &pinst->pslist, list) - if (atomic_dec_and_test(&ps->opd->refcnt)) + if (refcount_dec_and_test(&ps->opd->refcnt)) padata_free_pd(ps->opd); pinst->flags &= ~PADATA_RESET; -- cgit v1.2.3-71-gd317 From 09b1b13461e12e6962baf0c5bb9f65bedf284d90 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 29 Jul 2021 16:28:11 +0200 Subject: kcsan: use u64 instead of cycles_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cycles_t has a different type across architectures: unsigned int, unsinged long, or unsigned long long. Depending on architecture this will generate this warning: kernel/kcsan/debugfs.c: In function ‘microbenchmark’: ./include/linux/kern_levels.h:5:25: warning: format ‘%llu’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘cycles_t’ {aka ‘long unsigned int’} [-Wformat=] To avoid this simply change the type of cycle to u64 in microbenchmark(), since u64 is of type unsigned long long for all architectures. Acked-by: Marco Elver Link: https://lore.kernel.org/r/20210729142811.1309391-1-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- kernel/kcsan/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index e65de172ccf7..1d1d1b0e4248 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -64,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters) { const struct kcsan_ctx ctx_save = current->kcsan_ctx; const bool was_enabled = READ_ONCE(kcsan_enabled); - cycles_t cycles; + u64 cycles; /* We may have been called from an atomic region; reset context. */ memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); -- cgit v1.2.3-71-gd317 From d92df42d7685445a2b6c815d9230d9699d9d400b Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:50 +0200 Subject: genirq: Improve "hwirq" output in /proc and /sys/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HW IRQ numbers generated by the PCI MSI layer can be quite large on a pSeries machine when running under the IBM Hypervisor and they appear as negative. Use '%lu' instead to show them correctly. Signed-off-by: Cédric Le Goater Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 2 +- kernel/irq/proc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index fadb93766020..4e3c29bb603c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj, raw_spin_lock_irq(&desc->lock); if (desc->irq_data.domain) - ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); raw_spin_unlock_irq(&desc->lock); return ret; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7c5cd42df3b9..ee595ec09778 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, " %8s", "None"); } if (desc->irq_data.domain) - seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); + seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); else seq_printf(p, " %*s", prec, ""); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL -- cgit v1.2.3-71-gd317 From ff41c28c4b54052942180d8b3f49e75f1445135a Mon Sep 17 00:00:00 2001 From: Kamal Agrawal Date: Fri, 30 Jul 2021 18:53:06 +0530 Subject: tracing: Fix NULL pointer dereference in start_creating The event_trace_add_tracer() can fail. In this case, it leads to a crash in start_creating with below call stack. Handle the error scenario properly in trace_array_create_dir. Call trace: down_write+0x7c/0x204 start_creating.25017+0x6c/0x194 tracefs_create_file+0xc4/0x2b4 init_tracer_tracefs+0x5c/0x940 trace_array_create_dir+0x58/0xb4 trace_array_create+0x1bc/0x2b8 trace_array_get_by_name+0xdc/0x18c Link: https://lkml.kernel.org/r/1627651386-21315-1-git-send-email-kamaagra@codeaurora.org Cc: stable@vger.kernel.org Fixes: 4114fbfd02f1 ("tracing: Enable creating new instance early boot") Signed-off-by: Kamal Agrawal Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c59dd35a6da5..33899a71fdc1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9135,8 +9135,10 @@ static int trace_array_create_dir(struct trace_array *tr) return -EINVAL; ret = event_trace_add_tracer(tr->dir, tr); - if (ret) + if (ret) { tracefs_remove(tr->dir); + return ret; + } init_tracer_tracefs(tr, tr->dir); __update_tracer_options(tr); -- cgit v1.2.3-71-gd317 From b61a28cf11d61f512172e673b8f8c4a6c789b425 Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Wed, 28 Jul 2021 18:47:41 +0200 Subject: bpf: Fix off-by-one in tail call count limiting Before, the interpreter allowed up to MAX_TAIL_CALL_CNT + 1 tail calls. Now precisely MAX_TAIL_CALL_CNT is allowed, which is in line with the behavior of the x86 JITs. Signed-off-by: Johan Almbladh Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210728164741.350370-1-johan.almbladh@anyfinetworks.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b1a5fc04492b..fe807b203a6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1562,7 +1562,7 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; -- cgit v1.2.3-71-gd317 From 67ccddf86621b18dbffe56f11a106774ee8f44bd Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 28 Jul 2021 23:25:45 +0200 Subject: ftrace: Introduce ftrace_need_init_nop() Implementing live patching on s390 requires each function's prologue to contain a very special kind of nop, which gcc and clang don't generate. However, the current code assumes that if CC_USING_NOP_MCOUNT is defined, then whatever the compiler generates is good enough. Move the CC_USING_NOP_MCOUNT check into the new ftrace_need_init_nop() macro, that the architectures can override. An alternative solution is to disable using -mnop-mcount in the Makefile, however, this makes the build logic (even) more complicated and forces the arch-specific code to deal with the useless __fentry__ symbol. Signed-off-by: Ilya Leoshkevich Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20210728212546.128248-2-iii@linux.ibm.com Signed-off-by: Heiko Carstens --- include/linux/ftrace.h | 16 ++++++++++++++++ kernel/trace/ftrace.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a69f363b61bf..832e65f06754 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -643,6 +643,22 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } extern int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr); +/** + * ftrace_need_init_nop - return whether nop call sites should be initialized + * + * Normally the compiler's -mnop-mcount generates suitable nops, so we don't + * need to call ftrace_init_nop() if the code is built with that flag. + * Architectures where this is not always the case may define their own + * condition. + * + * Return must be: + * 0 if ftrace_init_nop() should be called + * Nonzero if ftrace_init_nop() should not be called + */ + +#ifndef ftrace_need_init_nop +#define ftrace_need_init_nop() (!__is_defined(CC_USING_NOP_MCOUNT)) +#endif /** * ftrace_init_nop - initialize a nop call site diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b180f61e6d3..7efbc8aaf7f6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3100,6 +3100,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { + bool init_nop = ftrace_need_init_nop(); struct ftrace_page *pg; struct dyn_ftrace *p; u64 start, stop; @@ -3138,8 +3139,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_nop_initialize(mod, p)) + if (init_nop && !ftrace_nop_initialize(mod, p)) break; update_cnt++; -- cgit v1.2.3-71-gd317 From f558c2b834ec27e75d37b1c860c139e7b7c3a8e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Aug 2021 12:45:01 +0200 Subject: sched/rt: Fix double enqueue caused by rt_effective_prio Double enqueues in rt runqueues (list) have been reported while running a simple test that spawns a number of threads doing a short sleep/run pattern while being concurrently setscheduled between rt and fair class. WARNING: CPU: 3 PID: 2825 at kernel/sched/rt.c:1294 enqueue_task_rt+0x355/0x360 CPU: 3 PID: 2825 Comm: setsched__13 RIP: 0010:enqueue_task_rt+0x355/0x360 Call Trace: __sched_setscheduler+0x581/0x9d0 _sched_setscheduler+0x63/0xa0 do_sched_setscheduler+0xa0/0x150 __x64_sys_sched_setscheduler+0x1a/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae list_add double add: new=ffff9867cb629b40, prev=ffff9867cb629b40, next=ffff98679fc67ca0. kernel BUG at lib/list_debug.c:31! invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI CPU: 3 PID: 2825 Comm: setsched__13 RIP: 0010:__list_add_valid+0x41/0x50 Call Trace: enqueue_task_rt+0x291/0x360 __sched_setscheduler+0x581/0x9d0 _sched_setscheduler+0x63/0xa0 do_sched_setscheduler+0xa0/0x150 __x64_sys_sched_setscheduler+0x1a/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae __sched_setscheduler() uses rt_effective_prio() to handle proper queuing of priority boosted tasks that are setscheduled while being boosted. rt_effective_prio() is however called twice per each __sched_setscheduler() call: first directly by __sched_setscheduler() before dequeuing the task and then by __setscheduler() to actually do the priority change. If the priority of the pi_top_task is concurrently being changed however, it might happen that the two calls return different results. If, for example, the first call returned the same rt priority the task was running at and the second one a fair priority, the task won't be removed by the rt list (on_list still set) and then enqueued in the fair runqueue. When eventually setscheduled back to rt it will be seen as enqueued already and the WARNING/BUG be issued. Fix this by calling rt_effective_prio() only once and then reusing the return value. While at it refactor code as well for clarity. Concurrent priority inheritance handling is still safe and will eventually converge to a new state by following the inheritance chain(s). Fixes: 0782e63bc6fe ("sched: Handle priority boosted tasks proper in setscheduler()") [squashed Peterz changes; added changelog] Reported-by: Mark Simmons Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210803104501.38333-1-juri.lelli@redhat.com --- kernel/sched/core.c | 90 +++++++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9ff40f4661..20ffcc044134 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1981,12 +1981,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) +static inline int __normal_prio(int policy, int rt_prio, int nice) { - return p->static_prio; + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; } /* @@ -1998,15 +2004,7 @@ static inline int __normal_prio(struct task_struct *p) */ static inline int normal_prio(struct task_struct *p) { - int prio; - - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); } /* @@ -4099,7 +4097,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); /* @@ -6341,6 +6339,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); +static void __setscheduler_prio(struct task_struct *p, int prio) +{ + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; +} + #ifdef CONFIG_RT_MUTEXES static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) @@ -6456,22 +6466,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } else { p->dl.pi_se = &p->dl; } - p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } - p->prio = prio; + __setscheduler_prio(p, prio); if (queued) enqueue_task(rq, p, queue_flag); @@ -6824,35 +6831,6 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -6873,10 +6851,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; struct callback_head *head; struct rq_flags rf; @@ -7074,6 +7050,7 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); if (pi) { /* * Take priority boosted tasks into account. If the new @@ -7082,8 +7059,8 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -7096,7 +7073,10 @@ change: prev_class = p->sched_class; - __setscheduler(rq, p, attr, pi); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } __setscheduler_uclamp(p, attr); if (queued) { -- cgit v1.2.3-71-gd317 From 1c6829cfd3d5124b125e6df41158665aea413b35 Mon Sep 17 00:00:00 2001 From: Mika Penttilä Date: Thu, 22 Jul 2021 09:39:46 +0300 Subject: sched/numa: Fix is_core_idle() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the loop variable instead of the function argument to test the other SMT siblings for idle. Fixes: ff7db0bf24db ("sched/numa: Prefer using an idle CPU as a migration target instead of comparing tasks") Signed-off-by: Mika Penttilä Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Acked-by: Pankaj Gupta Link: https://lkml.kernel.org/r/20210722063946.28951-1-mika.penttila@gmail.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11d22943753f..13c3fd40bd34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1486,7 +1486,7 @@ static inline bool is_core_idle(int cpu) if (cpu == sibling) continue; - if (!idle_cpu(cpu)) + if (!idle_cpu(sibling)) return false; } #endif -- cgit v1.2.3-71-gd317 From f912d051619d11411867f642d2004928eb0b41b1 Mon Sep 17 00:00:00 2001 From: Wang Hui Date: Wed, 21 Jul 2021 17:11:09 +0800 Subject: sched: remove redundant on_rq status change activate_task/deactivate_task will change on_rq status, no need to do it again. Signed-off-by: Wang Hui Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210721091109.1406043-1-john.wanghui@huawei.com --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c22cd026440..6c562ad1ad5d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5659,11 +5659,9 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); -- cgit v1.2.3-71-gd317 From f95091536f78971b269ec321b057b8d630b0ad8a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 27 Jul 2021 11:11:01 +0100 Subject: sched/deadline: Fix reset_on_fork reporting of DL tasks It is possible for sched_getattr() to incorrectly report the state of the reset_on_fork flag when called on a deadline task. Indeed, if the flag was set on a deadline task using sched_setattr() with flags (SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_KEEP_PARAMS), then p->sched_reset_on_fork will be set, but __setscheduler() will bail out early, which means that the dl_se->flags will not get updated by __setscheduler_params()->__setparam_dl(). Consequently, if sched_getattr() is then called on the task, __getparam_dl() will override kattr.sched_flags with the now out-of-date copy in dl_se->flags and report the stale value to userspace. To fix this, make sure to only copy the flags that are relevant to sched_deadline to and from the dl_se->flags field. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210727101103.2729607-2-qperret@google.com --- kernel/sched/deadline.c | 7 ++++--- kernel/sched/sched.h | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaacd6cfd42f..5cafc642e647 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2741,7 +2741,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2754,7 +2754,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2851,7 +2852,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a1c6aeb9165..75a5c120e7d9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -- cgit v1.2.3-71-gd317 From 7ad721bf10718a4e480a27ded8bb16b8f6feb2d1 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 27 Jul 2021 11:11:02 +0100 Subject: sched: Don't report SCHED_FLAG_SUGOV in sched_getattr() SCHED_FLAG_SUGOV is supposed to be a kernel-only flag that userspace cannot interact with. However, sched_getattr() currently reports it in sched_flags if called on a sugov worker even though it is not actually defined in a UAPI header. To avoid this, make sure to clean-up the sched_flags field in sched_getattr() before returning to userspace. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210727101103.2729607-3-qperret@google.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c562ad1ad5d..314f70db3e5c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7530,6 +7530,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_priority = p->rt_priority; else kattr.sched_nice = task_nice(p); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* -- cgit v1.2.3-71-gd317 From 89aafd67f28c9e3b725aa30b44b7f61ad3e348ce Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:56 +0100 Subject: sched/fair: Use prev instead of new target as recent_used_cpu After select_idle_sibling, p->recent_used_cpu is set to the new target. However on the next wakeup, prev will be the same as recent_used_cpu unless the load balancer has moved the task since the last wakeup. It still works, but is less efficient than it could be. This patch preserves recent_used_cpu for longer. The impact on SIS efficiency is tiny so the SIS statistic patches were used to track the hit rate for using recent_used_cpu. With perf bench pipe on a 2-socket Cascadelake machine, the hit rate went from 57.14% to 85.32%. For more intensive wakeup loads like hackbench, the hit rate is almost negligible but rose from 0.21% to 6.64%. For scaling loads like tbench, the hit rate goes from almost 0% to 25.42% overall. Broadly speaking, on tbench, the success rate is much higher for lower thread counts and drops to almost 0 as the workload scales to towards saturation. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13c3fd40bd34..7ee394b4b4cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6347,6 +6347,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@ -6873,9 +6874,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); -- cgit v1.2.3-71-gd317 From 56498cfb045d7147cdcba33795d19429afcd1d00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Aug 2021 12:58:57 +0100 Subject: sched/fair: Avoid a second scan of target in select_idle_cpu When select_idle_cpu starts scanning for an idle CPU, it starts with a target CPU that has already been checked by select_idle_sibling. This patch starts with the next CPU instead. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210804115857.6253-3-mgorman@techsingularity.net --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ee394b4b4cd..47a0fbf224b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6220,7 +6220,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) -- cgit v1.2.3-71-gd317 From d2c8cce647f3022d5960a3bf2b50a2da341d9c8b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:13 +0200 Subject: PM: sleep: s2idle: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d8cae434f9eb..eb75f394a059 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -96,7 +96,7 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - get_online_cpus(); + cpus_read_lock(); cpuidle_resume(); /* Push all the CPUs into the idle loop. */ @@ -106,7 +106,7 @@ static void s2idle_enter(void) s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); - put_online_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); -- cgit v1.2.3-71-gd317 From 4fac49fd0a349aa3afb3ad7ec778a00592c7ab59 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 4 Aug 2021 12:44:07 +0200 Subject: PM: sleep: check RTC features instead of ops in suspend_test Test RTC_FEATURE_ALARM instead of relying on ops->set_alarm to know whether alarms are available. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index e1ed58adb69e..d20526c5be15 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); - if (!candidate->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) return 0; if (!device_may_wakeup(candidate->dev.parent)) return 0; -- cgit v1.2.3-71-gd317 From 2c05caa7ba8803209769b9e4fe02c38d77ae88d0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Jul 2021 17:19:51 -0400 Subject: tracing / histogram: Give calculation hist_fields a size When working on my user space applications, I found a bug in the synthetic event code where the automated synthetic event field was not matching the event field calculation it was attached to. Looking deeper into it, it was because the calculation hist_field was not given a size. The synthetic event fields are matched to their hist_fields either by having the field have an identical string type, or if that does not match, then the size and signed values are used to match the fields. The problem arose when I tried to match a calculation where the fields were "unsigned int". My tool created a synthetic event of type "u32". But it failed to match. The string was: diff=field1-field2:onmatch(event).trace(synth,$diff) Adding debugging into the kernel, I found that the size of "diff" was 0. And since it was given "unsigned int" as a type, the histogram fallback code used size and signed. The signed matched, but the size of u32 (4) did not match zero, and the event failed to be created. This can be worse if the field you want to match is not one of the acceptable fields for a synthetic event. As event fields can have any type that is supported in Linux, this can cause an issue. For example, if a type is an enum. Then there's no way to use that with any calculations. Have the calculation field simply take on the size of what it is calculating. Link: https://lkml.kernel.org/r/20210730171951.59c7743f@oasis.local.home Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Ingo Molnar Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: 100719dcef447 ("tracing: Add simple expression support to hist triggers") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 34325f41ebc0..362db9b81b8d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2287,6 +2287,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operands[1] = operand2; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + expr->operator = field_op; expr->name = expr_str(expr, 0); expr->type = kstrdup(operand1->type, GFP_KERNEL); -- cgit v1.2.3-71-gd317 From a9d10ca4986571bffc19778742d508cc8dd13e02 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 28 Jul 2021 07:55:43 +0900 Subject: tracing: Reject string operand in the histogram expression Since the string type can not be the target of the addition / subtraction operation, it must be rejected. Without this fix, the string type silently converted to digits. Link: https://lkml.kernel.org/r/162742654278.290973.1523000673366456634.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 100719dcef447 ("tracing: Add simple expression support to hist triggers") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 362db9b81b8d..949ef09dc537 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -65,7 +65,8 @@ C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \ C(EMPTY_SORT_FIELD, "Empty sort field"), \ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ - C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), + C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), #undef C #define C(a, b) HIST_ERR_##a @@ -2156,6 +2157,13 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, ret = PTR_ERR(operand1); goto free; } + if (operand1->flags & HIST_FIELD_FL_STRING) { + /* String type can not be the operand of unary operator. */ + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); + destroy_hist_field(operand1, 0); + ret = -EINVAL; + goto free; + } expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2257,6 +2265,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand1 = NULL; goto free; } + if (operand1->flags & HIST_FIELD_FL_STRING) { + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); + ret = -EINVAL; + goto free; + } /* rest of string could be another expression e.g. b+c in a+b+c */ operand_flags = 0; @@ -2266,6 +2279,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand2 = NULL; goto free; } + if (operand2->flags & HIST_FIELD_FL_STRING) { + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); + ret = -EINVAL; + goto free; + } ret = check_expr_operands(file->tr, operand1, operand2); if (ret) -- cgit v1.2.3-71-gd317 From 51397dc6f283bb570e1cf8226017d300d8ea1f5b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 4 Aug 2021 14:18:48 -0400 Subject: tracing: Quiet smp_processor_id() use in preemptable warning in hwlat The hardware latency detector (hwlat) has a mode that it runs one thread across CPUs. The logic to move from the currently running CPU to the next one in the list does a smp_processor_id() to find where it currently is. Unfortunately, it's done with preemption enabled, and this triggers a warning for using smp_processor_id() in a preempt enabled section. As it is only using smp_processor_id() to get information on where it currently is in order to simply move it to the next CPU, it doesn't really care if it got moved in the mean time. It will simply balance out later if such a case arises. Switch smp_processor_id() to raw_smp_processor_id() to quiet that warning. Link: https://lkml.kernel.org/r/20210804141848.79edadc0@oasis.local.home Acked-by: Daniel Bristot de Oliveira Fixes: 8fa826b7344d ("trace/hwlat: Implement the mode config option") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a6c0cdaf4b87..14f46aae1981 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -327,7 +327,7 @@ static void move_to_next_cpu(void) get_online_cpus(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(smp_processor_id(), current_mask); + next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); put_online_cpus(); if (next_cpu >= nr_cpu_ids) -- cgit v1.2.3-71-gd317 From f7ec4121256393e1d03274acdca73eb18958f27e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 5 Aug 2021 09:27:15 -0400 Subject: tracepoint: static call: Compare data on transition from 2->1 callees On transition from 2->1 callees, we should be comparing .data rather than .func, because the same callback can be registered twice with different data, and what we care about here is that the data of array element 0 is unchanged to skip rcu sync. Link: https://lkml.kernel.org/r/20210805132717.23813-2-mathieu.desnoyers@efficios.com Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: "Paul E. McKenney" Cc: Stefan Metzmacher Fixes: 547305a64632 ("tracepoint: Fix out of sync data passing by static caller") Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index fc32821f8240..133b6454b287 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -338,7 +338,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, } else { rcu_assign_pointer(tp->funcs, tp_funcs); tracepoint_update_call(tp, tp_funcs, - tp_funcs[0].func != old[0].func); + tp_funcs[0].data != old[0].data); } release_probes(old); return 0; -- cgit v1.2.3-71-gd317 From 231264d6927f6740af36855a622d0e240be9d94c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 5 Aug 2021 09:27:16 -0400 Subject: tracepoint: Fix static call function vs data state mismatch On a 1->0->1 callbacks transition, there is an issue with the new callback using the old callback's data. Considering __DO_TRACE_CALL: do { \ struct tracepoint_func *it_func_ptr; \ void *__data; \ it_func_ptr = \ rcu_dereference_raw((&__tracepoint_##name)->funcs); \ if (it_func_ptr) { \ __data = (it_func_ptr)->data; \ ----> [ delayed here on one CPU (e.g. vcpu preempted by the host) ] static_call(tp_func_##name)(__data, args); \ } \ } while (0) It has loaded the tp->funcs of the old callback, so it will try to use the old data. This can be fixed by adding a RCU sync anywhere in the 1->0->1 transition chain. On a N->2->1 transition, we need an rcu-sync because you may have a sequence of 3->2->1 (or 1->2->1) where the element 0 data is unchanged between 2->1, but was changed from 3->2 (or from 1->2), which may be observed by the static call. This can be fixed by adding an unconditional RCU sync in transition 2->1. Note, this fixes a correctness issue at the cost of adding a tremendous performance regression to the disabling of tracepoints. Before this commit: # trace-cmd start -e all # time trace-cmd start -p nop real 0m0.778s user 0m0.000s sys 0m0.061s After this commit: # trace-cmd start -e all # time trace-cmd start -p nop real 0m10.593s user 0m0.017s sys 0m0.259s A follow up fix will introduce a more lightweight scheme based on RCU get_state and cond_sync, that will return the performance back to what it was. As both this change and the lightweight versions are complex on their own, for bisecting any issues that this may cause, they are kept as two separate changes. Link: https://lkml.kernel.org/r/20210805132717.23813-3-mathieu.desnoyers@efficios.com Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: "Paul E. McKenney" Cc: Stefan Metzmacher Fixes: d25e37d89dd2 ("tracepoint: Optimize using static_call()") Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 102 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 133b6454b287..8d772bd6894d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -15,6 +15,13 @@ #include #include +enum tp_func_state { + TP_FUNC_0, + TP_FUNC_1, + TP_FUNC_2, + TP_FUNC_N, +}; + extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; @@ -246,26 +253,29 @@ static void *func_remove(struct tracepoint_func **funcs, return old; } -static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync) +/* + * Count the number of functions (enum tp_func_state) in a tp_funcs array. + */ +static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs) +{ + if (!tp_funcs) + return TP_FUNC_0; + if (!tp_funcs[1].func) + return TP_FUNC_1; + if (!tp_funcs[2].func) + return TP_FUNC_2; + return TP_FUNC_N; /* 3 or more */ +} + +static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) { void *func = tp->iterator; /* Synthetic events do not have static call sites */ if (!tp->static_call_key) return; - - if (!tp_funcs[1].func) { + if (nr_func_state(tp_funcs) == TP_FUNC_1) func = tp_funcs[0].func; - /* - * If going from the iterator back to a single caller, - * we need to synchronize with __DO_TRACE to make sure - * that the data passed to the callback is the one that - * belongs to that callback. - */ - if (sync) - tracepoint_synchronize_unregister(); - } - __static_call_update(tp->static_call_key, tp->static_call_tramp, func); } @@ -299,9 +309,31 @@ static int tracepoint_add_func(struct tracepoint *tp, * a pointer to it. This array is referenced by __DO_TRACE from * include/linux/tracepoint.h using rcu_dereference_sched(). */ - tracepoint_update_call(tp, tp_funcs, false); - rcu_assign_pointer(tp->funcs, tp_funcs); - static_key_enable(&tp->key); + switch (nr_func_state(tp_funcs)) { + case TP_FUNC_1: /* 0->1 */ + /* Set static call to first function */ + tracepoint_update_call(tp, tp_funcs); + /* Both iterator and static call handle NULL tp->funcs */ + rcu_assign_pointer(tp->funcs, tp_funcs); + static_key_enable(&tp->key); + break; + case TP_FUNC_2: /* 1->2 */ + /* Set iterator static call */ + tracepoint_update_call(tp, tp_funcs); + /* + * Iterator callback installed before updating tp->funcs. + * Requires ordering between RCU assign/dereference and + * static call update/call. + */ + rcu_assign_pointer(tp->funcs, tp_funcs); + break; + case TP_FUNC_N: /* N->N+1 (N>1) */ + rcu_assign_pointer(tp->funcs, tp_funcs); + break; + default: + WARN_ON_ONCE(1); + break; + } release_probes(old); return 0; @@ -328,17 +360,47 @@ static int tracepoint_remove_func(struct tracepoint *tp, /* Failed allocating new tp_funcs, replaced func with stub */ return 0; - if (!tp_funcs) { + switch (nr_func_state(tp_funcs)) { + case TP_FUNC_0: /* 1->0 */ /* Removed last function */ if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); static_key_disable(&tp->key); + /* Set iterator static call */ + tracepoint_update_call(tp, tp_funcs); + /* Both iterator and static call handle NULL tp->funcs */ + rcu_assign_pointer(tp->funcs, NULL); + /* + * Make sure new func never uses old data after a 1->0->1 + * transition sequence. + * Considering that transition 0->1 is the common case + * and don't have rcu-sync, issue rcu-sync after + * transition 1->0 to break that sequence by waiting for + * readers to be quiescent. + */ + tracepoint_synchronize_unregister(); + break; + case TP_FUNC_1: /* 2->1 */ rcu_assign_pointer(tp->funcs, tp_funcs); - } else { + /* + * On 2->1 transition, RCU sync is needed before setting + * static call to first callback, because the observer + * may have loaded any prior tp->funcs after the last one + * associated with an rcu-sync. + */ + tracepoint_synchronize_unregister(); + /* Set static call to first function */ + tracepoint_update_call(tp, tp_funcs); + break; + case TP_FUNC_2: /* N->N-1 (N>2) */ + fallthrough; + case TP_FUNC_N: rcu_assign_pointer(tp->funcs, tp_funcs); - tracepoint_update_call(tp, tp_funcs, - tp_funcs[0].data != old[0].data); + break; + default: + WARN_ON_ONCE(1); + break; } release_probes(old); return 0; -- cgit v1.2.3-71-gd317 From b4da13aa28d4fd0071247b7b41c579ee8a86c81a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 4 Aug 2021 15:59:25 +0200 Subject: sched/deadline: Fix missing clock update in migrate_task_rq_dl() A missing clock update is causing the following warning: rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 112 PID: 2041 at kernel/sched/sched.h:1453 sub_running_bw.isra.0+0x190/0x1a0 ... CPU: 112 PID: 2041 Comm: sugov:112 Tainted: G W 5.14.0-rc1 #1 Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 1.6.20210526 (SCP: 1.06.20210526) 2021/05/26 ... Call trace: sub_running_bw.isra.0+0x190/0x1a0 migrate_task_rq_dl+0xf8/0x1e0 set_task_cpu+0xa8/0x1f0 try_to_wake_up+0x150/0x3d4 wake_up_q+0x64/0xc0 __up_write+0xd0/0x1c0 up_write+0x4c/0x2b0 cppc_set_perf+0x120/0x2d0 cppc_cpufreq_set_target+0xe0/0x1a4 [cppc_cpufreq] __cpufreq_driver_target+0x74/0x140 sugov_work+0x64/0x80 kthread_worker_fn+0xe0/0x230 kthread+0x138/0x140 ret_from_fork+0x10/0x18 The task causing this is the `cppc_fie` DL task introduced by commit 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance"). With CONFIG_ACPI_CPPC_CPUFREQ_FIE=y and schedutil cpufreq governor on slow-switching system (like on this Ampere Altra WIWYNN Mt. Jade Arm Server): DL task `curr=sugov:112` lets `p=cppc_fie` migrate and since the latter is in `non_contending` state, migrate_task_rq_dl() calls sub_running_bw()->__sub_running_bw()->cpufreq_update_util()-> rq_clock()->assert_clock_updated() on p. Fix this by updating the clock for a non_contending task in migrate_task_rq_dl() before calling sub_running_bw(). Reported-by: Bruno Goncalves Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20210804135925.3734605-1-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5cafc642e647..e94314633b39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* -- cgit v1.2.3-71-gd317 From ca4984a7dd863f3e1c0df775ae3e744bff24c303 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 5 Aug 2021 11:21:53 +0100 Subject: sched: Fix UCLAMP_FLAG_IDLE setting The UCLAMP_FLAG_IDLE flag is set on a runqueue when dequeueing the last uclamp active task (that is, when buckets.tasks reaches 0 for all buckets) to maintain the last uclamp.max and prevent blocked util from suddenly becoming visible. However, there is an asymmetry in how the flag is set and cleared which can lead to having the flag set whilst there are active tasks on the rq. Specifically, the flag is cleared in the uclamp_rq_inc() path, which is called at enqueue time, but set in uclamp_rq_dec_id() which is called both when dequeueing a task _and_ in the update_uclamp_active() path. As a result, when both uclamp_rq_{dec,ind}_id() are called from update_uclamp_active(), the flag ends up being set but not cleared, hence leaving the runqueue in a broken state. Fix this by clearing the flag in update_uclamp_active() as well. Fixes: e496187da710 ("sched/uclamp: Enforce last task's UCLAMP_MAX") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20210805102154.590709-2-qperret@google.com --- kernel/sched/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 314f70db3e5c..df0480ad59b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1621,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1644,12 +1661,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3-71-gd317 From f4dddf90d58d77b48492b775868af4041a217f4c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 5 Aug 2021 11:21:54 +0100 Subject: sched: Skip priority checks with SCHED_FLAG_KEEP_PARAMS SCHED_FLAG_KEEP_PARAMS can be passed to sched_setattr to specify that the call must not touch scheduling parameters (nice or priority). This is particularly handy for uclamp when used in conjunction with SCHED_FLAG_KEEP_POLICY as that allows to issue a syscall that only impacts uclamp values. However, sched_setattr always checks whether the priorities and nice values passed in sched_attr are valid first, even if those never get used down the line. This is useless at best since userspace can trivially bypass this check to set the uclamp values by specifying low priorities. However, it is cumbersome to do so as there is no single expression of this that skips both RT and CFS checks at once. As such, userspace needs to query the task policy first with e.g. sched_getattr and then set sched_attr.sched_priority accordingly. This is racy and slower than a single call. As the priority and nice checks are useless when SCHED_FLAG_KEEP_PARAMS is specified, simply inherit them in this case to match the policy inheritance of SCHED_FLAG_KEEP_POLICY. Reported-by: Wei Wang Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20210805102154.590709-3-qperret@google.com --- kernel/sched/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df0480ad59b0..433b4001e71e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7328,6 +7328,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -7389,6 +7399,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -7537,12 +7549,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3-71-gd317 From 7fcc17d0cb12938d2b3507973a6f93fc9ed2c7a1 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 3 Aug 2021 11:27:43 +0100 Subject: PM: EM: Increase energy calculation precision The Energy Model (EM) provides useful information about device power in each performance state to other subsystems like: Energy Aware Scheduler (EAS). The energy calculation in EAS does arithmetic operation based on the EM em_cpu_energy(). Current implementation of that function uses em_perf_state::cost as a pre-computed cost coefficient equal to: cost = power * max_frequency / frequency. The 'power' is expressed in milli-Watts (or in abstract scale). There are corner cases when the EAS energy calculation for two Performance Domains (PDs) return the same value. The EAS compares these values to choose smaller one. It might happen that this values are equal due to rounding error. In such scenario, we need better resolution, e.g. 1000 times better. To provide this possibility increase the resolution in the em_perf_state::cost for 64-bit architectures. The cost of increasing resolution on 32-bit is pretty high (64-bit division) and is not justified since there are no new 32bit big.LITTLE EAS systems expected which would benefit from this higher resolution. This patch allows to avoid the rounding to milli-Watt errors, which might occur in EAS energy estimation for each PD. The rounding error is common for small tasks which have small utilization value. There are two places in the code where it makes a difference: 1. In the find_energy_efficient_cpu() where we are searching for best_delta. We might suffer there when two PDs return the same result, like in the example below. Scenario: Low utilized system e.g. ~200 sum_util for PD0 and ~220 for PD1. There are quite a few small tasks ~10-15 util. These tasks would suffer for the rounding error. These utilization values are typical when running games on Android. One of our partners has reported 5..10mA less battery drain when running with increased resolution. Some details: We have two PDs: PD0 (big) and PD1 (little) Let's compare w/o patch set ('old') and w/ patch set ('new') We are comparing energy w/ task and w/o task placed in the PDs a) 'old' w/o patch set, PD0 task_util = 13 cost = 480 sum_util_w/o_task = 215 sum_util_w_task = 228 scale_cpu = 1024 energy_w/o_task = 480 * 215 / 1024 = 100.78 => 100 energy_w_task = 480 * 228 / 1024 = 106.87 => 106 energy_diff = 106 - 100 = 6 (this is equal to 'old' PD1's energy_diff in 'c)') b) 'new' w/ patch set, PD0 task_util = 13 cost = 480 * 1000 = 480000 sum_util_w/o_task = 215 sum_util_w_task = 228 energy_w/o_task = 480000 * 215 / 1024 = 100781 energy_w_task = 480000 * 228 / 1024 = 106875 energy_diff = 106875 - 100781 = 6094 (this is not equal to 'new' PD1's energy_diff in 'd)') c) 'old' w/o patch set, PD1 task_util = 13 cost = 160 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160 * 283 / 355 = 127.55 => 127 energy_w_task = 160 * 296 / 355 = 133.41 => 133 energy_diff = 133 - 127 = 6 (this is equal to 'old' PD0's energy_diff in 'a)') d) 'new' w/ patch set, PD1 task_util = 13 cost = 160 * 1000 = 160000 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160000 * 283 / 355 = 127549 energy_w_task = 160000 * 296 / 355 = 133408 energy_diff = 133408 - 127549 = 5859 (this is not equal to 'new' PD0's energy_diff in 'b)') 2. Difference in the 6% energy margin filter at the end of find_energy_efficient_cpu(). With this patch the margin comparison also has better resolution, so it's possible to have better task placement thanks to that. Fixes: 27871f7a8a341ef ("PM: Introduce an Energy Model management framework") Reported-by: CCJ Yeh Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 16 ++++++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3f221dbf5f95..1834752c5617 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -53,6 +53,22 @@ struct em_perf_domain { #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF +/* + * Increase resolution of energy estimation calculations for 64-bit + * architectures. The extra resolution improves decision made by EAS for the + * task placement when two Performance Domains might provide similar energy + * estimation values (w/o better resolution the values could be equal). + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + */ +#ifdef CONFIG_64BIT +#define em_scale_power(p) ((p) * 1000) +#else +#define em_scale_power(p) (p) +#endif + struct em_data_callback { /** * active_power() - Provide power at the next performance state of diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0f4530b3a8cd..a332ccd829e2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } -- cgit v1.2.3-71-gd317 From e5c6b312ce3cc97e90ea159446e6bfa06645364d Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 5 Aug 2021 15:29:17 +0800 Subject: cpufreq: schedutil: Use kobject release() method to free sugov_tunables The struct sugov_tunables is protected by the kobject, so we can't free it directly. Otherwise we would get a call trace like this: ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x30 WARNING: CPU: 3 PID: 720 at lib/debugobjects.c:505 debug_print_object+0xb8/0x100 Modules linked in: CPU: 3 PID: 720 Comm: a.sh Tainted: G W 5.14.0-rc1-next-20210715-yocto-standard+ #507 Hardware name: Marvell OcteonTX CN96XX board (DT) pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--) pc : debug_print_object+0xb8/0x100 lr : debug_print_object+0xb8/0x100 sp : ffff80001ecaf910 x29: ffff80001ecaf910 x28: ffff00011b10b8d0 x27: ffff800011043d80 x26: ffff00011a8f0000 x25: ffff800013cb3ff0 x24: 0000000000000000 x23: ffff80001142aa68 x22: ffff800011043d80 x21: ffff00010de46f20 x20: ffff800013c0c520 x19: ffff800011d8f5b0 x18: 0000000000000010 x17: 6e6968207473696c x16: 5f72656d6974203a x15: 6570797420746365 x14: 6a626f2029302065 x13: 303378302f307830 x12: 2b6e665f72656d69 x11: ffff8000124b1560 x10: ffff800012331520 x9 : ffff8000100ca6b0 x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 0000000000000001 x5 : ffff800011d8c000 x4 : ffff800011d8c740 x3 : 0000000000000000 x2 : ffff0001108301c0 x1 : ab3c90eedf9c0f00 x0 : 0000000000000000 Call trace: debug_print_object+0xb8/0x100 __debug_check_no_obj_freed+0x1c0/0x230 debug_check_no_obj_freed+0x20/0x88 slab_free_freelist_hook+0x154/0x1c8 kfree+0x114/0x5d0 sugov_exit+0xbc/0xc0 cpufreq_exit_governor+0x44/0x90 cpufreq_set_policy+0x268/0x4a8 store_scaling_governor+0xe0/0x128 store+0xc0/0xf0 sysfs_kf_write+0x54/0x80 kernfs_fop_write_iter+0x128/0x1c0 new_sync_write+0xf0/0x190 vfs_write+0x2d4/0x478 ksys_write+0x74/0x100 __arm64_sys_write+0x24/0x30 invoke_syscall.constprop.0+0x54/0xe0 do_el0_svc+0x64/0x158 el0_svc+0x2c/0xb0 el0t_64_sync_handler+0xb0/0xb8 el0t_64_sync+0x198/0x19c irq event stamp: 5518 hardirqs last enabled at (5517): [] console_unlock+0x554/0x6c8 hardirqs last disabled at (5518): [] el1_dbg+0x28/0xa0 softirqs last enabled at (5504): [] __do_softirq+0x4d0/0x6c0 softirqs last disabled at (5483): [] irq_exit+0x1b0/0x1b8 So split the original sugov_tunables_free() into two functions, sugov_clear_global_tunables() is just used to clear the global_tunables and the new sugov_tunables_free() is used as kobj_type::release to release the sugov_tunables safely. Fixes: 9bdcb44e391d ("cpufreq: schedutil: New governor based on scheduler utilization data") Cc: 4.7+ # 4.7+ Signed-off-by: Kevin Hao Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); -- cgit v1.2.3-71-gd317 From 7b40066c97ec66a44e388f82fcf694987451768f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 5 Aug 2021 15:29:54 -0400 Subject: tracepoint: Use rcu get state and cond sync for static call updates State transitions from 1->0->1 and N->2->1 callbacks require RCU synchronization. Rather than performing the RCU synchronization every time the state change occurs, which is quite slow when many tracepoints are registered in batch, instead keep a snapshot of the RCU state on the most recent transitions which belong to a chain, and conditionally wait for a grace period on the last transition of the chain if one g.p. has not elapsed since the last snapshot. This applies to both RCU and SRCU. This brings the performance regression caused by commit 231264d6927f ("Fix: tracepoint: static call function vs data state mismatch") back to what it was originally. Before this commit: # trace-cmd start -e all # time trace-cmd start -p nop real 0m10.593s user 0m0.017s sys 0m0.259s After this commit: # trace-cmd start -e all # time trace-cmd start -p nop real 0m0.878s user 0m0.000s sys 0m0.103s Link: https://lkml.kernel.org/r/20210805192954.30688-1-mathieu.desnoyers@efficios.com Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/ Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: "Paul E. McKenney" Cc: Stefan Metzmacher Fixes: 231264d6927f ("Fix: tracepoint: static call function vs data state mismatch") Signed-off-by: Mathieu Desnoyers Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 81 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8d772bd6894d..efd14c79fab4 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -28,6 +28,44 @@ extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; DEFINE_SRCU(tracepoint_srcu); EXPORT_SYMBOL_GPL(tracepoint_srcu); +enum tp_transition_sync { + TP_TRANSITION_SYNC_1_0_1, + TP_TRANSITION_SYNC_N_2_1, + + _NR_TP_TRANSITION_SYNC, +}; + +struct tp_transition_snapshot { + unsigned long rcu; + unsigned long srcu; + bool ongoing; +}; + +/* Protected by tracepoints_mutex */ +static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; + +static void tp_rcu_get_state(enum tp_transition_sync sync) +{ + struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; + + /* Keep the latest get_state snapshot. */ + snapshot->rcu = get_state_synchronize_rcu(); + snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu); + snapshot->ongoing = true; +} + +static void tp_rcu_cond_sync(enum tp_transition_sync sync) +{ + struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; + + if (!snapshot->ongoing) + return; + cond_synchronize_rcu(snapshot->rcu); + if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu)) + synchronize_srcu(&tracepoint_srcu); + snapshot->ongoing = false; +} + /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -311,6 +349,11 @@ static int tracepoint_add_func(struct tracepoint *tp, */ switch (nr_func_state(tp_funcs)) { case TP_FUNC_1: /* 0->1 */ + /* + * Make sure new static func never uses old data after a + * 1->0->1 transition sequence. + */ + tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ @@ -325,10 +368,15 @@ static int tracepoint_add_func(struct tracepoint *tp, * Requires ordering between RCU assign/dereference and * static call update/call. */ - rcu_assign_pointer(tp->funcs, tp_funcs); - break; + fallthrough; case TP_FUNC_N: /* N->N+1 (N>1) */ rcu_assign_pointer(tp->funcs, tp_funcs); + /* + * Make sure static func never uses incorrect data after a + * N->...->2->1 (N>1) transition sequence. + */ + if (tp_funcs[0].data != old[0].data) + tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); @@ -372,24 +420,23 @@ static int tracepoint_remove_func(struct tracepoint *tp, /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, NULL); /* - * Make sure new func never uses old data after a 1->0->1 - * transition sequence. - * Considering that transition 0->1 is the common case - * and don't have rcu-sync, issue rcu-sync after - * transition 1->0 to break that sequence by waiting for - * readers to be quiescent. + * Make sure new static func never uses old data after a + * 1->0->1 transition sequence. */ - tracepoint_synchronize_unregister(); + tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1); break; case TP_FUNC_1: /* 2->1 */ rcu_assign_pointer(tp->funcs, tp_funcs); /* - * On 2->1 transition, RCU sync is needed before setting - * static call to first callback, because the observer - * may have loaded any prior tp->funcs after the last one - * associated with an rcu-sync. + * Make sure static func never uses incorrect data after a + * N->...->2->1 (N>2) transition sequence. If the first + * element's data has changed, then force the synchronization + * to prevent current readers that have loaded the old data + * from calling the new function. */ - tracepoint_synchronize_unregister(); + if (tp_funcs[0].data != old[0].data) + tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); + tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); break; @@ -397,6 +444,12 @@ static int tracepoint_remove_func(struct tracepoint *tp, fallthrough; case TP_FUNC_N: rcu_assign_pointer(tp->funcs, tp_funcs); + /* + * Make sure static func never uses incorrect data after a + * N->...->2->1 (N>2) transition sequence. + */ + if (tp_funcs[0].data != old[0].data) + tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); -- cgit v1.2.3-71-gd317 From e6a901a44f76878ed1653626c9ff4cfc5a3f58f8 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 16 May 2021 00:45:11 +0800 Subject: rcu: Fix to include first blocked task in stall warning The for loop in rcu_print_task_stall() always omits ts[0], which points to the first task blocking the stalled grace period. This in turn fails to count this first task, which means that ndetected will be equal to zero when all CPUs have passed through their quiescent states and only one task is blocking the stalled grace period. This zero value for ndetected will in turn result in an incorrect "All QSes seen" message: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: Tasks blocked on level-1 rcu_node (CPUs 12-23): (detected by 15, t=6504 jiffies, g=164777, q=9011209) rcu: All QSes seen, last rcu_preempt kthread activity 1 (4295252379-4295252378), jiffies_till_next_fqs=1, root ->qsmask 0x2 BUG: sleeping function called from invalid context at include/linux/uaccess.h:156 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 70613, name: msgstress04 INFO: lockdep is turned off. Preemption disabled at: [] create_object.isra.0+0x204/0x4b0 CPU: 15 PID: 70613 Comm: msgstress04 Kdump: loaded Not tainted 5.12.2-yoctodev-standard #1 Hardware name: Marvell OcteonTX CN96XX board (DT) Call trace: dump_backtrace+0x0/0x2cc show_stack+0x24/0x30 dump_stack+0x110/0x188 ___might_sleep+0x214/0x2d0 __might_sleep+0x7c/0xe0 This commit therefore fixes the loop to include ts[0]. Fixes: c583bcb8f5ed ("rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled") Tested-by: Qais Yousef Signed-off-by: Yanfei Xu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6c76988cc019..2e96f9741666 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -280,8 +280,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) break; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - for (i--; i; i--) { - t = ts[i]; + while (i) { + t = ts[--i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else -- cgit v1.2.3-71-gd317 From dc87740c8a6806bd2162bfb441770e4e53be5601 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 16 May 2021 17:50:10 +0800 Subject: rcu: Fix stall-warning deadlock due to non-release of rcu_node ->lock If rcu_print_task_stall() is invoked on an rcu_node structure that does not contain any tasks blocking the current grace period, it takes an early exit that fails to release that rcu_node structure's lock. This results in a self-deadlock, which is detected by lockdep. To reproduce this bug: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 3 --trust-make --configs "TREE03" --kconfig "CONFIG_PROVE_LOCKING=y" --bootargs "rcutorture.stall_cpu=30 rcutorture.stall_cpu_block=1 rcutorture.fwd_progress=0 rcutorture.test_boost=0" This will also result in other complaints, including RCU's scheduler hook complaining about blocking rather than preemption and an rcutorture writer stall. Only a partial RCU CPU stall warning message will be printed because of the self-deadlock. This commit therefore releases the lock on the rcu_print_task_stall() function's early exit path. Fixes: c583bcb8f5ed ("rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled") Tested-by: Qais Yousef Signed-off-by: Yanfei Xu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2e96f9741666..bd4de5bc5807 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -267,8 +267,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *ts[8]; lockdep_assert_irqs_disabled(); - if (!rcu_preempt_blocked_readers_cgp(rnp)) + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; + } pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); t = list_entry(rnp->gp_tasks->prev, -- cgit v1.2.3-71-gd317 From a86baa69c2b7b85bab41692fa3ec188a5aae1d27 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 18 May 2021 19:17:16 -0700 Subject: rcu: Remove special bit at the bottom of the ->dynticks counter Commit b8c17e6664c4 ("rcu: Maintain special bits at bottom of ->dynticks counter") reserved a bit at the bottom of the ->dynticks counter to defer flushing of TLBs, but this facility never has been used. This commit therefore removes this capability along with the rcu_eqs_special_set() function used to trigger it. Link: https://lore.kernel.org/linux-doc/CALCETrWNPOOdTrFabTDd=H7+wc6xJ9rJceg6OL1S0rTV5pfSsA@mail.gmail.com/ Suggested-by: Andy Lutomirski Signed-off-by: "Joel Fernandes (Google)" [ paulmck: Forward-port to v5.13-rc1. ] Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 3 -- kernel/rcu/tree.c | 77 +++++++++---------------------------------------- 2 files changed, 14 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 953e70fafe38..9be015305f9f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -14,9 +14,6 @@ #include /* for HZ */ -/* Never flag non-existent other CPUs! */ -static inline bool rcu_eqs_special_set(int cpu) { return false; } - unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..42a0032dd99f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -74,17 +74,10 @@ /* Data structures. */ -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, #endif @@ -266,7 +259,6 @@ void rcu_softirq_qs(void) */ static noinstr void rcu_dynticks_eqs_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -275,13 +267,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } /* @@ -291,7 +279,6 @@ static noinstr void rcu_dynticks_eqs_enter(void) */ static noinstr void rcu_dynticks_eqs_exit(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -299,15 +286,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); } /* @@ -324,9 +306,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + atomic_inc(&rdp->dynticks); } /* @@ -336,9 +318,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&this_cpu_ptr(&rcu_data)->dynticks) & 0x1); } /* @@ -347,9 +327,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; + return atomic_add_return(0, &rdp->dynticks); } /* @@ -358,7 +336,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICK_CTRL_CTR); + return !(snap & 0x1); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -389,8 +367,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | - RCU_DYNTICK_CTRL_CTR); + snap = atomic_read(&rdp->dynticks) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -398,32 +375,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); -} - -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - int new_old; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - new_old = atomic_read(&rdp->dynticks); - do { - old = new_old; - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - new_old = atomic_cmpxchg(&rdp->dynticks, old, new); - } while (new_old != old); - return true; + return snap == atomic_read(&rdp->dynticks); } /* @@ -442,10 +394,9 @@ notrace void rcu_momentary_dyntick_idle(void) int special; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + special = atomic_add_return(2, &this_cpu_ptr(&rcu_data)->dynticks); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(special & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); -- cgit v1.2.3-71-gd317 From 2be57f732889277b07ccddd205ef0616c8c1941f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 May 2021 17:25:42 -0700 Subject: rcu: Weaken ->dynticks accesses and updates Accesses to the rcu_data structure's ->dynticks field have always been fully ordered because it was not possible to prove that weaker ordering was safe. However, with the removal of the rcu_eqs_special_set() function and the advent of the Linux-kernel memory model, it is now easy to show that two of the four original full memory barriers can be weakened to acquire and release operations. The remaining pair must remain full memory barriers. This change makes the memory ordering requirements more evident, and it might well also speed up the to-idle and from-idle fastpaths on some architectures. The following litmus test, adapted from one supplied off-list by Frederic Weisbecker, models the RCU grace-period kthread detecting an idle CPU that is concurrently transitioning to non-idle: C dynticks-from-idle { DYNTICKS=0; (* Initially idle. *) } P0(int *X, int *DYNTICKS) { int dynticks; int x; // Idle. dynticks = READ_ONCE(*DYNTICKS); smp_store_release(DYNTICKS, dynticks + 1); smp_mb(); // Now non-idle x = READ_ONCE(*X); } P1(int *X, int *DYNTICKS) { int dynticks; WRITE_ONCE(*X, 1); smp_mb(); dynticks = smp_load_acquire(DYNTICKS); } exists (1:dynticks=0 /\ 0:x=1) Running "herd7 -conf linux-kernel.cfg dynticks-from-idle.litmus" verifies this transition, namely, showing that if the RCU grace-period kthread (P1) sees another CPU as idle (P0), then any memory access prior to the start of the grace period (P1's write to X) will be seen by any RCU read-side critical section following the to-non-idle transition (P0's read from X). This is a straightforward use of full memory barriers to force ordering in a store-buffering (SB) litmus test. The following litmus test, also adapted from the one supplied off-list by Frederic Weisbecker, models the RCU grace-period kthread detecting a non-idle CPU that is concurrently transitioning to idle: C dynticks-into-idle { DYNTICKS=1; (* Initially non-idle. *) } P0(int *X, int *DYNTICKS) { int dynticks; // Non-idle. WRITE_ONCE(*X, 1); dynticks = READ_ONCE(*DYNTICKS); smp_store_release(DYNTICKS, dynticks + 1); smp_mb(); // Now idle. } P1(int *X, int *DYNTICKS) { int x; int dynticks; smp_mb(); dynticks = smp_load_acquire(DYNTICKS); x = READ_ONCE(*X); } exists (1:dynticks=2 /\ 1:x=0) Running "herd7 -conf linux-kernel.cfg dynticks-into-idle.litmus" verifies this transition, namely, showing that if the RCU grace-period kthread (P1) sees another CPU as newly idle (P0), then any pre-idle memory access (P0's write to X) will be seen by any code following the grace period (P1's read from X). This is a simple release-acquire pair forcing ordering in a message-passing (MP) litmus test. Of course, if the grace-period kthread detects the CPU as non-idle, it will refrain from reporting a quiescent state on behalf of that CPU, so there are no ordering requirements from the grace-period kthread in that case. However, other subsystems call rcu_is_idle_cpu() to check for CPUs being non-idle from an RCU perspective. That case is also verified by the above litmus tests with the proviso that the sense of the low-order bit of the DYNTICKS counter be inverted. Unfortunately, on x86 smp_mb() is as expensive as a cache-local atomic increment. This commit therefore weakens only the read from ->dynticks. However, the updates are abstracted into a rcu_dynticks_inc() function to ease any future changes that might be needed. [ paulmck: Apply Linus Torvalds feedback. ] Link: https://lore.kernel.org/lkml/20210721202127.2129660-4-paulmck@kernel.org/ Suggested-by: Linus Torvalds Acked-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 42a0032dd99f..381ebd3ae26c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -251,6 +251,15 @@ void rcu_softirq_qs(void) rcu_tasks_qs(current, false); } +/* + * Increment the current CPU's rcu_data structure's ->dynticks field + * with ordering. Return the new value. + */ +static noinline noinstr unsigned long rcu_dynticks_inc(int incby) +{ + return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); +} + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -267,7 +276,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(1); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } @@ -286,7 +295,7 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(1); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); @@ -308,7 +317,7 @@ static void rcu_dynticks_eqs_online(void) if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_inc(&rdp->dynticks); + rcu_dynticks_inc(1); } /* @@ -318,7 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(arch_atomic_read(&this_cpu_ptr(&rcu_data)->dynticks) & 0x1); + return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /* @@ -327,7 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - return atomic_add_return(0, &rdp->dynticks); + smp_mb(); // Fundamental RCU ordering guarantee. + return atomic_read_acquire(&rdp->dynticks); } /* @@ -391,12 +401,12 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) */ notrace void rcu_momentary_dyntick_idle(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2, &this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(2); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & 0x1)); + WARN_ON_ONCE(!(seq & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); -- cgit v1.2.3-71-gd317 From 5fcb3a5f04ee6422714adb02f5364042228bfc2e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 May 2021 13:35:50 -0700 Subject: rcu: Mark accesses to ->rcu_read_lock_nesting KCSAN flags accesses to ->rcu_read_lock_nesting as data races, but in the past, the overhead of marked accesses was excessive. However, that was long ago, and much has changed since then, both in terms of hardware and of compilers. Here is data taken on an eight-core laptop using Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz with a kernel built using gcc version 9.3.0, with all data in nanoseconds. Unmarked accesses (status quo), measured by three refscale runs: Minimum reader duration: 3.286 2.851 3.395 Median reader duration: 3.698 3.531 3.4695 Maximum reader duration: 4.481 5.215 5.157 Marked accesses, also measured by three refscale runs: Minimum reader duration: 3.501 3.677 3.580 Median reader duration: 4.053 3.723 3.895 Maximum reader duration: 7.307 4.999 5.511 This focused microbenhmark shows only sub-nanosecond differences which are unlikely to be visible at the system level. This commit therefore marks data-racing accesses to ->rcu_read_lock_nesting. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d9680b798b21..cfeb43bfc719 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -53,7 +53,7 @@ void __rcu_read_unlock(void); * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ -#define rcu_preempt_depth() (current->rcu_read_lock_nesting) +#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..83a702a4e296 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -405,17 +405,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) static void rcu_preempt_read_enter(void) { - current->rcu_read_lock_nesting++; + WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1); } static int rcu_preempt_read_exit(void) { - return --current->rcu_read_lock_nesting; + int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1; + + WRITE_ONCE(current->rcu_read_lock_nesting, ret); + return ret; } static void rcu_preempt_depth_set(int val) { - current->rcu_read_lock_nesting = val; + WRITE_ONCE(current->rcu_read_lock_nesting, val); } /* -- cgit v1.2.3-71-gd317 From ccfc9dd6914feaa9a81f10f9cce56eb0f7712264 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 May 2021 00:56:23 +0900 Subject: rcu/tree: Handle VM stoppage in stall detection The soft watchdog timer function checks if a virtual machine was suspended and hence what looks like a lockup in fact is a false positive. This is what kvm_check_and_clear_guest_paused() does: it tests guest PVCLOCK_GUEST_STOPPED (which is set by the host) and if it's set then we need to touch all watchdogs and bail out. Watchdog timer function runs from IRQ, so PVCLOCK_GUEST_STOPPED check works fine. There is, however, one more watchdog that runs from IRQ, so watchdog timer fn races with it, and that watchdog is not aware of PVCLOCK_GUEST_STOPPED - RCU stall detector. apic_timer_interrupt() smp_apic_timer_interrupt() hrtimer_interrupt() __hrtimer_run_queues() tick_sched_timer() tick_sched_handle() update_process_times() rcu_sched_clock_irq() This triggers RCU stalls on our devices during VM resume. If tick_sched_handle()->rcu_sched_clock_irq() runs on a VCPU before watchdog_timer_fn()->kvm_check_and_clear_guest_paused() then there is nothing on this VCPU that touches watchdogs and RCU reads stale gp stall timestamp and new jiffies value, which makes it think that RCU has stalled. Make RCU stall watchdog aware of PVCLOCK_GUEST_STOPPED and don't report RCU stalls when we resume the VM. Signed-off-by: Sergey Senozhatsky Signed-off-by: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index bd4de5bc5807..0e7a60706d1c 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,8 @@ * Author: Paul E. McKenney */ +#include + ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. @@ -698,6 +700,14 @@ static void check_cpu_stall(struct rcu_data *rdp) (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) @@ -707,6 +717,14 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) -- cgit v1.2.3-71-gd317 From a80be428fbc1f1f3bc9ed9245906dd60850887f5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 May 2021 00:56:24 +0900 Subject: rcu: Do not disable GP stall detection in rcu_cpu_stall_reset() rcu_cpu_stall_reset() is one of the functions virtual CPUs execute during VM resume in order to handle jiffies skew that can trigger false positive stall warnings. Paul has pointed out that this approach is problematic because rcu_cpu_stall_reset() disables RCU grace period stall-detection virtually forever, while in fact it can just restart the stall-detection timeout. Suggested-by: "Paul E. McKenney" Signed-off-by: Sergey Senozhatsky Signed-off-by: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0e7a60706d1c..e199022dce9d 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -119,17 +119,14 @@ static void panic_on_rcu_stall(void) } /** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. + * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } ////////////////////////////////////////////////////////////////////////////// -- cgit v1.2.3-71-gd317 From b169246feb1d82dbee5f3f6a4ce57368644dce95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 May 2021 14:23:03 -0700 Subject: rcu: Start timing stall repetitions after warning complete Systems with low-bandwidth consoles can have very large printk() latencies, and on such systems it makes no sense to have the next RCU CPU stall warning message start output before the prior message completed. This commit therefore sets the time of the next stall only after the prints have completed. While printing, the time of the next stall message is set to ULONG_MAX/2 jiffies into the future. Reviewed-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e199022dce9d..42847caa3909 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -647,6 +647,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { + bool didstall = false; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -692,7 +693,7 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + jn = jiffies + ULONG_MAX / 2; if (rcu_gp_in_progress() && (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { @@ -709,6 +710,7 @@ static void check_cpu_stall(struct rcu_data *rdp) print_cpu_stall(gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -726,6 +728,11 @@ static void check_cpu_stall(struct rcu_data *rdp) print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; + } + if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); } } -- cgit v1.2.3-71-gd317 From 65bfdd36c113f5d579a382d8f2847210ea4cdca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 16:31:38 -0700 Subject: srcutiny: Mark read-side data races This commit marks some interrupt-induced read-side data races in __srcu_read_lock(), __srcu_read_unlock(), and srcu_torture_stats_print(). Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 8 ++++---- kernel/rcu/srcutiny.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 0e0cf4d6a72a..6cfaa0a9a9b9 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -61,7 +61,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) int idx; idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; - WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); return idx; } @@ -81,11 +81,11 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, { int idx; - idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; + idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1; pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", tt, tf, idx, - READ_ONCE(ssp->srcu_lock_nesting[!idx]), - READ_ONCE(ssp->srcu_lock_nesting[idx])); + data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])), + data_race(READ_ONCE(ssp->srcu_lock_nesting[idx]))); } #endif diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 26344dc6483b..a0ba2ed49bc6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = ssp->srcu_lock_nesting[idx] - 1; + int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) -- cgit v1.2.3-71-gd317 From d9ee962feb4f26d4eac0042861457d941aa2df5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Jun 2021 10:17:36 -0700 Subject: rcu: Mark lockless ->qsmask read in rcu_check_boost_fail() Accesses to ->qsmask are normally protected by ->lock, but there is an exception in the diagnostic code in rcu_check_boost_fail(). This commit therefore applies data_race() to this access to avoid KCSAN complaining about the C-language writes protected by ->lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 42847caa3909..6dd6c9aa3f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -766,7 +766,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) rcu_for_each_leaf_node(rnp) { if (!cpup) { - if (READ_ONCE(rnp->qsmask)) { + if (data_race(READ_ONCE(rnp->qsmask))) { return false; } else { if (READ_ONCE(rnp->gp_tasks)) -- cgit v1.2.3-71-gd317 From f74126dcbcbffe0d9fc3cb9bbf171b124a6791e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Jun 2021 21:57:02 -0700 Subject: rcu: Make rcu_gp_init() and rcu_gp_fqs_loop noinline to conserve stack The kbuild test project found an oversized stack frame in rcu_gp_kthread() for some kernel configurations. This oversizing was due to a very large amount of inlining, which is unnecessary due to the fact that this code executes infrequently. This commit therefore marks rcu_gp_init() and rcu_gp_fqs_loop noinline_for_stack to conserve stack space. Reported-by: kernel test robot Tested-by: Rong Chen [ paulmck: noinline_for_stack per Nathan Chancellor. ] Reviewed-by: Nathan Chancellor Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 381ebd3ae26c..d791c4d32987 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1733,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long firstseq; unsigned long flags; @@ -1927,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { bool first_gp_fqs; int gf = 0; -- cgit v1.2.3-71-gd317 From d283aa1b04d9ad9ed34bfc2f51ffe0371a16ee3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 May 2021 21:08:38 -0700 Subject: rcu: Mark accesses in tree_stall.h This commit marks the accesses in tree_stall.h so as to both avoid undesirable compiler optimizations and to keep KCSAN focused on the accesses of the core algorithm. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 63 ++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6dd6c9aa3f75..a8d0fcf0826f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -465,9 +465,10 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->__state : ~0, cpu); + data_race(READ_ONCE(rcu_state.gp_flags)), + gp_state_getname(rcu_state.gp_state), + data_race(READ_ONCE(rcu_state.gp_state)), + gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); @@ -510,7 +511,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, - gpk->__state); + data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); } @@ -569,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = data_race(rcu_state.gp_activity); + gpa = data_race(READ_ONCE(rcu_state.gp_activity)); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - data_race(jiffies_till_next_fqs), - rcu_get_root()->qsmask); + data_race(READ_ONCE(jiffies_till_next_fqs)), + data_race(READ_ONCE(rcu_get_root()->qsmask))); } } /* Rewrite if needed in case of slow consoles. */ @@ -815,32 +816,34 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - data_race(rcu_state.gp_activity); - jr = j - data_race(rcu_state.gp_req_activity); - js = j - data_race(rcu_state.gp_start); - jw = j - data_race(rcu_state.gp_wake_time); + ja = j - data_race(READ_ONCE(rcu_state.gp_activity)); + jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity)); + js = j - data_race(READ_ONCE(rcu_state.gp_start)); + jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time)); pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU, - js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), - (long)data_race(rcu_state.gp_seq), - (long)data_race(rcu_get_root()->gp_seq_needed), - data_race(rcu_state.gp_max), - data_race(rcu_state.gp_flags)); + data_race(READ_ONCE(rcu_state.gp_state)), + t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)), + (long)data_race(READ_ONCE(rcu_state.gp_seq)), + (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)), + data_race(READ_ONCE(rcu_state.gp_max)), + data_race(READ_ONCE(rcu_state.gp_flags))); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && - !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) && - !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks)) + !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) && + !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", rnp->grplo, rnp->grphi, - (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed), - data_race(rnp->qsmask), - ".b"[!!data_race(rnp->boost_kthread_task)], - ".B"[!!data_race(rnp->boost_tasks)], - ".E"[!!data_race(rnp->exp_tasks)], - ".G"[!!data_race(rnp->gp_tasks)], - data_race(rnp->n_boosts)); + (long)data_race(READ_ONCE(rnp->gp_seq)), + (long)data_race(READ_ONCE(rnp->gp_seq_needed)), + data_race(READ_ONCE(rnp->qsmask)), + ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))], + ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))], + ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))], + ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))], + data_race(READ_ONCE(rnp->n_boosts))); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -850,12 +853,12 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)data_race(rdp->gp_seq_needed)); + cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed))); } } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - cbs += data_race(rdp->n_cbs_invoked); + cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } @@ -937,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j) if (rcu_gp_in_progress()) { pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start))); show_rcu_gp_kthreads(); } else { pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end))); preempt_disable(); rdp = this_cpu_ptr(&rcu_data); rcu_check_gp_start_stall(rdp->mynode, rdp, j); -- cgit v1.2.3-71-gd317 From eb880949ef41c98a203c4a033e06e05854d902ef Mon Sep 17 00:00:00 2001 From: Liu Song Date: Tue, 29 Jun 2021 21:55:51 +0800 Subject: rcu: Remove useless "ret" update in rcu_gp_fqs_loop() Within rcu_gp_fqs_loop(), the "ret" local variable is set to the return value from swait_event_idle_timeout_exclusive(), but "ret" is unconditionally overwritten later in the code. This commit therefore removes this useless assignment. Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d791c4d32987..2f000e8a315d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1954,8 +1954,8 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ -- cgit v1.2.3-71-gd317 From 8211e922de2854130e3633f52cd4fc2d7817ceb0 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Wed, 30 Jun 2021 22:08:02 +0800 Subject: rcu: Use per_cpu_ptr to get the pointer of per_cpu variable There are a few remaining locations in kernel/rcu that still use "&per_cpu()". This commit replaces them with "per_cpu_ptr(&)", and does not introduce any functional change. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Neeraj Upadhyay Reviewed-by: Joel Fernandes (Google) Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_stall.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..21f00194e69d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -920,7 +920,7 @@ reset_ipi: // Allow future IPIs to be sent on CPU and for task. // Also order this IPI handler against any later manipulations of // the intended task. - smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2f000e8a315d..7403328cdf99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1286,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); + rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a8d0fcf0826f..677ee3d8671b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -351,7 +351,7 @@ static void rcu_dump_cpu_stacks(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, -- cgit v1.2.3-71-gd317 From 508958259bb3d9ca4ec37f0abdb211e9a6f3daa2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 6 Jul 2021 01:43:43 +0200 Subject: rcu: Explain why rcu_all_qs() is a stub in preemptible TREE RCU The cond_resched() function reports an RCU quiescent state only in non-preemptible TREE RCU implementation. This commit therefore adds a comment explaining why cond_resched() does nothing in preemptible kernels. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Joel Fernandes Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Peter Zijlstra (Intel) Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9ff40f4661..6a03c3fac55c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7781,6 +7781,17 @@ int __sched __cond_resched(void) preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); #endif -- cgit v1.2.3-71-gd317 From 521c89b3a4022269c75b35062358d1dae4ebfa79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jul 2021 11:52:12 -0700 Subject: rcu: Print human-readable message for schedule() in RCU reader The WARN_ON_ONCE() invocation within the CONFIG_PREEMPT=y version of rcu_note_context_switch() triggers when there is a voluntary context switch in an RCU read-side critical section, but there is quite a gap between the output of that WARN_ON_ONCE() and this RCU-usage error. This commit therefore converts the WARN_ON_ONCE() to a WARN_ONCE() that explicitly describes the problem in its message. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 83a702a4e296..e8b45ab72a79 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -346,7 +346,7 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!"); if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { -- cgit v1.2.3-71-gd317 From c4eb1f403243fc7bbb7de644db8587c03de36da6 Mon Sep 17 00:00:00 2001 From: Tatsuhiko Yasumatsu Date: Sat, 7 Aug 2021 00:04:18 +0900 Subject: bpf: Fix integer overflow involving bucket_size In __htab_map_lookup_and_delete_batch(), hash buckets are iterated over to count the number of elements in each bucket (bucket_size). If bucket_size is large enough, the multiplication to calculate kvmalloc() size could overflow, resulting in out-of-bounds write as reported by KASAN: [...] [ 104.986052] BUG: KASAN: vmalloc-out-of-bounds in __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.986489] Write of size 4194224 at addr ffffc9010503be70 by task crash/112 [ 104.986889] [ 104.987193] CPU: 0 PID: 112 Comm: crash Not tainted 5.14.0-rc4 #13 [ 104.987552] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 104.988104] Call Trace: [ 104.988410] dump_stack_lvl+0x34/0x44 [ 104.988706] print_address_description.constprop.0+0x21/0x140 [ 104.988991] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.989327] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.989622] kasan_report.cold+0x7f/0x11b [ 104.989881] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.990239] kasan_check_range+0x17c/0x1e0 [ 104.990467] memcpy+0x39/0x60 [ 104.990670] __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.990982] ? __wake_up_common+0x4d/0x230 [ 104.991256] ? htab_of_map_free+0x130/0x130 [ 104.991541] bpf_map_do_batch+0x1fb/0x220 [...] In hashtable, if the elements' keys have the same jhash() value, the elements will be put into the same bucket. By putting a lot of elements into a single bucket, the value of bucket_size can be increased to trigger the integer overflow. Triggering the overflow is possible for both callers with CAP_SYS_ADMIN and callers without CAP_SYS_ADMIN. It will be trivial for a caller with CAP_SYS_ADMIN to intentionally reach this overflow by enabling BPF_F_ZERO_SEED. As this flag will set the random seed passed to jhash() to 0, it will be easy for the caller to prepare keys which will be hashed into the same value, and thus put all the elements into the same bucket. If the caller does not have CAP_SYS_ADMIN, BPF_F_ZERO_SEED cannot be used. However, it will be still technically possible to trigger the overflow, by guessing the random seed value passed to jhash() (32bit) and repeating the attempt to trigger the overflow. In this case, the probability to trigger the overflow will be low and will take a very long time. Fix the integer overflow by calling kvmalloc_array() instead of kvmalloc() to allocate memory. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Tatsuhiko Yasumatsu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210806150419.109658-1-th.yasumatsu@gmail.com --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 72c58cc516a3..9c011f3a2687 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1565,8 +1565,8 @@ alloc: /* We cannot do copy_from_user or copy_to_user inside * the rcu_read_lock. Allocate enough space here. */ - keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); - values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); if (!keys || !values) { ret = -ENOMEM; goto after_loop; -- cgit v1.2.3-71-gd317 From 1d7db834a027ecfb5ac93be2e8bb45bfcbbf6eaa Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 22 Jul 2021 18:55:12 +0800 Subject: dma-debug: use memory_intersects() directly There is already a memory_intersects() helper in sections.h, use memory_intersects() directly instead of private overlap(). Signed-off-by: Kefeng Wang Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index dadae6255d05..fe6efd181614 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1064,20 +1064,10 @@ static void check_for_stack(struct device *dev, } } -static inline bool overlap(void *addr, unsigned long len, void *start, void *end) -{ - unsigned long a1 = (unsigned long)addr; - unsigned long b1 = a1 + len; - unsigned long a2 = (unsigned long)start; - unsigned long b2 = (unsigned long)end; - - return !(b1 <= a2 || a1 >= b2); -} - static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) { - if (overlap(addr, len, _stext, _etext) || - overlap(addr, len, __start_rodata, __end_rodata)) + if (memory_intersects(_stext, _etext, addr, len) || + memory_intersects(__start_rodata, __end_rodata, addr, len)) err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); } -- cgit v1.2.3-71-gd317 From 173735c346c412d9f084825ecb04f24ada0e2986 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Thu, 22 Jul 2021 16:10:55 +0200 Subject: dma-debug: fix debugfs initialization order Due to link order, dma_debug_init is called before debugfs has a chance to initialize (via debugfs_init which also happens in the core initcall stage), so the directories for dma-debug are never created. Decouple dma_debug_fs_init from dma_debug_init and defer its init until core_initcall_sync (after debugfs has been initialized) while letting dma-debug initialization occur as soon as possible to catch any early mappings, as suggested in [1]. [1] https://lore.kernel.org/linux-iommu/YIgGa6yF%2Fadg8OSN@kroah.com/ Fixes: 15b28bbcd567 ("dma-debug: move initialization to common code") Signed-off-by: Anthony Iliopoulos Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index fe6efd181614..6c90c69e5311 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -792,7 +792,7 @@ static int dump_show(struct seq_file *seq, void *v) } DEFINE_SHOW_ATTRIBUTE(dump); -static void dma_debug_fs_init(void) +static int __init dma_debug_fs_init(void) { struct dentry *dentry = debugfs_create_dir("dma-api", NULL); @@ -805,7 +805,10 @@ static void dma_debug_fs_init(void) debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); + + return 0; } +core_initcall_sync(dma_debug_fs_init); static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) { @@ -890,8 +893,6 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - dma_debug_fs_init(); - nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) dma_debug_create_entries(GFP_KERNEL); -- cgit v1.2.3-71-gd317 From fffe3cc8c2194f60c4af4fac7f27d25e8828f001 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:19 -0600 Subject: dma-mapping: allow map_sg() ops to return negative error codes Allow dma_map_sgtable() to pass errors from the map_sg() ops. This will be required for returning appropriate error codes when mapping P2PDMA memory. Introduce __dma_map_sg_attrs() which will return the raw error code from the map_sg operation (whether it be negative or zero). Then add a dma_map_sg_attrs() wrapper to convert any negative errors to zero to satisfy the existing calling convention. dma_map_sgtable() defines three error codes that .map_sg implementations are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which is a generic return for cases that are passing DMA_MAPPING_ERROR through. dma_map_sgtable() will convert a zero error return for old map_sg() ops into a -EIO return and return any negative errors as reported. This allows map_sg implementations to start returning multiple negative error codes. Legacy map_sg implementations can continue to return zero until they are all converted. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 5 +-- include/linux/dma-mapping.h | 35 ++++--------------- kernel/dma/mapping.c | 82 ++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 84 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 0d53a96a3d64..2f842498c448 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -41,8 +41,9 @@ struct dma_map_ops { size_t size, enum dma_data_direction dir, unsigned long attrs); /* - * map_sg returns 0 on error and a value > 0 on success. - * It should never return a value < 0. + * map_sg should return a negative error code on error. See + * dma_map_sgtable() for a list of appropriate error codes + * and their meanings. */ int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 183e7103a66d..daa1e360f0ee 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); +int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, @@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev, unsigned long attrs) { } +static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + return -EOPNOTSUPP; +} static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev, return dma_sync_single_for_device(dev, addr + offset, size, dir); } -/** - * dma_map_sgtable - Map the given buffer for DMA - * @dev: The device for which to perform the DMA operation - * @sgt: The sg_table object describing the buffer - * @dir: DMA direction - * @attrs: Optional DMA attributes for the map operation - * - * Maps a buffer described by a scatterlist stored in the given sg_table - * object for the @dir DMA operation by the @dev device. After success the - * ownership for the buffer is transferred to the DMA domain. One has to - * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the - * ownership of the buffer back to the CPU domain before touching the - * buffer by the CPU. - * - * Returns 0 on success or -EINVAL on error during mapping the buffer. - */ -static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, - enum dma_data_direction dir, unsigned long attrs) -{ - int nents; - - nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); - if (nents <= 0) - return -EINVAL; - sgt->nents = nents; - return 0; -} - /** * dma_unmap_sgtable - Unmap the given buffer for DMA * @dev: The device for which to perform the DMA operation diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 2b06a809d0b9..21e550076be5 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, } EXPORT_SYMBOL(dma_unmap_page_attrs); -/* - * dma_maps_sg_attrs returns 0 on error and > 0 on success. - * It should never return a value < 0. - */ -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs) +static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; @@ -197,13 +193,83 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); - BUG_ON(ents < 0); - debug_dma_map_sg(dev, sg, nents, ents, dir); + + if (ents > 0) + debug_dma_map_sg(dev, sg, nents, ents, dir); + else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != 0)) + return -EIO; return ents; } + +/** + * dma_map_sg_attrs - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sg: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist passed in the sg argument with + * nents segments for the @dir DMA operation by the @dev device. + * + * Returns the number of mapped entries (which can be less than nents) + * on success. Zero is returned for any error. + * + * dma_unmap_sg_attrs() should be used to unmap the buffer with the + * original sg and original nents (not the value returned by this funciton). + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + int ret; + + ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs); + if (ret < 0) + return 0; + return ret; +} EXPORT_SYMBOL(dma_map_sg_attrs); +/** + * dma_map_sgtable - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After success, the + * ownership for the buffer is transferred to the DMA domain. One has to + * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the + * ownership of the buffer back to the CPU domain before touching the + * buffer by the CPU. + * + * Returns 0 on success or a negative error code on error. The following + * error codes are supported with the given meaning: + * + * -EINVAL - An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM - Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO - Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned DMA_MAPPING_ERROR. + */ +int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + int nents; + + nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); + if (nents == 0) + return -EIO; + if (nents < 0) + return nents; + sgt->nents = nents; + return 0; +} +EXPORT_SYMBOL_GPL(dma_map_sgtable); + void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) -- cgit v1.2.3-71-gd317 From c81be74e7d790f090d3c8a52e20a334dc2506a3f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:20 -0600 Subject: dma-direct: return appropriate error code from dma_direct_map_sg() Now that the map_sg() op expects error codes instead of return zero on error, convert dma_direct_map_sg() to return an error code. Per the documentation for dma_map_sgtable(), -EIO is returned due to an DMA_MAPPING_ERROR with unknown cause. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f737e3347059..f33ceb68aef2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -411,7 +411,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return 0; + return -EIO; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, -- cgit v1.2.3-71-gd317 From 66ab63104f9cab09209851fef168f5972d791903 Mon Sep 17 00:00:00 2001 From: Martin Oliveira Date: Thu, 29 Jul 2021 14:15:38 -0600 Subject: dma-mapping: return error code from dma_dummy_map_sg() The .map_sg() op now expects an error code instead of zero on failure. The only errno to return is -EINVAL in the case when DMA is not supported. Signed-off-by: Martin Oliveira Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/dummy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index eacd4c5b10bf..b492d59ac77e 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, unsigned long attrs) { - return 0; + return -EINVAL; } static int dma_dummy_supported(struct device *hwdev, u64 mask) -- cgit v1.2.3-71-gd317 From d03c54419274f96434e2ad74e59e67ec6d54ca86 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 9 Aug 2021 17:13:31 +0200 Subject: dma-mapping: disallow .map_sg operations from returning zero on error Now that all the .map_sg operations have been converted to returning proper error codes, drop the code to handle a zero return value, add a warning if a zero is returned. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 21e550076be5..967b62692102 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (ents > 0) debug_dma_map_sg(dev, sg, nents, ents, dir); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != 0)) + ents != -EIO)) return -EIO; return ents; @@ -261,8 +261,6 @@ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, int nents; nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); - if (nents == 0) - return -EIO; if (nents < 0) return nents; sgt->nents = nents; -- cgit v1.2.3-71-gd317 From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 21:45:32 +0200 Subject: bpf: Add _kernel suffix to internal lockdown_bpf_read Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 2 +- kernel/bpf/helpers.c | 4 ++-- kernel/trace/bpf_trace.c | 8 ++++---- security/security.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 24eda04221e9..724d7a4a0c91 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -123,7 +123,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, - LOCKDOWN_BPF_READ, + LOCKDOWN_BPF_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62cf00383910..0b04553e8c44 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b4916ef388ad..1836591197a5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -999,19 +999,19 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS diff --git a/security/security.c b/security/security.c index 09533cbb7221..6b83ab4e9d66 100644 --- a/security/security.c +++ b/security/security.c @@ -61,7 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", - [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", -- cgit v1.2.3-71-gd317 From f153c2246783ba210493054d99c66353f56423c9 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 30 Jul 2021 08:28:54 +0200 Subject: ucounts: add missing data type changes commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t") changed the data type of ucounts/ucounts_max to long, but missed to adjust a few other places. This is noticeable on big endian platforms from user space because the /proc/sys/user/max_*_names files all contain 0. v4 - Made the min and max constants long so the sysctl values are actually settable on little endian machines. -- EWB Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t") Signed-off-by: Sven Schnelle Tested-by: Nathan Chancellor Tested-by: Linux Kernel Functional Testing Acked-by: Alexey Gladkov v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133 Signed-off-by: Eric W. Biederman --- fs/notify/fanotify/fanotify_user.c | 17 +++++++++++------ fs/notify/inotify/inotify_user.c | 17 +++++++++++------ kernel/ucount.c | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 64864fb40b40..28b67cb9458d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -54,22 +54,27 @@ static int fanotify_max_queued_events __read_mostly; #include +static long ft_zero = 0; +static long ft_int_max = INT_MAX; + struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, }, { .procname = "max_user_marks", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, }, { .procname = "max_queued_events", diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 98f61b31745a..62051247f6d2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -55,22 +55,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include +static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", diff --git a/kernel/ucount.c b/kernel/ucount.c index 77be3bbe3cc4..bb51849e6375 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -58,14 +58,17 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -#define UCOUNT_ENTRY(name) \ - { \ - .procname = name, \ - .maxlen = sizeof(int), \ - .mode = 0644, \ - .proc_handler = proc_dointvec_minmax, \ - .extra1 = SYSCTL_ZERO, \ - .extra2 = SYSCTL_INT_MAX, \ +static long ue_zero = 0; +static long ue_int_max = INT_MAX; + +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(long), \ + .mode = 0644, \ + .proc_handler = proc_doulongvec_minmax, \ + .extra1 = &ue_zero, \ + .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), -- cgit v1.2.3-71-gd317 From aeea1b86f9363f3feabb496534d886f082a89f21 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Sat, 31 Jul 2021 05:57:35 +0000 Subject: bpf, devmap: Exclude XDP broadcast to master device If the ingress device is bond slave, do not broadcast back through it or the bond master. Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210731055738.16820-5-joamaki@gmail.com --- kernel/bpf/devmap.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 542e94fa30b4..f02d04540c0c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -534,10 +534,9 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp, - int exclude_ifindex) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) { - if (!obj || obj->dev->ifindex == exclude_ifindex || + if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; @@ -562,17 +561,48 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, return 0; } +static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) +{ + while (num_excluded--) { + if (ifindex == excluded[num_excluded]) + return true; + } + return false; +} + +/* Get ifindex of each upper device. 'indexes' must be able to hold at + * least MAX_NEST_DEV elements. + * Returns the number of ifindexes added. + */ +static int get_upper_ifindexes(struct net_device *dev, int *indexes) +{ + struct net_device *upper; + struct list_head *iter; + int n = 0; + + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + indexes[n++] = upper->ifindex; + } + return n; +} + int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0; struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct xdp_frame *xdpf; + int num_excluded = 0; unsigned int i; int err; + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + excluded_devices[num_excluded++] = dev_rx->ifindex; + } + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -581,7 +611,10 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp, exclude_ifindex)) + if (!is_valid_dst(dst, xdp)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -601,7 +634,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp, exclude_ifindex)) + if (!is_valid_dst(dst, xdp)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -675,18 +712,27 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int exclude_ifindex = exclude_ingress ? dev->ifindex : 0; struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; + int num_excluded = 0; unsigned int i; int err; + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev, excluded_devices); + excluded_devices[num_excluded++] = dev->ifindex; + } + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!dst || dst->dev->ifindex == exclude_ifindex) + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -700,12 +746,17 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, return err; last_dst = dst; + } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { - if (!dst || dst->dev->ifindex == exclude_ifindex) + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ -- cgit v1.2.3-71-gd317 From 67dc8325370844ffce92aa59abe8b453aa6aa83c Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Sat, 31 Jul 2021 08:01:29 +0800 Subject: workqueue: Fix typo in comments Fix typo: *assing ==> assign *alloced ==> allocated *Retun ==> Return *excute ==> execute v1->v2: *reverse 'iff' *update changelog Signed-off-by: Cai Huoqing Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index d15a7730ee18..5fcf3d048a5a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -324,7 +324,7 @@ enum { * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to - * excute the work item on that CPU breaking the idleness, which in + * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 542c2d03dab6..d3c35e47aa90 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -524,7 +524,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** - * worker_pool_assign_id - allocate ID and assing it to @pool + * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned @@ -3763,7 +3763,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } -/* initialize newly alloced @pwq which is associated with @wq and @pool */ +/* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { @@ -5331,7 +5331,7 @@ static int workqueue_apply_unbound_cpumask(void) * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * - * Retun: 0 - Success + * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -- cgit v1.2.3-71-gd317 From e441b56fe438fd126b9eea7d30c57d3cd3f34e14 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Aug 2021 11:50:36 +0800 Subject: workqueue: Replace deprecated ida_simple_*() with ida_alloc()/ida_free() Replace ida_simple_get() with ida_alloc() and ida_simple_remove() with ida_free(), the latter is more concise and intuitive. In addition, if ida_alloc() fails, NULL is returned directly. This eliminates unnecessary initialization of two local variables and an 'if' judgment. Signed-off-by: Zhen Lei Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d3c35e47aa90..29f049463d88 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1912,14 +1912,14 @@ static void worker_detach_from_pool(struct worker *worker) */ static struct worker *create_worker(struct worker_pool *pool) { - struct worker *worker = NULL; - int id = -1; + struct worker *worker; + int id; char id_buf[16]; /* ID is needed to determine kthread name */ - id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) - goto fail; + return NULL; worker = alloc_worker(pool->node); if (!worker) @@ -1954,8 +1954,7 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: - if (id >= 0) - ida_simple_remove(&pool->worker_ida, id); + ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2378,7 +2377,7 @@ woke_up: set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); - ida_simple_remove(&pool->worker_ida, worker->id); + ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); kfree(worker); return 0; -- cgit v1.2.3-71-gd317 From ffd8bea81fbb5abe6518bce8d6297a149b935cf7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:20 +0200 Subject: workqueue: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Tejun Heo Reviewed-by: Lai Jiangshan Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29f049463d88..9bce39dba297 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3292,7 +3292,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -3304,7 +3304,7 @@ int schedule_on_each_cpu(work_func_t func) for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); + cpus_read_unlock(); free_percpu(works); return 0; } @@ -4015,14 +4015,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); - put_online_cpus(); + cpus_read_unlock(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, @@ -4067,7 +4067,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ @@ -4195,7 +4195,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - get_online_cpus(); + cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ @@ -4205,7 +4205,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -5167,10 +5167,10 @@ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { long ret = -ENODEV; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(cpu)) ret = work_on_cpu(cpu, fn, arg); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_on_cpu_safe); @@ -5442,7 +5442,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, @@ -5452,7 +5452,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return written; } -- cgit v1.2.3-71-gd317 From 6ba34d3c73674e46d9e126e4f0cee79e5ef2481c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:28 -0400 Subject: cgroup/cpuset: Fix violation of cpuset locking rule The cpuset fields that manage partition root state do not strictly follow the cpuset locking rule that update to cpuset has to be done with both the callback_lock and cpuset_mutex held. This is now fixed by making sure that the locking rule is upheld. Fixes: 3881b86128d0 ("cpuset: Add an error state to cpuset.sched.partition") Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 58 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 28a784bf64b1..13b5be6df4da 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1148,6 +1148,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int new_prs; bool part_error = false; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1183,6 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * A cpumask update cannot make parent's effective_cpus become empty. */ adding = deleting = false; + new_prs = cpuset->partition_root_state; if (cmd == partcmd_enable) { cpumask_copy(tmp->addmask, cpuset->cpus_allowed); adding = true; @@ -1247,11 +1249,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, switch (cpuset->partition_root_state) { case PRS_ENABLED: if (part_error) - cpuset->partition_root_state = PRS_ERROR; + new_prs = PRS_ERROR; break; case PRS_ERROR: if (!part_error) - cpuset->partition_root_state = PRS_ENABLED; + new_prs = PRS_ENABLED; break; } /* @@ -1260,10 +1262,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, part_error = (prev_prs == PRS_ERROR); } - if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) + if (!part_error && (new_prs == PRS_ERROR)) return 0; /* Nothing need to be done */ - if (cpuset->partition_root_state == PRS_ERROR) { + if (new_prs == PRS_ERROR) { /* * Remove all its cpus from parent's subparts_cpus. */ @@ -1272,7 +1274,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->subparts_cpus); } - if (!adding && !deleting) + if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) return 0; /* @@ -1299,6 +1301,9 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, } parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + + if (cpuset->partition_root_state != new_prs) + cpuset->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); return cmd == partcmd_update; @@ -1321,6 +1326,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; + int new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { @@ -1360,7 +1366,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - if ((cp != cs) && cp->partition_root_state) { + new_prs = cp->partition_root_state; + if ((cp != cs) && new_prs) { switch (parent->partition_root_state) { case PRS_DISABLED: /* @@ -1370,7 +1377,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) */ WARN_ON_ONCE(cp->partition_root_state != PRS_ERROR); - cp->partition_root_state = PRS_DISABLED; + new_prs = PRS_DISABLED; /* * clear_bit() is an atomic operation and @@ -1391,11 +1398,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* * When parent is invalid, it has to be too. */ - cp->partition_root_state = PRS_ERROR; - if (cp->nr_subparts_cpus) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } + new_prs = PRS_ERROR; break; } } @@ -1407,8 +1410,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && - (cp->partition_root_state != PRS_ENABLED)) { + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); } else if (cp->nr_subparts_cpus) { @@ -1435,6 +1437,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) = cpumask_weight(cp->subparts_cpus); } } + + if (new_prs != cp->partition_root_state) + cp->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -1944,25 +1950,25 @@ out: */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err; + int err, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - if (new_prs == cs->partition_root_state) + if (old_prs == new_prs) return 0; /* * Cannot force a partial or invalid partition root to a full * partition root. */ - if (new_prs && (cs->partition_root_state < 0)) + if (new_prs && (old_prs == PRS_ERROR)) return -EINVAL; if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; err = -EINVAL; - if (!cs->partition_root_state) { + if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed @@ -1981,14 +1987,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; } - cs->partition_root_state = PRS_ENABLED; } else { /* * Turning off partition root will clear the * CS_CPU_EXCLUSIVE bit. */ - if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = PRS_DISABLED; + if (old_prs == PRS_ERROR) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); err = 0; goto out; @@ -1999,8 +2003,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (err) goto out; - cs->partition_root_state = PRS_DISABLED; - /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); } @@ -2017,6 +2019,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) rebuild_sched_domains_locked(); out: + if (!err) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + } + free_cpumasks(NULL, &tmpmask); return err; } @@ -3080,8 +3088,10 @@ retry: if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || (parent->partition_root_state == PRS_ERROR))) { if (cs->nr_subparts_cpus) { + spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); compute_effective_cpumask(&new_cpus, cs, parent); } @@ -3095,7 +3105,9 @@ retry: cpumask_empty(&new_cpus)) { update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); + spin_lock_irq(&callback_lock); cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); } cpuset_force_rebuild(); } -- cgit v1.2.3-71-gd317 From c5c63b9a6a2e53757b598485b8adbafa56d6d9ee Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:07 +0200 Subject: cgroup: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Zefan Li Cc: Tejun Heo Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13b5be6df4da..65258102e030 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -979,7 +979,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1040,11 +1040,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } /** @@ -2288,7 +2288,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -2326,7 +2326,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2337,7 +2337,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2352,7 +2352,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2391,7 +2391,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2417,7 +2417,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2548,7 +2548,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2556,7 +2556,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, retval = update_prstate(cs, val); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; } @@ -2762,7 +2762,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); @@ -2815,7 +2815,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -2834,7 +2834,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) @@ -2855,7 +2855,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3-71-gd317 From 07d25971b220e477eb019fcb520a9f2e3ac966af Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 31 Jul 2021 20:30:11 +0800 Subject: locking/rtmutex: Use the correct rtmutex debugging config option It's CONFIG_DEBUG_RT_MUTEXES not CONFIG_DEBUG_RT_MUTEX. Fixes: f7efc4799f81 ("locking/rtmutex: Inline chainwalk depth check") Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Acked-by: Boqun Feng Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210731123011.4555-1-thunder.leizhen@huawei.com --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b5d9bb5202c6..ad0db322ed3b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,7 +343,7 @@ static __always_inline bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk chwalk) { - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX)) + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) return waiter != NULL; return chwalk == RT_MUTEX_FULL_CHAINWALK; } -- cgit v1.2.3-71-gd317 From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 12:43:17 +0200 Subject: bpf: Add lockdown check for probe_write_user helper Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") added the bpf_probe_write_user() helper in order to allow to override user space memory. Its original goal was to have a facility to "debug, divert, and manipulate execution of semi-cooperative processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed since it would otherwise tamper with its integrity. One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of using bpf_probe_write_user bpf helper") where the program DNATs traffic at the time of connect(2) syscall, meaning, it rewrites the arguments to a syscall while they're still in userspace, and before the syscall has a chance to copy the argument into kernel space. These days we have better mechanisms in BPF for achieving the same (e.g. for load-balancers), but without having to write to userspace memory. Of course the bpf_probe_write_user() helper can also be used to abuse many other things for both good or bad purpose. Outside of BPF, there is a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and PTRACE_POKE{TEXT,DATA}, but would likely require some more effort. Commit 96ae52279594 explicitly dedicated the helper for experimentation purpose only. Thus, move the helper's availability behind a newly added LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under the "integrity" mode. More fine-grained control can be implemented also from LSM side with this change. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 5 +++-- security/security.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 724d7a4a0c91..5b7288521300 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -120,6 +120,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, + LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1836591197a5..fdd14072fc3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -990,12 +990,13 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_probe_write_user: - return bpf_get_probe_write_proto(); case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : bpf_get_probe_write_proto(); case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/security/security.c b/security/security.c index 6b83ab4e9d66..9ffa9e9c5c55 100644 --- a/security/security.c +++ b/security/security.c @@ -58,6 +58,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", + [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", -- cgit v1.2.3-71-gd317 From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 9 Aug 2021 18:04:13 -0700 Subject: bpf: Fix potentially incorrect results with bpf_get_local_storage() Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed a bug for bpf_get_local_storage() helper so different tasks won't mess up with each other's percpu local storage. The percpu data contains 8 slots so it can hold up to 8 contexts (same or different tasks), for 8 different program runs, at the same time. This in general is sufficient. But our internal testing showed the following warning multiple times: [...] warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193 __cgroup_bpf_run_filter_sock_ops+0x13e/0x180 RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180 tcp_call_bpf.constprop.99+0x93/0xc0 tcp_conn_request+0x41e/0xa50 ? tcp_rcv_state_process+0x203/0xe00 tcp_rcv_state_process+0x203/0xe00 ? sk_filter_trim_cap+0xbc/0x210 ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160 tcp_v6_do_rcv+0x181/0x3e0 tcp_v6_rcv+0xc65/0xcb0 ip6_protocol_deliver_rcu+0xbd/0x450 ip6_input_finish+0x11/0x20 ip6_input+0xb5/0xc0 ip6_sublist_rcv_finish+0x37/0x50 ip6_sublist_rcv+0x1dc/0x270 ipv6_list_rcv+0x113/0x140 __netif_receive_skb_list_core+0x1a0/0x210 netif_receive_skb_list_internal+0x186/0x2a0 gro_normal_list.part.170+0x19/0x40 napi_complete_done+0x65/0x150 mlx5e_napi_poll+0x1ae/0x680 __napi_poll+0x25/0x120 net_rx_action+0x11e/0x280 __do_softirq+0xbb/0x271 irq_exit_rcu+0x97/0xa0 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x1e/0x40 RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac ? __cgroup_bpf_run_filter_skb+0x378/0x4e0 ? do_softirq+0x34/0x70 ? ip6_finish_output2+0x266/0x590 ? ip6_finish_output+0x66/0xa0 ? ip6_output+0x6c/0x130 ? ip6_xmit+0x279/0x550 ? ip6_dst_check+0x61/0xd0 [...] Using drgn [0] to dump the percpu buffer contents showed that on this CPU slot 0 is still available, but slots 1-7 are occupied and those tasks in slots 1-7 mostly don't exist any more. So we might have issues in bpf_cgroup_storage_unset(). Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset(). Currently, it tries to unset "current" slot with searching from the start. So the following sequence is possible: 1. A task is running and claims slot 0 2. Running BPF program is done, and it checked slot 0 has the "task" and ready to reset it to NULL (not yet). 3. An interrupt happens, another BPF program runs and it claims slot 1 with the *same* task. 4. The unset() in interrupt context releases slot 0 since it matches "task". 5. Interrupt is done, the task in process context reset slot 0. At the end, slot 1 is not reset and the same process can continue to occupy slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF program won't be able to claim an empty slot and a warning will be issued. To fix the issue, for unset() function, we should traverse from the last slot to the first. This way, the above issue can be avoided. The same reverse traversal should also be done in bpf_get_local_storage() helper itself. Otherwise, incorrect local storage may be returned to BPF program. [0] https://github.com/osandov/drgn Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com --- include/linux/bpf-cgroup.h | 4 ++-- kernel/bpf/helpers.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..6c9b10d82c80 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void) { int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b04553e8c44..7a97b2f4747d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) void *ptr; int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); -- cgit v1.2.3-71-gd317 From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:48 +0200 Subject: genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP X86 IO/APIC and MSI interrupts (when used without interrupts remapping) require that the affinity setup on startup is done before the interrupt is enabled for the first time as the non-remapped operation mode cannot safely migrate enabled interrupts from arbitrary contexts. Provide a new irq chip flag which allows affected hardware to request this. This has to be opt-in because there have been reports in the past that some interrupt chips cannot handle affinity setting before startup. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8e9a9ae471a6..c8293c817646 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -569,6 +569,7 @@ struct irq_chip { * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), }; #include diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 7f04c7d8296e..a98bcfc4be7b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: + if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) + irq_setup_affinity(desc); ret = __irq_startup(desc); - irq_setup_affinity(desc); + if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) + irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false); -- cgit v1.2.3-71-gd317 From c576e0fcd6188d0edb50b0fb83f853433ef4819b Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Sun, 8 Aug 2021 15:24:33 +1000 Subject: kernel/pid.c: remove static qualifier from pidfd_create() With the idea of returning pidfds from the fanotify API, we need to expose a mechanism for creating pidfds. We drop the static qualifier from pidfd_create() and add its declaration to linux/pid.h so that the pidfd_create() helper can be called from other kernel subsystems i.e. fanotify. Link: https://lore.kernel.org/r/0c68653ec32f1b7143301f0231f7ed14062fd82b.1628398044.git.repnop@google.com Signed-off-by: Matthew Bobrowski Acked-by: Christian Brauner Signed-off-by: Jan Kara --- include/linux/pid.h | 1 + kernel/pid.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index fa10acb8d6a4..af308e15f174 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -78,6 +78,7 @@ struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); +int pidfd_create(struct pid *pid, unsigned int flags); static inline struct pid *get_pid(struct pid *pid) { diff --git a/kernel/pid.c b/kernel/pid.c index ebdf9c60cd0b..d3cd95b8b080 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -550,10 +550,12 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * + * This symbol should not be explicitly exported to loadable modules. + * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid, unsigned int flags) +int pidfd_create(struct pid *pid, unsigned int flags) { int fd; -- cgit v1.2.3-71-gd317 From 490b9ba881e2c6337bb09b68010803ae98e59f4a Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Sun, 8 Aug 2021 15:25:05 +1000 Subject: kernel/pid.c: implement additional checks upon pidfd_create() parameters By adding the pidfd_create() declaration to linux/pid.h, we effectively expose this function to the rest of the kernel. In order to avoid any unintended behavior, or set false expectations upon this function, ensure that constraints are forced upon each of the passed parameters. This includes the checking of whether the passed struct pid is a thread-group leader as pidfd creation is currently limited to such pid types. Link: https://lore.kernel.org/r/2e9b91c2d529d52a003b8b86c45f866153be9eb5.1628398044.git.repnop@google.com Signed-off-by: Matthew Bobrowski Acked-by: Christian Brauner Signed-off-by: Jan Kara --- kernel/pid.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index d3cd95b8b080..efe87db44683 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -559,6 +559,12 @@ int pidfd_create(struct pid *pid, unsigned int flags) { int fd; + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), flags | O_RDWR | O_CLOEXEC); if (fd < 0) @@ -598,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p, flags); - else - fd = -EINVAL; + fd = pidfd_create(p, flags); put_pid(p); return fd; -- cgit v1.2.3-71-gd317 From 019d0454c61707879cf9853c894e0a191f6b9774 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Aug 2021 14:52:29 -0700 Subject: bpf, core: Fix kernel-doc notation Fix kernel-doc warnings in kernel/bpf/core.c (found by scripts/kernel-doc and W=1 builds). That is, correct a function name in a comment and add return descriptions for 2 functions. Fixes these kernel-doc warnings: kernel/bpf/core.c:1372: warning: expecting prototype for __bpf_prog_run(). Prototype was for ___bpf_prog_run() instead kernel/bpf/core.c:1372: warning: No description found for return value of '___bpf_prog_run' kernel/bpf/core.c:1883: warning: No description found for return value of 'bpf_prog_select_runtime' Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210809215229.7556-1-rdunlap@infradead.org --- kernel/bpf/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b1a5fc04492b..0a28a8095d3e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1362,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) } /** - * __bpf_prog_run - run eBPF program on a given context + * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. + * + * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { @@ -1878,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. + * + * Return: the &fp argument along with &err set to 0 for success or + * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { -- cgit v1.2.3-71-gd317 From 428e211641ed808b55cdc7d880a0ee349eff354b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:08 +0200 Subject: genirq/affinity: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-26-bigeasy@linutronix.de --- kernel/irq/affinity.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4d89ad4fae3b..f7ff8919dc9b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -355,7 +355,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, goto fail_npresmsk; /* Stabilize the cpumasks */ - get_online_cpus(); + cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ @@ -384,7 +384,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, nr_others = ret; fail_build_affinity: - put_online_cpus(); + cpus_read_unlock(); if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); @@ -505,9 +505,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, if (affd->calc_sets) { set_vecs = maxvec - resv; } else { - get_online_cpus(); + cpus_read_lock(); set_vecs = cpumask_weight(cpu_possible_mask); - put_online_cpus(); + cpus_read_unlock(); } return resv + min(set_vecs, maxvec - resv); -- cgit v1.2.3-71-gd317 From 746f5ea9c4283d98353c1cd41864aec475e0edbd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:15 +0200 Subject: sched: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-33-bigeasy@linutronix.de --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 433b4001e71e..d67c5dd51c16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9840,7 +9840,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -9884,7 +9884,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } -- cgit v1.2.3-71-gd317 From 698429f9d0e54ce3964151adff886ee5fc59714b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:17 +0200 Subject: clocksource: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-35-bigeasy@linutronix.de --- kernel/time/clocksource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c..b8a14d2fb5ba 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -306,12 +306,12 @@ void clocksource_verify_percpu(struct clocksource *cs) return; cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); - get_online_cpus(); + cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); if (cpumask_weight(&cpus_chosen) == 0) { preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } @@ -337,7 +337,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_ahead), testcpu, cs->name); -- cgit v1.2.3-71-gd317 From 844d87871b6e0ac3ceb177535dcdf6e6a9f1fd4b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:16 +0200 Subject: smpboot: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-34-bigeasy@linutronix.de --- kernel/smpboot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index cf6acab78538..f6bc0bc8a2aa 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -291,7 +291,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -304,7 +304,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); @@ -317,12 +317,12 @@ EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); */ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) { - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smpboot_threads_lock); list_del(&plug_thread->list); smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); -- cgit v1.2.3-71-gd317 From 61377ec144574313ebfbf31685895a7b9b9b7a9a Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Sat, 31 Jul 2021 01:07:40 -0400 Subject: genirq: Clarify documentation for request_threaded_irq() Clarify wording and document commonly used IRQF_ONESHOT flag. Signed-off-by: Joel Savitz Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210731050740.444454-1-jsavitz@redhat.com --- kernel/irq/manage.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef30b4762947..766468a2fc5a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2072,9 +2072,9 @@ const void *free_nmi(unsigned int irq, void *dev_id) * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts - * If NULL and thread_fn != NULL the default - * primary handler is installed + * Primary handler for threaded interrupts. + * If handler is NULL and thread_fn != NULL + * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags @@ -2108,6 +2108,8 @@ const void *free_nmi(unsigned int irq, void *dev_id) * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level + * IRQF_ONESHOT Do not unmask interrupt line until + * thread_fn returns * */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, -- cgit v1.2.3-71-gd317 From 51be9e51a8000ffc6a33083ceca9da9303ed4dc5 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:33 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the underlying calls handling the passthrough of the interrupt in the guest need a number in the XIVE IRQ domain. Use the IRQ data mapped in the XIVE IRQ domain and not the one in the PCI-MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-16-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 3 ++- kernel/irq/irqdomain.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 45a0434e9d85..6878026ee8ec 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -927,7 +927,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_get_irq_data(host_irq); + struct irq_data *host_data = + irq_domain_get_irq_data(irq_get_default_host(), host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..0eee4816edac 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void) { return irq_default_domain; } +EXPORT_SYMBOL_GPL(irq_get_default_host); static bool irq_domain_is_nomap(struct irq_domain *domain) { -- cgit v1.2.3-71-gd317 From 5a6c76b5de59ed508d7cb133327a7c54e77fed97 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 May 2021 16:27:29 -0500 Subject: genirq/generic_chip: Use struct_size() in kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210513212729.GA214145@embeddedor --- kernel/irq/generic-chip.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f8f23af6ab0d..cc7cdd26e23e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -240,9 +240,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - gc = kzalloc(sz, GFP_KERNEL); + gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL); if (gc) { irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, handler); @@ -288,8 +287,11 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; - int numchips, sz, i; unsigned long flags; + int numchips, i; + size_t dgc_sz; + size_t gc_sz; + size_t sz; void *tmp; if (d->gc) @@ -300,8 +302,9 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, return -EINVAL; /* Allocate a pointer, generic chip and chiptypes for each chip */ - sz = sizeof(*dgc) + numchips * sizeof(gc); - sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + gc_sz = struct_size(gc, chip_types, num_ct); + dgc_sz = struct_size(dgc, gc, numchips); + sz = dgc_sz + numchips * gc_sz; tmp = dgc = kzalloc(sz, GFP_KERNEL); if (!dgc) @@ -314,7 +317,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->gc = dgc; /* Calc pointer to the first generic chip */ - tmp += sizeof(*dgc) + numchips * sizeof(gc); + tmp += dgc_sz; for (i = 0; i < numchips; i++) { /* Store the pointer to the generic chip */ dgc->gc[i] = gc = tmp; @@ -331,7 +334,7 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); /* Calc pointer to the next generic chip */ - tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + tmp += gc_sz; } return 0; } -- cgit v1.2.3-71-gd317 From b9cc7d8a4656a6e815852c27ab50365009cb69c1 Mon Sep 17 00:00:00 2001 From: Ben Dai Date: Sun, 25 Apr 2021 23:09:03 +0800 Subject: genirq/timings: Prevent potential array overflow in __irq_timings_store() When the interrupt interval is greater than 2 ^ PREDICTION_BUFFER_SIZE * PREDICTION_FACTOR us and less than 1s, the calculated index will be greater than the length of irqs->ema_time[]. Check the calculated index before using it to prevent array overflow. Fixes: 23aa3b9a6b7d ("genirq/timings: Encapsulate storing function") Signed-off-by: Ben Dai Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210425150903.25456-1-ben.dai9703@gmail.com --- kernel/irq/timings.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index d309d6fbf5bd..4d2a702d7aa9 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, */ index = irq_timings_interval_index(interval); + if (index > PREDICTION_BUFFER_SIZE - 1) { + irqs->count = 0; + return; + } + /* * Store the index as an element of the pattern in another * circular array. -- cgit v1.2.3-71-gd317 From dbbc93576e03fbe24b365fab0e901eb442237a8a Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Tue, 18 May 2021 11:31:17 +0800 Subject: genirq/msi: Ensure deactivation on teardown msi_domain_alloc_irqs() invokes irq_domain_activate_irq(), but msi_domain_free_irqs() does not enforce deactivation before tearing down the interrupts. This happens when PCI/MSI interrupts are set up and never used before being torn down again, e.g. in error handling pathes. The only place which cleans that up is the error handling path in msi_domain_alloc_irqs(). Move the cleanup from msi_domain_alloc_irqs() into msi_domain_free_irqs() to cure that. Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early") Signed-off-by: Bixuan Cui Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210518033117.78104-1-cuibixuan@huawei.com --- kernel/irq/msi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index c41965e348b5..85df3ca03efe 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -476,11 +476,6 @@ skip_activate: return 0; cleanup: - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } msi_domain_free_irqs(domain, dev); return ret; } @@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { + struct irq_data *irq_data; struct msi_desc *desc; + int i; + + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); + if (irqd_is_activated(irq_data)) + irq_domain_deactivate_irq(irq_data); + } for_each_msi_entry(desc, dev) { /* -- cgit v1.2.3-71-gd317 From 1dae37c7e41d9a75a615ba7b0480acc2e04094d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 13:01:47 +0100 Subject: posix-timers: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210721120147.109570-1-colin.king@canonical.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..3913222e7bcf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; - int ret = -1; + int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). -- cgit v1.2.3-71-gd317 From a5dec9f82ab2ae486119f0b0820ea16db3e522c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:08 +0200 Subject: posix-cpu-timers: Assert task sighand is locked while starting cputime counter Starting the process wide cputime counter needs to be done in the same sighand locking sequence than actually arming the related timer otherwise this races against concurrent timers setting/expiring in the same threadgroup. Detecting that the cputime counter is started without holding the sighand lock is a first step toward debugging such situations. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-2-frederic@kernel.org --- include/linux/sched/signal.h | 6 ++++++ kernel/signal.c | 15 +++++++++++++++ kernel/time/posix-cpu-timers.c | 2 ++ 3 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..0310a5add9ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -714,6 +714,12 @@ static inline void unlock_task_sighand(struct task_struct *task, spin_unlock_irqrestore(&task->sighand->siglock, *flags); } +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..52b6abec0ff8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1413,6 +1413,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, return sighand; } +#ifdef CONFIG_LOCKDEP +void lockdep_assert_task_sighand_held(struct task_struct *task) +{ + struct sighand_struct *sighand; + + rcu_read_lock(); + sighand = rcu_dereference(task->sighand); + if (sighand) + lockdep_assert_held(&sighand->siglock); + else + WARN_ON_ONCE(1); + rcu_read_unlock(); +} +#endif + /* * send signal info to all the members of a group */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 517be7fd175e..4693d3c71e7e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + lockdep_assert_task_sighand_held(tsk); + /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; -- cgit v1.2.3-71-gd317 From 175cc3ab28e3509ddee8de4f164b563d99daa570 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:09 +0200 Subject: posix-cpu-timers: Force next_expiration recalc after timer deletion A timer deletion only dequeues the timer but it doesn't shutdown the related costly process wide cputimer counter and the tick dependency. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } Make sure the next target's tick recalculates the nearest expiration and clears the process wide counter and tick dependency if necessary. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-3-frederic@kernel.org --- include/linux/posix-timers.h | 4 +++- kernel/time/posix-cpu-timers.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 896c16d2c5fb..4cf1fbe8d1bc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,12 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } -static inline void cpu_timer_dequeue(struct cpu_timer *ctmr) +static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (ctmr->head) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; + return true; } + return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4693d3c71e7e..61c78b62fe6a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,37 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + int clkidx; + + if (!cpu_timer_dequeue(ctmr)) + return; + + clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + base = p->posix_cputimers.bases + clkidx; + else + base = p->signal->posix_cputimers.bases + clkidx; + + if (cpu_timer_getexpires(ctmr) == base->nextevt) + base->nextevt = 0; +} + + /* * Clean up a CPU-clock timer that is about to be destroyed. * This is called from timer deletion with the timer already locked. @@ -441,7 +472,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer) if (timer->it.cpu.firing) ret = TIMER_RETRY; else - cpu_timer_dequeue(ctmr); + disarm_timer(timer, p); unlock_task_sighand(p, &flags); } -- cgit v1.2.3-71-gd317 From 406dd42bd1ba0c01babf9cde169bb319e52f6147 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:10 +0200 Subject: posix-cpu-timers: Force next expiration recalc after itimer reset When an itimer deactivates a previously armed expiration, it simply doesn't do anything. As a result the process wide cputime counter keeps running and the tick dependency stays set until it reaches the old ghost expiration value. This can be reproduced with the following snippet: void trigger_process_counter(void) { struct itimerval n = {}; n.it_value.tv_sec = 100; setitimer(ITIMER_VIRTUAL, &n, NULL); n.it_value.tv_sec = 0; setitimer(ITIMER_VIRTUAL, &n, NULL); } Fix this with resetting the relevant base expiration. This is similar to disarming a timer. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-4-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 61c78b62fe6a..5c71322b45c7 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1379,8 +1379,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } -- cgit v1.2.3-71-gd317 From d9c1b2a1089f606404284b9f5b045a584d73382d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:11 +0200 Subject: posix-cpu-timers: Remove confusing return value override The end of the function cannot be reached with an error in variable ret. Unconfuse reviewers about that. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-5-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5c71322b45c7..4fbbbc89ac4a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -744,8 +744,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_fire(timer); } - - ret = 0; out: rcu_read_unlock(); if (old) -- cgit v1.2.3-71-gd317 From 5c8f23e6b73c13d9f7b52614783dcb9169883296 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:12 +0200 Subject: posix-cpu-timers: Consolidate timer base accessor Remove the ad-hoc timer base accessors and provide a consolidated one. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-6-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4fbbbc89ac4a..0d918117a3e0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -421,18 +432,11 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) { struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; - int clkidx; if (!cpu_timer_dequeue(ctmr)) return; - clkidx = CPUCLOCK_WHICH(timer->it_clock); - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; - + base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) base->nextevt = 0; } @@ -531,15 +535,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer, struct task_struct *p) { - int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct posix_cputimer_base *base = timer_base(timer, p); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct posix_cputimer_base *base; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; -- cgit v1.2.3-71-gd317 From ee375328f579f94251eb66d5dc91aba056019a31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:13 +0200 Subject: posix-cpu-timers: Recalc next expiration when timer_settime() ends up not queueing There are several scenarios that can result in posix_cpu_timer_set() not queueing the timer but still leaving the threadgroup cputime counter running or keeping the tick dependency around for a random amount of time. 1) If timer_settime() is called with a 0 expiration on a timer that is already disabled, the process wide cputime counter will be started and won't ever get a chance to be stopped by stop_process_timer() since no timer is actually armed to be processed. The following snippet is enough to trigger the issue. void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, TIMER_ABSTIME, &val, NULL); timer_delete(id); } 2) If timer_settime() is called with a 0 expiration on a timer that is already armed, the timer is dequeued but not really disarmed. So the process wide cputime counter and the tick dependency may still remain a while around. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } 3) If the timer was initially deactivated, this call to timer_settime() with an early expiration may have started the process wide cputime counter even though the timer hasn't been queued and armed because it has fired early and inline within posix_cpu_timer_set() itself. As a result the process wide cputime counter may never stop until a new timer is ever armed in the future. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } 4) If the timer was initially armed with a former expiration value before this call to timer_settime() and the current call sets an early deadline that has already expired, the timer fires inline within posix_cpu_timer_set(). In this case it must have been dequeued before firing inline with its new expiration value, yet it hasn't been disarmed in this case. So the process wide cputime counter and the tick dependency may still be around for a while even after the timer fired. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_sec = 100; timer_settime(id, TIMER_ABSTIME, &val, NULL); val.it_value.tv_sec = 0; val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } Fix all these issues with triggering the related base next expiration recalculation on the next tick. This also implies to re-evaluate the need to keep around the process wide cputime counter and the tick dependency, in a similar fashion to disarm_timer(). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-7-frederic@kernel.org --- include/linux/posix-timers.h | 7 ++++++- kernel/time/posix-cpu-timers.c | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4cf1fbe8d1bc..00fef0064355 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,9 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } +static inline bool cpu_timer_queued(struct cpu_timer *ctmr) +{ + return !!ctmr->head; +} + static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { - if (ctmr->head) { + if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0d918117a3e0..ee736861b18f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -418,6 +418,20 @@ static struct posix_cputimer_base *timer_base(struct k_itimer *timer, return tsk->signal->posix_cputimers.bases + clkidx; } +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -438,7 +452,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) - base->nextevt = 0; + trigger_base_recalc_expires(timer, p); } @@ -734,13 +748,28 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires != 0 && !(val < new_expires)) { + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. */ - cpu_timer_fire(timer); + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); + + unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); -- cgit v1.2.3-71-gd317 From 627ef5ae2df8eeccb20d5af0e4cfa4df9e61ed28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:46 +0200 Subject: hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() If __hrtimer_start_range_ns() is invoked with an already armed hrtimer then the timer has to be canceled first and then added back. If the timer is the first expiring timer then on removal the clockevent device is reprogrammed to the next expiring timer to avoid that the pending expiry fires needlessly. If the new expiry time ends up to be the first expiry again then the clock event device has to reprogrammed again. Avoid this by checking whether the timer is the first to expire and in that case, keep the timer on the current CPU and delay the reprogramming up to the point where the timer has been enqueued again. Reported-by: Lorenzo Colitti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135157.873137732@linutronix.de --- kernel/time/hrtimer.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4a66725b1d4a..ba2e0d0a0e5a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1030,12 +1030,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1048,8 +1049,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1103,9 +1112,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1115,9 +1146,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; - return enqueue_hrtimer(timer, new_base, mode); + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1183,7 +1229,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); -- cgit v1.2.3-71-gd317 From b14bca97c9f5c3e3f133445b01c723e95490d843 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jul 2021 15:39:47 +0200 Subject: hrtimer: Consolidate reprogramming code This code is mostly duplicated. The redudant store in the force reprogram case does no harm and the in hrtimer interrupt condition cannot be true for the force reprogram invocations. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.054424875@linutronix.de --- kernel/time/hrtimer.c | 72 +++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ba2e0d0a0e5a..5f7c46598d9f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,21 +652,24 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, + struct hrtimer *next_timer, ktime_t expires_next) { - ktime_t expires_next; + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; - expires_next = hrtimer_update_next_event(cpu_base); + if (expires_next > cpu_base->expires_next) + return; if (skip_equal && expires_next == cpu_base->expires_next) return; + cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -689,7 +692,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, + expires_next); } /* High resolution timer related functions */ @@ -835,40 +854,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - cpu_base->expires_next = expires; - - /* - * If hres is not active, hardware does not have to be - * programmed yet. - * - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) - return; - - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - tick_program_event(expires, 1); + __hrtimer_reprogram(cpu_base, true, timer, expires); } /* -- cgit v1.2.3-71-gd317 From 8c3b5e6ec0fee18bc2ce38d1dfe913413205f908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:48 +0200 Subject: hrtimer: Ensure timerfd notification for HIGHRES=n If high resolution timers are disabled the timerfd notification about a clock was set event is not happening for all cases which use clock_was_set_delayed() because that's a NOP for HIGHRES=n, which is wrong. Make clock_was_set_delayed() unconditially available to fix that. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.196661266@linutronix.de --- include/linux/hrtimer.h | 5 ----- kernel/time/hrtimer.c | 32 ++++++++++++++++---------------- kernel/time/tick-internal.h | 3 +++ 3 files changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bb5e7b0a4274..77295af72426 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -318,16 +318,12 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); - extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } - #endif static inline ktime_t @@ -351,7 +347,6 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5f7c46598d9f..7ebf642b53f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -777,22 +777,6 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - #else static inline int hrtimer_is_hres_enabled(void) { return 0; } @@ -877,6 +861,22 @@ void clock_was_set(void) timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogram the hrtimer + * interrupt device on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + /* * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6a742a29e545..cd610faa2523 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,6 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +void clock_was_set(void); +void clock_was_set_delayed(void); -- cgit v1.2.3-71-gd317 From e71a4153b7c256ec103e79875398553808aeffd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:49 +0200 Subject: hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case When CONFIG_HIGH_RES_TIMERS is disabled, but NOHZ is enabled then clock_was_set() is not doing anything. With HIGHRES=n the kernel relies on the periodic tick to update the clock offsets, but when NOHZ is enabled and active then CPUs which are in a deep idle sleep do not have a periodic tick which means the expiry of timers affected by clock_was_set() can be arbitrarily delayed up to the point where the CPUs are brought out of idle again. Make the clock_was_set() logic unconditionaly available so that idle CPUs are kicked out of idle to handle the update. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.288697903@linutronix.de --- kernel/time/hrtimer.c | 87 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 7ebf642b53f9..214fd65a9597 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -739,23 +739,7 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - - if (!__hrtimer_hres_active(base)) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} +static void retrigger_next_event(void *arg); /* * Switch to high resolution mode @@ -781,9 +765,50 @@ static void hrtimer_switch_to_hres(void) static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} /* * When a timer is enqueued and expires earlier than the already enqueued @@ -842,22 +867,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) } /* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(void) { -#ifdef CONFIG_HIGH_RES_TIMERS + if (!hrtimer_hres_active() && !tick_nohz_active) + goto out_timerfd; + /* Retrigger the CPU local events everywhere */ on_each_cpu(retrigger_next_event, NULL, 1); -#endif + +out_timerfd: timerfd_clock_was_set(); } -- cgit v1.2.3-71-gd317 From a761a67f591a8c7476c30bb20ed0f09fdfb1a704 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:51 +0200 Subject: timekeeping: Distangle resume and clock-was-set events Resuming timekeeping is a clock-was-set event and uses the clock-was-set notification mechanism. This is in the way of making the clock-was-set update for hrtimers selective so unnecessary IPIs are avoided when a CPU base does not have timers queued which are affected by the clock setting. Distangle it by invoking hrtimer_resume() on each unfreezing CPU and invoke the new timerfd_resume() function from timekeeping_resume() which is the only place where this is needed. Rename hrtimer_resume() to hrtimer_resume_local() to reflect the change. With this the clock_was_set*() functions are not longer required to IPI all CPUs unconditionally and can get some smarts to avoid them. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.488853478@linutronix.de --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 15 ++++++--------- kernel/time/tick-common.c | 7 +++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/timekeeping.c | 4 +++- 5 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 253c6e25f331..0ee140176f10 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -354,7 +354,6 @@ extern void timerfd_resume(void); static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif -extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 214fd65a9597..68e56f0ecb09 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -900,8 +900,8 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus and to notify timerfd. + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { @@ -909,18 +909,15 @@ void clock_was_set_delayed(void) } /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. */ -void hrtimers_resume(void) +void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d663249652ef..46789356f856 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -470,6 +470,13 @@ void tick_resume_local(void) else tick_resume_oneshot(); } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cd610faa2523..22de98cc6dd8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -168,3 +168,5 @@ void timer_clear_idle(void); void clock_was_set(void); void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a364aa9881a..c8a9b9e54c9d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1810,8 +1810,10 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); + /* Resume the clockevent device(s) and hrtimers */ tick_resume(); - hrtimers_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); } int timekeeping_suspend(void) -- cgit v1.2.3-71-gd317 From 1b267793f4fd9a089ea8558f3b6698186b9a3214 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:52 +0200 Subject: time/timekeeping: Avoid invoking clock_was_set() twice do_adjtimex() might end up scheduling a delayed clock_was_set() via timekeeping_advance() and then invoke clock_was_set() directly which is pointless. Make timekeeping_advance() return whether an invocation of clock_was_set() is required and handle it at the call sites which allows do_adjtimex() to issue a single direct call if required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.580966888@linutronix.de --- kernel/time/timekeeping.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c8a9b9e54c9d..19ed58e97b57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2127,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static void timekeeping_advance(enum timekeeping_adv_mode mode) +static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2198,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (clock_set) - /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + + return !!clock_set; } /** @@ -2209,7 +2208,8 @@ out: */ void update_wall_time(void) { - timekeeping_advance(TK_ADV_TICK); + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); } /** @@ -2389,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; - unsigned long flags; + bool clock_set = false; struct timespec64 ts; + unsigned long flags; s32 orig_tai, tai; int ret; @@ -2425,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; } tk_update_leap_state(tk); @@ -2435,9 +2437,9 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - timekeeping_advance(TK_ADV_FREQ); + clock_set |= timekeeping_advance(TK_ADV_FREQ); - if (tai != orig_tai) + if (clock_set) clock_was_set(); ntp_notify_cmos_timer(); -- cgit v1.2.3-71-gd317 From 17a1b8826b451c80e7999a7c68e06b70579b2b8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:53 +0200 Subject: hrtimer: Add bases argument to clock_was_set() clock_was_set() unconditionaly invokes retrigger_next_event() on all online CPUs. This was necessary because that mechanism was also used for resume from suspend to idle which is not longer the case. The bases arguments allows the callers of clock_was_set() to hand in a mask which tells clock_was_set() which of the hrtimer clock bases are affected by the clock setting. This mask will be used in the next step to check whether a CPU base has timers queued on a clock base affected by the event and avoid the SMP function call if there are none. Add a @bases argument, provide defines for the active bases masking and fixup all callsites. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.691083465@linutronix.de --- kernel/time/hrtimer.c | 4 ++-- kernel/time/tick-internal.h | 9 ++++++++- kernel/time/timekeeping.c | 14 +++++++------- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 68e56f0ecb09..c8af165c04eb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ -void clock_was_set(void) +void clock_was_set(unsigned int bases) { if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; @@ -894,7 +894,7 @@ out_timerfd: static void clock_was_set_work(struct work_struct *work) { - clock_was_set(); + clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 22de98cc6dd8..3548f0829e6d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -166,7 +166,14 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); -void clock_was_set(void); +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19ed58e97b57..b348749a9fc6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */ write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif @@ -2440,7 +2440,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(); + clock_was_set(CLOCK_REALTIME); ntp_notify_cmos_timer(); -- cgit v1.2.3-71-gd317 From 81d741d3460ca422843ce0ec8351083f259c6166 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 13 Jul 2021 15:39:54 +0200 Subject: hrtimer: Avoid unnecessary SMP function calls in clock_was_set() Setting of clocks triggers an unconditional SMP function call on all online CPUs to reprogram the clock event device. However, only some clocks have their offsets updated and therefore potentially require a reprogram. That's CLOCK_REALTIME and CLOCK_TAI and in the case of resume (delayed sleep time injection) also CLOCK_BOOTTIME. Instead of sending an IPI unconditionally, check each per CPU hrtimer base whether it has active timers in the affected clock bases which are indicated by the caller in the @bases argument of clock_was_set(). If that's not the case, skip the IPI and update the offsets remotely which ensures that any subsequently armed timers on the affected clocks are evaluated with the correct offsets. [ tglx: Adopted to the new bases argument, removed the softirq_active check, added comment, fixed up stale comment ] Signed-off-by: Marcelo Tosatti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.787536542@linutronix.de --- kernel/time/hrtimer.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c8af165c04eb..5d44c90d41ea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -882,11 +882,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) */ void clock_was_set(unsigned int bases) { + cpumask_var_t mask; + int cpu; + if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + /* + * Only send the IPI when there are timers queued in one of + * the affected clock bases. Otherwise update the base + * remote to ensure that the next enqueue of a timer on + * such a clock base will see the correct offsets. + */ + if (cpu_base->active_bases & bases) + cpumask_set_cpu(cpu, mask); + else + hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); -- cgit v1.2.3-71-gd317 From 1e7f7fbcd40c69d23e3fe641ead9f3dc128fa8aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:55 +0200 Subject: hrtimer: Avoid more SMP function calls in clock_was_set() By unconditionally updating the offsets there are more indicators whether the SMP function calls on clock_was_set() can be avoided: - When the offset update already happened on the remote CPU then the remote update attempt will yield the same seqeuence number and no IPI is required. - When the remote CPU is currently handling hrtimer_interrupt(). In that case the remote CPU will reevaluate the timer bases before reprogramming anyway, so nothing to do. - After updating it can be checked whether the first expiring timer in the affected clock bases moves before the first expiring (softirq) timer of the CPU. If that's not the case then sending the IPI is not required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.887322464@linutronix.de --- kernel/time/hrtimer.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5d44c90d41ea..88aefc35f7d2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -866,6 +866,68 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, true, timer, expires); } +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; + + /* + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. + * + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. + */ + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); + + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; + + /* + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. + */ + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; +} + /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). @@ -900,16 +962,10 @@ void clock_was_set(unsigned int bases) unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - /* - * Only send the IPI when there are timers queued in one of - * the affected clock bases. Otherwise update the base - * remote to ensure that the next enqueue of a timer on - * such a clock base will see the correct offsets. - */ - if (cpu_base->active_bases & bases) + + if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); - else - hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3-71-gd317 From ed3cd1da674034c4800abfc48c26f2742d5df17e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Sat, 5 Jun 2021 14:30:03 +0800 Subject: cpu/hotplug: Fix kernel doc warnings for __cpuhp_setup_state_cpuslocked() Fixes the following W=1 kernel build warning(s): kernel/cpu.c:1949: warning: Function parameter or member 'name' not described in '__cpuhp_setup_state_cpuslocked' Signed-off-by: Baokun Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210605063003.681049-1-libaokun1@huawei.com --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 804b847912dc..8930a4e022ce 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1976,6 +1976,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup + * @name: Name of the step * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state * @startup: startup callback function -- cgit v1.2.3-71-gd317 From 11bc021d1fbaaa1a6e7b92d6631faa875dd40b7d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Aug 2021 15:38:25 -0700 Subject: cpu/hotplug: Eliminate all kernel-doc warnings kernel/cpu.c:57: warning: cannot understand function prototype: 'struct cpuhp_cpu_state ' kernel/cpu.c:115: warning: cannot understand function prototype: 'struct cpuhp_step ' kernel/cpu.c:146: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * cpuhp_invoke_callback _ Invoke the callbacks for a given state kernel/cpu.c:75: warning: Function parameter or member 'fail' not described in 'cpuhp_cpu_state' kernel/cpu.c:75: warning: Function parameter or member 'cpu' not described in 'cpuhp_cpu_state' kernel/cpu.c:75: warning: Function parameter or member 'node' not described in 'cpuhp_cpu_state' kernel/cpu.c:75: warning: Function parameter or member 'last' not described in 'cpuhp_cpu_state' kernel/cpu.c:130: warning: Function parameter or member 'list' not described in 'cpuhp_step' kernel/cpu.c:130: warning: Function parameter or member 'multi_instance' not described in 'cpuhp_step' kernel/cpu.c:158: warning: No description found for return value of 'cpuhp_invoke_callback' kernel/cpu.c:1188: warning: No description found for return value of 'cpu_device_down' kernel/cpu.c:1400: warning: No description found for return value of 'cpu_device_up' kernel/cpu.c:1425: warning: No description found for return value of 'bringup_hibernate_cpu' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210809223825.24512-1-rdunlap@infradead.org --- kernel/cpu.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8930a4e022ce..62fb67e5c59c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -41,14 +41,19 @@ #include "smpboot.h" /** - * cpuhp_cpu_state - Per cpu hotplug state storage + * struct cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state + * @fail: Current CPU hotplug callback state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector + * @cpu: CPU number + * @node: Remote CPU node; for multi-instance, do a + * single entry callback for install/remove + * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done_up: Signal completion to the issuer of the task for cpu-up @@ -106,11 +111,12 @@ static inline void cpuhp_lock_release(bool bringup) { } #endif /** - * cpuhp_step - Hotplug state machine step + * struct cpuhp_step - Hotplug state machine step * @name: Name of the step * @startup: Startup function of the step * @teardown: Teardown function of the step * @cant_stop: Bringup/teardown can't be stopped at this step + * @multi_instance: State has multiple instances which get added afterwards */ struct cpuhp_step { const char *name; @@ -124,7 +130,9 @@ struct cpuhp_step { int (*multi)(unsigned int cpu, struct hlist_node *node); } teardown; + /* private: */ struct hlist_head list; + /* public: */ bool cant_stop; bool multi_instance; }; @@ -143,7 +151,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) } /** - * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * cpuhp_invoke_callback - Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked @@ -151,6 +159,8 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. + * + * Return: %0 on success or a negative errno code */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node, @@ -1183,6 +1193,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target) * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use remove_cpu() instead. + * + * Return: %0 on success or a negative errno code */ int cpu_device_down(struct device *dev) { @@ -1395,6 +1407,8 @@ out: * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use add_cpu() instead. + * + * Return: %0 on success or a negative errno code */ int cpu_device_up(struct device *dev) { @@ -1420,6 +1434,8 @@ EXPORT_SYMBOL_GPL(add_cpu); * On some architectures like arm64, we can hibernate on any CPU, but on * wake up the CPU we hibernated on might be offline as a side effect of * using maxcpus= for example. + * + * Return: %0 on success or a negative errno code */ int bringup_hibernate_cpu(unsigned int sleep_cpu) { @@ -1985,9 +2001,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * added afterwards. * * The caller needs to hold cpus read locked while calling this function. - * Returns: + * Return: * On success: - * Positive state number if @state is CPUHP_AP_ONLINE_DYN + * Positive state number if @state is CPUHP_AP_ONLINE_DYN; * 0 for all other states * On failure: proper (negative) error code */ -- cgit v1.2.3-71-gd317 From 1782dc87b2edcf3a6c350ead748a8941b5835975 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 27 May 2021 22:11:05 +0800 Subject: cpu/hotplug: Use DEVICE_ATTR_*() macro Use DEVICE_ATTR_*() helper instead of plain DEVICE_ATTR, which makes the code a bit shorter and easier to read. Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210527141105.2312-1-yuehaibing@huawei.com --- kernel/cpu.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 62fb67e5c59c..7ef28e1d5ecd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2249,18 +2249,17 @@ int cpuhp_smt_enable(void) #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) -static ssize_t show_cpuhp_state(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t state_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->state); } -static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); +static DEVICE_ATTR_RO(state); -static ssize_t write_cpuhp_target(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t target_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; @@ -2298,19 +2297,17 @@ out: return ret ? ret : count; } -static ssize_t show_cpuhp_target(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t target_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->target); } -static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); - +static DEVICE_ATTR_RW(target); -static ssize_t write_cpuhp_fail(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; @@ -2359,15 +2356,15 @@ static ssize_t write_cpuhp_fail(struct device *dev, return count; } -static ssize_t show_cpuhp_fail(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t fail_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->fail); } -static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); +static DEVICE_ATTR_RW(fail); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, @@ -2382,7 +2379,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = { NULL }; -static ssize_t show_cpuhp_states(struct device *dev, +static ssize_t states_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t cur, res = 0; @@ -2401,7 +2398,7 @@ static ssize_t show_cpuhp_states(struct device *dev, mutex_unlock(&cpuhp_state_mutex); return res; } -static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); +static DEVICE_ATTR_RO(states); static struct attribute *cpuhp_cpu_root_attrs[] = { &dev_attr_states.attr, @@ -2474,28 +2471,27 @@ static const char *smt_states[] = { [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", }; -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t control_show(struct device *dev, + struct device_attribute *attr, char *buf) { const char *state = smt_states[cpu_smt_control]; return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); } -static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t control_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { return __store_smt_control(dev, attr, buf, count); } -static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); +static DEVICE_ATTR_RW(control); -static ssize_t -show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t active_show(struct device *dev, + struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } -static DEVICE_ATTR(active, 0444, show_smt_active, NULL); +static DEVICE_ATTR_RO(active); static struct attribute *cpuhp_smt_attrs[] = { &dev_attr_control.attr, -- cgit v1.2.3-71-gd317 From ebca71a8c96f0af2ba482489ecc64d88979cd825 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Thu, 8 Apr 2021 22:53:16 -0700 Subject: cpu/hotplug: Add debug printks for hotplug callback failures CPU hotplug callbacks can fail and cause a rollback to the previous state. These failures are silent and therefore hard to debug. Add pr_debug() to the up and down paths which provide information about the error code, the CPU and the failed state. The debug printks can be enabled via kernel command line or sysfs. [ tglx: Adopt to current mainline, massage printk and changelog ] Signed-off-by: Dongli Zhang Signed-off-by: Thomas Gleixner Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20210409055316.1709-1-dongli.zhang@oracle.com --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 7ef28e1d5ecd..192e43a87407 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -692,6 +692,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback_range(true, cpu, st, target); if (ret) { + pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n", + ret, cpu, cpuhp_get_step(st->state)->name, + st->state); + cpuhp_reset_state(st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, @@ -1091,6 +1095,9 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback_range(false, cpu, st, target); if (ret) { + pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n", + ret, cpu, cpuhp_get_step(st->state)->name, + st->state); cpuhp_reset_state(st, prev_state); -- cgit v1.2.3-71-gd317 From d3dd95a8853f1d588e38e9d9d7c8cc2da412cc36 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:14 +0200 Subject: rcu: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7403328cdf99..f53a3c1bf1d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4011,7 +4011,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - get_online_cpus(); + cpus_read_lock(); /* * Force each CPU with callbacks to register a new callback. @@ -4042,7 +4042,7 @@ void rcu_barrier(void) rcu_state.barrier_sequence); } } - put_online_cpus(); + cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each -- cgit v1.2.3-71-gd317 From ed4fa2442e87bf9143d608473df117589e4bfc70 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:18 +0200 Subject: torture: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 0a315c387bed..bb8f411c974b 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -521,11 +521,11 @@ static void torture_shuffle_tasks(void) struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); + cpus_read_lock(); /* No point in shuffling if there is only one online CPU (ex: UP) */ if (num_online_cpus() == 1) { - put_online_cpus(); + cpus_read_unlock(); return; } @@ -541,7 +541,7 @@ static void torture_shuffle_tasks(void) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); mutex_unlock(&shuffle_task_mutex); - put_online_cpus(); + cpus_read_unlock(); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the -- cgit v1.2.3-71-gd317 From 91cc470e797828d779cd4c1efbe8519bcb358bae Mon Sep 17 00:00:00 2001 From: Tanner Love Date: Wed, 2 Jun 2021 14:03:38 -0400 Subject: genirq: Change force_irqthreads to a static key With CONFIG_IRQ_FORCED_THREADING=y, testing the boolean force_irqthreads could incur a cache line miss in invoke_softirq() and other places. Replace the test with a static key to avoid the potential cache miss. [ tglx: Dropped the IDE part, removed the export and updated blk-mq ] Suggested-by: Eric Dumazet Signed-off-by: Tanner Love Signed-off-by: Thomas Gleixner Reviewed-by: Eric Dumazet Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210602180338.3324213-1-tannerlove.kernel@gmail.com --- block/blk-mq.c | 2 +- include/linux/interrupt.h | 8 +++++--- kernel/irq/manage.c | 11 +++++------ kernel/softirq.c | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..572d8ab34014 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -606,7 +606,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) * This is probably worse than completing the request on a different * cache domain. */ - if (force_irqthreads) + if (force_irqthreads()) return false; /* same CPU or cache domain? Complete locally */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2ed65b01c961..1f22a30c0963 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -474,12 +475,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT -# define force_irqthreads (true) +# define force_irqthreads() (true) # else -extern bool force_irqthreads; +DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); +# define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else -#define force_irqthreads (0) +#define force_irqthreads() (false) #endif #ifndef local_softirq_pending diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 766468a2fc5a..34a66c4543a2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,12 +25,11 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -__read_mostly bool force_irqthreads; -EXPORT_SYMBOL_GPL(force_irqthreads); +DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { - force_irqthreads = true; + static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); @@ -1260,8 +1259,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); - if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, - &action->thread_flags)) + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; @@ -1322,7 +1321,7 @@ EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { - if (!force_irqthreads) + if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; diff --git a/kernel/softirq.c b/kernel/softirq.c index f3a012179f47..322b65d45676 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -422,7 +422,7 @@ static inline void invoke_softirq(void) if (ksoftirqd_running(local_softirq_pending())) return; - if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) { + if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if -- cgit v1.2.3-71-gd317 From 92848731c45f4f9c3d9818e6b4ba1b2884002324 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Sat, 5 Jun 2021 14:34:13 +0800 Subject: genirq/matrix: Fix kernel doc warnings for irq_matrix_alloc_managed() Describe the arguments correctly. Fixes the following W=1 kernel build warning(s): kernel/irq/matrix.c:287: warning: Function parameter or member 'msk' not described in 'irq_matrix_alloc_managed' kernel/irq/matrix.c:287: warning: Function parameter or member 'mapped_cpu' not described in 'irq_matrix_alloc_managed' kernel/irq/matrix.c:287: warning: Excess function parameter 'cpu' description in 'irq_matrix_alloc_managed' Signed-off-by: Baokun Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210605063413.684085-1-libaokun1@huawei.com --- kernel/irq/matrix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 578596e41cb6..bbfb26489aa1 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -280,7 +280,8 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) /** * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map * @m: Matrix pointer - * @cpu: On which CPU the interrupt should be allocated + * @msk: Which CPUs to search in + * @mapped_cpu: Pointer to store the CPU for which the irq was allocated */ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) -- cgit v1.2.3-71-gd317 From 290fdc4b7ef14e33d0e30058042b0e9bfd02b89b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 17:33:32 +0800 Subject: genirq/timings: Fix error return code in irq_timings_test_irqs() Return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: f52da98d900e ("genirq/timings: Add selftest for irqs circular buffer") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210811093333.2376-1-thunder.leizhen@huawei.com --- kernel/irq/timings.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index d309d6fbf5bd..59affb3bfdfa 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -794,12 +794,14 @@ static int __init irq_timings_test_irqs(struct timings_intervals *ti) __irq_timings_store(irq, irqs, ti->intervals[i]); if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + ret = -EBADSLT; pr_err("Failed to store in the circular buffer\n"); goto out; } } if (irqs->count != ti->count) { + ret = -ERANGE; pr_err("Count differs\n"); goto out; } -- cgit v1.2.3-71-gd317 From 3b35e7e6daef5a8b4819e2bd2d15898b9b4d1669 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Aug 2021 16:48:35 -0700 Subject: genirq: Fix kernel-doc warnings in pm.c, msi.c and ipi.c Fix all kernel-doc warnings in these 3 files and do some simple editing (capitalize acronyms, capitalize Linux). kernel/irq/pm.c:235: warning: expecting prototype for irq_pm_syscore_ops(). Prototype was for irq_pm_syscore_resume() instead kernel/irq/msi.c:530: warning: expecting prototype for __msi_domain_free_irqs(). Prototype was for msi_domain_free_irqs() instead kernel/irq/msi.c:31: warning: No description found for return value of 'alloc_msi_entry' kernel/irq/msi.c:103: warning: No description found for return value of 'msi_domain_set_affinity' kernel/irq/msi.c:288: warning: No description found for return value of 'msi_create_irq_domain' kernel/irq/msi.c:499: warning: No description found for return value of 'msi_domain_alloc_irqs' kernel/irq/msi.c:545: warning: No description found for return value of 'msi_get_domain_info' kernel/irq/ipi.c:264: warning: expecting prototype for ipi_send_mask(). Prototype was for __ipi_send_mask() instead kernel/irq/ipi.c:25: warning: No description found for return value of 'irq_reserve_ipi' kernel/irq/ipi.c:116: warning: No description found for return value of 'irq_destroy_ipi' kernel/irq/ipi.c:163: warning: No description found for return value of 'ipi_get_hwirq' kernel/irq/ipi.c:222: warning: No description found for return value of '__ipi_send_single' kernel/irq/ipi.c:308: warning: No description found for return value of 'ipi_send_single' kernel/irq/ipi.c:329: warning: No description found for return value of 'ipi_send_mask' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210810234835.12547-1-rdunlap@infradead.org --- kernel/irq/ipi.c | 32 ++++++++++++++++---------------- kernel/irq/msi.c | 19 ++++++++++++------- kernel/irq/pm.c | 2 +- 3 files changed, 29 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 52f11c791bf8..08ce7da3b57c 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -14,11 +14,11 @@ /** * irq_reserve_ipi() - Setup an IPI to destination cpumask * @domain: IPI domain - * @dest: cpumask of cpus which can receive the IPI + * @dest: cpumask of CPUs which can receive the IPI * * Allocate a virq that can be used to send IPI to any CPU in dest mask. * - * On success it'll return linux irq number and error code on failure + * Return: Linux IRQ number on success or error code on failure */ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest) @@ -104,13 +104,13 @@ free_descs: /** * irq_destroy_ipi() - unreserve an IPI that was previously allocated - * @irq: linux irq number to be destroyed - * @dest: cpumask of cpus which should have the IPI removed + * @irq: Linux IRQ number to be destroyed + * @dest: cpumask of CPUs which should have the IPI removed * * The IPIs allocated with irq_reserve_ipi() are returned to the system * destroying all virqs associated with them. * - * Return 0 on success or error code on failure. + * Return: %0 on success or error code on failure. */ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) { @@ -150,14 +150,14 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) } /** - * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu - * @irq: linux irq number - * @cpu: the target cpu + * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU + * @irq: Linux IRQ number + * @cpu: the target CPU * * When dealing with coprocessors IPI, we need to inform the coprocessor of * the hwirq it needs to use to receive and send IPIs. * - * Returns hwirq value on success and INVALID_HWIRQ on failure. + * Return: hwirq value on success or INVALID_HWIRQ on failure. */ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) { @@ -216,7 +216,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, * This function is for architecture or core code to speed up IPI sending. Not * usable from driver code. * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) { @@ -250,7 +250,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) } /** - * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * __ipi_send_mask - send an IPI to target Linux SMP CPU(s) * @desc: pointer to irq_desc of the IRQ * @dest: dest CPU(s), must be a subset of the mask passed to * irq_reserve_ipi() @@ -258,7 +258,7 @@ int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) * This function is for architecture or core code to speed up IPI sending. Not * usable from driver code. * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) { @@ -298,11 +298,11 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) /** * ipi_send_single - Send an IPI to a single CPU - * @virq: linux irq number from irq_reserve_ipi() + * @virq: Linux IRQ number from irq_reserve_ipi() * @cpu: destination CPU, must in the destination mask passed to * irq_reserve_ipi() * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int ipi_send_single(unsigned int virq, unsigned int cpu) { @@ -319,11 +319,11 @@ EXPORT_SYMBOL_GPL(ipi_send_single); /** * ipi_send_mask - Send an IPI to target CPU(s) - * @virq: linux irq number from irq_reserve_ipi() + * @virq: Linux IRQ number from irq_reserve_ipi() * @dest: dest CPU(s), must be a subset of the mask passed to * irq_reserve_ipi() * - * Returns zero on success and negative error number on failure. + * Return: %0 on success or negative error number on failure. */ int ipi_send_mask(unsigned int virq, const struct cpumask *dest) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index c41965e348b5..bb18040efbe8 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,13 +18,15 @@ #include "internals.h" /** - * alloc_msi_entry - Allocate an initialize msi_entry + * alloc_msi_entry - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec * - * If @affinity is not NULL then an affinity array[@nvec] is allocated + * If @affinity is not %NULL then an affinity array[@nvec] is allocated * and the affinity masks and flags from @affinity are copied. + * + * Return: pointer to allocated &msi_desc on success or %NULL on failure */ struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity) @@ -97,6 +99,8 @@ static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg) * * Intended to be used by MSI interrupt controllers which are * implemented with hierarchical domains. + * + * Return: IRQ_SET_MASK_* result code */ int msi_domain_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) @@ -277,10 +281,12 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) } /** - * msi_create_irq_domain - Create a MSI interrupt domain + * msi_create_irq_domain - Create an MSI interrupt domain * @fwnode: Optional fwnode of the interrupt controller * @info: MSI domain info * @parent: Parent irq domain + * + * Return: pointer to the created &struct irq_domain or %NULL on failure */ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, @@ -492,7 +498,7 @@ cleanup: * are allocated * @nvec: The number of interrupts to allocate * - * Returns 0 on success or an error code. + * Return: %0 on success or an error code. */ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) @@ -521,7 +527,7 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } /** - * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts * @dev: Pointer to device struct of the device for which the interrupts * are free @@ -538,8 +544,7 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from * - * Returns the pointer to the msi_domain_info stored in - * @domain->host_data. + * Return: the pointer to the msi_domain_info stored in @domain->host_data. */ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) { diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index ce0adb22ee96..ca71123a6130 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -227,7 +227,7 @@ unlock: } /** - * irq_pm_syscore_ops - enable interrupt lines early + * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -- cgit v1.2.3-71-gd317 From 49b3bd213a9f3d685784913c255c6a2cb3d1fcce Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Aug 2021 15:50:51 -0700 Subject: smp: Fix all kernel-doc warnings Fix the following warnings: kernel/smp.c:1189: warning: cannot understand function prototype: 'struct smp_call_on_cpu_struct ' kernel/smp.c:788: warning: No description found for return value of 'smp_call_function_single_async' kernel/smp.c:990: warning: Function parameter or member 'wait' not described in 'smp_call_function_many' kernel/smp.c:990: warning: Excess function parameter 'flags' description in 'smp_call_function_many' kernel/smp.c:1198: warning: Function parameter or member 'work' not described in 'smp_call_on_cpu_struct' kernel/smp.c:1198: warning: Function parameter or member 'done' not described in 'smp_call_on_cpu_struct' kernel/smp.c:1198: warning: Function parameter or member 'func' not described in 'smp_call_on_cpu_struct' kernel/smp.c:1198: warning: Function parameter or member 'data' not described in 'smp_call_on_cpu_struct' kernel/smp.c:1198: warning: Function parameter or member 'ret' not described in 'smp_call_on_cpu_struct' kernel/smp.c:1198: warning: Function parameter or member 'cpu' not described in 'smp_call_on_cpu_struct' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210810225051.3938-1-rdunlap@infradead.org --- kernel/smp.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 52bf159ec400..f43ede0ab183 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -764,7 +764,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, EXPORT_SYMBOL(smp_call_function_single); /** - * smp_call_function_single_async(): Run an asynchronous function on a + * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure @@ -783,6 +783,8 @@ EXPORT_SYMBOL(smp_call_function_single); * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. + * + * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { @@ -974,7 +976,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait + * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. @@ -1180,7 +1182,13 @@ void wake_up_all_idle_cpus(void) EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** - * smp_call_on_cpu - Call a function on a specific cpu + * struct smp_call_on_cpu_struct - Call a function on a specific CPU + * @work: &work_struct + * @done: &completion to signal + * @func: function to call + * @data: function's data argument + * @ret: return value from @func + * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu -- cgit v1.2.3-71-gd317 From b4cc61960879025665a46085cc9b3fa334b34cc8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Aug 2021 17:03:49 -0700 Subject: cgroup: cgroup-v1: clean up kernel-doc notation Fix kernel-doc warnings found in cgroup-v1.c: kernel/cgroup/cgroup-v1.c:55: warning: No description found for return value of 'cgroup_attach_task_all' kernel/cgroup/cgroup-v1.c:94: warning: expecting prototype for cgroup_trasnsfer_tasks(). Prototype was for cgroup_transfer_tasks() instead cgroup-v1.c:96: warning: No description found for return value of 'cgroup_transfer_tasks' kernel/cgroup/cgroup-v1.c:687: warning: No description found for return value of 'cgroupstats_build' Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8d6bf56ed77a..d25838995151 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -50,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid) * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { @@ -80,7 +82,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * @@ -89,6 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { @@ -682,6 +686,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * * Build and fill cgroupstats so that taskstats can export it to user * space. + * + * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { -- cgit v1.2.3-71-gd317 From e7cc9888dc57927f0977d346dab64a5bdfc3da59 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Aug 2021 23:06:02 -0400 Subject: cgroup/cpuset: Enable event notification when partition state changes A valid cpuset partition can become invalid if all its CPUs are offlined or somehow removed. This can happen through external events without "cpuset.cpus.partition" being touched at all. Users that rely on the property of a partition being present do not currently have a simple way to get such an event notified other than constant periodic polling which is both inefficient and cumbersome. To make life easier for those users, event notification is now enabled for "cpuset.cpus.partition" whenever its state changes. Suggested-by: Tejun Heo Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 65258102e030..fcc11f2d3b5b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -160,6 +160,9 @@ struct cpuset { */ int use_parent_ecpus; int child_ecpus_count; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; }; /* @@ -263,6 +266,16 @@ static inline int is_partition_root(const struct cpuset *cs) return cs->partition_root_state > 0; } +/* + * Send notification event of whenever partition_root_state changes. + */ +static inline void notify_partition_change(struct cpuset *cs, + int old_prs, int new_prs) +{ + if (old_prs != new_prs) + cgroup_file_notify(&cs->partition_file); +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -1148,7 +1161,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ - int new_prs; + int old_prs, new_prs; bool part_error = false; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1184,7 +1197,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * A cpumask update cannot make parent's effective_cpus become empty. */ adding = deleting = false; - new_prs = cpuset->partition_root_state; + old_prs = new_prs = cpuset->partition_root_state; if (cmd == partcmd_enable) { cpumask_copy(tmp->addmask, cpuset->cpus_allowed); adding = true; @@ -1274,7 +1287,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->subparts_cpus); } - if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) + if (!adding && !deleting && (new_prs == old_prs)) return 0; /* @@ -1302,9 +1315,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); - if (cpuset->partition_root_state != new_prs) + if (old_prs != new_prs) cpuset->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + notify_partition_change(cpuset, old_prs, new_prs); return cmd == partcmd_update; } @@ -1326,7 +1341,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; - int new_prs; + int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { @@ -1366,8 +1381,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - new_prs = cp->partition_root_state; - if ((cp != cs) && new_prs) { + old_prs = new_prs = cp->partition_root_state; + if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_DISABLED: /* @@ -1438,10 +1453,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) } } - if (new_prs != cp->partition_root_state) + if (new_prs != old_prs) cp->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); + notify_partition_change(cp, old_prs, new_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -2023,6 +2039,7 @@ out: spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, new_prs); } free_cpumasks(NULL, &tmpmask); @@ -2708,6 +2725,7 @@ static struct cftype dfl_files[] = { .write = sched_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cpuset, partition_file), }, { @@ -3103,11 +3121,17 @@ retry: */ if ((parent->partition_root_state == PRS_ERROR) || cpumask_empty(&new_cpus)) { + int old_prs; + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); + old_prs = cs->partition_root_state; + if (old_prs != PRS_ERROR) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, PRS_ERROR); + } } cpuset_force_rebuild(); } -- cgit v1.2.3-71-gd317 From 2d3a1e3615c5449a4583010f41a6f824a4ffa03e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Aug 2021 16:05:37 -0700 Subject: bpf: Add rcu_read_lock in bpf_get_current_[ancestor_]cgroup_id() helpers Currently, if bpf_get_current_cgroup_id() or bpf_get_current_ancestor_cgroup_id() helper is called with sleepable programs e.g., sleepable fentry/fmod_ret/fexit/lsm programs, a rcu warning may appear. For example, if I added the following hack to test_progs/test_lsm sleepable fentry program test_sys_setdomainname: --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -168,6 +168,10 @@ int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs) int buf = 0; long ret; + __u64 cg_id = bpf_get_current_cgroup_id(); + if (cg_id == 1000) + copy_test++; + ret = bpf_copy_from_user(&buf, sizeof(buf), ptr); if (len == -2 && ret == 0 && buf == 1234) copy_test++; I will hit the following rcu warning: include/linux/cgroup.h:481 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by test_progs/260: #0: ffffffffa5173360 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x0/0xa0 stack backtrace: CPU: 1 PID: 260 Comm: test_progs Tainted: G O 5.14.0-rc2+ #176 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x56/0x7b bpf_get_current_cgroup_id+0x9c/0xb1 bpf_prog_a29888d1c6706e09_test_sys_setdomainname+0x3e/0x89c bpf_trampoline_6442469132_0+0x2d/0x1000 __x64_sys_setdomainname+0x5/0x110 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae I can get similar warning using bpf_get_current_ancestor_cgroup_id() helper. syzbot reported a similar issue in [1] for syscall program. Helper bpf_get_current_cgroup_id() or bpf_get_current_ancestor_cgroup_id() has the following callchain: task_dfl_cgroup task_css_set task_css_set_check and we have #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) Since cgroup_mutex/css_set_lock is not held and the task is not existing and rcu read_lock is not held, a warning will be issued. Note that bpf sleepable program is protected by rcu_read_lock_trace(). The above sleepable bpf programs are already protected by migrate_disable(). Adding rcu_read_lock() in these two helpers will silence the above warning. I marked the patch fixing 95b861a7935b ("bpf: Allow bpf_get_current_ancestor_cgroup_id for tracing") which added bpf_get_current_ancestor_cgroup_id() to tracing programs in 5.14. I think backporting 5.14 is probably good enough as sleepable progrems are not widely used. This patch should fix [1] as well since syscall program is a sleepable program protected with migrate_disable(). [1] https://lore.kernel.org/bpf/0000000000006d5cab05c7d9bb87@google.com/ Fixes: 95b861a7935b ("bpf: Allow bpf_get_current_ancestor_cgroup_id for tracing") Reported-by: syzbot+7ee5c2c09c284495371f@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810230537.2864668-1-yhs@fb.com --- kernel/bpf/helpers.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7a97b2f4747d..55f83ea09dae 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -353,9 +353,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = { #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; + u64 cgrp_id; - return cgroup_id(cgrp); + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + cgrp_id = cgroup_id(cgrp); + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { @@ -366,13 +372,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; struct cgroup *ancestor; + u64 cgrp_id; + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); - if (!ancestor) - return 0; - return cgroup_id(ancestor); + cgrp_id = ancestor ? cgroup_id(ancestor) : 0; + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { -- cgit v1.2.3-71-gd317 From b4d8a58f8dcfcc890f296696cadb76e77be44b5f Mon Sep 17 00:00:00 2001 From: Hsuan-Chi Kuo Date: Thu, 4 Mar 2021 17:37:08 -0600 Subject: seccomp: Fix setting loaded filter count during TSYNC The desired behavior is to set the caller's filter count to thread's. This value is reported via /proc, so this fixes the inaccurate count exposed to userspace; it is not used for reference counting, etc. Signed-off-by: Hsuan-Chi Kuo Link: https://lore.kernel.org/r/20210304233708.420597-1-hsuanchikuo@gmail.com Co-developed-by: Wiktor Garbacz Signed-off-by: Wiktor Garbacz Link: https://lore.kernel.org/lkml/20210810125158.329849-1-wiktorg@google.com Signed-off-by: Kees Cook Cc: stable@vger.kernel.org Fixes: c818c03b661c ("seccomp: Report number of loaded filters in /proc/$pid/status") --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 057e17f3215d..6469eca8078c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -602,7 +602,7 @@ static inline void seccomp_sync_threads(unsigned long flags) smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, - atomic_read(&thread->seccomp.filter_count)); + atomic_read(&caller->seccomp.filter_count)); /* * Don't let an unprivileged task work around -- cgit v1.2.3-71-gd317 From 14c4c8e41511aa8fba7fb239b20b6539b5bce201 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Wed, 11 Aug 2021 08:59:14 -0700 Subject: cfi: Use rcu_read_{un}lock_sched_notrace If rcu_read_lock_sched tracing is enabled, the tracing subsystem can perform a jump which needs to be checked by CFI. For example, stm_ftrace source is enabled as a module and hooks into enabled ftrace events. This can cause an recursive loop where find_shadow_check_fn -> rcu_read_lock_sched -> (call to stm_ftrace generates cfi slowpath) -> find_shadow_check_fn -> rcu_read_lock_sched -> ... To avoid the recursion, either the ftrace codes needs to be marked with __no_cfi or CFI should not trace. Use the "_notrace" in CFI to avoid tracing so that CFI can guard ftrace. Signed-off-by: Elliot Berman Reviewed-by: Sami Tolvanen Cc: stable@vger.kernel.org Fixes: cf68fffb66d6 ("add support for Clang CFI") Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210811155914.19550-1-quic_eberman@quicinc.com --- kernel/cfi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cfi.c b/kernel/cfi.c index e17a56639766..9594cfd1cf2c 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -248,9 +248,9 @@ static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) { cfi_check_fn fn; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); return fn; } @@ -269,11 +269,11 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) cfi_check_fn fn = NULL; struct module *mod; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); mod = __module_address(ptr); if (mod) fn = mod->cfi_check; - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); return fn; } -- cgit v1.2.3-71-gd317 From 80771c8228029daff4b3402e00883cde06e07d46 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:10 +0200 Subject: padata: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Steffen Klassert Cc: Daniel Jordan Cc: linux-crypto@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Acked-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 378c36080781..3258ab89c2a3 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -733,7 +733,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, struct cpumask *serial_mask, *parallel_mask; int err = -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&pinst->lock); switch (cpumask_type) { @@ -753,7 +753,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, out: mutex_unlock(&pinst->lock); - put_online_cpus(); + cpus_read_unlock(); return err; } @@ -992,7 +992,7 @@ struct padata_instance *padata_alloc(const char *name) if (!pinst->parallel_wq) goto err_free_inst; - get_online_cpus(); + cpus_read_lock(); pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1, name); @@ -1026,7 +1026,7 @@ struct padata_instance *padata_alloc(const char *name) &pinst->cpu_dead_node); #endif - put_online_cpus(); + cpus_read_unlock(); return pinst; @@ -1036,7 +1036,7 @@ err_free_masks: err_free_serial_wq: destroy_workqueue(pinst->serial_wq); err_put_cpus: - put_online_cpus(); + cpus_read_unlock(); destroy_workqueue(pinst->parallel_wq); err_free_inst: kfree(pinst); @@ -1074,9 +1074,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) ps->pinst = pinst; - get_online_cpus(); + cpus_read_lock(); pd = padata_alloc_pd(ps); - put_online_cpus(); + cpus_read_unlock(); if (!pd) goto out_free_ps; -- cgit v1.2.3-71-gd317 From 2863643fb8b92291a7e97ba46e342f1163595fa8 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 28 Jul 2021 00:26:29 -0700 Subject: set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds in copy_process(): non root users but with capability CAP_SYS_RESOURCE or CAP_SYS_ADMIN will clean PF_NPROC_EXCEEDED flag even rlimit(RLIMIT_NPROC) exceeds. Add the same capability check logic here. Align the permission checks in copy_process() and set_user(). In copy_process() CAP_SYS_RESOURCE or CAP_SYS_ADMIN capable users will be able to circumvent and clear the PF_NPROC_EXCEEDED flag whereas they aren't able to the same in set_user(). There's no obvious logic to this and trying to unearth the reason in the thread didn't go anywhere. The gist seems to be that this code wants to make sure that a program can't successfully exec if it has gone through a set*id() transition while exceeding its RLIMIT_NPROC. A capable but non-INIT_USER caller getting PF_NPROC_EXCEEDED set during a set*id() transition wouldn't be able to exec right away if they still exceed their RLIMIT_NPROC at the time of exec. So their exec would fail in fs/exec.c: if ((current->flags & PF_NPROC_EXCEEDED) && is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } However, if the caller were to fork() right after the set*id() transition but before the exec while still exceeding their RLIMIT_NPROC then they would get PF_NPROC_EXCEEDED cleared (while the child would inherit it): retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; which means a subsequent exec by the capable caller would now succeed even though they could still exceed their RLIMIT_NPROC limit. This seems inconsistent. Allow a CAP_SYS_ADMIN or CAP_SYS_RESOURCE capable user to avoid PF_NPROC_EXCEEDED as they already can in copy_process(). Cc: peterz@infradead.org, tglx@linutronix.de, linux-kernel@vger.kernel.org, Ran Xiaokai , , , Link: https://lore.kernel.org/r/20210728072629.530435-1-ran.xiaokai@zte.com.cn Cc: Neil Brown Cc: Kees Cook Cc: Thomas Gleixner Cc: James Morris Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ran Xiaokai Signed-off-by: Christian Brauner --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..72c7639e3c98 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -480,7 +480,8 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER) + new_user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; -- cgit v1.2.3-71-gd317 From d03721a6e7e8c04261873b3840daa3ce2c5b0543 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:53 +0200 Subject: trace/osnoise: Add a header with PREEMPT_RT additional fields Some extra flags are printed to the trace header when using the PREEMPT_RT config. The extra flags are: need-resched-lazy, preempt-lazy-depth, and migrate-disable. Without printing these fields, the osnoise specific fields are shifted by three positions, for example: # tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE %% OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-741 [000] ....... 1105.690909: 1000000 234 99.97660 36 21 0 1001 22 3 <...>-742 [001] ....... 1105.691923: 1000000 281 99.97190 197 7 0 1012 35 14 <...>-743 [002] ....... 1105.691958: 1000000 1324 99.86760 118 11 0 1016 155 143 <...>-744 [003] ....... 1105.691998: 1000000 109 99.98910 21 4 0 1004 33 7 <...>-745 [004] ....... 1105.692015: 1000000 2023 99.79770 97 37 0 1023 52 18 Add a new header for osnoise with the missing fields, to be used when the PREEMPT_RT is enabled. Link: https://lkml.kernel.org/r/1f03289d2a51fde5a58c2e7def063dc630820ad1.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a7e3c24dee13..03ef720b491d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -253,10 +253,40 @@ static struct osnoise_data { */ static bool osnoise_busy; +#ifdef CONFIG_PREEMPT_RT /* * Print the osnoise header info. */ static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + + seq_puts(s, "# |||||| / "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# ||||| / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# ||||||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ +static void print_osnoise_headers(struct seq_file *s) { if (osnoise_data.tainted) seq_puts(s, "# osnoise is tainted!\n"); @@ -279,6 +309,7 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# | | | |||| | | "); seq_puts(s, " | | | | | | | |\n"); } +#endif /* CONFIG_PREEMPT_RT */ /* * osnoise_taint - report an osnoise error. -- cgit v1.2.3-71-gd317 From e1c4ad4a7f58417a6c483432b69c640670b6fe3d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:54 +0200 Subject: trace/timerlat: Add a header with PREEMPT_RT additional fields Some extra flags are printed to the trace header when using the PREEMPT_RT config. The extra flags are: need-resched-lazy, preempt-lazy-depth, and migrate-disable. Without printing these fields, the timerlat specific fields are shifted by three positions, for example: # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # || / # |||| ACTIVATION # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY # | | | |||| | | | | -0 [000] d..h... 3279.798871: #1 context irq timer_latency 830 ns <...>-807 [000] ....... 3279.798881: #1 context thread timer_latency 11301 ns Add a new header for timerlat with the missing fields, to be used when the PREEMPT_RT is enabled. Link: https://lkml.kernel.org/r/babb83529a3211bd0805be0b8c21608230202c55.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 03ef720b491d..518a5c190b2b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -354,6 +354,24 @@ static void trace_osnoise_sample(struct osnoise_sample *sample) /* * Print the timerlat header info. */ +#ifdef CONFIG_PREEMPT_RT +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||||| /\n"); + seq_puts(s, "# ||||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ static void print_timerlat_headers(struct seq_file *s) { seq_puts(s, "# _-----=> irqs-off\n"); @@ -367,6 +385,7 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, "# | | | |||| | | "); seq_puts(s, " | |\n"); } +#endif /* CONFIG_PREEMPT_RT */ /* * Record an timerlat_sample into the tracer buffer. -- cgit v1.2.3-71-gd317 From 0e05ba498dd0a19fc12868a9506be0f86cf36912 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:55 +0200 Subject: trace/osnoise: Print a stop tracing message When using osnoise/timerlat with stop tracing, sometimes it is not clear in which CPU the stop condition was hit, mainly when using some extra events. Print a message informing in which CPU the trace stopped, like in the example below: -0 [006] d.h. 2932.676616: #1672599 context irq timer_latency 34689 ns -0 [006] dNh. 2932.676618: irq_noise: local_timer:236 start 2932.676615639 duration 2391 ns -0 [006] dNh. 2932.676620: irq_noise: virtio0-output.0:47 start 2932.676620180 duration 86 ns -0 [003] d.h. 2932.676621: #1673374 context irq timer_latency 1200 ns -0 [006] d... 2932.676623: thread_noise: swapper/6:0 start 2932.676615964 duration 4339 ns -0 [003] dNh. 2932.676623: irq_noise: local_timer:236 start 2932.676620597 duration 1881 ns -0 [006] d... 2932.676623: sched_switch: prev_comm=swapper/6 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/6 next_pid=852 next_prio=4 timerlat/6-852 [006] .... 2932.676623: #1672599 context thread timer_latency 41931 ns -0 [003] d... 2932.676623: thread_noise: swapper/3:0 start 2932.676620854 duration 880 ns -0 [003] d... 2932.676624: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/3 next_pid=849 next_prio=4 timerlat/6-852 [006] .... 2932.676624: timerlat_main: stop tracing hit on cpu 6 timerlat/3-849 [003] .... 2932.676624: #1673374 context thread timer_latency 4310 ns Link: https://lkml.kernel.org/r/b30a0d7542adba019185f44ee648e60e14923b11.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 518a5c190b2b..b61eefe5ccf5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1075,9 +1075,13 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample * /* * osnoise_stop_tracing - Stop tracing and the tracer. */ -static void osnoise_stop_tracing(void) +static __always_inline void osnoise_stop_tracing(void) { struct trace_array *tr = osnoise_trace; + + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); + tracer_tracing_off(tr); } -- cgit v1.2.3-71-gd317 From 12f9951d3f311acb1d4ffe8e839bc2c07983546f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 6 Aug 2021 21:50:27 +0200 Subject: tracing: define needed config DYNAMIC_FTRACE_WITH_ARGS Commit 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available") intends to enable config LIVEPATCH when ftrace with ARGS is available. However, the chain of configs to enable LIVEPATCH is incomplete, as HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, but the definition of DYNAMIC_FTRACE_WITH_ARGS, combining DYNAMIC_FTRACE and HAVE_DYNAMIC_FTRACE_WITH_ARGS, needed to enable LIVEPATCH, is missing in the commit. Fortunately, ./scripts/checkkconfigsymbols.py detects this and warns: DYNAMIC_FTRACE_WITH_ARGS Referencing files: kernel/livepatch/Kconfig So, define the config DYNAMIC_FTRACE_WITH_ARGS analogously to the already existing similar configs, DYNAMIC_FTRACE_WITH_REGS and DYNAMIC_FTRACE_WITH_DIRECT_CALLS, in ./kernel/trace/Kconfig to connect the chain of configs. Link: https://lore.kernel.org/kernel-janitors/CAKXUXMwT2zS9fgyQHKUUiqo8ynZBdx2UEUu1WnV_q0OCmknqhw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210806195027.16808-1-lukas.bulwahn@gmail.com Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Peter Zijlstra Cc: Miroslav Benes Cc: stable@vger.kernel.org Fixes: 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available") Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d567b1717c4c..3ee23f4d437f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -219,6 +219,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +config DYNAMIC_FTRACE_WITH_ARGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER -- cgit v1.2.3-71-gd317 From 5acce0bff2a0420ce87d4591daeb867f47d552c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 8 Aug 2021 00:30:11 -0400 Subject: tracing / histogram: Fix NULL pointer dereference on strcmp() on NULL event name The following commands: # echo 'read_max u64 size;' > synthetic_events # echo 'hist:keys=common_pid:count=count:onmax($count).trace(read_max,count)' > events/syscalls/sys_enter_read/trigger Causes: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 4 PID: 1763 Comm: bash Not tainted 5.14.0-rc2-test+ #155 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strcmp+0xc/0x20 Code: 75 f7 31 c0 0f b6 0c 06 88 0c 02 48 83 c0 01 84 c9 75 f1 4c 89 c0 c3 0f 1f 80 00 00 00 00 31 c0 eb 08 48 83 c0 01 84 d2 74 0f <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 31 c0 c3 66 90 48 89 RSP: 0018:ffffb5fdc0963ca8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffffb3a4e040 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9714c0d0b640 RDI: 0000000000000000 RBP: 0000000000000000 R08: 00000022986b7cde R09: ffffffffb3a4dff8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9714c50603c8 R13: 0000000000000000 R14: ffff97143fdf9e48 R15: ffff9714c01a2210 FS: 00007f1fa6785740(0000) GS:ffff9714da400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000002d863004 CR4: 00000000001706e0 Call Trace: __find_event_file+0x4e/0x80 action_create+0x6b7/0xeb0 ? kstrdup+0x44/0x60 event_hist_trigger_func+0x1a07/0x2130 trigger_process_regex+0xbd/0x110 event_trigger_write+0x71/0xd0 vfs_write+0xe9/0x310 ksys_write+0x68/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f1fa6879e87 The problem was the "trace(read_max,count)" where the "count" should be "$count" as "onmax()" only handles variables (although it really should be able to figure out that "count" is a field of sys_enter_read). But there's a path that does not find the variable and ends up passing a NULL for the event, which ends up getting passed to "strcmp()". Add a check for NULL to return and error on the command with: # cat error_log hist:syscalls:sys_enter_read: error: Couldn't create or find variable Command: hist:keys=common_pid:count=count:onmax($count).trace(read_max,count) ^ Link: https://lkml.kernel.org/r/20210808003011.4037f8d0@oasis.local.home Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 50450603ec9cb tracing: Add 'onmax' hist trigger action support Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 949ef09dc537..a48aa2a2875b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3430,6 +3430,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, event = data->match_data.event; } + if (!event) + goto free; /* * At this point, we're looking at a field on another * event. Because we can't modify a hist trigger on -- cgit v1.2.3-71-gd317 From 9482fd71dbb8f0d1a61821a83e467dc0a9d7b429 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:31:24 +0200 Subject: hrtimer: Use raw_cpu_ptr() in clock_was_set() clock_was_set() can be invoked from preemptible context. Use raw_cpu_ptr() to check whether high resolution mode is active or not. It does not matter whether the task migrates after acquiring the pointer. Fixes: e71a4153b7c2 ("hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case") Reported-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/875ywacsmb.ffs@tglx --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aefc35f7d2..33b00e213e07 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -944,10 +944,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active() && !tick_nohz_active) + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -958,9 +959,9 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; + cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) -- cgit v1.2.3-71-gd317 From f80e21489590c00f46226d5802d900e6f66e5633 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:32:30 +0200 Subject: hrtimer: Unbreak hrtimer_force_reprogram() Since the recent consoliation of reprogramming functions, hrtimer_force_reprogram() is affected by a check whether the new expiry time is past the current expiry time. This breaks the NOHZ logic as that relies on the fact that the tick hrtimer is moved into the future. That means cpu_base->expires_next becomes stale and subsequent reprogramming attempts fail as well until the situation is cleaned up by an hrtimer interrupts. For some yet unknown reason this leads to a complete stall, so for now partially revert the offending commit to a known working state. The root cause for the stall is still investigated and will be fixed in a subsequent commit. Fixes: b14bca97c9f5 ("hrtimer: Consolidate reprogramming code") Reported-by: Mike Galbraith Reported-by: Marek Szyprowski Signed-off-by: Thomas Gleixner Tested-by: Mike Galbraith Link: https://lore.kernel.org/r/8735recskh.ffs@tglx --- kernel/time/hrtimer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 33b00e213e07..0ea8702eb516 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,24 +652,10 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -static void -__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, - struct hrtimer *next_timer, ktime_t expires_next) +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) { - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires_next > cpu_base->expires_next) - return; - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - - cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -707,8 +693,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) expires_next = hrtimer_update_next_event(cpu_base); - __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, - expires_next); + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -863,7 +851,19 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - __hrtimer_reprogram(cpu_base, true, timer, expires); + if (expires >= cpu_base->expires_next) + return; + + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; + + cpu_base->next_timer = timer; + + __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, -- cgit v1.2.3-71-gd317 From ee9707e8593dfb9a375cf4793c3fd03d4142b463 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 11 Aug 2021 15:57:07 -0400 Subject: cgroup/cpuset: Enable memory migration for cpuset v2 When a user changes cpuset.cpus, each task in a v2 cpuset will be moved to one of the new cpus if it is not there already. For memory, however, they won't be migrated to the new nodes when cpuset.mems changes. This is an inconsistency in behavior. In cpuset v1, there is a memory_migrate control file to enable such behavior by setting the CS_MEMORY_MIGRATE flag. Make it the default for cpuset v2 so that we have a consistent set of behavior for both cpus and memory. There is certainly a cost to make memory migration the default, but it is a one time cost that shouldn't really matter as long as cpuset.mems isn't changed frequenty. Update the cgroup-v2.rst file to document the new behavior and recommend against changing cpuset.mems frequently. Since there won't be any concurrent access to the newly allocated cpuset structure in cpuset_css_alloc(), we can use the cheaper non-atomic __set_bit() instead of the more expensive atomic set_bit(). Signed-off-by: Waiman Long Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 11 +++++++++++ kernel/cgroup/cpuset.c | 6 +++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5c7377b5bd3e..babbe04c8d37 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2056,6 +2056,17 @@ Cpuset Interface Files The value of "cpuset.mems" stays constant until the next update and won't be affected by any memory nodes hotplug events. + Setting a non-empty value to "cpuset.mems" causes memory of + tasks within the cgroup to be migrated to the designated nodes if + they are currently using memory outside of the designated nodes. + + There is a cost for this memory migration. The migration + may not be complete and some memory pages may be left behind. + So it is recommended that "cpuset.mems" should be set properly + before spawning new tasks into the cpuset. Even if there is + a need to change "cpuset.mems" with active tasks, it shouldn't + be done frequently. + cpuset.mems.effective A read-only multiple values file which exists on all cpuset-enabled cgroups. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index fcc11f2d3b5b..44d234b0df5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2761,12 +2761,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); } - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); nodes_clear(cs->mems_allowed); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + /* Set CS_MEMORY_MIGRATE for default hierarchy */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + __set_bit(CS_MEMORY_MIGRATE, &cs->flags); + return &cs->css; } -- cgit v1.2.3-71-gd317 From 04c2721d3530f0723b4c922a8fa9f26b202a20de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Aug 2021 12:40:04 +0200 Subject: genirq: Fix kernel doc indentation Fixes: 61377ec14457 ("genirq: Clarify documentation for request_threaded_irq()") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 34a66c4543a2..27667e82ecc9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2107,9 +2107,7 @@ const void *free_nmi(unsigned int irq, void *dev_id) * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level - * IRQF_ONESHOT Do not unmask interrupt line until - * thread_fn returns - * + * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, -- cgit v1.2.3-71-gd317 From 45c709f8c71b525b51988e782febe84ce933e7e0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 12 Aug 2021 17:18:10 +0200 Subject: bpf: Clear zext_dst of dead insns "access skb fields ok" verifier test fails on s390 with the "verifier bug. zext_dst is set, but no reg is defined" message. The first insns of the test prog are ... 0: 61 01 00 00 00 00 00 00 ldxw %r0,[%r1+0] 8: 35 00 00 01 00 00 00 00 jge %r0,0,1 10: 61 01 00 08 00 00 00 00 ldxw %r0,[%r1+8] ... and the 3rd one is dead (this does not look intentional to me, but this is a separate topic). sanitize_dead_code() converts dead insns into "ja -1", but keeps zext_dst. When opt_subreg_zext_lo32_rnd_hi32() tries to parse such an insn, it sees this discrepancy and bails. This problem can be seen only with JITs whose bpf_jit_needs_zext() returns true. Fix by clearning dead insns' zext_dst. The commits that contributed to this problem are: 1. 5aa5bd14c5f8 ("bpf: add initial suite for selftests"), which introduced the test with the dead code. 2. 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag"), which introduced the zext_dst flag. 3. 83a2881903f3 ("bpf: Account for BPF_FETCH in insn_has_def32()"), which introduced the sanity check. 4. 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches"), which bisect points to. It's best to fix this on stable branches that contain the second one, since that's the point where the inconsistency was introduced. Fixes: 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210812151811.184086-2-iii@linux.ibm.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9bda5476ea5..381d3d6f24bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11663,6 +11663,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) if (aux_data[i].seen) continue; memcpy(insn + i, &trap, sizeof(trap)); + aux_data[i].zext_dst = false; } } -- cgit v1.2.3-71-gd317 From f1248dee954c2ddb0ece47a13591e5d55d422d22 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Aug 2021 16:05:29 -0700 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_CGROUP_SOCKOPT This is similar to existing BPF_PROG_TYPE_CGROUP_SOCK and BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210813230530.333779-2-sdf@google.com --- kernel/bpf/cgroup.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b567ca46555c..9f6070369caa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1846,11 +1846,29 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; +#ifdef CONFIG_NET +BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) +{ + const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; + + return net->net_cookie; +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { + .func = bpf_get_netns_cookie_sockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; +#endif + static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #ifdef CONFIG_NET + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sockopt_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- cgit v1.2.3-71-gd317 From 2a047e0662aee1bd773e0415accd785ad26a9398 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 17:19:36 +0200 Subject: dma-mapping: return an unsigned int from dma_map_sg{,_attrs} These can only return 0 for failure or the number of entries, so turn the return value into an unsigned int. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe --- include/linux/dma-mapping.h | 9 +++++---- kernel/dma/mapping.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index daa1e360f0ee..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -105,8 +105,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs); +unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); @@ -166,8 +166,9 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) +static inline unsigned int dma_map_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) { return 0; } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 967b62692102..7ee5284bff58 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -219,7 +219,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, * dma_unmap_sg_attrs() should be used to unmap the buffer with the * original sg and original nents (not the value returned by this funciton). */ -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, +unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { int ret; -- cgit v1.2.3-71-gd317 From 3478cfcfcddff0f3aad82891be2992e51c4f7936 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:16 +0900 Subject: bpf: Support "%c" in bpf_bprintf_prepare(). /proc/net/unix uses "%c" to print a single-byte character to escape '\0' in the name of the abstract UNIX domain socket. The following selftest uses it, so this patch adds support for "%c". Note that it does not support wide character ("%lc" and "%llc") for simplicity. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-3-kuniyu@amazon.co.jp --- kernel/bpf/helpers.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 32761be48143..4e8540716187 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -907,6 +907,20 @@ fmt_str: tmp_buf += err; num_spec++; + continue; + } else if (fmt[i] == 'c') { + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { + err = -ENOSPC; + goto out; + } + + *tmp_buf = raw_args[num_spec]; + tmp_buf++; + num_spec++; + continue; } -- cgit v1.2.3-71-gd317 From 2a14c9ae15a38148484a128b84bff7e9ffd90d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:33 -0700 Subject: params: lift param_set_uint_minmax to common code It is a useful helper hence move it to common code so others can enjoy it. Suggested-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/moduleparam.h | 2 ++ kernel/params.c | 18 ++++++++++++++++++ net/sunrpc/xprtsock.c | 18 ------------------ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index eed280fae433..962cd41a2cb5 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -431,6 +431,8 @@ extern int param_get_int(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_uint; extern int param_set_uint(const char *val, const struct kernel_param *kp); extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92c..8299bd764e42 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66..b7dbdcbdeb6c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3149,24 +3149,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, -- cgit v1.2.3-71-gd317 From 6c34df6f350df9579ce99d887a2b5fa14cc13b32 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Sat, 14 Aug 2021 11:45:38 +0800 Subject: tracing: Apply trace filters on all output channels The event filters are not applied on all of the output, which results in the flood of printk when using tp_printk. Unfolding event_trigger_unlock_commit_regs() into trace_event_buffer_commit(), so the filters can be applied on every output. Link: https://lkml.kernel.org/r/20210814034538.8428-1-kernelfans@gmail.com Cc: stable@vger.kernel.org Fixes: 0daa2302968c1 ("tracing: Add tp_printk cmdline to have tracepoints go to printk()") Signed-off-by: Pingfan Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 18 +++++++++++++++--- kernel/trace/trace.h | 32 -------------------------------- 2 files changed, 15 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 33899a71fdc1..a1adb29ef5c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2897,14 +2897,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { + enum event_trigger_type tt = ETT_NONE; + struct trace_event_file *file = fbuffer->trace_file; + + if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, + fbuffer->entry, &tt)) + goto discard; + if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); - event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, - fbuffer->event, fbuffer->entry, - fbuffer->trace_ctx, fbuffer->regs); + + trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, + fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); + +discard: + if (tt) + event_triggers_post_call(file, tt); + } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a180abf76d4e..4a0e693000c6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1389,38 +1389,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, event_triggers_post_call(file, tt); } -/** - * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer associated with the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @trace_ctx: The tracing context flags. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - * - * Same as event_trigger_unlock_commit() but calls - * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). - */ -static inline void -event_trigger_unlock_commit_regs(struct trace_event_file *file, - struct trace_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned int trace_ctx, - struct pt_regs *regs) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit_regs(file->tr, buffer, event, - trace_ctx, regs); - - if (tt) - event_triggers_post_call(file, tt); -} - #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -- cgit v1.2.3-71-gd317 From 6fe7c745f2acb73e4cc961d7f91125eef5a8861f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:14 +0900 Subject: tracing/boot: Fix a hist trigger dependency for boot time tracing Fixes a build error when CONFIG_HIST_TRIGGERS=n with boot-time tracing. Since the trigger_process_regex() is defined only when CONFIG_HIST_TRIGGERS=y, if it is disabled, the 'actions' event option also must be disabled. Link: https://lkml.kernel.org/r/162856123376.203126.582144262622247352.stgit@devnote2 Fixes: 81a59555ff15 ("tracing/boot: Add per-event settings") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 94ef2d099e32..d713714cba67 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -205,12 +205,15 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply filter: %s\n", buf); } - xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) - pr_err("action string is too long: %s\n", p); - else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); - } + if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", buf); + } + } else if (xbc_node_find_value(enode, "actions", NULL)) + pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); if (xbc_node_find_value(enode, "enable", NULL)) { if (trace_event_enable_disable(file, 1, 0) < 0) -- cgit v1.2.3-71-gd317 From de9a48a360b70d5318061cf1237431d1869555e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:24 -0400 Subject: tracing: Add linear buckets to histogram logic There's been several times I wished the histogram logic had a "grouping" feature for the buckets. Currently, each bucket has a size of one. That is, if you trace the amount of requested allocations, each allocation is its own bucket, even if you are interested in what allocates 100 bytes or less, 100 to 200, 200 to 300, etc. Also, without grouping, it fills up the allocated histogram buckets quickly. If you are tracking latency, and don't care if something is 200 microseconds off, or 201 microseconds off, but want to track them by say 10 microseconds each. This can not currently be done. There is a log2 but that grouping get's too big too fast for a lot of cases. Introduce a "buckets=SIZE" command to each field where it will record in a rounded number. For example: ># echo 'hist:keys=bytes_req.buckets=100:sort=bytes_req' > events/kmem/kmalloc/trigger ># cat events/kmem/kmalloc/hist # event histogram # # trigger info: hist:keys=bytes_req.buckets=100:vals=hitcount:sort=bytes_req.buckets=100:size=2048 [active] # { bytes_req: ~ 0-99 } hitcount: 3149 { bytes_req: ~ 100-199 } hitcount: 1468 { bytes_req: ~ 200-299 } hitcount: 39 { bytes_req: ~ 300-399 } hitcount: 306 { bytes_req: ~ 400-499 } hitcount: 364 { bytes_req: ~ 500-599 } hitcount: 32 { bytes_req: ~ 600-699 } hitcount: 69 { bytes_req: ~ 700-799 } hitcount: 37 { bytes_req: ~ 1200-1299 } hitcount: 16 { bytes_req: ~ 1400-1499 } hitcount: 30 { bytes_req: ~ 2000-2099 } hitcount: 6 { bytes_req: ~ 4000-4099 } hitcount: 2168 { bytes_req: ~ 5000-5099 } hitcount: 6 Totals: Hits: 7690 Entries: 13 Dropped: 0 Link: https://lkml.kernel.org/r/20210707213921.980359719@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 65 +++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a48aa2a2875b..8e87c4a429fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -121,6 +121,7 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; @@ -219,6 +220,27 @@ static u64 hist_field_log2(struct hist_field *hist_field, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_bucket(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + unsigned long buckets = hist_field->buckets; + + u64 val = operand->fn(operand, elt, buffer, rbe, event); + + if (WARN_ON_ONCE(!buckets)) + return val; + + if (val >= LONG_MAX) + val = div64_ul(val, buckets); + else + val = (u64)((unsigned long)val / buckets); + return val * buckets; +} + static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -318,6 +340,7 @@ enum hist_field_flags { HIST_FIELD_FL_VAR_REF = 1 << 14, HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, }; struct var_defs { @@ -1109,7 +1132,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else if (field->flags & HIST_FIELD_FL_LOG2 || - field->flags & HIST_FIELD_FL_ALIAS) + field->flags & HIST_FIELD_FL_ALIAS || + field->flags & HIST_FIELD_FL_BUCKET) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; @@ -1470,6 +1494,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "syscall"; else if (hist_field->flags & HIST_FIELD_FL_LOG2) flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_BUCKET) + flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; @@ -1658,9 +1684,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } - if (flags & HIST_FIELD_FL_LOG2) { - unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; - hist_field->fn = hist_field_log2; + if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { + unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); + hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : + hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); @@ -1953,7 +1980,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, static struct ftrace_event_field * parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, - char *field_str, unsigned long *flags) + char *field_str, unsigned long *flags, unsigned long *buckets) { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; @@ -1980,7 +2007,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; - else { + else if (strncmp(modifier, "bucket", 6) == 0) { + int ret; + + modifier += 6; + + if (*modifier == 's') + modifier++; + if (*modifier != '=') + goto error; + modifier++; + ret = kstrtoul(modifier, 0, buckets); + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; + } else { + error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; @@ -2049,6 +2091,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; + unsigned long buckets = 0; int ret = 0; s = strchr(str, '.'); @@ -2086,7 +2129,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, } else str = s; - field = parse_field(hist_data, file, str, flags); + field = parse_field(hist_data, file, str, flags, &buckets); if (IS_ERR(field)) { ret = PTR_ERR(field); goto out; @@ -2097,6 +2140,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, ret = -ENOMEM; goto out; } + hist_field->buckets = buckets; return hist_field; out: @@ -4698,6 +4742,11 @@ static void hist_trigger_print_key(struct seq_file *m, } else if (key_field->flags & HIST_FIELD_FL_LOG2) { seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_BUCKET) { + unsigned long buckets = key_field->buckets; + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: ~ %llu-%llu", field_name, + uval, uval + buckets -1); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); @@ -5137,6 +5186,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, ".%s", flags); } } + if (hist_field->buckets) + seq_printf(m, "=%ld", hist_field->buckets); } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3-71-gd317 From 370364351926e4fcc7c1a486901bfaae0172b7d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:25 -0400 Subject: tracing/histogram: Update the documentation for the buckets modifier Update both the tracefs README file as well as the histogram.rst to include an explanation of what the buckets modifier is and how to use it. Include an example with the wakeup_latency example for both log2 and the buckets modifiers as there was no existing log2 example. Link: https://lkml.kernel.org/r/20210707213922.167218794@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 92 ++++++++++++++++++++++++++++++++++++--- kernel/trace/trace.c | 1 + 2 files changed, 87 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index f99be8062bc8..4e650671f245 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -77,6 +77,7 @@ Documentation written by Tom Zanussi .syscall display a syscall id as a system call name .execname display a common_pid as a program name .log2 display log2 value rather than raw number + .buckets=size display grouping of values rather than raw number .usecs display a common_timestamp in microseconds =========== ========================================== @@ -228,7 +229,7 @@ Extended error information that lists the total number of bytes requested for each function in the kernel that made one or more calls to kmalloc:: - # echo 'hist:key=call_site:val=bytes_req' > \ + # echo 'hist:key=call_site:val=bytes_req.buckets=32' > \ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger This tells the tracing system to create a 'hist' trigger using the @@ -1823,20 +1824,99 @@ and variables defined on other events (see Section 2.2.3 below on how that is done using hist trigger 'onmatch' action). Once that is done, the 'wakeup_latency' synthetic event instance is created. -A histogram can now be defined for the new synthetic event:: - - # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - The new event is created under the tracing/events/synthetic/ directory and looks and behaves just like any other event:: # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency enable filter format hist id trigger +A histogram can now be defined for the new synthetic event:: + + # echo 'hist:keys=pid,prio,lat.log2:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + +The above shows the latency "lat" in a power of 2 grouping. + Like any other event, once a histogram is enabled for the event, the output can be displayed by reading the event's 'hist' file. + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.log2:vals=hitcount:sort=lat.log2:size=2048 [active] + # + + { pid: 2035, prio: 9, lat: ~ 2^2 } hitcount: 43 + { pid: 2034, prio: 9, lat: ~ 2^2 } hitcount: 60 + { pid: 2029, prio: 9, lat: ~ 2^2 } hitcount: 965 + { pid: 2034, prio: 120, lat: ~ 2^2 } hitcount: 9 + { pid: 2033, prio: 120, lat: ~ 2^2 } hitcount: 5 + { pid: 2030, prio: 9, lat: ~ 2^2 } hitcount: 335 + { pid: 2030, prio: 120, lat: ~ 2^2 } hitcount: 10 + { pid: 2032, prio: 120, lat: ~ 2^2 } hitcount: 1 + { pid: 2035, prio: 120, lat: ~ 2^2 } hitcount: 2 + { pid: 2031, prio: 9, lat: ~ 2^2 } hitcount: 176 + { pid: 2028, prio: 120, lat: ~ 2^2 } hitcount: 15 + { pid: 2033, prio: 9, lat: ~ 2^2 } hitcount: 91 + { pid: 2032, prio: 9, lat: ~ 2^2 } hitcount: 125 + { pid: 2029, prio: 120, lat: ~ 2^2 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^2 } hitcount: 3 + { pid: 2029, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2035, prio: 9, lat: ~ 2^3 } hitcount: 41 + { pid: 2030, prio: 120, lat: ~ 2^3 } hitcount: 1 + { pid: 2032, prio: 9, lat: ~ 2^3 } hitcount: 32 + { pid: 2031, prio: 9, lat: ~ 2^3 } hitcount: 44 + { pid: 2034, prio: 9, lat: ~ 2^3 } hitcount: 40 + { pid: 2030, prio: 9, lat: ~ 2^3 } hitcount: 29 + { pid: 2033, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2029, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2028, prio: 120, lat: ~ 2^3 } hitcount: 18 + { pid: 2031, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2028, prio: 120, lat: ~ 2^4 } hitcount: 1 + { pid: 2029, prio: 9, lat: ~ 2^4 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^7 } hitcount: 1 + { pid: 2032, prio: 120, lat: ~ 2^7 } hitcount: 1 + + Totals: + Hits: 2122 + Entries: 30 + Dropped: 0 + + +The latency values can also be grouped linearly by a given size with +the ".buckets" modifier and specify a size (in this case groups of 10). + + # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.buckets=10:vals=hitcount:sort=lat.buckets=10:size=2048 [active] + # + + { pid: 2067, prio: 9, lat: ~ 0-9 } hitcount: 220 + { pid: 2068, prio: 9, lat: ~ 0-9 } hitcount: 157 + { pid: 2070, prio: 9, lat: ~ 0-9 } hitcount: 100 + { pid: 2067, prio: 120, lat: ~ 0-9 } hitcount: 6 + { pid: 2065, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2066, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2069, prio: 9, lat: ~ 0-9 } hitcount: 122 + { pid: 2069, prio: 120, lat: ~ 0-9 } hitcount: 8 + { pid: 2070, prio: 120, lat: ~ 0-9 } hitcount: 1 + { pid: 2068, prio: 120, lat: ~ 0-9 } hitcount: 7 + { pid: 2066, prio: 9, lat: ~ 0-9 } hitcount: 365 + { pid: 2064, prio: 120, lat: ~ 0-9 } hitcount: 35 + { pid: 2065, prio: 9, lat: ~ 0-9 } hitcount: 998 + { pid: 2071, prio: 9, lat: ~ 0-9 } hitcount: 85 + { pid: 2065, prio: 9, lat: ~ 10-19 } hitcount: 2 + { pid: 2064, prio: 120, lat: ~ 10-19 } hitcount: 2 + + Totals: + Hits: 2112 + Entries: 16 + Dropped: 0 + 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a1adb29ef5c1..be0169594de5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5654,6 +5654,7 @@ static const char readme_msg[] = "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" -- cgit v1.2.3-71-gd317 From 3347d80baa41c357cf263923f60aa8051a753d76 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:06 -0400 Subject: tracing: Have histogram types be constant when possible Instead of kstrdup("const", GFP_KERNEL), have the hist_field type simply assign the constant hist_field->type = "const"; And when the value passed to it is a variable, use "kstrdup_const(var, GFP_KERNEL);" which will just copy the value if the variable is already a constant. This saves on having to allocate when not needed. All frees of the hist_field->type will need to use kfree_const(). Link: https://lkml.kernel.org/r/20210722142837.280718447@goodmis.org Suggested-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e87c4a429fd..bb466a82b938 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1616,7 +1616,9 @@ static void __destroy_hist_field(struct hist_field *hist_field) kfree(hist_field->var.name); kfree(hist_field->name); - kfree(hist_field->type); + + /* Can likely be a const */ + kfree_const(hist_field->type); kfree(hist_field->system); kfree(hist_field->event_name); @@ -1673,9 +1675,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } @@ -1690,7 +1690,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; - hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) goto free; goto out; @@ -1699,18 +1699,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_TIMESTAMP) { hist_field->fn = hist_field_timestamp; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { hist_field->fn = hist_field_cpu; hist_field->size = sizeof(int); - hist_field->type = kstrdup("unsigned int", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "unsigned int"; goto out; } @@ -1723,7 +1719,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1736,7 +1732,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1822,7 +1818,7 @@ static int init_var_ref(struct hist_field *ref_field, } } - ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL); if (!ref_field->type) { err = -ENOMEM; goto free; @@ -2215,7 +2211,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2355,7 +2351,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operator = field_op; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2743,10 +2739,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, var->var.hist_data = var->hist_data = hist_data; var->size = size; var->var.name = kstrdup(name, GFP_KERNEL); - var->type = kstrdup(type, GFP_KERNEL); + var->type = kstrdup_const(type, GFP_KERNEL); if (!var->var.name || !var->type) { + kfree_const(var->type); kfree(var->var.name); - kfree(var->type); kfree(var); var = ERR_PTR(-ENOMEM); } -- cgit v1.2.3-71-gd317 From ed2cf90735daf40ab8d938b4b6d3ca43c0f84466 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:07 -0400 Subject: tracing: Allow execnames to be passed as args for synthetic events Allow common_pid.execname to be saved in a variable in one histogram to be passed to another histogram that can pass it as a parameter to a synthetic event. ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs:arg2=common_pid.execname' \ > events/sched/sched_waking/trigger ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1,exec=$arg2'\ ':onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,$exec)' \ > events/sched/sched_switch/trigger The above is a wake up latency synthetic event setup that passes the execname of the common_pid that woke the task to the scheduling of that task, which triggers a synthetic event that passes the original execname as a parameter to display it. ># echo 1 > events/synthetic/enable ># cat trace -0 [006] d..4 186.863801: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.863858: wakeup_lat: pid=163 delta=27 wake_comm= -0 [001] d..4 186.863903: wakeup_lat: pid=1307 delta=36 wake_comm=kworker/u16:4 -0 [000] d..4 186.863927: wakeup_lat: pid=163 delta=5 wake_comm= -0 [006] d..4 186.863957: wakeup_lat: pid=1306 delta=24 wake_comm=kworker/u16:3 sshd-1306 [006] d..4 186.864051: wakeup_lat: pid=61 delta=62 wake_comm= -0 [000] d..4 186.965030: wakeup_lat: pid=609 delta=18 wake_comm= -0 [006] d..4 186.987582: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.987639: wakeup_lat: pid=163 delta=27 wake_comm= Link: https://lkml.kernel.org/r/20210722142837.458596338@goodmis.org Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 46 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bb466a82b938..9d91b1c06957 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1420,17 +1420,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_trigger_data *hist_data = elt->map->private_data; unsigned int size = TASK_COMM_LEN; struct hist_elt_data *elt_data; - struct hist_field *key_field; + struct hist_field *hist_field; unsigned int i, n_str; elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); if (!elt_data) return -ENOMEM; - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + if (hist_field->flags & HIST_FIELD_FL_EXECNAME) { elt_data->comm = kzalloc(size, GFP_KERNEL); if (!elt_data->comm) { kfree(elt_data); @@ -3771,6 +3771,41 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } +static const char *no_comm = "(no comm)"; + +static u64 hist_field_execname(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + + if (WARN_ON_ONCE(!elt)) + return (u64)(unsigned long)no_comm; + + elt_data = elt->private_data; + + if (WARN_ON_ONCE(!elt_data->comm)) + return (u64)(unsigned long)no_comm; + + return (u64)(unsigned long)(elt_data->comm); +} + +/* Convert a var that points to common_pid.execname to a string */ +static void update_var_execname(struct hist_field *hist_field) +{ + hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR | + HIST_FIELD_FL_EXECNAME; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->is_signed = 0; + + kfree_const(hist_field->type); + hist_field->type = "char[]"; + + hist_field->fn = hist_field_execname; +} + static int create_var_field(struct hist_trigger_data *hist_data, unsigned int val_idx, struct trace_event_file *file, @@ -3795,6 +3830,9 @@ static int create_var_field(struct hist_trigger_data *hist_data, ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) + update_var_execname(hist_data->fields[val_idx]); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; -- cgit v1.2.3-71-gd317 From de32951b29be3d6cc7a92bfbf366f48a9f4c4407 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 31 Jul 2021 14:22:31 +0900 Subject: tracing: Simplify the Kconfig dependency of FTRACE The entire FTRACE block is surrounded by 'if TRACING_SUPPORT' ... 'endif'. Using 'depends on' is a simpler way to guard FTRACE. Link: https://lkml.kernel.org/r/20210731052233.4703-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee23f4d437f..420ff4bc67fd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -135,10 +135,9 @@ config TRACING_SUPPORT depends on STACKTRACE_SUPPORT default y -if TRACING_SUPPORT - menuconfig FTRACE bool "Tracers" + depends on TRACING_SUPPORT default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. @@ -1037,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. endif # FTRACE - -endif # TRACING_SUPPORT - -- cgit v1.2.3-71-gd317 From e66ed86ca6c52488249e95f7b3a6a3d7d6ab5e1e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:21 +0900 Subject: tracing/boot: Add per-event histogram action options Add a hist-trigger action syntax support to boot-time tracing. Currently, boot-time tracing supports per-event actions as option strings. However, for the histogram action, it has a special syntax and usually needs a long action definition. To make it readable and fit to the bootconfig syntax, this introduces a new options for histogram. Here are the histogram action options for boot-time tracing. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { keys = [,...] values = [,...] sort = [,...] size = name = var { = ... } pause|continue|clear onmax|onchange { var = ; [= ] } onmatch { event = ; [= ] } filter = } Where is one of below; trace = , [, ...] save = [, ...] snapshot Link: https://lkml.kernel.org/r/162856124106.203126.10501871028479029087.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d713714cba67..3d0e51368f51 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -171,6 +171,231 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event) } #endif +#ifdef CONFIG_HIST_TRIGGERS +static int __init __printf(3, 4) +append_printf(char **bufp, char *end, const char *fmt, ...) +{ + va_list args; + int ret; + + if (*bufp == end) + return -ENOSPC; + + va_start(args, fmt); + ret = vsnprintf(*bufp, end - *bufp, fmt, args); + if (ret < end - *bufp) { + *bufp += ret; + } else { + *bufp = end; + ret = -ERANGE; + } + va_end(args); + + return ret; +} + +static int __init +append_str_nospace(char **bufp, char *end, const char *str) +{ + char *p = *bufp; + int len; + + while (p < end - 1 && *str != '\0') { + if (!isspace(*str)) + *(p++) = *str; + str++; + } + *p = '\0'; + if (p == end - 1) { + *bufp = end; + return -ENOSPC; + } + len = p - *bufp; + *bufp = p; + return (int)len; +} + +static int __init +trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, + char *end, const char *key) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + knode = xbc_node_find_child(hnode, key); + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode) { + pr_err("hist.%s requires value(s).\n", key); + return -EINVAL; + } + + append_printf(bufp, end, ":%s", key); + sep = '='; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '=') + sep = ','; + } + } else + return -ENOENT; + + return 0; +} + +static int __init +trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + /* Compose 'handler' parameter */ + p = xbc_node_find_value(hnode, param, NULL); + if (!p) { + pr_err("hist.%s requires '%s' option.\n", + xbc_node_get_data(hnode), param); + return -EINVAL; + } + append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + + /* Compose 'action' parameter */ + knode = xbc_node_find_child(hnode, "trace"); + if (!knode) + knode = xbc_node_find_child(hnode, "save"); + + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode || !xbc_node_is_value(anode)) { + pr_err("hist.%s.%s requires value(s).\n", + xbc_node_get_data(hnode), + xbc_node_get_data(knode)); + return -EINVAL; + } + + append_printf(bufp, end, ".%s", xbc_node_get_data(knode)); + sep = '('; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '(') + sep = ','; + } + append_printf(bufp, end, ")"); + } else if (xbc_node_find_child(hnode, "snapshot")) { + append_printf(bufp, end, ".snapshot()"); + } else { + pr_err("hist.%s requires an action.\n", + xbc_node_get_data(hnode)); + return -EINVAL; + } + + return 0; +} + +/* + * Histogram boottime tracing syntax. + * + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * keys = [,...] + * values = [,...] + * sort = [,...] + * size = + * name = + * var { = ... } + * pause|continue|clear + * onmax|onchange { var = ; [= ] } + * onmatch { event = ; [= ] } + * filter = + * } + * + * Where are; + * + * trace = , [, ...] + * save = [, ...] + * snapshot + */ +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node, *knode; + char *end = buf + size; + const char *p; + int ret = 0; + + append_printf(&buf, end, "hist"); + + ret = trace_boot_hist_add_array(hnode, &buf, end, "keys"); + if (ret < 0) { + if (ret == -ENOENT) + pr_err("hist requires keys.\n"); + return -EINVAL; + } + + ret = trace_boot_hist_add_array(hnode, &buf, end, "values"); + if (ret == -EINVAL) + return ret; + ret = trace_boot_hist_add_array(hnode, &buf, end, "sort"); + if (ret == -EINVAL) + return ret; + + p = xbc_node_find_value(hnode, "size", NULL); + if (p) + append_printf(&buf, end, ":size=%s", p); + + p = xbc_node_find_value(hnode, "name", NULL); + if (p) + append_printf(&buf, end, ":name=%s", p); + + node = xbc_node_find_child(hnode, "var"); + if (node) { + xbc_node_for_each_key_value(node, knode, p) { + /* Expression must not include spaces. */ + append_printf(&buf, end, ":%s=", + xbc_node_get_data(knode)); + append_str_nospace(&buf, end, p); + } + } + + /* Histogram control attributes (mutual exclusive) */ + if (xbc_node_find_child(hnode, "pause")) + append_printf(&buf, end, ":pause"); + else if (xbc_node_find_child(hnode, "continue")) + append_printf(&buf, end, ":continue"); + else if (xbc_node_find_child(hnode, "clear")) + append_printf(&buf, end, ":clear"); + + /* Histogram handler and actions */ + node = xbc_node_find_child(hnode, "onmax"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onchange"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onmatch"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + return -EINVAL; + + p = xbc_node_find_value(hnode, "filter", NULL); + if (p) + append_printf(&buf, end, " if %s", p); + + if (buf == end) { + pr_err("hist exceeds the max command length.\n"); + return -E2BIG; + } + + return 0; +} +#else +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -212,6 +437,12 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", buf); } + anode = xbc_node_find_child(enode, "hist"); + if (anode && + trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3-71-gd317 From 8993665abcce793f00815b3504a486dce70cc2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:29 +0900 Subject: tracing/boot: Support multiple handlers for per-event histogram Support multiple handlers for per-event histogram in boot-time tracing. Since the histogram can register multiple same handler-actions with different parameters, this expands the syntax to support such cases. With this update, the 'onmax', 'onchange' and 'onmatch' handler subkeys under per-event histogram option will take a number subkeys optionally as below. (see [.N]) ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { onmax|onchange[.N] { var = ; [= ] } onmatch[.N] { event = ; [= ] } } The 'N' must be a digit (or digit started word). Thus user can add several handler-actions to the histogram, for example, ftrace.event.SOMEGROUP.SOMEEVENT.hist { keys = SOME_ID; lat = common_timestamp.usecs-$ts0 onmatch.1 { event = GROUP1.STARTEVENT1 trace = latency_event, SOME_ID, $lat } onmatch.2 { event = GROUP2.STARTEVENT2 trace = latency_event, SOME_ID, $lat } } Then, it can trace the elapsed time from GROUP1.STARTEVENT1 to SOMEGROUP.SOMEEVENT, and from GROUP2.STARTEVENT2 to SOMEGROUP.SOMEEVENT with SOME_ID key. Link: https://lkml.kernel.org/r/162856124905.203126.14913731908137885922.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3d0e51368f51..f024f27b3602 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -245,8 +245,9 @@ trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, } static int __init -trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, - char *end, const char *param) +trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *handler, + const char *param) { struct xbc_node *knode, *anode; const char *p; @@ -259,7 +260,7 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, xbc_node_get_data(hnode), param); return -EINVAL; } - append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ knode = xbc_node_find_child(hnode, "trace"); @@ -294,6 +295,32 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, return 0; } +static int __init +trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *node; + const char *p, *handler; + int ret; + + handler = xbc_node_get_data(hnode); + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param); + if (ret < 0) + break; + } + + if (xbc_node_find_child(hnode, param)) + ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); + + return ret; +} + /* * Histogram boottime tracing syntax. * @@ -305,8 +332,8 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, * name = * var { = ... } * pause|continue|clear - * onmax|onchange { var = ; [= ] } - * onmatch { event = ; [= ] } + * onmax|onchange[.N] { var = ; [= ] } + * onmatch[.N] { event = ; [= ] } * filter = * } * @@ -368,13 +395,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) /* Histogram handler and actions */ node = xbc_node_find_child(hnode, "onmax"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onchange"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onmatch"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; p = xbc_node_find_value(hnode, "filter", NULL); -- cgit v1.2.3-71-gd317 From 17abd7c36c77c393fa65cde462059395d6437dba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:36 +0900 Subject: tracing/boot: Support multiple histograms for each event Add multiple histograms support for each event. This allows user to set multiple histograms to an event. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { ... } The 'N' is a digit started string and it can be omitted for the default histogram. For example, multiple hist triggers example in the Documentation/trace/histogram.rst can be written as below; ftrace.event.net.netif_receive_skb.hist { 1 { keys = skbaddr.hex values = len filter = len < 0 } 2 { keys = skbaddr.hex values = len filter = len > 4096 } 3 { keys = skbaddr.hex values = len filter = len == 256 } 4 { keys = skbaddr.hex values = len } 5 { keys = len values = common_preempt_count } } Link: https://lkml.kernel.org/r/162856125628.203126.15846930277378572120.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f024f27b3602..1a2b270e9cda 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -324,7 +324,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, /* * Histogram boottime tracing syntax. * - * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { * keys = [,...] * values = [,...] * sort = [,...] @@ -415,11 +415,37 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) return 0; } + +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node; + const char *p; + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } + } + + if (xbc_node_find_child(hnode, "keys")) { + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } +} #else -static int __init -trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) { - return -EOPNOTSUPP; + /* do nothing */ } #endif @@ -465,11 +491,8 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply an action: %s\n", buf); } anode = xbc_node_find_child(enode, "hist"); - if (anode && - trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { - if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); - } + if (anode) + trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3-71-gd317 From 64dc7f6958ef56512137ed6ec228127cef7724e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:44 +0900 Subject: tracing/boot: Show correct histogram error command Since trigger_process_regex() modifies given trigger actions while parsing, the error message couldn't show what command was passed to the trigger_process_regex() when it returns an error. To fix that, show the backed up trigger action command instead of parsed buffer. Link: https://lkml.kernel.org/r/162856126413.203126.9465564928450701424.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1a2b270e9cda..1060b0446032 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -422,6 +422,7 @@ trace_boot_init_histograms(struct trace_event_file *file, { struct xbc_node *node; const char *p; + char *tmp; xbc_node_for_each_subkey(hnode, node) { p = xbc_node_get_data(node); @@ -429,15 +430,20 @@ trace_boot_init_histograms(struct trace_event_file *file, continue; /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); } } if (xbc_node_find_child(hnode, "keys")) { - if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } } } #else @@ -488,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); + pr_err("Failed to apply an action: %s\n", p); } anode = xbc_node_find_child(enode, "hist"); if (anode) -- cgit v1.2.3-71-gd317 From bd74095389b378f2292ae32397b23f4b97ff23c8 Mon Sep 17 00:00:00 2001 From: zhaoxiao Date: Mon, 16 Aug 2021 13:24:30 +0800 Subject: tracepoint: Fix kerneldoc comments Fix function name in tracepoint.c kernel-doc comment to remove a warning found by clang_w1. kernel/tracepoint.c:589: warning: expecting prototype for register_tracepoint_notifier(). Prototype was for register_tracepoint_module_notifier() instead kernel/tracepoint.c:613: warning: expecting prototype for unregister_tracepoint_notifier(). Prototype was for unregister_tracepoint_module_notifier() instead Link: https://lkml.kernel.org/r/20210816052430.16539-1-zhaoxiao@uniontech.com Acked-by: Mathieu Desnoyers Signed-off-by: zhaoxiao Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index efd14c79fab4..64ea283f2f86 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -577,7 +577,7 @@ bool trace_module_has_bad_taint(struct module *mod) static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** - * register_tracepoint_notifier - register tracepoint coming/going notifier + * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module @@ -603,7 +603,7 @@ end: EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** - * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data -- cgit v1.2.3-71-gd317 From dbcfa7156f48ebb72dfc8f4e5d702af8ca1d4b3a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Aug 2021 18:44:42 -0700 Subject: PM: sleep: unmark 'state' functions as kernel-doc Fix kernel-doc warnings in kernel/power/main.c by unmarking the comment block as kernel-doc notation. This eliminates the following kernel-doc warnings: kernel/power/main.c:593: warning: expecting prototype for state(). Prototype was for state_show() instead kernel/power/main.c:593: warning: Function parameter or member 'kobj' not described in 'state_show' kernel/power/main.c:593: warning: Function parameter or member 'attr' not described in 'state_show' kernel/power/main.c:593: warning: Function parameter or member 'buf' not described in 'state_show' Signed-off-by: Randy Dunlap Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 12c7e1bb442f..44169f3081fd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; -/** +/* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", -- cgit v1.2.3-71-gd317 From b2f6662ac08d0e7c25574ce53623c71bdae9dd78 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:31 +0100 Subject: PM: cpu: Make notifier chain use a raw_spinlock_t Invoking atomic_notifier_chain_notify() requires acquiring a spinlock_t, which can block under CONFIG_PREEMPT_RT. Notifications for members of the cpu_pm notification chain will be issued by the idle task, which can never block. Making *all* atomic_notifiers use a raw_spinlock is too big of a hammer, as only notifications issued by the idle task are problematic. Special-case cpu_pm_notifier_chain by kludging a raw_notifier and raw_spinlock_t together, matching the atomic_notifier behavior with a raw_spinlock_t. Fixes: 70d932985757 ("notifier: Fix broken error handling pattern") Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/cpu_pm.c | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f7e1d0eccdbc..246efc74e3f3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -13,19 +13,32 @@ #include #include -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); +/* + * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT. + * Notifications for cpu_pm will be issued by the idle task itself, which can + * never block, IOW it requires using a raw_spinlock_t. + */ +static struct { + struct raw_notifier_head chain; + raw_spinlock_t lock; +} cpu_pm_notifier = { + .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain), + .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock), +}; static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * atomic_notifier_call_chain has a RCU read critical section, which - * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let - * RCU know this. + * This introduces a RCU read critical section, which could be + * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know + * this. */ rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_read_lock(); + ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); + rcu_read_unlock(); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event) static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) { + unsigned long flags; int ret; rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev * Add a driver to a list of drivers that are notified about * CPU and CPU cluster low power entry and exit. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. + * This function has the same return conditions as raw_notifier_chain_register. */ int cpu_pm_register_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); * * Remove a driver from the CPU PM notifier list. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. + * This function has the same return conditions as raw_notifier_chain_unregister. */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); -- cgit v1.2.3-71-gd317 From 15538a20579fa4047912508b03b39bcdeec26b05 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:32 +0100 Subject: notifier: Remove atomic_notifier_call_chain_robust() This now has no more users, remove it. Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 2 -- kernel/notifier.c | 19 ------------------- 2 files changed, 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2fb373a5c1ed..87069b8459af 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -168,8 +168,6 @@ extern int raw_notifier_call_chain(struct raw_notifier_head *nh, extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); -extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca594..b8251dc0bc0f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); -int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v) -{ - unsigned long flags; - int ret; - - /* - * Musn't use RCU; because then the notifier list can - * change between the up and down traversal. - */ - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); -NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); - /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain -- cgit v1.2.3-71-gd317 From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:54 -0700 Subject: bpf: Refactor BPF_PROG_RUN into a function Turn BPF_PROG_RUN into a proper always inlined function. No functional and performance changes are intended, but it makes it much easier to understand what's going on with how BPF programs are actually get executed. It's more obvious what types and callbacks are expected. Also extra () around input parameters can be dropped, as well as `__` variable prefixes intended to avoid naming collisions, which makes the code simpler to read and write. This refactoring also highlighted one extra issue. BPF_PROG_RUN is both a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning BPF_PROG_RUN into a function causes naming conflict compilation error. So rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org --- Documentation/networking/filter.rst | 4 +-- drivers/media/rc/bpf-lirc.c | 2 +- drivers/net/ppp/ppp_generic.c | 8 ++--- drivers/net/team/team_mode_loadbalance.c | 2 +- include/linux/bpf.h | 2 +- include/linux/filter.h | 61 +++++++++++++++++++------------- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/cgroup.c | 16 ++++----- kernel/bpf/core.c | 2 +- kernel/bpf/trampoline.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 4 +-- lib/test_bpf.c | 2 +- net/bpf/test_run.c | 6 ++-- net/core/filter.c | 4 +-- net/core/ptp_classifier.c | 2 +- net/netfilter/xt_bpf.c | 2 +- net/sched/act_bpf.c | 4 +-- net/sched/cls_bpf.c | 4 +-- 20 files changed, 73 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 5f13905b12e0..ce2b8e8bb9ab 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -638,8 +638,8 @@ extension, PTP dissector/classifier, and much more. They are all internally converted by the kernel into the new instruction set representation and run in the eBPF interpreter. For in-kernel handlers, this all works transparently by using bpf_prog_create() for setting up the filter, resp. -bpf_prog_destroy() for destroying it. The macro -BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed +bpf_prog_destroy() for destroying it. The function +bpf_prog_run(filter, ctx) transparently invokes eBPF interpreter or JITed code to run the filter. 'filter' is a pointer to struct bpf_prog that we got from bpf_prog_create(), and 'ctx' the given context (e.g. skb pointer). All constraints and restrictions from bpf_check_classic() apply diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index afae0afe3f81..bb5a9dc78f1b 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -217,7 +217,7 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) raw->bpf_sample = sample; if (raw->progs) - BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN); + BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run); } /* diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index e9e81573f21e..fb52cd175b45 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1744,7 +1744,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) a four-byte PPP header on each packet */ *(u8 *)skb_push(skb, 2) = 1; if (ppp->pass_filter && - BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { + bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: outbound frame " @@ -1754,7 +1754,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) } /* if this packet passes the active filter, record the time */ if (!(ppp->active_filter && - BPF_PROG_RUN(ppp->active_filter, skb) == 0)) + bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_xmit = jiffies; skb_pull(skb, 2); #else @@ -2468,7 +2468,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) *(u8 *)skb_push(skb, 2) = 0; if (ppp->pass_filter && - BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { + bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: inbound frame " @@ -2477,7 +2477,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) return; } if (!(ppp->active_filter && - BPF_PROG_RUN(ppp->active_filter, skb) == 0)) + bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_recv = jiffies; __skb_pull(skb, 2); } else diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 32aef8ac4a14..b095a4b4957b 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -197,7 +197,7 @@ static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv, fp = rcu_dereference_bh(lb_priv->fp); if (unlikely(!fp)) return 0; - lhash = BPF_PROG_RUN(fp, skb); + lhash = bpf_prog_run(fp, skb); c = (char *) &lhash; return c[0] ^ c[1] ^ c[2] ^ c[3]; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8cc09013210..968fea98087a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,7 +1103,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); + * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. diff --git a/include/linux/filter.h b/include/linux/filter.h index 1797e8506929..954373db20e7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -600,25 +600,38 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); -#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 __ret; \ - cant_migrate(); \ - if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *__stats; \ - u64 __start = sched_clock(); \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->stats); \ - u64_stats_update_begin(&__stats->syncp); \ - __stats->cnt++; \ - __stats->nsecs += sched_clock() - __start; \ - u64_stats_update_end(&__stats->syncp); \ - } else { \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - } \ - __ret; }) - -#define BPF_PROG_RUN(prog, ctx) \ - __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) +typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, + const struct bpf_insn *insnsi, + unsigned int (*bpf_func)(const void *, + const struct bpf_insn *)); + +static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, + const void *ctx, + bpf_dispatcher_fn dfunc) +{ + u32 ret; + + cant_migrate(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + struct bpf_prog_stats *stats; + u64 start = sched_clock(); + + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } else { + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + } + return ret; +} + +static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) +{ + return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); +} /* * Use in preemptible and therefore migratable context to make sure that @@ -637,7 +650,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, u32 ret; migrate_disable(); - ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); + ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } @@ -742,7 +755,7 @@ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, memset(cb_data, 0, sizeof(cb_saved)); } - res = BPF_PROG_RUN(prog, skb); + res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); @@ -787,7 +800,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ - u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) @@ -1440,7 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; @@ -1478,7 +1491,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2e9d47bb40ff..b2ee45064e06 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -686,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) rcu_read_lock(); migrate_disable(); - ret = BPF_PROG_RUN(prog, ctx); + ret = bpf_prog_run(prog, ctx); migrate_enable(); rcu_read_unlock(); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f6070369caa..16dc467adfa0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1091,7 +1091,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN, flags); + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1121,7 +1121,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int ret; ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - BPF_PROG_RUN); + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1140,7 +1140,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN); + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1386,7 +1386,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1496,7 +1496,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1557,7 +1557,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82af6279992d..5ee2ec27c3d4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1879,7 +1879,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. - * The BPF program will be executed via BPF_PROG_RUN() macro. + * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b2535acfe9db..fe1e857324e6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -548,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) u64_stats_update_end(&stats->syncp); } -/* The logic is similar to BPF_PROG_RUN, but with an explicit +/* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ea2238a6656..f5a0077c9981 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12383,7 +12383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - /* BPF_PROG_RUN doesn't call subprogs directly, + /* bpf_prog_run() doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id * func[i]->stats will never be accessed and stays NULL diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..7d20743b48e1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9913,7 +9913,7 @@ static void bpf_overflow_handler(struct perf_event *event, if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, &ctx); + ret = bpf_prog_run(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0da94e1d6af9..05a5a556671d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); @@ -1816,7 +1816,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); rcu_read_lock(); - (void) BPF_PROG_RUN(prog, args); + (void) bpf_prog_run(prog, args); rcu_read_unlock(); } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 77fe6fde56c5..830a18ecffc8 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -8616,7 +8616,7 @@ static int __run_one(const struct bpf_prog *fp, const void *data, start = ktime_get_ns(); for (i = 0; i < runs; i++) - ret = BPF_PROG_RUN(fp, data); + ret = bpf_prog_run(fp, data); finish = ktime_get_ns(); migrate_enable(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4b855af267b1..2eb0e55ef54d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -116,7 +116,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else - *retval = BPF_PROG_RUN(prog, ctx); + *retval = bpf_prog_run(prog, ctx); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); @@ -327,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data) struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); - info->retval = BPF_PROG_RUN(info->prog, info->ctx); + info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } @@ -989,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; - retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); bpf_test_timer_leave(&t); diff --git a/net/core/filter.c b/net/core/filter.c index 3aca07c44fad..5cf38e8886f1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to BPF_PROG_RUN. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -10115,7 +10115,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); - action = BPF_PROG_RUN(prog, &reuse_kern); + action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index e33fde06d528..dd4cf01d1e0a 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return BPF_PROG_RUN(ptp_insns, skb); + return bpf_prog_run(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 040807aa15b9..5c36013339e1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,11 +47,11 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3b472bafdc9d..df19a847829e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); } if (prog->exts_integrated) { -- cgit v1.2.3-71-gd317 From 7d08c2c9117113fee118487425ed55efa50cbfa9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:55 -0700 Subject: bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions Similar to BPF_PROG_RUN, turn BPF_PROG_RUN_ARRAY macros into proper functions with all the same readability and maintainability benefits. Making them into functions required shuffling around bpf_set_run_ctx/bpf_reset_run_ctx functions. Also, explicitly specifying the type of the BPF prog run callback required adjusting __bpf_prog_run_save_cb() to accept const void *, casted internally to const struct sk_buff. Further, split out a cgroup-specific BPF_PROG_RUN_ARRAY_CG and BPF_PROG_RUN_ARRAY_CG_FLAGS from the more generic BPF_PROG_RUN_ARRAY due to the differences in bpf_run_ctx used for those two different use cases. I think BPF_PROG_RUN_ARRAY_CG would benefit from further refactoring to accept struct cgroup and enum bpf_attach_type instead of bpf_prog_array, fetching cgrp->bpf.effective[type] and RCU-dereferencing it internally. But that required including include/linux/cgroup-defs.h, which I wasn't sure is ok with everyone. The remaining generic BPF_PROG_RUN_ARRAY function will be extended to pass-through user-provided context value in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-3-andrii@kernel.org --- include/linux/bpf.h | 179 +++++++++++++++++++++++++++-------------------- include/linux/filter.h | 5 +- kernel/bpf/cgroup.c | 32 ++++----- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 124 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 968fea98087a..344e0d4d8ef6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1146,67 +1146,116 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; - struct bpf_prog_array_item *prog_item; + const struct bpf_prog_array_item *prog_item; }; +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx = NULL; + +#ifdef CONFIG_BPF_SYSCALL + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; +#endif + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ +#ifdef CONFIG_BPF_SYSCALL + current->bpf_ctx = old_ctx; +#endif +} + /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) -#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - u32 func_ret; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - func_ret = func(_prog, ctx); \ - _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) - -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - if (unlikely(check_non_null && !_array))\ - goto _out; \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - _ret &= func(_prog, ctx); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ -_out: \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) +typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog, + u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + u32 func_ret; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + ret &= (func_ret & 1); + *(ret_flags) |= (func_ret >> 1); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + ret &= run_prog(prog, ctx); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + if (unlikely(!array)) + goto out; + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + ret &= run_prog(prog, ctx); + item++; + } +out: + rcu_read_unlock(); + migrate_enable(); + return ret; +} /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs * so BPF programs can request cwr for TCP packets. @@ -1235,7 +1284,7 @@ _out: \ u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ @@ -1244,12 +1293,6 @@ _out: \ _ret; \ }) -#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) - -#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -1284,20 +1327,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) -{ - struct bpf_run_ctx *old_ctx; - - old_ctx = current->bpf_ctx; - current->bpf_ctx = new_ctx; - return old_ctx; -} - -static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) -{ - current->bpf_ctx = old_ctx; -} - extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/filter.h b/include/linux/filter.h index 954373db20e7..7d248941ecea 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -723,7 +723,7 @@ static inline void bpf_restore_data_end( cb->data_end = saved_data_end; } -static inline u8 *bpf_skb_cb(struct sk_buff *skb) +static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with @@ -744,8 +744,9 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, - struct sk_buff *skb) + const void *ctx) { + const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 16dc467adfa0..a1dedba4c174 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1012,8 +1012,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } bpf_restore_data_end(skb, saved_data_end); @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1090,8 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - bpf_prog_run, flags); + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1120,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1139,8 +1139,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - bpf_prog_run); + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1385,8 +1385,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1495,8 +1495,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1556,8 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 05a5a556671d..91867b14b222 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3-71-gd317 From 652c1b17b85b9c195978c051aa283027529db1fe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:56 -0700 Subject: bpf: Refactor perf_event_set_bpf_prog() to use struct bpf_prog input Make internal perf_event_set_bpf_prog() use struct bpf_prog pointer as an input argument, which makes it easier to re-use for other internal uses (coming up for BPF link in the next patch). BPF program FD is not as convenient and in some cases it's not available. So switch to struct bpf_prog, move out refcounting outside and let caller do bpf_prog_put() in case of an error. This follows the approach of most of the other BPF internal functions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-4-andrii@kernel.org --- kernel/events/core.c | 61 ++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7d20743b48e1..2f07718bd41c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5574,7 +5574,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -5637,7 +5637,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: - return perf_event_set_bpf_prog(event, arg); + { + struct bpf_prog *prog; + int err; + + prog = bpf_prog_get(arg); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_prog_put(prog); + return err; + } + + return 0; + } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; @@ -9923,10 +9938,8 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog *prog; - if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; @@ -9934,9 +9947,8 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (event->prog) return -EEXIST; - prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && @@ -9952,7 +9964,6 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ - bpf_prog_put(prog); return -EPROTO; } @@ -9974,7 +9985,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { return -EOPNOTSUPP; } @@ -10002,14 +10013,12 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; - struct bpf_prog *prog; - int ret; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog_fd); + return perf_event_set_bpf_handler(event, prog); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10018,38 +10027,24 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || - (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; - } /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) return -EINVAL; - } if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); - if (prog->aux->max_ctx_offset > off) { - bpf_prog_put(prog); + if (prog->aux->max_ctx_offset > off) return -EACCES; - } } - ret = perf_event_attach_bpf_prog(event, prog); - if (ret) - bpf_prog_put(prog); - return ret; + return perf_event_attach_bpf_prog(event, prog); } static void perf_event_free_bpf_prog(struct perf_event *event) @@ -10071,7 +10066,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -- cgit v1.2.3-71-gd317 From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- include/linux/bpf_types.h | 3 ++ include/linux/trace_events.h | 3 ++ include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 105 ++++++++++++++++++++++++++++++++++++++--- kernel/events/core.c | 10 ++-- tools/include/uapi/linux/bpf.h | 2 + 6 files changed, 112 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ae3ac3a2018c..9c81724e4b98 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif +#ifdef CONFIG_PERF_EVENTS +BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) +#endif diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..8ac92560d3a3 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +void perf_event_free_bpf_prog(struct perf_event *event); + void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a2068e39d23..80c03bedd6e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = { .fill_link_info = bpf_raw_tp_link_fill_link_info, }; +#ifdef CONFIG_PERF_EVENTS +struct bpf_perf_link { + struct bpf_link link; + struct file *perf_file; +}; + +static void bpf_perf_link_release(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + struct perf_event *event = perf_link->perf_file->private_data; + + perf_event_free_bpf_prog(event); + fput(perf_link->perf_file); +} + +static void bpf_perf_link_dealloc(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + + kfree(perf_link); +} + +static const struct bpf_link_ops bpf_perf_link_lops = { + .release = bpf_perf_link_release, + .dealloc = bpf_perf_link_dealloc, +}; + +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_perf_link *link; + struct perf_event *event; + struct file *perf_file; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + perf_file = perf_event_get(attr->link_create.target_fd); + if (IS_ERR(perf_file)) + return PTR_ERR(perf_file); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_file; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + link->perf_file = perf_file; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_file; + } + + event = perf_file->private_data; + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_file; + } + /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out_put_file: + fput(perf_file); + return err; +} +#endif /* CONFIG_PERF_EVENTS */ + #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) @@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (ret) goto out; - if (prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_EXT: ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; - } - - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + if (attr->link_create.attach_type != BPF_PERF_EVENT) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; + default: + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } + break; } switch (ptype) { @@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif +#ifdef CONFIG_PERF_EVENTS + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_KPROBE: + ret = bpf_perf_link_attach(attr, prog); + break; #endif default: ret = -EINVAL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f07718bd41c..9fd65667bcb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4697,7 +4697,6 @@ errout: } static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; @@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr return perf_event_attach_bpf_prog(event, prog); } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); @@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3-71-gd317 From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- drivers/media/rc/bpf-lirc.c | 4 ++-- include/linux/bpf.h | 16 +++++++++++++++- include/linux/perf_event.h | 1 + include/linux/trace_events.h | 6 +++--- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/core.c | 29 ++++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 21 ++++++++++++++------- kernel/trace/bpf_trace.c | 8 +++++--- tools/include/uapi/linux/bpf.h | 7 +++++++ 10 files changed, 73 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index bb5a9dc78f1b..3eff08d7b8e5 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array); if (ret < 0) goto unlock; @@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) } old_array = lirc_rcu_dereference(raw->progs); - ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array); /* * Do not use bpf_prog_array_delete_safe() as we would end up * with a dummy entry in the array, and the we would free the diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 344e0d4d8ef6..83c3cc5e90df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + union { + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u64 bpf_cookie; + }; }; struct bpf_prog_array { @@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; @@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx { const struct bpf_prog_array_item *prog_item; }; +struct bpf_trace_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; @@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; u32 ret = 1; migrate_disable(); @@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, array = rcu_dereference(array_rcu); if (unlikely(!array)) goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { + run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } + bpf_reset_run_ctx(old_run_ctx); out: rcu_read_unlock(); migrate_enable(); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d510ad750ed..fe156a8170aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -762,6 +762,7 @@ struct perf_event { #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; + u64 bpf_cookie; #endif #ifdef CONFIG_EVENT_TRACING diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8ac92560d3a3..8e0631a4b046 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); -int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); @@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c } static inline int -perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ee2ec27c3d4..91f24c7b38a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog_array_item *existing; + struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; - int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to * the new array. @@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; + new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; - for (; existing->prog; existing++) - if (existing->prog != exclude_prog && - existing->prog != &dummy_bpf_prog.prog) { - array->items[new_prog_idx++].prog = - existing->prog; - } + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog || + existing->prog == &dummy_bpf_prog.prog) + continue; + + new->prog = existing->prog; + new->bpf_cookie = existing->bpf_cookie; + new++; + } } - if (include_prog) - array->items[new_prog_idx++].prog = include_prog; - array->items[new_prog_idx].prog = NULL; + if (include_prog) { + new->prog = include_prog; + new->bpf_cookie = bpf_cookie; + new++; + } + new->prog = NULL; *new_array = array; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 80c03bedd6e6..7420e1334ab2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } event = perf_file->private_data; - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9fd65667bcb2..2d1e63dd97f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -9936,7 +9936,9 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ @@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog } event->prog = prog; + event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; @@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog); + return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) return -EACCES; } - return perf_event_attach_bpf_prog(event, prog); + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) @@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91867b14b222..57879d28f824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, - struct bpf_prog *prog) + struct bpf_prog *prog, + u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; @@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; + event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); @@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); - ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret == -ENOENT) goto unlock; if (ret < 0) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; -- cgit v1.2.3-71-gd317 From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- include/linux/bpf.h | 3 --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 4 files changed, 66 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c3cc5e90df..f4c16f19f83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; -const struct bpf_func_proto *bpf_tracing_func_proto( - enum bpf_func_id func_id, const struct bpf_prog *prog); - const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 57879d28f824..cbc73c08c4a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; -const struct bpf_func_proto * +BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { + .func = bpf_get_attach_cookie_trace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) +{ + return ctx->event->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { + .func = bpf_get_attach_cookie_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_kprobe; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-71-gd317 From b41cda03765580caf7723b8c1b672d191c71013f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:38 +0200 Subject: locking/rtmutex: Set proper wait context for lockdep RT mutexes belong to the LD_WAIT_SLEEP class. Make them so. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.031014562@linutronix.de --- include/linux/rtmutex.h | 19 ++++++++++++------- kernel/locking/rtmutex.c | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index d1672de9ca89..87b325aec508 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -52,17 +52,22 @@ do { \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ - , .dep_map = { .name = #mutexname } +#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ + .dep_map = { \ + .name = #mutexname, \ + .wait_type_inner = LD_WAIT_SLEEP, \ + } #else #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) #endif -#define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .waiters = RB_ROOT_CACHED \ - , .owner = NULL \ - __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} +#define __RT_MUTEX_INITIALIZER(mutexname) \ +{ \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock), \ + .waiters = RB_ROOT_CACHED, \ + .owner = NULL, \ + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ +} #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ad0db322ed3b..1a7e3f838077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1556,7 +1556,7 @@ void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); __rt_mutex_basic_init(lock); } -- cgit v1.2.3-71-gd317 From 43295d73adc8d3780e9f34206663e336678aaff8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:40 +0200 Subject: sched/wakeup: Split out the wakeup ->__state check RT kernels have a slightly more complicated handling of wakeups due to 'sleeping' spin/rwlocks. If a task is blocked on such a lock then the original state of the task is preserved over the blocking period, and any regular (non lock related) wakeup has to be targeted at the saved state to ensure that these wakeups are not lost. Once the task acquires the lock it restores the task state from the saved state. To avoid cluttering try_to_wake_up() with that logic, split the wakeup state check out into an inline helper and use it at both places where task::__state is checked against the state argument of try_to_wake_up(). No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.088945085@linutronix.de --- kernel/sched/core.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..961991e06337 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3561,6 +3561,22 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) rq_unlock(rq, &rf); } +/* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + return false; +} + /* * Notes on Program-Order guarantees on SMP systems. * @@ -3700,10 +3716,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); @@ -3718,14 +3733,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck -- cgit v1.2.3-71-gd317 From 5f220be21418541422335288b6e2360a5ce0613c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:44 +0200 Subject: sched/wakeup: Prepare for RT sleeping spin/rwlocks Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state preserving. Any wakeup which matches the state is valid. RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates an issue vs. task::__state. In order to block on the lock, the task has to overwrite task::__state and a consecutive wakeup issued by the unlocker sets the state back to TASK_RUNNING. As a consequence the task loses the state which was set before the lock acquire and also any regular wakeup targeted at the task while it is blocked on the lock. To handle this gracefully, add a 'saved_state' member to task_struct which is used in the following way: 1) When a task blocks on a 'sleeping' spinlock, the current state is saved in task::saved_state before it is set to TASK_RTLOCK_WAIT. 2) When the task unblocks and after acquiring the lock, it restores the saved state. 3) When a regular wakeup happens for a task while it is blocked then the state change of that wakeup is redirected to operate on task::saved_state. This is also required when the task state is running because the task might have been woken up from the lock wait and has not yet restored the saved state. To make it complete, provide the necessary helpers to save and restore the saved state along with the necessary documentation how the RT lock blocking is supposed to work. For non-RT kernels there is no functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.258751046@linutronix.de --- include/linux/sched.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 33 ++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c72cf6aaabf..02714b9a3ff9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -143,9 +143,22 @@ struct task_group; current->task_state_change = _THIS_IP_; \ } while (0) +# define debug_rtlock_wait_set_state() \ + do { \ + current->saved_state_change = current->task_state_change;\ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_rtlock_wait_restore_state() \ + do { \ + current->task_state_change = current->saved_state_change;\ + } while (0) + #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) +# define debug_rtlock_wait_set_state() do { } while (0) +# define debug_rtlock_wait_restore_state() do { } while (0) #endif /* @@ -213,6 +226,51 @@ struct task_group; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) +/* + * PREEMPT_RT specific variants for "sleeping" spin/rwlocks + * + * RT's spin/rwlock substitutions are state preserving. The state of the + * task when blocking on the lock is saved in task_struct::saved_state and + * restored after the lock has been acquired. These operations are + * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT + * lock related wakeups while the task is blocked on the lock are + * redirected to operate on task_struct::saved_state to ensure that these + * are not dropped. On restore task_struct::saved_state is set to + * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. + * + * The lock operation looks like this: + * + * current_save_and_set_rtlock_wait_state(); + * for (;;) { + * if (try_lock()) + * break; + * raw_spin_unlock_irq(&lock->wait_lock); + * schedule_rtlock(); + * raw_spin_lock_irq(&lock->wait_lock); + * set_current_state(TASK_RTLOCK_WAIT); + * } + * current_restore_rtlock_saved_state(); + */ +#define current_save_and_set_rtlock_wait_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + current->saved_state = current->__state; \ + debug_rtlock_wait_set_state(); \ + WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + +#define current_restore_rtlock_saved_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + debug_rtlock_wait_restore_state(); \ + WRITE_ONCE(current->__state, current->saved_state); \ + current->saved_state = TASK_RUNNING; \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + #define get_current_state() READ_ONCE(current->__state) /* Task command name length: */ @@ -668,6 +726,11 @@ struct task_struct { #endif unsigned int __state; +#ifdef CONFIG_PREEMPT_RT + /* saved state for "spinlock sleepers" */ + unsigned int saved_state; +#endif + /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. @@ -1357,6 +1420,9 @@ struct task_struct { struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; +# ifdef CONFIG_PREEMPT_RT + unsigned long saved_state_change; +# endif #endif int pagefault_disabled; #ifdef CONFIG_MMU diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 961991e06337..e407c6ac4a26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + if (READ_ONCE(p->__state) & state) { *success = 1; return true; } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif return false; } -- cgit v1.2.3-71-gd317 From b4bfa3fcfe3b827ddb8b16edd45896caac5a1194 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:46 +0200 Subject: sched/core: Rework the __schedule() preempt argument PREEMPT_RT needs to hand a special state into __schedule() when a task blocks on a 'sleeping' spin/rwlock. This is required to handle rcu_note_context_switch() correctly without having special casing in the RCU code. From an RCU point of view the blocking on the sleeping spinlock is equivalent to preemption, because the task might be in a read side critical section. schedule_debug() also has a check which would trigger with the !preempt case, but that could be handled differently. To avoid adding another argument and extra checks which cannot be optimized out by the compiler, the following solution has been chosen: - Replace the boolean 'preempt' argument with an unsigned integer 'sched_mode' argument and define constants to hand in: (0 == no preemption, 1 = preemption). - Add two masks to apply on that mode: one for the debug/rcu invocations, and one for the actual scheduling decision. For a non RT kernel these masks are UINT_MAX, i.e. all bits are set, which allows the compiler to optimize the AND operation out, because it is not masking out anything. IOW, it's not different from the boolean. RT enabled kernels will define these masks separately. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.315473019@linutronix.de --- kernel/sched/core.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e407c6ac4a26..ebc24e136222 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5819,6 +5819,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SCHED_CORE */ +/* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_MASK_PREEMPT (~0U) + /* * __schedule() is the main scheduler function. * @@ -5858,7 +5870,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5871,13 +5883,13 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below @@ -5911,7 +5923,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = READ_ONCE(prev->__state); - if (!preempt && prev_state) { + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -5977,7 +5989,7 @@ static void __sched notrace __schedule(bool preempt) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -5998,7 +6010,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6059,7 +6071,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -6087,7 +6099,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } @@ -6140,7 +6152,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -6219,7 +6231,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6368,7 +6380,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); -- cgit v1.2.3-71-gd317 From 6991436c2b5d91d5358d9914ae2df22b9a1d1dc9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:48 +0200 Subject: sched/core: Provide a scheduling point for RT locks RT enabled kernels substitute spin/rwlocks with 'sleeping' variants based on rtmutexes. Blocking on such a lock is similar to preemption versus: - I/O scheduling and worker handling, because these functions might block on another substituted lock, or come from a lock contention within these functions. - RCU considers this like a preemption, because the task might be in a read side critical section. Add a separate scheduling point for this, and hand a new scheduling mode argument to __schedule() which allows, along with separate mode masks, to handle this gracefully from within the scheduler, without proliferating that to other subsystems like RCU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.372319055@linutronix.de --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 02714b9a3ff9..746dfc06a35c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -288,6 +288,9 @@ extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); +#ifdef CONFIG_PREEMPT_RT + extern void schedule_rtlock(void); +#endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ebc24e136222..c89c1d45dd0b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5829,7 +5829,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ #define SM_NONE 0x0 #define SM_PREEMPT 0x1 -#define SM_MASK_PREEMPT (~0U) +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif /* * __schedule() is the main scheduler function. @@ -6134,6 +6140,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { -- cgit v1.2.3-71-gd317 From 785159301bedea25fae9b20cae3d12377246e941 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 15 Aug 2021 23:27:54 +0200 Subject: locking/rtmutex: Convert macros to inlines Inlines are type-safe... Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.610830960@linutronix.de --- kernel/locking/rtmutex.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1a7e3f838077..5187added8bc 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,8 +141,19 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) -# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, + struct task_struct *old, + struct task_struct *new) +{ + return cmpxchg_acquire(&lock->owner, old, new) == old; +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, + struct task_struct *old, + struct task_struct *new) +{ + return cmpxchg_release(&lock->owner, old, new) == old; +} /* * Callers must hold the ->wait_lock -- which is the whole purpose as we force @@ -201,8 +212,20 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_acquire(l,c,n) (0) -# define rt_mutex_cmpxchg_release(l,c,n) (0) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; + +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; +} static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { -- cgit v1.2.3-71-gd317 From 709e0b62869f625afd18edd79f190c38cb39dfb2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:55 +0200 Subject: locking/rtmutex: Switch to from cmpxchg_*() to try_cmpxchg_*() Allows the compiler to generate better code depending on the architecture. Suggested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.668958502@linutronix.de --- kernel/locking/rtmutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5187added8bc..98f06c509f02 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -145,14 +145,14 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, struct task_struct *old, struct task_struct *new) { - return cmpxchg_acquire(&lock->owner, old, new) == old; + return try_cmpxchg_acquire(&lock->owner, &old, new); } static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, struct task_struct *old, struct task_struct *new) { - return cmpxchg_release(&lock->owner, old, new) == old; + return try_cmpxchg_release(&lock->owner, &old, new); } /* -- cgit v1.2.3-71-gd317 From 531ae4b06a737ed5539cd75dc6f6b9a28f900bba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:27:57 +0200 Subject: locking/rtmutex: Split API from implementation Prepare for reusing the inner functions of rtmutex for RT lock substitutions: introduce kernel/locking/rtmutex_api.c and move them there. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.726560996@linutronix.de --- kernel/locking/Makefile | 2 +- kernel/locking/rtmutex.c | 479 ++-------------------------------------- kernel/locking/rtmutex_api.c | 453 +++++++++++++++++++++++++++++++++++++ kernel/locking/rtmutex_common.h | 78 ++++--- 4 files changed, 514 insertions(+), 498 deletions(-) create mode 100644 kernel/locking/rtmutex_api.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 3572808223e4..269f55e1e431 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 98f06c509f02..cd0e1a43b07a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -11,14 +11,12 @@ * * See Documentation/locking/rt-mutex-design.rst for details. */ -#include -#include +#include +#include +#include #include #include -#include #include -#include -#include #include "rtmutex_common.h" @@ -371,11 +369,6 @@ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, return chwalk == RT_MUTEX_FULL_CHAINWALK; } -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; @@ -1112,42 +1105,6 @@ static void __sched remove_waiter(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); } -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void __sched rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - struct rt_mutex *next_lock; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - next_lock = waiter->lock; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - - rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, - next_lock, NULL, task); -} - -void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); - waiter->task = NULL; -} - /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1274,6 +1231,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, return ret; } +static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, + unsigned int state) +{ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; + + return rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); +} + static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1316,21 +1282,16 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) return ret; } -/* - * Performs the wakeup of the top-waiter and re-enables preemption. - */ -void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) +static __always_inline int __rt_mutex_trylock(struct rt_mutex *lock) { - wake_up_q(wake_q); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; - /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ - preempt_enable(); + return rt_mutex_slowtrylock(lock); } /* * Slow path to release a rt-mutex. - * - * Return whether the current task needs to call rt_mutex_postunlock(). */ static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { @@ -1393,416 +1354,10 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_postunlock(&wake_q); } -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state, - unsigned int subclass) -{ - int ret; - - might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; - - ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); - return ret; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -/** - * rt_mutex_lock_nested - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * @subclass: the lockdep subclass - */ -void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) -{ - __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); - -#else /* !CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); -#endif - -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) -{ - return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); - -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * This function can only be called in thread context. It's safe to call it - * from atomic regions, but not from hard or soft interrupt context. - * - * Returns: - * 1 on success - * 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) +static __always_inline void __rt_mutex_unlock(struct rt_mutex *lock) { - int ret; - - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) - return 0; - - /* - * No lockdep annotation required because lockdep disables the fast - * path. - */ - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - ret = rt_mutex_slowtrylock(lock); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_trylock); - -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) -{ - mutex_release(&lock->dep_map, _RET_IP_); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) return; rt_mutex_slowunlock(lock); } -EXPORT_SYMBOL_GPL(rt_mutex_unlock); - -/* - * Futex variants, must not use fastpath. - */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_slowtrylock(lock); -} - -int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return __rt_mutex_slowtrylock(lock); -} - -/** - * __rt_mutex_futex_unlock - Futex variant, that since futex variants - * do not use the fast-path, can be simple and will not need to retry. - * - * @lock: The rt_mutex to be unlocked - * @wake_q: The wake queue head from which to get the next lock waiter - */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) -{ - lockdep_assert_held(&lock->wait_lock); - - debug_rt_mutex_unlock(lock); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - return false; /* done */ - } - - /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). - */ - mark_wakeup_next_waiter(wake_q, lock); - - return true; /* call postunlock() */ -} - -void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) -{ - DEFINE_WAKE_Q(wake_q); - unsigned long flags; - bool postunlock; - - raw_spin_lock_irqsave(&lock->wait_lock, flags); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - if (postunlock) - rt_mutex_postunlock(&wake_q); -} - -/** - * __rt_mutex_init - initialize the rt_mutex - * - * @lock: The rt_mutex to be initialized - * @name: The lock name used for debugging - * @key: The lock class key used for debugging - * - * Initialize the rt_mutex to unlocked state. - * - * Initializing of a locked rt_mutex is not allowed - */ -void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, - struct lock_class_key *key) -{ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); - - __rt_mutex_basic_init(lock); -} -EXPORT_SYMBOL_GPL(__rt_mutex_init); - -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This initializes the rtmutex and - * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not - * possible at this point because the pi_state which contains the rtmutex - * is not yet visible to other tasks. - */ -void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_basic_init(lock); - rt_mutex_set_owner(lock, proxy_owner); -} - -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This merrily cleans up the rtmutex - * (debugging) state. Concurrent operations on this rt_mutex are not - * possible because it belongs to the pi_state which is about to be freed - * and it is not longer visible to other tasks. - */ -void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); -} - -/** - * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: does _NOT_ remove the @waiter on failure; must either call - * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. - */ -int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) -{ - int ret; - - lockdep_assert_held(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) - return 1; - - /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, - RT_MUTEX_FULL_CHAINWALK); - - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; - } - - return ret; -} - -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter - * on failure. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. - */ -int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) -{ - int ret; - - raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); - if (unlikely(ret)) - remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - - return ret; -} - -/** - * rt_mutex_wait_proxy_lock() - Wait for lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * - * Wait for the lock acquisition started on our behalf by - * rt_mutex_start_proxy_lock(). Upon failure, the caller must call - * rt_mutex_cleanup_proxy_lock(). - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT - * - * Special API call for PI-futex support - */ -int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter) -{ - int ret; - - raw_spin_lock_irq(&lock->wait_lock); - /* sleep on the mutex */ - set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - raw_spin_unlock_irq(&lock->wait_lock); - - return ret; -} - -/** - * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition - * @lock: the rt_mutex we were woken on - * @waiter: the pre-initialized rt_mutex_waiter - * - * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or - * rt_mutex_wait_proxy_lock(). - * - * Unless we acquired the lock; we're still enqueued on the wait-list and can - * in fact still be granted ownership until we're removed. Therefore we can - * find we are in fact the owner and must disregard the - * rt_mutex_wait_proxy_lock() failure. - * - * Returns: - * true - did the cleanup, we done. - * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, - * caller should disregards its return value. - * - * Special API call for PI-futex support - */ -bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - bool cleanup = false; - - raw_spin_lock_irq(&lock->wait_lock); - /* - * Do an unconditional try-lock, this deals with the lock stealing - * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() - * sets a NULL owner. - * - * We're not interested in the return value, because the subsequent - * test on rt_mutex_owner() will infer that. If the trylock succeeded, - * we will own the lock and it will have removed the waiter. If we - * failed the trylock, we're still not owner and we need to remove - * ourselves. - */ - try_to_take_rt_mutex(lock, current, waiter); - /* - * Unless we're the owner; we're still enqueued on the wait_list. - * So check if we became owner, if not, take us off the wait_list. - */ - if (rt_mutex_owner(lock) != current) { - remove_waiter(lock, waiter); - cleanup = true; - } - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irq(&lock->wait_lock); - - return cleanup; -} - -#ifdef CONFIG_DEBUG_RT_MUTEXES -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} -#endif diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c new file mode 100644 index 000000000000..fc1322f5b219 --- /dev/null +++ b/kernel/locking/rtmutex_api.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include +#include + +#include "rtmutex.c" + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, + unsigned int subclass) +{ + int ret; + + might_sleep(); + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = __rt_mutex_lock(lock, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * This function can only be called in thread context. It's safe to call it + * from atomic regions, but not from hard or soft interrupt context. + * + * Returns: + * 1 on success + * 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(lock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(lock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/* + * Futex variants, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_slowtrylock(lock); +} + +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + +/** + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wake_q: The wake queue head from which to get the next lock waiter + */ +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) +{ + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wake_q, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + DEFINE_WAKE_Q(wake_q); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) + rt_mutex_postunlock(&wake_q); +} + +/** + * __rt_mutex_init - initialize the rt_mutex + * + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging + * + * Initialize the rt_mutex to unlocked state. + * + * Initializing of a locked rt_mutex is not allowed + */ +void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); + + __rt_mutex_basic_init(lock); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. + */ +void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_basic_init(lock); + rt_mutex_set_owner(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This just cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. + */ +void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL); +} + +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + return ret; +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * + * Wait for the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT + * + * Special API call for PI-futex support + */ +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); + cleanup = true; + } + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void __sched rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + next_lock = waiter->lock; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); +} + +/* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ + preempt_enable(); +} + +#ifdef CONFIG_DEBUG_RT_MUTEXES +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} +#endif diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index a90c22abdbca..0f314a21d6ca 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -37,6 +37,33 @@ struct rt_mutex_waiter { u64 deadline; }; +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q); + /* * Must be guarded because this header is included from rcu/tree_plugin.h * unconditionally. @@ -78,13 +105,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } -#else /* CONFIG_RT_MUTEXES */ -/* Used in rcu/tree_plugin.h */ -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) -{ - return NULL; -} -#endif /* !CONFIG_RT_MUTEXES */ /* * Constants for rt mutex functions which have a selectable deadlock @@ -108,34 +128,6 @@ static inline void __rt_mutex_basic_init(struct rt_mutex *lock) lock->waiters = RB_ROOT_CACHED; } -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter); - -extern int rt_mutex_futex_trylock(struct rt_mutex *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex *l); - -extern void rt_mutex_futex_unlock(struct rt_mutex *lock); -extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); - -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); - /* Debug functions */ static inline void debug_rt_mutex_unlock(struct rt_mutex *lock) { @@ -161,4 +153,20 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } +static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; +} + +#else /* CONFIG_RT_MUTEXES */ +/* Used in rcu/tree_plugin.h */ +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return NULL; +} +#endif /* !CONFIG_RT_MUTEXES */ + #endif -- cgit v1.2.3-71-gd317 From 830e6acc8a1cafe153a0d88f9b2455965b396131 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:27:58 +0200 Subject: locking/rtmutex: Split out the inner parts of 'struct rtmutex' RT builds substitutions for rwsem, mutex, spinlock and rwlock around rtmutexes. Split the inner working out so each lock substitution can use them with the appropriate lockdep annotations. This avoids having an extra unused lockdep map in the wrapped rtmutex. No functional change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.784739994@linutronix.de --- include/linux/rtmutex.h | 23 +++++++++++---- kernel/futex.c | 4 +-- kernel/locking/rtmutex.c | 64 +++++++++++++++++++++-------------------- kernel/locking/rtmutex_api.c | 41 ++++++++++++++------------ kernel/locking/rtmutex_common.h | 38 ++++++++++++------------ kernel/rcu/tree_plugin.h | 6 ++-- 6 files changed, 97 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index cb0f441fecb9..852740285d71 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -19,6 +19,21 @@ extern int max_lock_depth; /* for sysctl */ +struct rt_mutex_base { + raw_spinlock_t wait_lock; + struct rb_root_cached waiters; + struct task_struct *owner; +}; + +#define __RT_MUTEX_BASE_INITIALIZER(rtbasename) \ +{ \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(rtbasename.wait_lock), \ + .waiters = RB_ROOT_CACHED, \ + .owner = NULL \ +} + +extern void rt_mutex_base_init(struct rt_mutex_base *rtb); + /** * The rt_mutex structure * @@ -28,9 +43,7 @@ extern int max_lock_depth; /* for sysctl */ * @owner: the mutex owner */ struct rt_mutex { - raw_spinlock_t wait_lock; - struct rb_root_cached waiters; - struct task_struct *owner; + struct rt_mutex_base rtmutex; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -63,9 +76,7 @@ do { \ #define __RT_MUTEX_INITIALIZER(mutexname) \ { \ - .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock), \ - .waiters = RB_ROOT_CACHED, \ - .owner = NULL, \ + .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex), \ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ } diff --git a/kernel/futex.c b/kernel/futex.c index 2ecb07575055..6eab24764f28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,7 @@ struct futex_pi_state { /* * The PI object: */ - struct rt_mutex pi_mutex; + struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; @@ -3254,7 +3254,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = ret < 0 ? ret : 0; } } else { - struct rt_mutex *pi_mutex; + struct rt_mutex_base *pi_mutex; /* * We have been woken up by futex_unlock_pi(), a timeout, or a diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cd0e1a43b07a..b31f6cbe3a30 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -48,7 +48,7 @@ */ static __always_inline void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; @@ -58,13 +58,13 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) WRITE_ONCE(lock->owner, (struct task_struct *)val); } -static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -139,14 +139,14 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) { return try_cmpxchg_acquire(&lock->owner, &old, new); } -static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) { @@ -158,7 +158,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, * all future threads that attempt to [Rmw] the lock to the slowpath. As such * relaxed semantics suffice. */ -static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -174,7 +174,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, unsigned long flags) __releases(lock->wait_lock) { @@ -210,7 +210,7 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) { @@ -218,14 +218,14 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock, } -static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock, +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) { return false; } -static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -234,7 +234,7 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, unsigned long flags) __releases(lock->wait_lock) { @@ -295,13 +295,13 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod } static __always_inline void -rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } static __always_inline void -rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { if (RB_EMPTY_NODE(&waiter->tree_entry)) return; @@ -369,7 +369,7 @@ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, return chwalk == RT_MUTEX_FULL_CHAINWALK; } -static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; } @@ -439,15 +439,15 @@ static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, - struct rt_mutex *orig_lock, - struct rt_mutex *next_lock, + struct rt_mutex_base *orig_lock, + struct rt_mutex_base *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; struct rt_mutex_waiter *prerequeue_top_waiter; int ret = 0, depth = 0; - struct rt_mutex *lock; + struct rt_mutex_base *lock; bool detect_deadlock; bool requeue = true; @@ -795,7 +795,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * callsite called task_blocked_on_lock(), otherwise NULL */ static int __sched -try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, +try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); @@ -913,14 +913,14 @@ takeit: * * This must be called with lock->wait_lock held and interrupts disabled */ -static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, +static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; int chain_walk = 0, res; lockdep_assert_held(&lock->wait_lock); @@ -1003,7 +1003,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, * Called with lock->wait_lock held and interrupts disabled. */ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, - struct rt_mutex *lock) + struct rt_mutex_base *lock) { struct rt_mutex_waiter *waiter; @@ -1052,12 +1052,12 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ -static void __sched remove_waiter(struct rt_mutex *lock, +static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; lockdep_assert_held(&lock->wait_lock); @@ -1115,7 +1115,8 @@ static void __sched remove_waiter(struct rt_mutex *lock, * * Must be called with lock->wait_lock held and interrupts disabled */ -static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, +static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + unsigned int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter) { @@ -1170,7 +1171,8 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, /* * Slow path lock function: */ -static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, +static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + unsigned int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk) { @@ -1231,7 +1233,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state, return ret; } -static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, +static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) @@ -1240,7 +1242,7 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, return rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } -static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1256,7 +1258,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path try-lock function: */ -static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) { unsigned long flags; int ret; @@ -1282,7 +1284,7 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) return ret; } -static __always_inline int __rt_mutex_trylock(struct rt_mutex *lock) +static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) { if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; @@ -1293,7 +1295,7 @@ static __always_inline int __rt_mutex_trylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex. */ -static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) +static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) { DEFINE_WAKE_Q(wake_q); unsigned long flags; @@ -1354,7 +1356,7 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_postunlock(&wake_q); } -static __always_inline void __rt_mutex_unlock(struct rt_mutex *lock) +static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) { if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) return; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index fc1322f5b219..38de4b137b9e 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -26,12 +26,18 @@ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, might_sleep(); mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - ret = __rt_mutex_lock(lock, state); + ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); return ret; } +void rt_mutex_base_init(struct rt_mutex_base *rtb) +{ + __rt_mutex_base_init(rtb); +} +EXPORT_SYMBOL(rt_mutex_base_init); + #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rt_mutex_lock_nested - lock a rt_mutex @@ -93,7 +99,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(lock); + ret = __rt_mutex_trylock(&lock->rtmutex); if (ret) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); @@ -109,19 +115,19 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); void __sched rt_mutex_unlock(struct rt_mutex *lock) { mutex_release(&lock->dep_map, _RET_IP_); - __rt_mutex_unlock(lock); + __rt_mutex_unlock(&lock->rtmutex); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); /* * Futex variants, must not use fastpath. */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock) { return rt_mutex_slowtrylock(lock); } -int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) { return __rt_mutex_slowtrylock(lock); } @@ -133,7 +139,7 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) * @lock: The rt_mutex to be unlocked * @wake_q: The wake queue head from which to get the next lock waiter */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, +bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, struct wake_q_head *wake_q) { lockdep_assert_held(&lock->wait_lock); @@ -156,7 +162,7 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, return true; /* call postunlock() */ } -void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock) { DEFINE_WAKE_Q(wake_q); unsigned long flags; @@ -182,12 +188,11 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) * Initializing of a locked rt_mutex is not allowed */ void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key) { debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + __rt_mutex_base_init(&lock->rtmutex); lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); - - __rt_mutex_basic_init(lock); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -205,10 +210,10 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * possible at this point because the pi_state which contains the rtmutex * is not yet visible to other tasks. */ -void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, +void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, struct task_struct *proxy_owner) { - __rt_mutex_basic_init(lock); + __rt_mutex_base_init(lock); rt_mutex_set_owner(lock, proxy_owner); } @@ -224,7 +229,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) +void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); @@ -249,7 +254,7 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) * * Special API call for PI-futex support. */ -int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { @@ -296,7 +301,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support. */ -int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { @@ -328,7 +333,7 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support */ -int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -368,7 +373,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support */ -bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { bool cleanup = false; @@ -413,7 +418,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, void __sched rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 0f314a21d6ca..548285a5ed19 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -32,7 +32,7 @@ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; - struct rt_mutex *lock; + struct rt_mutex_base *lock; int prio; u64 deadline; }; @@ -40,26 +40,26 @@ struct rt_mutex_waiter { /* * PI-futex support (proxy locking functions, etc.): */ -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, +extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); -extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter); -extern int rt_mutex_futex_trylock(struct rt_mutex *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex *l); +extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); -extern void rt_mutex_futex_unlock(struct rt_mutex *lock); -extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, +extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, struct wake_q_head *wake_q); extern void rt_mutex_postunlock(struct wake_q_head *wake_q); @@ -69,12 +69,12 @@ extern void rt_mutex_postunlock(struct wake_q_head *wake_q); * unconditionally. */ #ifdef CONFIG_RT_MUTEXES -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } -static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex *lock) +static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -99,7 +99,7 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) #define RT_MUTEX_HAS_WAITERS 1UL -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); @@ -121,21 +121,21 @@ enum rtmutex_chainwalk { RT_MUTEX_FULL_CHAINWALK, }; -static inline void __rt_mutex_basic_init(struct rt_mutex *lock) +static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; } /* Debug functions */ -static inline void debug_rt_mutex_unlock(struct rt_mutex *lock) +static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock) { if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } -static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); @@ -163,7 +163,7 @@ static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) #else /* CONFIG_RT_MUTEXES */ /* Used in rcu/tree_plugin.h */ -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { return NULL; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..0ff5e4fb933e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -559,7 +559,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ - drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t; if (&t->rcu_node_entry == rnp->boost_tasks) WRITE_ONCE(rnp->boost_tasks, np); } @@ -586,7 +586,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); /* * If this was the last task on the expedited lists, @@ -1083,7 +1083,7 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); + rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); -- cgit v1.2.3-71-gd317 From ebbdc41e90ffce8b6bb3cbba1801ede2dd07a89b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:00 +0200 Subject: locking/rtmutex: Provide rt_mutex_slowlock_locked() Split the inner workings of rt_mutex_slowlock() out into a separate function, which can be reused by the upcoming RT lock substitutions, e.g. for rw_semaphores. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.841971086@linutronix.de --- kernel/locking/rtmutex.c | 100 +++++++++++++++++++++++++------------------ kernel/locking/rtmutex_api.c | 2 +- 2 files changed, 59 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b31f6cbe3a30..3d0b29cb5e63 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1106,7 +1106,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, } /** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE * or TASK_UNINTERRUPTIBLE) @@ -1115,10 +1115,10 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * * Must be called with lock->wait_lock held and interrupts disabled */ -static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, - unsigned int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) +static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, + unsigned int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) { int ret = 0; @@ -1168,52 +1168,37 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, } } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held + * @lock: The rtmutex to block lock + * @state: The task state for sleeping + * @chwalk: Indicator whether full or partial chainwalk is requested + * @waiter: Initializer waiter for blocking */ -static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, - unsigned int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) +static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + unsigned int state, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter) { - struct rt_mutex_waiter waiter; - unsigned long flags; - int ret = 0; - - rt_mutex_init_waiter(&waiter); + int ret; - /* - * Technically we could use raw_spin_[un]lock_irq() here, but this can - * be called in early boot if the cmpxchg() fast path is disabled - * (debug, no architecture support). In this case we will acquire the - * rtmutex with lock->wait_lock held. But we cannot unconditionally - * enable interrupts in that early boot case. So we need to use the - * irqsave/restore variants. - */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); + lockdep_assert_held(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + if (try_to_take_rt_mutex(lock, current, NULL)) return 0; - } set_current_state(state); - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); + ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); if (likely(!ret)) - /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); + ret = rt_mutex_slowlock_block(lock, state, NULL, waiter); if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, chwalk, &waiter); + remove_waiter(lock, waiter); + rt_mutex_handle_deadlock(ret, chwalk, waiter); } /* @@ -1221,14 +1206,45 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, * unconditionally. We might have to fix that up. */ fixup_rt_mutex_waiters(lock); + return ret; +} - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); +static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, + unsigned int state) +{ + struct rt_mutex_waiter waiter; + int ret; + + rt_mutex_init_waiter(&waiter); - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); + ret = __rt_mutex_slowlock(lock, state, RT_MUTEX_MIN_CHAINWALK, &waiter); debug_rt_mutex_free_waiter(&waiter); + return ret; +} + +/* + * rt_mutex_slowlock - Locking slowpath invoked when fast path fails + * @lock: The rtmutex to block lock + * @state: The task state for sleeping + */ +static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + unsigned int state) +{ + unsigned long flags; + int ret; + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + ret = __rt_mutex_slowlock_locked(lock, state); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1239,7 +1255,7 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - return rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + return rt_mutex_slowlock(lock, state); } static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 38de4b137b9e..c5136f4998bb 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -342,7 +342,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, TASK_INTERRUPTIBLE, to, waiter); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. -- cgit v1.2.3-71-gd317 From 943f0edb754fac195043c620b44f920e4fb76ec8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:03 +0200 Subject: locking/rt: Add base code for RT rw_semaphore and rwlock On PREEMPT_RT, rw_semaphores and rwlocks are substituted with an rtmutex and a reader count. The implementation is writer unfair, as it is not feasible to do priority inheritance on multiple readers, but experience has shown that real-time workloads are not the typical workloads which are sensitive to writer starvation. The inner workings of rw_semaphores and rwlocks on RT are almost identical except for the task state and signal handling. rw_semaphores are not state preserving over a contention, they are expected to enter and leave with state == TASK_RUNNING. rwlocks have a mechanism to preserve the state of the task at entry and restore it after unblocking taking potential non-lock related wakeups into account. rw_semaphores can also be subject to signal handling interrupting a blocked state, while rwlocks ignore signals. To avoid code duplication, provide a shared implementation which takes the small difference vs. state and signals into account. The code is included into the relevant rw_semaphore/rwlock base code and compiled for each use case separately. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211302.957920571@linutronix.de --- include/linux/rwbase_rt.h | 39 +++++++ kernel/locking/rwbase_rt.c | 263 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 include/linux/rwbase_rt.h create mode 100644 kernel/locking/rwbase_rt.c (limited to 'kernel') diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h new file mode 100644 index 000000000000..1d264dd08625 --- /dev/null +++ b/include/linux/rwbase_rt.h @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _LINUX_RWBASE_RT_H +#define _LINUX_RWBASE_RT_H + +#include +#include + +#define READER_BIAS (1U << 31) +#define WRITER_BIAS (1U << 30) + +struct rwbase_rt { + atomic_t readers; + struct rt_mutex_base rtmutex; +}; + +#define __RWBASE_INITIALIZER(name) \ +{ \ + .readers = ATOMIC_INIT(READER_BIAS), \ + .rtmutex = __RT_MUTEX_BASE_INITIALIZER(name.rtmutex), \ +} + +#define init_rwbase_rt(rwbase) \ + do { \ + rt_mutex_base_init(&(rwbase)->rtmutex); \ + atomic_set(&(rwbase)->readers, READER_BIAS); \ + } while (0) + + +static __always_inline bool rw_base_is_locked(struct rwbase_rt *rwb) +{ + return atomic_read(&rwb->readers) != READER_BIAS; +} + +static __always_inline bool rw_base_is_contended(struct rwbase_rt *rwb) +{ + return atomic_read(&rwb->readers) > 0; +} + +#endif /* _LINUX_RWBASE_RT_H */ diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c new file mode 100644 index 000000000000..4ba15088e640 --- /dev/null +++ b/kernel/locking/rwbase_rt.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * RT-specific reader/writer semaphores and reader/writer locks + * + * down_write/write_lock() + * 1) Lock rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical section + * 4) Mark it write locked + * + * up_write/write_unlock() + * 1) Remove the write locked marker + * 2) Set the reader BIAS, so readers can use the fast path again + * 3) Unlock rtmutex, to release blocked readers + * + * down_read/read_lock() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take tmutex::wait_lock, which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on tmutex + * 5) unlock rtmutex, goto 1) + * + * up_read/read_unlock() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in down_write()/write_lock() #3 + * + * down_read/read_lock()#3 has the consequence, that rw semaphores and rw + * locks on RT are not writer fair, but writers, which should be avoided in + * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL + * inheritance mechanism. + * + * It's possible to make the rw primitives writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers + * to block on the rtmutex, but the rtmutex would have to be proxy locked + * for one reader after the other. We can't use multi-reader inheritance + * because there is no way to support that with SCHED_DEADLINE. + * Implementing the one by one reader boosting/handover mechanism is a + * major surgery for a very dubious value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + * + * Common code shared between RT rw_semaphore and rwlock + */ + +static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) +{ + int r; + + /* + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&rwb->readers); r < 0;) { + if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) + return 1; + } + return 0; +} + +static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + int ret; + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Allow readers, as long as the writer has not completely + * acquired the semaphore for write. + */ + if (atomic_read(&rwb->readers) != WRITER_BIAS) { + atomic_inc(&rwb->readers); + raw_spin_unlock_irq(&rtm->wait_lock); + return 0; + } + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * down_read() + * unlock(m->wait_lock) + * up_read() + * wake(Writer) + * lock(m->wait_lock) + * sem->writelocked=true + * unlock(m->wait_lock) + * + * up_write() + * sem->writelocked=false + * rtmutex_unlock(m) + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call up_read(), which might be unbound. + */ + + /* + * For rwlocks this returns 0 unconditionally, so the below + * !ret conditionals are optimized out. + */ + ret = rwbase_rtmutex_slowlock_locked(rtm, state); + + /* + * On success the rtmutex is held, so there can't be a writer + * active. Increment the reader count and immediately drop the + * rtmutex again. + * + * rtmutex->wait_lock has to be unlocked in any case of course. + */ + if (!ret) + atomic_inc(&rwb->readers); + raw_spin_unlock_irq(&rtm->wait_lock); + if (!ret) + rwbase_rtmutex_unlock(rtm); + return ret; +} + +static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + if (rwbase_read_trylock(rwb)) + return 0; + + return __rwbase_read_lock(rwb, state); +} + +static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + struct task_struct *owner; + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path (due to a signal), but to + * clean up rwb->readers it needs to acquire rtm->wait_lock. The + * worst case which can happen is a spurious wakeup. + */ + owner = rt_mutex_owner(rtm); + if (owner) + wake_up_state(owner, state); + + raw_spin_unlock_irq(&rtm->wait_lock); +} + +static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + /* + * rwb->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical section. + */ + if (unlikely(atomic_dec_and_test(&rwb->readers))) + __rwbase_read_unlock(rwb, state); +} + +static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, + unsigned long flags) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + + atomic_add(READER_BIAS - bias, &rwb->readers); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_rtmutex_unlock(rtm); +} + +static inline void rwbase_write_unlock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + __rwbase_write_unlock(rwb, WRITER_BIAS, flags); +} + +static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* Release it and account current as reader */ + __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); +} + +static int __sched rwbase_write_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + /* Take the rtmutex as a first step */ + if (rwbase_rtmutex_lock_state(rtm, state)) + return -EINTR; + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* + * set_current_state() for rw_semaphore + * current_save_and_set_rtlock_wait_state() for rwlock + */ + rwbase_set_and_save_current_state(state); + + /* Block until all readers have left the critical section. */ + for (; atomic_read(&rwb->readers);) { + /* Optimized out for rwlocks */ + if (rwbase_signal_pending_state(state, current)) { + __set_current_state(TASK_RUNNING); + __rwbase_write_unlock(rwb, 0, flags); + return -EINTR; + } + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + + /* + * Schedule and wait for the readers to leave the critical + * section. The last reader leaving it wakes the waiter. + */ + if (atomic_read(&rwb->readers) != 0) + rwbase_schedule(); + set_current_state(state); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + } + + atomic_set(&rwb->readers, WRITER_BIAS); + rwbase_restore_current_state(); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 0; +} + +static inline int rwbase_write_trylock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + if (!rwbase_rtmutex_trylock(rtm)) + return 0; + + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (!atomic_read(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 1; + } + __rwbase_write_unlock(rwb, 0, flags); + return 0; +} -- cgit v1.2.3-71-gd317 From 42254105dfe871a0dc4f9d376106aeb010e54341 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:05 +0200 Subject: locking/rwsem: Add rtmutex based R/W semaphore implementation The RT specific R/W semaphore implementation used to restrict the number of readers to one, because a writer cannot block on multiple readers and inherit its priority or budget. The single reader restricting was painful in various ways: - Performance bottleneck for multi-threaded applications in the page fault path (mmap sem) - Progress blocker for drivers which are carefully crafted to avoid the potential reader/writer deadlock in mainline. The analysis of the writer code paths shows that properly written RT tasks should not take them. Syscalls like mmap(), file access which take mmap sem write locked have unbound latencies, which are completely unrelated to mmap sem. Other R/W sem users like graphics drivers are not suitable for RT tasks either. So there is little risk to hurt RT tasks when the RT rwsem implementation is done in the following way: - Allow concurrent readers - Make writers block until the last reader left the critical section. This blocking is not subject to priority/budget inheritance. - Readers blocked on a writer inherit their priority/budget in the normal way. There is a drawback with this scheme: R/W semaphores become writer unfair though the applications which have triggered writer starvation (mostly on mmap_sem) in the past are not really the typical workloads running on a RT system. So while it's unlikely to hit writer starvation, it's possible. If there are unexpected workloads on RT systems triggering it, the problem has to be revisited. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.016885947@linutronix.de --- include/linux/rwsem.h | 78 ++++++++++++++++++++++++++++++----- kernel/locking/rwsem.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index a66038d88878..426e98e0b675 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,19 @@ #include #include #include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_SLEEP, \ + }, +#else +# define __RWSEM_DEP_MAP_INIT(lockname) +#endif + +#ifndef CONFIG_PREEMPT_RT + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include #endif @@ -64,16 +77,6 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) /* Common initializer macros and functions */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) \ - .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_SLEEP, \ - }, -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else @@ -119,6 +122,61 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) return !list_empty(&sem->wait_list); } +#else /* !CONFIG_PREEMPT_RT */ + +#include + +struct rw_semaphore { + struct rwbase_rt rwbase; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __RWSEM_INITIALIZER(name) \ + { \ + .rwbase = __RWBASE_INITIALIZER(name), \ + __RWSEM_DEP_MAP_INIT(name) \ + } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, + struct lock_class_key *key); +#else +static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name, + struct lock_class_key *key) +{ +} +#endif + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + init_rwbase_rt(&(sem)->rwbase); \ + __rwsem_init((sem), #sem, &__key); \ +} while (0) + +static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return rw_base_is_locked(&sem->rwbase); +} + +static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) +{ + return rw_base_is_contended(&sem->rwbase); +} + +#endif /* CONFIG_PREEMPT_RT */ + +/* + * The functions below are the same for all rwsem implementations including + * the RT specific variant. + */ + /* * lock for reading */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8a595b60ca9b..c017f9f2874b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,7 @@ #include #include +#ifndef CONFIG_PREEMPT_RT #include "lock_events.h" /* @@ -1344,6 +1345,113 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } +#else /* !CONFIG_PREEMPT_RT */ + +#include "rtmutex.c" + +#define rwbase_set_and_save_current_state(state) \ + set_current_state(state) + +#define rwbase_restore_current_state() \ + __set_current_state(TASK_RUNNING) + +#define rwbase_rtmutex_lock_state(rtm, state) \ + __rt_mutex_lock(rtm, state) + +#define rwbase_rtmutex_slowlock_locked(rtm, state) \ + __rt_mutex_slowlock_locked(rtm, state) + +#define rwbase_rtmutex_unlock(rtm) \ + __rt_mutex_unlock(rtm) + +#define rwbase_rtmutex_trylock(rtm) \ + __rt_mutex_trylock(rtm) + +#define rwbase_signal_pending_state(state, current) \ + signal_pending_state(state, current) + +#define rwbase_schedule() \ + schedule() + +#include "rwbase_rt.c" + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rwsem_init(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(__rwsem_init); +#endif + +static inline void __down_read(struct rw_semaphore *sem) +{ + rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + return rwbase_read_trylock(&sem->rwbase); +} + +static inline void __up_read(struct rw_semaphore *sem) +{ + rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); +} + +static inline void __sched __down_write(struct rw_semaphore *sem) +{ + rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + return rwbase_write_trylock(&sem->rwbase); +} + +static inline void __up_write(struct rw_semaphore *sem) +{ + rwbase_write_unlock(&sem->rwbase); +} + +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + rwbase_write_downgrade(&sem->rwbase); +} + +/* Debug stubs for the common API */ +#define DEBUG_RWSEMS_WARN_ON(c, sem) + +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ +} + +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ + int count = atomic_read(&sem->rwbase.readers); + + return count < 0 && count != READER_BIAS; +} + +#endif /* CONFIG_PREEMPT_RT */ + /* * lock for reading */ -- cgit v1.2.3-71-gd317 From c014ef69b3acdb8c9e7fc412e96944f4d5c36fa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:06 +0200 Subject: locking/rtmutex: Add wake_state to rt_mutex_waiter Regular sleeping locks like mutexes, rtmutexes and rw_semaphores are always entering and leaving a blocking section with task state == TASK_RUNNING. On a non-RT kernel spinlocks and rwlocks never affect the task state, but on RT kernels these locks are converted to rtmutex based 'sleeping' locks. So in case of contention the task goes to block, which requires to carefully preserve the task state, and restore it after acquiring the lock taking regular wakeups for the task into account, which happened while the task was blocked. This state preserving is achieved by having a separate task state for blocking on a RT spin/rwlock and a saved_state field in task_struct along with careful handling of these wakeup scenarios in try_to_wake_up(). To avoid conditionals in the rtmutex code, store the wake state which has to be used for waking a lock waiter in rt_mutex_waiter which allows to handle the regular and RT spin/rwlocks by handing it to wake_up_state(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.079800739@linutronix.de --- kernel/locking/rtmutex.c | 2 +- kernel/locking/rtmutex_common.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3d0b29cb5e63..c13b9b849a4b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -692,7 +692,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * to get the lock. */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); + wake_up_state(waiter->task, waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 548285a5ed19..fcc55de46b28 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -25,6 +25,7 @@ * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks + * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) * @prio: Priority of the waiter * @deadline: Deadline of the waiter if applicable */ @@ -33,6 +34,7 @@ struct rt_mutex_waiter { struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex_base *lock; + unsigned int wake_state; int prio; u64 deadline; }; @@ -158,9 +160,16 @@ static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) debug_rt_mutex_init_waiter(waiter); RB_CLEAR_NODE(&waiter->pi_tree_entry); RB_CLEAR_NODE(&waiter->tree_entry); + waiter->wake_state = TASK_NORMAL; waiter->task = NULL; } +static inline void rtlock_init_rtmutex_waiter(struct rt_mutex_waiter *waiter) +{ + rt_mutex_init_waiter(waiter); + waiter->wake_state = TASK_RTLOCK_WAIT; +} + #else /* CONFIG_RT_MUTEXES */ /* Used in rcu/tree_plugin.h */ static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -- cgit v1.2.3-71-gd317 From b576e640ce5e22673e12949cf14ae3cb18d9b859 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:08 +0200 Subject: locking/rtmutex: Provide rt_wake_q_head and helpers To handle the difference between wakeups for regular sleeping locks (mutex, rtmutex, rw_semaphore) and the wakeups for 'sleeping' spin/rwlocks on PREEMPT_RT enabled kernels correctly, it is required to provide a wake_q_head construct which allows to keep them separate. Provide a wrapper around wake_q_head and the required helpers, which will be extended with the state handling later. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.139337655@linutronix.de --- kernel/locking/rtmutex.c | 15 +++++++++++++++ kernel/locking/rtmutex_common.h | 14 ++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c13b9b849a4b..35f7685d7460 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -347,6 +347,21 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) rt_mutex_setprio(p, pi_task); } +/* RT mutex specific wake_q wrappers */ +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + wake_q_add(&wqh->head, w->task); +} + +static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) +{ + wake_up_q(&wqh->head); + + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ + preempt_enable(); +} + /* * Deadlock detection is conditional: * diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index fcc55de46b28..9e2f1dbf6482 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -39,6 +39,20 @@ struct rt_mutex_waiter { u64 deadline; }; +/** + * rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT + * @head: The regular wake_q_head for sleeping lock variants + */ +struct rt_wake_q_head { + struct wake_q_head head; +}; + +#define DEFINE_RT_WAKE_Q(name) \ + struct rt_wake_q_head name = { \ + .head = WAKE_Q_HEAD_INITIALIZER(name.head), \ + } + /* * PI-futex support (proxy locking functions, etc.): */ -- cgit v1.2.3-71-gd317 From 7980aa397cc0968ea3ffee7a985c31c92ad84f81 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:09 +0200 Subject: locking/rtmutex: Use rt_mutex_wake_q_head Prepare for the required state aware handling of waiter wakeups via wake_q and switch the rtmutex code over to the rtmutex specific wrapper. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.197113263@linutronix.de --- kernel/futex.c | 8 ++++---- kernel/locking/rtmutex.c | 12 ++++++------ kernel/locking/rtmutex_api.c | 19 ++++++++----------- kernel/locking/rtmutex_common.h | 4 ++-- 4 files changed, 20 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6eab24764f28..21625cb3e865 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1493,11 +1493,11 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 curval, newval; struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); + u32 curval, newval; int ret = 0; top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); @@ -1549,14 +1549,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ * not fail. */ pi_state_update_owner(pi_state, new_owner); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wqh); return ret; } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 35f7685d7460..5f0d0725ca32 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1017,7 +1017,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * * Called with lock->wait_lock held and interrupts disabled. */ -static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, +static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, struct rt_mutex_base *lock) { struct rt_mutex_waiter *waiter; @@ -1054,10 +1054,10 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, * deboost but before waking our donor task, hence the preempt_disable() * before unlock. * - * Pairs with preempt_enable() in rt_mutex_postunlock(); + * Pairs with preempt_enable() in rt_mutex_wake_up_q(); */ preempt_disable(); - wake_q_add(wake_q, waiter->task); + rt_mutex_wake_q_add(wqh, waiter); raw_spin_unlock(¤t->pi_lock); } @@ -1328,7 +1328,7 @@ static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) */ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) { - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); unsigned long flags; /* irqsave required to support early boot calls */ @@ -1381,10 +1381,10 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(&wake_q, lock); + mark_wakeup_next_waiter(&wqh, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - rt_mutex_postunlock(&wake_q); + rt_mutex_wake_up_q(&wqh); } static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index c5136f4998bb..56403dc5c2fc 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -137,10 +137,10 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) * do not use the fast-path, can be simple and will not need to retry. * * @lock: The rt_mutex to be unlocked - * @wake_q: The wake queue head from which to get the next lock waiter + * @wqh: The wake queue head from which to get the next lock waiter */ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, - struct wake_q_head *wake_q) + struct rt_wake_q_head *wqh) { lockdep_assert_held(&lock->wait_lock); @@ -157,23 +157,23 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, * avoid inversion prior to the wakeup. preempt_disable() * therein pairs with rt_mutex_postunlock(). */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(wqh, lock); return true; /* call postunlock() */ } void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock) { - DEFINE_WAKE_Q(wake_q); + DEFINE_RT_WAKE_Q(wqh); unsigned long flags; bool postunlock; raw_spin_lock_irqsave(&lock->wait_lock, flags); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + postunlock = __rt_mutex_futex_unlock(lock, &wqh); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wqh); } /** @@ -441,12 +441,9 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) /* * Performs the wakeup of the top-waiter and re-enables preemption. */ -void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) +void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh) { - wake_up_q(wake_q); - - /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ - preempt_enable(); + rt_mutex_wake_up_q(wqh); } #ifdef CONFIG_DEBUG_RT_MUTEXES diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 9e2f1dbf6482..ff36316003d8 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -76,9 +76,9 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, - struct wake_q_head *wake_q); + struct rt_wake_q_head *wqh); -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); +extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); /* * Must be guarded because this header is included from rcu/tree_plugin.h -- cgit v1.2.3-71-gd317 From 456cfbc65cd072f4f53936ee5a37eb1447a7d3ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:11 +0200 Subject: locking/rtmutex: Prepare RT rt_mutex_wake_q for RT locks Add an rtlock_task pointer to rt_mutex_wake_q, which allows to handle the RT specific wakeup for spin/rwlock waiters. The pointer is just consuming 4/8 bytes on the stack so it is provided unconditionaly to avoid #ifdeffery all over the place. This cannot use a regular wake_q, because a task can have concurrent wakeups which would make it miss either lock or the regular wakeups, depending on what gets queued first, unless task struct gains a separate wake_q_node for this, which would be overkill, because there can only be a single task which gets woken up in the spin/rw_lock unlock path. No functional change for non-RT enabled kernels. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.253614678@linutronix.de --- kernel/locking/rtmutex.c | 18 ++++++++++++++++-- kernel/locking/rtmutex_common.h | 5 ++++- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5f0d0725ca32..8b0d38dc4147 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -351,12 +351,26 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, struct rt_mutex_waiter *w) { - wake_q_add(&wqh->head, w->task); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) { + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(wqh->rtlock_task); + get_task_struct(w->task); + wqh->rtlock_task = w->task; + } else { + wake_q_add(&wqh->head, w->task); + } } static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) { - wake_up_q(&wqh->head); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { + wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); + put_task_struct(wqh->rtlock_task); + wqh->rtlock_task = NULL; + } + + if (!wake_q_empty(&wqh->head)) + wake_up_q(&wqh->head); /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ preempt_enable(); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index ff36316003d8..424ee0f5e5a4 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -42,15 +42,18 @@ struct rt_mutex_waiter { /** * rt_wake_q_head - Wrapper around regular wake_q_head to support * "sleeping" spinlocks on RT - * @head: The regular wake_q_head for sleeping lock variants + * @head: The regular wake_q_head for sleeping lock variants + * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups */ struct rt_wake_q_head { struct wake_q_head head; + struct task_struct *rtlock_task; }; #define DEFINE_RT_WAKE_Q(name) \ struct rt_wake_q_head name = { \ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \ + .rtlock_task = NULL, \ } /* -- cgit v1.2.3-71-gd317 From e17ba59b7e8e1f67e36d8fcc46daa13370efcf11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:12 +0200 Subject: locking/rtmutex: Guard regular sleeping locks specific functions Guard the regular sleeping lock specific functionality, which is used for rtmutex on non-RT enabled kernels and for mutex, rtmutex and semaphores on RT enabled kernels so the code can be reused for the RT specific implementation of spinlocks and rwlocks in a different compilation unit. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.311535693@linutronix.de --- kernel/locking/rtmutex.c | 254 ++++++++++++++++++++++--------------------- kernel/locking/rtmutex_api.c | 1 + kernel/locking/rwsem.c | 1 + 3 files changed, 133 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8b0d38dc4147..949781aa54b1 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1075,10 +1075,139 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, raw_spin_unlock(¤t->pi_lock); } +static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) +{ + unsigned long flags; + int ret; + + /* + * If the lock already has an owner we fail to get the lock. + * This can be done without taking the @lock->wait_lock as + * it is only being read, and this is a trylock anyway. + */ + if (rt_mutex_owner(lock)) + return 0; + + /* + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + + ret = __rt_mutex_slowtrylock(lock); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + return ret; +} + +static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) +{ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(lock); +} + +/* + * Slow path to release a rt-mutex. + */ +static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) +{ + DEFINE_RT_WAKE_Q(wqh); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock, flags) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + } + + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + * + * Queue the next waiter for wakeup once we release the wait_lock. + */ + mark_wakeup_next_waiter(&wqh, lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + rt_mutex_wake_up_q(&wqh); +} + +static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) +{ + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + + rt_mutex_slowunlock(lock); +} + +#ifdef RT_MUTEX_BUILD_MUTEX +/* + * Functions required for: + * - rtmutex, futex on all kernels + * - mutex and rwsem substitutions on RT kernels + */ + /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and interrupts disabled. I must + * Must be called with lock->wait_lock held and interrupts disabled. It must * have just failed to try_to_take_rt_mutex(). */ static void __sched remove_waiter(struct rt_mutex_base *lock, @@ -1286,125 +1415,4 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, return rt_mutex_slowlock(lock, state); } - -static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) -{ - int ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - - return ret; -} - -/* - * Slow path try-lock function: - */ -static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) -{ - unsigned long flags; - int ret; - - /* - * If the lock already has an owner we fail to get the lock. - * This can be done without taking the @lock->wait_lock as - * it is only being read, and this is a trylock anyway. - */ - if (rt_mutex_owner(lock)) - return 0; - - /* - * The mutex has currently no owner. Lock the wait lock and try to - * acquire the lock. We use irqsave here to support early boot calls. - */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - ret = __rt_mutex_slowtrylock(lock); - - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - return ret; -} - -static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) -{ - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - return rt_mutex_slowtrylock(lock); -} - -/* - * Slow path to release a rt-mutex. - */ -static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) -{ - DEFINE_RT_WAKE_Q(wqh); - unsigned long flags; - - /* irqsave required to support early boot calls */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - debug_rt_mutex_unlock(lock); - - /* - * We must be careful here if the fast path is enabled. If we - * have no waiters queued we cannot set owner to NULL here - * because of: - * - * foo->lock->owner = NULL; - * rtmutex_lock(foo->lock); <- fast path - * free = atomic_dec_and_test(foo->refcnt); - * rtmutex_unlock(foo->lock); <- fast path - * if (free) - * kfree(foo); - * raw_spin_unlock(foo->lock->wait_lock); - * - * So for the fastpath enabled kernel: - * - * Nothing can set the waiters bit as long as we hold - * lock->wait_lock. So we do the following sequence: - * - * owner = rt_mutex_owner(lock); - * clear_rt_mutex_waiters(lock); - * raw_spin_unlock(&lock->wait_lock); - * if (cmpxchg(&lock->owner, owner, 0) == owner) - * return; - * goto retry; - * - * The fastpath disabled variant is simple as all access to - * lock->owner is serialized by lock->wait_lock: - * - * lock->owner = NULL; - * raw_spin_unlock(&lock->wait_lock); - */ - while (!rt_mutex_has_waiters(lock)) { - /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock, flags) == true) - return; - /* Relock the rtmutex and try again */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - } - - /* - * The wakeup next waiter path does not suffer from the above - * race. See the comments there. - * - * Queue the next waiter for wakeup once we release the wait_lock. - */ - mark_wakeup_next_waiter(&wqh, lock); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - rt_mutex_wake_up_q(&wqh); -} - -static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) -{ - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) - return; - - rt_mutex_slowunlock(lock); -} +#endif /* RT_MUTEX_BUILD_MUTEX */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 56403dc5c2fc..7f3ac096250d 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -5,6 +5,7 @@ #include #include +#define RT_MUTEX_BUILD_MUTEX #include "rtmutex.c" /* diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c017f9f2874b..2847833d5583 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1347,6 +1347,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #else /* !CONFIG_PREEMPT_RT */ +#define RT_MUTEX_BUILD_MUTEX #include "rtmutex.c" #define rwbase_set_and_save_current_state(state) \ -- cgit v1.2.3-71-gd317 From 1c143c4b65da09081d644110e619decc49c9dee4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:25 +0200 Subject: locking/rtmutex: Provide the spin/rwlock core lock function A simplified version of the rtmutex slowlock function, which neither handles signals nor timeouts, and is careful about preserving the state of the blocked task across the lock operation. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.770228446@linutronix.de --- kernel/locking/rtmutex.c | 60 +++++++++++++++++++++++++++++++++++++++++ kernel/locking/rtmutex_common.h | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 949781aa54b1..951bef073891 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1416,3 +1416,63 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, return rt_mutex_slowlock(lock, state); } #endif /* RT_MUTEX_BUILD_MUTEX */ + +#ifdef RT_MUTEX_BUILD_SPINLOCKS +/* + * Functions required for spin/rw_lock substitution on RT kernels + */ + +/** + * rtlock_slowlock_locked - Slow path lock acquisition for RT locks + * @lock: The underlying RT mutex + */ +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) +{ + struct rt_mutex_waiter waiter; + + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, current, NULL)) + return; + + rt_mutex_init_rtlock_waiter(&waiter); + + /* Save current state and set state to TASK_RTLOCK_WAIT */ + current_save_and_set_rtlock_wait_state(); + + task_blocks_on_rt_mutex(lock, &waiter, current, RT_MUTEX_MIN_CHAINWALK); + + for (;;) { + /* Try to acquire the lock again */ + if (try_to_take_rt_mutex(lock, current, &waiter)) + break; + + raw_spin_unlock_irq(&lock->wait_lock); + + schedule_rtlock(); + + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(TASK_RTLOCK_WAIT); + } + + /* Restore the task state */ + current_restore_rtlock_saved_state(); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. + * We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + debug_rt_mutex_free_waiter(&waiter); +} + +static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + rtlock_slowlock_locked(lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); +} + +#endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 424ee0f5e5a4..ccf0e36d6c31 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -181,7 +181,7 @@ static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) waiter->task = NULL; } -static inline void rtlock_init_rtmutex_waiter(struct rt_mutex_waiter *waiter) +static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter) { rt_mutex_init_waiter(waiter); waiter->wake_state = TASK_RTLOCK_WAIT; -- cgit v1.2.3-71-gd317 From a2071573d6346819cc4e5787b4206f2184985160 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 3 Aug 2021 12:59:36 +0200 Subject: sysctl: introduce new proc handler proc_dobool This is to let bool variable could be correctly displayed in big/little endian sysctl procfs. sizeof(bool) is arch dependent, proc_dobool should work in all arches. Suggested-by: Pan Xinhui Signed-off-by: Jia He [thuth: rebased the patch to the current kernel version] Signed-off-by: Thomas Huth Signed-off-by: Chuck Lever --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'kernel') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d99ca99837de..1fa2b69c6fc3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -48,6 +48,8 @@ typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..25e49b4d8049 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -536,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c) } } +static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *(bool *)valp = *lvalp; + } else { + int val = *(bool *)valp; + + *lvalp = (unsigned long)val; + *negp = false; + } + return 0; +} + static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -798,6 +813,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +/** + * proc_dobool - read/write a bool + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dobool_conv, NULL); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1630,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dobool(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3425,6 +3466,7 @@ int __init sysctl_init(void) * No sense putting this after each symbol definition, twice, * exception granted :-) */ +EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); -- cgit v1.2.3-71-gd317 From 0f383b6dc96e976dfbf2721b0bf10bd96103b341 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:27 +0200 Subject: locking/spinlock: Provide RT variant Provide the actual locking functions which make use of the general and spinlock specific rtmutex code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.826621464@linutronix.de --- kernel/locking/Makefile | 1 + kernel/locking/spinlock_rt.c | 129 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 kernel/locking/spinlock_rt.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 269f55e1e431..683f0b7fbacc 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o +obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c new file mode 100644 index 000000000000..edfa7b5776d7 --- /dev/null +++ b/kernel/locking/spinlock_rt.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PREEMPT_RT substitution for spin/rw_locks + * + * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to + * resemble the non RT semantics: + * + * - Contrary to plain rtmutexes, spinlocks and rwlocks are state + * preserving. The task state is saved before blocking on the underlying + * rtmutex, and restored when the lock has been acquired. Regular wakeups + * during that time are redirected to the saved state so no wake up is + * missed. + * + * - Non RT spin/rwlocks disable preemption and eventually interrupts. + * Disabling preemption has the side effect of disabling migration and + * preventing RCU grace periods. + * + * The RT substitutions explicitly disable migration and take + * rcu_read_lock() across the lock held section. + */ +#include +#include + +#define RT_MUTEX_BUILD_SPINLOCKS +#include "rtmutex.c" + +static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); +} + +static __always_inline void __rt_spin_lock(spinlock_t *lock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +} + +void __sched rt_spin_lock(spinlock_t *lock) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +void __sched rt_spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nest_lock); +#endif + +void __sched rt_spin_unlock(spinlock_t *lock) +{ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + + if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL))) + rt_mutex_slowunlock(&lock->lock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __sched rt_spin_lock_unlock(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_unlock); + +static __always_inline int __rt_spin_trylock(spinlock_t *lock) +{ + int ret = 1; + + if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current))) + ret = rt_mutex_slowtrylock(&lock->lock); + + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} + +int __sched rt_spin_trylock(spinlock_t *lock) +{ + return __rt_spin_trylock(lock); +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __sched rt_spin_trylock_bh(spinlock_t *lock) +{ + int ret; + + local_bh_disable(); + ret = __rt_spin_trylock(lock); + if (!ret) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_bh); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); +} +EXPORT_SYMBOL(__rt_spin_lock_init); +#endif -- cgit v1.2.3-71-gd317 From 8282947f67345246b4a6344dbceb07484d3d4dad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:28 +0200 Subject: locking/rwlock: Provide RT variant Similar to rw_semaphores, on RT the rwlock substitution is not writer fair, because it's not feasible to have a writer inherit its priority to multiple readers. Readers blocked on a writer follow the normal rules of priority inheritance. Like RT spinlocks, RT rwlocks are state preserving across the slow lock operations (contended case). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.882793524@linutronix.de --- include/linux/rwlock_rt.h | 140 ++++++++++++++++++++++++++++++++++++++++ include/linux/rwlock_types.h | 49 ++++++++++---- include/linux/spinlock_rt.h | 2 + kernel/Kconfig.locks | 2 +- kernel/locking/spinlock.c | 7 ++ kernel/locking/spinlock_debug.c | 5 ++ kernel/locking/spinlock_rt.c | 131 +++++++++++++++++++++++++++++++++++++ 7 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 include/linux/rwlock_rt.h (limited to 'kernel') diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h new file mode 100644 index 000000000000..49c1f3842ed5 --- /dev/null +++ b/include/linux/rwlock_rt.h @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_RWLOCK_RT_H +#define __LINUX_RWLOCK_RT_H + +#ifndef __LINUX_SPINLOCK_RT_H +#error Do not #include directly. Use . +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key); +#else +static inline void __rt_rwlock_init(rwlock_t *rwlock, char *name, + struct lock_class_key *key) +{ +} +#endif + +#define rwlock_init(rwl) \ +do { \ + static struct lock_class_key __key; \ + \ + init_rwbase_rt(&(rwl)->rwbase); \ + __rt_rwlock_init(rwl, #rwl, &__key); \ +} while (0) + +extern void rt_read_lock(rwlock_t *rwlock); +extern int rt_read_trylock(rwlock_t *rwlock); +extern void rt_read_unlock(rwlock_t *rwlock); +extern void rt_write_lock(rwlock_t *rwlock); +extern int rt_write_trylock(rwlock_t *rwlock); +extern void rt_write_unlock(rwlock_t *rwlock); + +static __always_inline void read_lock(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); +} + +static __always_inline void read_lock_bh(rwlock_t *rwlock) +{ + local_bh_disable(); + rt_read_lock(rwlock); +} + +static __always_inline void read_lock_irq(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); +} + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + rt_read_lock(lock); \ + flags = 0; \ + } while (0) + +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) + +static __always_inline void read_unlock(rwlock_t *rwlock) +{ + rt_read_unlock(rwlock); +} + +static __always_inline void read_unlock_bh(rwlock_t *rwlock) +{ + rt_read_unlock(rwlock); + local_bh_enable(); +} + +static __always_inline void read_unlock_irq(rwlock_t *rwlock) +{ + rt_read_unlock(rwlock); +} + +static __always_inline void read_unlock_irqrestore(rwlock_t *rwlock, + unsigned long flags) +{ + rt_read_unlock(rwlock); +} + +static __always_inline void write_lock(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); +} + +static __always_inline void write_lock_bh(rwlock_t *rwlock) +{ + local_bh_disable(); + rt_write_lock(rwlock); +} + +static __always_inline void write_lock_irq(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); +} + +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + rt_write_lock(lock); \ + flags = 0; \ + } while (0) + +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) + +#define write_trylock_irqsave(lock, flags) \ +({ \ + int __locked; \ + \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __locked = write_trylock(lock); \ + __locked; \ +}) + +static __always_inline void write_unlock(rwlock_t *rwlock) +{ + rt_write_unlock(rwlock); +} + +static __always_inline void write_unlock_bh(rwlock_t *rwlock) +{ + rt_write_unlock(rwlock); + local_bh_enable(); +} + +static __always_inline void write_unlock_irq(rwlock_t *rwlock) +{ + rt_write_unlock(rwlock); +} + +static __always_inline void write_unlock_irqrestore(rwlock_t *rwlock, + unsigned long flags) +{ + rt_write_unlock(rwlock); +} + +#define rwlock_is_contended(lock) (((void)(lock), 0)) + +#endif /* __LINUX_RWLOCK_RT_H */ diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index 0ad226b5d8fd..1948442e7750 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -5,9 +5,19 @@ # error "Do not include directly, include spinlock_types.h" #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RW_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_CONFIG, \ + } +#else +# define RW_DEP_MAP_INIT(lockname) +#endif + +#ifndef CONFIG_PREEMPT_RT /* - * include/linux/rwlock_types.h - generic rwlock type definitions - * and initializers + * generic rwlock type definitions and initializers * * portions Copyright 2005, Red Hat, Inc., Ingo Molnar * Released under the General Public License (GPL). @@ -25,16 +35,6 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define RW_DEP_MAP_INIT(lockname) \ - .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_CONFIG, \ - } -#else -# define RW_DEP_MAP_INIT(lockname) -#endif - #ifdef CONFIG_DEBUG_SPINLOCK #define __RW_LOCK_UNLOCKED(lockname) \ (rwlock_t) { .raw_lock = __ARCH_RW_LOCK_UNLOCKED, \ @@ -50,4 +50,29 @@ typedef struct { #define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) +#else /* !CONFIG_PREEMPT_RT */ + +#include + +typedef struct { + struct rwbase_rt rwbase; + atomic_t readers; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#define __RWLOCK_RT_INITIALIZER(name) \ +{ \ + .rwbase = __RWBASE_INITIALIZER(name), \ + RW_DEP_MAP_INIT(name) \ +} + +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name) + +#define DEFINE_RWLOCK(name) \ + rwlock_t name = __RW_LOCK_UNLOCKED(name) + +#endif /* CONFIG_PREEMPT_RT */ + #endif /* __LINUX_RWLOCK_TYPES_H */ diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 21228d3362f7..4fc72199cc9d 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -146,4 +146,6 @@ static inline int spin_is_locked(spinlock_t *lock) #define assert_spin_locked(lock) BUG_ON(!spin_is_locked(lock)) +#include + #endif diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 3de8fd11873b..4198f0273ecd 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS - depends on SMP + depends on SMP && !PREEMPT_RT config ARCH_HAS_MMIOWB bool diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index c8d7ad9fb9b2..c5830cfa379a 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK int __lockfunc _raw_read_trylock(rwlock_t *lock) { @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index b9d93087ee66..14235671a1a7 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, } EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_dump(raw_spinlock_t *lock, const char *msg) { @@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif /* !CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index edfa7b5776d7..c36648bd765d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -127,3 +127,134 @@ void __rt_spin_lock_init(spinlock_t *lock, const char *name, } EXPORT_SYMBOL(__rt_spin_lock_init); #endif + +/* + * RT-specific reader/writer locks + */ +#define rwbase_set_and_save_current_state(state) \ + current_save_and_set_rtlock_wait_state() + +#define rwbase_restore_current_state() \ + current_restore_rtlock_saved_state() + +static __always_inline int +rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); + return 0; +} + +static __always_inline int +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) +{ + rtlock_slowlock_locked(rtm); + return 0; +} + +static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL))) + return; + + rt_mutex_slowunlock(rtm); +} + +static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(rtm); +} + +#define rwbase_signal_pending_state(state, current) (0) + +#define rwbase_schedule() \ + schedule_rtlock() + +#include "rwbase_rt.c" +/* + * The common functions which get wrapped into the rwlock API. + */ +int __sched rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_read_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +int __sched rt_write_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_write_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +void __sched rt_read_lock(rwlock_t *rwlock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_read_lock); + +void __sched rt_write_lock(rwlock_t *rwlock) +{ + ___might_sleep(__FILE__, __LINE__, 0); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock); + +void __sched rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT); +} +EXPORT_SYMBOL(rt_read_unlock); + +void __sched rt_write_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + rcu_read_unlock(); + migrate_enable(); + rwbase_write_unlock(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_write_unlock); + +int __sched rt_rwlock_is_contended(rwlock_t *rwlock) +{ + return rw_base_is_contended(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_rwlock_is_contended); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG); +} +EXPORT_SYMBOL(__rt_rwlock_init); +#endif -- cgit v1.2.3-71-gd317 From 715f7f9ece4685157bb59560f6c612340d730ab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:30 +0200 Subject: locking/rtmutex: Squash !RT tasks to DEFAULT_PRIO Ensure all !RT tasks have the same prio such that they end up in FIFO order and aren't split up according to nice level. The reason why nice levels were taken into account so far is historical. In the early days of the rtmutex code it was done to give the PI boosting and deboosting a larger coverage. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.938676930@linutronix.de --- kernel/locking/rtmutex.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 951bef073891..ac8fb2f9c6f2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -244,11 +244,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, } #endif +static __always_inline int __waiter_prio(struct task_struct *task) +{ + int prio = task->prio; + + if (!rt_prio(prio)) + return DEFAULT_PRIO; + + return prio; +} + +static __always_inline void +waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + waiter->prio = __waiter_prio(task); + waiter->deadline = task->dl.deadline; +} + /* * Only use with rt_mutex_waiter_{less,equal}() */ #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -698,8 +715,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * serializes all pi_waiters access and rb_erase() does not care about * the values of the node being removed. */ - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); @@ -969,8 +985,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, raw_spin_lock(&task->pi_lock); waiter->task = task; waiter->lock = lock; - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) -- cgit v1.2.3-71-gd317 From a321fb9038b335f3c447d1810b97d5f7eec152ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Aug 2021 16:17:38 +0200 Subject: locking/mutex: Consolidate core headers, remove kernel/locking/mutex-debug.h Having two header files which contain just the non-debug and debug variants is mostly waste of disc space and has no real value. Stick the debug variants into the common mutex.h file as counterpart to the stubs for the non-debug case. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211303.995350521@linutronix.de --- kernel/locking/mutex-debug.c | 4 +--- kernel/locking/mutex-debug.h | 29 ----------------------------- kernel/locking/mutex.c | 4 ++-- kernel/locking/mutex.h | 37 +++++++++++++++++++++++-------------- 4 files changed, 26 insertions(+), 48 deletions(-) delete mode 100644 kernel/locking/mutex-debug.h (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index db9301591e3f..7ef5a36857e8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -1,6 +1,4 @@ /* - * kernel/mutex-debug.c - * * Debugging code for mutexes * * Started by Ingo Molnar: @@ -22,7 +20,7 @@ #include #include -#include "mutex-debug.h" +#include "mutex.h" /* * Must be called with lock->wait_lock held. diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h deleted file mode 100644 index 53e631e1d76d..000000000000 --- a/kernel/locking/mutex-debug.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * This file contains mutex debugging related internal declarations, - * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. - * More details are in kernel/mutex-debug.c. - */ - -/* - * This must be called with lock->wait_lock held. - */ -extern void debug_mutex_lock_common(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_wake_waiter(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); -extern void debug_mutex_add_waiter(struct mutex *lock, - struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 21c9e5da1858..acbe43d92836 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,11 +30,11 @@ #include #include +#include "mutex.h" + #ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" # define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond) #else -# include "mutex.h" # define MUTEX_WARN_ON(cond) #endif diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index f0c710b1d192..586e4f1f6ebf 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -5,19 +5,28 @@ * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * This file contains mutex debugging related internal prototypes, for the - * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) -#define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name, key) do { } while (0) - -static inline void -debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ -} +#ifdef CONFIG_DEBUG_MUTEXES +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); +#else /* CONFIG_DEBUG_MUTEXES */ +# define debug_mutex_lock_common(lock, waiter) do { } while (0) +# define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +# define debug_mutex_free_waiter(waiter) do { } while (0) +# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_unlock(lock) do { } while (0) +# define debug_mutex_init(lock, name, key) do { } while (0) +#endif /* !CONFIG_DEBUG_MUTEXES */ -- cgit v1.2.3-71-gd317 From 43d2d52d704e025518d35c3079fcbff744623166 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:33 +0200 Subject: locking/mutex: Move the 'struct mutex_waiter' definition from to the internal header Move the mutex waiter declaration from the public header to the internal kernel/locking/mutex.h header. There is no reason to expose it outside of the core code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.054325923@linutronix.de --- include/linux/mutex.h | 13 ------------- kernel/locking/mutex.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index e19323521f9c..62bafee747e9 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -74,19 +74,6 @@ struct ww_mutex { #endif }; -/* - * This is the control structure for tasks blocked on mutex, - * which resides on the blocked task's kernel stack: - */ -struct mutex_waiter { - struct list_head list; - struct task_struct *task; - struct ww_acquire_ctx *ww_ctx; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -}; - #ifdef CONFIG_DEBUG_MUTEXES #define __DEBUG_MUTEX_INITIALIZER(lockname) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 586e4f1f6ebf..0b2a79c4013b 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -7,6 +7,19 @@ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar */ +/* + * This is the control structure for tasks blocked on mutex, which resides + * on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); -- cgit v1.2.3-71-gd317 From ebf4c55c1ddbabaea120fe8d48ce25b4f5da93a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:36 +0200 Subject: locking/mutex: Make mutex::wait_lock raw The wait_lock of mutex is really a low level lock. Convert it to a raw_spinlock like the wait_lock of rtmutex. [ mingo: backmerged the test_lockup.c build fix by bigeasy. ] Co-developed-by: Sebastian Andrzej Siewior Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.166863404@linutronix.de --- include/linux/mutex.h | 4 ++-- kernel/locking/mutex.c | 22 +++++++++++----------- lib/test_lockup.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index db3367586a06..0bbc872ba72b 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -50,7 +50,7 @@ */ struct mutex { atomic_long_t owner; - spinlock_t wait_lock; + raw_spinlock_t wait_lock; #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif @@ -105,7 +105,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ - , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ + , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index acbe43d92836..17c194b81b9e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -42,7 +42,7 @@ void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { atomic_long_set(&lock->owner, 0); - spin_lock_init(&lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); @@ -486,9 +486,9 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * Uh oh, we raced in fastpath, check if any of the waiters need to * die or wound us. */ - spin_lock(&lock->base.wait_lock); + raw_spin_lock(&lock->base.wait_lock); __ww_mutex_check_waiters(&lock->base, ctx); - spin_unlock(&lock->base.wait_lock); + raw_spin_unlock(&lock->base.wait_lock); } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -966,7 +966,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas return 0; } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ @@ -1032,7 +1032,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); first = __mutex_waiter_is_first(lock, &waiter); @@ -1047,9 +1047,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) break; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); acquired: __set_current_state(TASK_RUNNING); @@ -1074,7 +1074,7 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); preempt_enable(); return 0; @@ -1082,7 +1082,7 @@ err: __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); preempt_enable(); @@ -1243,7 +1243,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -1260,7 +1260,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } diff --git a/lib/test_lockup.c b/lib/test_lockup.c index 864554e76973..4d93b02f81bb 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -502,7 +502,7 @@ static int __init test_lockup_init(void) offsetof(rwlock_t, magic), RWLOCK_MAGIC) || test_magic(lock_mutex_ptr, - offsetof(struct mutex, wait_lock.rlock.magic), + offsetof(struct mutex, wait_lock.magic), SPINLOCK_MAGIC) || test_magic(lock_rwsem_ptr, offsetof(struct rw_semaphore, wait_lock.magic), -- cgit v1.2.3-71-gd317 From cf702eddcd03dca3184947170930bf284aea27e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:38 +0200 Subject: locking/ww_mutex: Simplify lockdep annotations No functional change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.222921634@linutronix.de --- kernel/locking/mutex.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 17c194b81b9e..73ad8623cc98 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -951,6 +951,10 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas */ if (ww_ctx->acquired == 0) ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif } preempt_disable(); @@ -1098,10 +1102,9 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + unsigned long ip, struct ww_acquire_ctx *ww_ctx) { - return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); + return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1181,8 +1184,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1197,8 +1199,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1364,7 +1365,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock) static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1372,7 +1373,7 @@ static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); } -- cgit v1.2.3-71-gd317 From c0afb0ffc06e6b4e492a3b711f1fb32074f9949c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:39 +0200 Subject: locking/ww_mutex: Gather mutex_waiter initialization Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.281927514@linutronix.de --- kernel/locking/mutex-debug.c | 1 + kernel/locking/mutex.c | 12 +++--------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 7ef5a36857e8..bc8abb8549d2 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -30,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); + waiter->ww_ctx = MUTEX_POISON_WW_CTX; } void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 73ad8623cc98..6cb27c51a6e0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -982,17 +982,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } debug_mutex_lock_common(lock, &waiter); + waiter.task = current; + if (ww_ctx) + waiter.ww_ctx = ww_ctx; lock_contended(&lock->dep_map, ip); if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ __mutex_add_waiter(lock, &waiter, &lock->wait_list); - - -#ifdef CONFIG_DEBUG_MUTEXES - waiter.ww_ctx = MUTEX_POISON_WW_CTX; -#endif } else { /* * Add in stamp order, waking up waiters that must kill @@ -1001,12 +999,8 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) goto err_early_kill; - - waiter.ww_ctx = ww_ctx; } - waiter.task = current; - set_current_state(state); for (;;) { bool first; -- cgit v1.2.3-71-gd317 From aaa77de10b7c86fa779b2108802fa9e785fbe2e9 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 17 Aug 2021 16:19:04 +0200 Subject: locking/ww_mutex: Split up ww_mutex_unlock() Split the ww related part out into a helper function so it can be reused for a rtmutex based ww_mutex implementation. [ mingo: Fixed bisection failure. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.340166556@linutronix.de --- kernel/locking/mutex.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 6cb27c51a6e0..070f6f1119cd 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -744,6 +744,20 @@ void __sched mutex_unlock(struct mutex *lock) } EXPORT_SYMBOL(mutex_unlock); +static void __ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { + MUTEX_WARN_ON(!lock->ctx->acquired); + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } +} + /** * ww_mutex_unlock - release the w/w mutex * @lock: the mutex to be released @@ -757,17 +771,7 @@ EXPORT_SYMBOL(mutex_unlock); */ void __sched ww_mutex_unlock(struct ww_mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { - MUTEX_WARN_ON(!lock->ctx->acquired); - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - + __ww_mutex_unlock(lock); mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); -- cgit v1.2.3-71-gd317 From 2674bd181f3338dc2c58a59caa766dc9d5779784 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 17 Aug 2021 16:31:54 +0200 Subject: locking/ww_mutex: Split out the W/W implementation logic into kernel/locking/ww_mutex.h Split the W/W mutex helper functions out into a separate header file, so they can be shared with a rtmutex based variant later. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.396893399@linutronix.de --- kernel/locking/mutex.c | 372 +--------------------------------------------- kernel/locking/ww_mutex.h | 369 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 370 insertions(+), 371 deletions(-) create mode 100644 kernel/locking/ww_mutex.h (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 070f6f1119cd..9906ca6cc912 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -281,215 +281,7 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -/* - * Wait-Die: - * The newer transactions are killed when: - * It (the new transaction) makes a request for a lock being held - * by an older transaction. - * - * Wound-Wait: - * The newer transactions are wounded when: - * An older transaction makes a request for a lock being held by - * the newer transaction. - */ - -/* - * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired - * it. - */ -static __always_inline void -ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; - ww->ctx = ww_ctx; -} - -/* - * Determine if context @a is 'after' context @b. IOW, @a is a younger - * transaction than @b and depending on algorithm either needs to wait for - * @b or die. - */ -static inline bool __sched -__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) -{ - - return (signed long)(a->stamp - b->stamp) > 0; -} - -/* - * Wait-Die; wake a younger waiter context (when locks held) such that it can - * die. - * - * Among waiters with context, only the first one can have other locks acquired - * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and - * __ww_mutex_check_kill() wake any but the earliest context. - */ -static bool __sched -__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ww_ctx) -{ - if (!ww_ctx->is_wait_die) - return false; - - if (waiter->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, waiter); - wake_up_process(waiter->task); - } - - return true; -} - -/* - * Wound-Wait; wound a younger @hold_ctx if it holds the lock. - * - * Wound the lock holder if there are waiters with older transactions than - * the lock holders. Even if multiple waiters may wound the lock holder, - * it's sufficient that only one does. - */ -static bool __ww_mutex_wound(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - struct ww_acquire_ctx *hold_ctx) -{ - struct task_struct *owner = __mutex_owner(lock); - - lockdep_assert_held(&lock->wait_lock); - - /* - * Possible through __ww_mutex_add_waiter() when we race with - * ww_mutex_set_context_fastpath(). In that case we'll get here again - * through __ww_mutex_check_waiters(). - */ - if (!hold_ctx) - return false; - - /* - * Can have !owner because of __mutex_unlock_slowpath(), but if owner, - * it cannot go away because we'll have FLAG_WAITERS set and hold - * wait_lock. - */ - if (!owner) - return false; - - if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { - hold_ctx->wounded = 1; - - /* - * wake_up_process() paired with set_current_state() - * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_check_kill() or has a - * wakeup pending to re-read the wounded state. - */ - if (owner != current) - wake_up_process(owner); - - return true; - } - - return false; -} - -/* - * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die, or wound us. - * - * See __ww_mutex_add_waiter() for the list-order construction; basically the - * list is ordered by stamp, smallest (oldest) first. - * - * This relies on never mixing wait-die/wound-wait on the same wait-list; - * which is currently ensured by that being a ww_class property. - * - * The current task must not be on the wait list. - */ -static void __sched -__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - - lockdep_assert_held(&lock->wait_lock); - - list_for_each_entry(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_mutex_die(lock, cur, ww_ctx) || - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) - break; - } -} - -/* - * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx - * and wake up any waiters so they can recheck. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - - /* - * The lock->ctx update should be visible on all cores before - * the WAITERS check is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* See comments above and below. */ - - /* - * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS - * MB MB - * [R] MUTEX_FLAG_WAITERS [R] ww->ctx - * - * The memory barrier above pairs with the memory barrier in - * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx - * and/or !empty list. - */ - if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) - return; - - /* - * Uh oh, we raced in fastpath, check if any of the waiters need to - * die or wound us. - */ - raw_spin_lock(&lock->base.wait_lock); - __ww_mutex_check_waiters(&lock->base, ctx); - raw_spin_unlock(&lock->base.wait_lock); -} +#include "ww_mutex.h" #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -744,20 +536,6 @@ void __sched mutex_unlock(struct mutex *lock) } EXPORT_SYMBOL(mutex_unlock); -static void __ww_mutex_unlock(struct ww_mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { - MUTEX_WARN_ON(!lock->ctx->acquired); - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } -} - /** * ww_mutex_unlock - release the w/w mutex * @lock: the mutex to be released @@ -776,154 +554,6 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) } EXPORT_SYMBOL(ww_mutex_unlock); - -static __always_inline int __sched -__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - return 0; -} - - -/* - * Check the wound condition for the current lock acquire. - * - * Wound-Wait: If we're wounded, kill ourself. - * - * Wait-Die: If we're trying to acquire a lock already held by an older - * context, kill ourselves. - * - * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to - * look at waiters before us in the wait-list. - */ -static inline int __sched -__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); - struct mutex_waiter *cur; - - if (ctx->acquired == 0) - return 0; - - if (!ctx->is_wait_die) { - if (ctx->wounded) - return __ww_mutex_kill(lock, ctx); - - return 0; - } - - if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - return __ww_mutex_kill(lock, ctx); - - /* - * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must kill ourself. - */ - cur = waiter; - list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - return __ww_mutex_kill(lock, ctx); - } - - return 0; -} - -/* - * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest - * first. Such that older contexts are preferred to acquire the lock over - * younger contexts. - * - * Waiters without context are interspersed in FIFO order. - * - * Furthermore, for Wait-Die kill ourself immediately when possible (there are - * older contexts already waiting) to avoid unnecessary waiting and for - * Wound-Wait ensure we wound the owning context when it is younger. - */ -static inline int __sched -__ww_mutex_add_waiter(struct mutex_waiter *waiter, - struct mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - struct list_head *pos; - bool is_wait_die; - - if (!ww_ctx) { - __mutex_add_waiter(lock, waiter, &lock->wait_list); - return 0; - } - - is_wait_die = ww_ctx->is_wait_die; - - /* - * Add the waiter before the first waiter with a higher stamp. - * Waiters without a context are skipped to avoid starving - * them. Wait-Die waiters may die here. Wound-Wait waiters - * never die here, but they are sorted in stamp order and - * may wound the lock holder. - */ - pos = &lock->wait_list; - list_for_each_entry_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* - * Wait-Die: if we find an older context waiting, there - * is no point in queueing behind it, as we'd have to - * die the moment it would acquire the lock. - */ - if (is_wait_die) { - int ret = __ww_mutex_kill(lock, ww_ctx); - - if (ret) - return ret; - } - - break; - } - - pos = &cur->list; - - /* Wait-Die: ensure younger waiters die. */ - __ww_mutex_die(lock, cur, ww_ctx); - } - - __mutex_add_waiter(lock, waiter, pos); - - /* - * Wound-Wait: if we're blocking on a mutex owned by a younger context, - * wound that such that we might proceed. - */ - if (!is_wait_die) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - - /* - * See ww_mutex_set_context_fastpath(). Orders setting - * MUTEX_FLAG_WAITERS vs the ww->ctx load, - * such that either we or the fastpath will wound @ww->ctx. - */ - smp_mb(); - __ww_mutex_wound(lock, ww_ctx, ww->ctx); - } - - return 0; -} - /* * Lock a mutex (possibly interruptible), slowpath: */ diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h new file mode 100644 index 000000000000..dadc798dfdee --- /dev/null +++ b/kernel/locking/ww_mutex.h @@ -0,0 +1,369 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; + ww->ctx = ww_ctx; +} + +/* + * Determine if context @a is 'after' context @b. IOW, @a is a younger + * transaction than @b and depending on algorithm either needs to wait for + * @b or die. + */ +static inline bool __sched +__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ + + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a younger waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool __sched +__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, waiter); + wake_up_process(waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with older transactions than + * the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __mutex_owner(lock); + + lockdep_assert_held(&lock->wait_lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_check_kill() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; +} + +/* + * We just acquired @lock under @ww_ctx, if there are later contexts waiting + * behind us on the wait-list, check if they need to die, or wound us. + * + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * + * The current task must not be on the wait list. + */ +static void __sched +__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + + lockdep_assert_held(&lock->wait_lock); + + list_for_each_entry(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + break; + } +} + +/* + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + ww_mutex_lock_acquired(lock, ctx); + + /* + * The lock->ctx update should be visible on all cores before + * the WAITERS check is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* See comments above and below. */ + + /* + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. + */ + if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) + return; + + /* + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. + */ + raw_spin_lock(&lock->base.wait_lock); + __ww_mutex_check_waiters(&lock->base, ctx); + raw_spin_unlock(&lock->base.wait_lock); +} + +static __always_inline int __sched +__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ +static inline int __sched +__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct mutex_waiter *cur; + + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) + return __ww_mutex_kill(lock, ctx); + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must kill ourself. + */ + cur = waiter; + list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); + } + + return 0; +} + +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ +static inline int __sched +__ww_mutex_add_waiter(struct mutex_waiter *waiter, + struct mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct mutex_waiter *cur; + struct list_head *pos; + bool is_wait_die; + + if (!ww_ctx) { + __mutex_add_waiter(lock, waiter, &lock->wait_list); + return 0; + } + + is_wait_die = ww_ctx->is_wait_die; + + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. + */ + pos = &lock->wait_list; + list_for_each_entry_reverse(cur, &lock->wait_list, list) { + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; + } + + break; + } + + pos = &cur->list; + + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); + } + + __mutex_add_waiter(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); + } + + return 0; +} + +static inline void __ww_mutex_unlock(struct ww_mutex *lock) +{ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } +} -- cgit v1.2.3-71-gd317 From 5297ccb2c50916c59294a63fae79fe01a7fbb79a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:44 +0200 Subject: locking/ww_mutex: Remove the __sched annotation from ww_mutex APIs None of these functions will be on the stack when blocking in schedule(), hence __sched is not needed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.453235952@linutronix.de --- kernel/locking/ww_mutex.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index dadc798dfdee..6a98f3bb7e24 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -62,7 +62,7 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) * transaction than @b and depending on algorithm either needs to wait for * @b or die. */ -static inline bool __sched +static inline bool __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { @@ -77,7 +77,7 @@ __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and * __ww_mutex_check_kill() wake any but the earliest context. */ -static bool __sched +static bool __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, struct ww_acquire_ctx *ww_ctx) { @@ -154,7 +154,7 @@ static bool __ww_mutex_wound(struct mutex *lock, * * The current task must not be on the wait list. */ -static void __sched +static void __ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { struct mutex_waiter *cur; @@ -210,7 +210,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) raw_spin_unlock(&lock->base.wait_lock); } -static __always_inline int __sched +static __always_inline int __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { if (ww_ctx->acquired > 0) { @@ -238,7 +238,7 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to * look at waiters before us in the wait-list. */ -static inline int __sched +static inline int __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, struct ww_acquire_ctx *ctx) { @@ -285,7 +285,7 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, * older contexts already waiting) to avoid unnecessary waiting and for * Wound-Wait ensure we wound the owning context when it is younger. */ -static inline int __sched +static inline int __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -- cgit v1.2.3-71-gd317 From 23d599eb2377404100d0d1508e12b0a2c40b49b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:45 +0200 Subject: locking/ww_mutex: Abstract out the waiter iteration Split out the waiter iteration functions so they can be substituted for a rtmutex based ww_mutex later. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.509186185@linutronix.de --- kernel/locking/ww_mutex.h | 57 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 6a98f3bb7e24..1cd178c0c5c2 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -1,5 +1,49 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +static inline struct mutex_waiter * +__ww_waiter_first(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_next_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_prev_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_last(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + /* * Wait-Die: * The newer transactions are killed when: @@ -161,7 +205,9 @@ __ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) lockdep_assert_held(&lock->wait_lock); - list_for_each_entry(cur, &lock->wait_list, list) { + for (cur = __ww_waiter_first(lock); cur; + cur = __ww_waiter_next(lock, cur)) { + if (!cur->ww_ctx) continue; @@ -263,8 +309,9 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, * If there is a waiter in front of us that has a context, then its * stamp is earlier than ours and we must kill ourself. */ - cur = waiter; - list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { + for (cur = __ww_waiter_prev(lock, waiter); cur; + cur = __ww_waiter_prev(lock, cur)) { + if (!cur->ww_ctx) continue; @@ -309,7 +356,9 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, * may wound the lock holder. */ pos = &lock->wait_list; - list_for_each_entry_reverse(cur, &lock->wait_list, list) { + for (cur = __ww_waiter_last(lock); cur; + cur = __ww_waiter_prev(lock, cur)) { + if (!cur->ww_ctx) continue; -- cgit v1.2.3-71-gd317 From 843dac28f90ef80535b0aee0b78446f1770c8611 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:47 +0200 Subject: locking/ww_mutex: Abstract out waiter enqueueing The upcoming rtmutex based ww_mutex needs a different handling for enqueueing a waiter. Split it out into a helper function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.566318143@linutronix.de --- kernel/locking/ww_mutex.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 1cd178c0c5c2..f5aaf2f19370 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -44,6 +44,15 @@ __ww_waiter_last(struct mutex *lock) return w; } +static inline void +__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) +{ + struct list_head *p = &lock->wait_list; + if (pos) + p = &pos->list; + __mutex_add_waiter(lock, waiter, p); +} + /* * Wait-Die: * The newer transactions are killed when: @@ -337,12 +346,11 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { - struct mutex_waiter *cur; - struct list_head *pos; + struct mutex_waiter *cur, *pos = NULL; bool is_wait_die; if (!ww_ctx) { - __mutex_add_waiter(lock, waiter, &lock->wait_list); + __ww_waiter_add(lock, waiter, NULL); return 0; } @@ -355,7 +363,6 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, * never die here, but they are sorted in stamp order and * may wound the lock holder. */ - pos = &lock->wait_list; for (cur = __ww_waiter_last(lock); cur; cur = __ww_waiter_prev(lock, cur)) { @@ -378,13 +385,13 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, break; } - pos = &cur->list; + pos = cur; /* Wait-Die: ensure younger waiters die. */ __ww_mutex_die(lock, cur, ww_ctx); } - __mutex_add_waiter(lock, waiter, pos); + __ww_waiter_add(lock, waiter, pos); /* * Wound-Wait: if we're blocking on a mutex owned by a younger context, -- cgit v1.2.3-71-gd317 From 9934ccc75cec2bafac552c2130835630530c4f7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:49 +0200 Subject: locking/ww_mutex: Abstract out mutex accessors Move the mutex related access from various ww_mutex functions into helper functions so they can be substituted for rtmutex based ww_mutex later. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.622477030@linutronix.de --- kernel/locking/ww_mutex.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index f5aaf2f19370..842dbed0a8b2 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -53,6 +53,18 @@ __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_wa __mutex_add_waiter(lock, waiter, p); } +static inline struct task_struct * +__ww_mutex_owner(struct mutex *lock) +{ + return __mutex_owner(lock); +} + +static inline bool +__ww_mutex_has_waiters(struct mutex *lock) +{ + return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; +} + /* * Wait-Die: * The newer transactions are killed when: @@ -157,7 +169,7 @@ static bool __ww_mutex_wound(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx) { - struct task_struct *owner = __mutex_owner(lock); + struct task_struct *owner = __ww_mutex_owner(lock); lockdep_assert_held(&lock->wait_lock); @@ -253,7 +265,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx * and/or !empty list. */ - if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) + if (likely(!__ww_mutex_has_waiters(&lock->base))) return; /* -- cgit v1.2.3-71-gd317 From bdb189148ded4ffa826a1387074c795fda43b3ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:50 +0200 Subject: locking/ww_mutex: Abstract out mutex types Some ww_mutex helper functions use pointers for the underlying mutex and mutex_waiter. The upcoming rtmutex based implementation needs to share these functions. Add and use defines for the types and replace the direct types in the affected functions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.678720245@linutronix.de --- kernel/locking/ww_mutex.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 842dbed0a8b2..31b075f03660 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#define MUTEX mutex +#define MUTEX_WAITER mutex_waiter + static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) { @@ -143,7 +146,7 @@ __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) * __ww_mutex_check_kill() wake any but the earliest context. */ static bool -__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, +__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ww_ctx) { if (!ww_ctx->is_wait_die) @@ -165,7 +168,7 @@ __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, * the lock holders. Even if multiple waiters may wound the lock holder, * it's sufficient that only one does. */ -static bool __ww_mutex_wound(struct mutex *lock, +static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx) { @@ -220,9 +223,9 @@ static bool __ww_mutex_wound(struct mutex *lock, * The current task must not be on the wait list. */ static void -__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) { - struct mutex_waiter *cur; + struct MUTEX_WAITER *cur; lockdep_assert_held(&lock->wait_lock); @@ -278,7 +281,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } static __always_inline int -__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) { if (ww_ctx->acquired > 0) { #ifdef CONFIG_DEBUG_MUTEXES @@ -306,12 +309,12 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) * look at waiters before us in the wait-list. */ static inline int -__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, +__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); - struct mutex_waiter *cur; + struct MUTEX_WAITER *cur; if (ctx->acquired == 0) return 0; @@ -354,11 +357,11 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, * Wound-Wait ensure we wound the owning context when it is younger. */ static inline int -__ww_mutex_add_waiter(struct mutex_waiter *waiter, - struct mutex *lock, +__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, + struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) { - struct mutex_waiter *cur, *pos = NULL; + struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; if (!ww_ctx) { -- cgit v1.2.3-71-gd317 From 653a5b0bd9b405db999d5f4bfe08d34691e2c55a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:28:52 +0200 Subject: locking/ww_mutex: Abstract out internal lock accesses Accessing the internal wait_lock of mutex and rtmutex is slightly different. Provide helper functions for that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.734635961@linutronix.de --- include/linux/ww_mutex.h | 13 +++++++++---- kernel/locking/ww_mutex.h | 23 +++++++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 590aaa207757..3438e302a52a 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -19,6 +19,11 @@ #include +#define WW_MUTEX_BASE mutex +#define ww_mutex_base_init(l,n,k) __mutex_init(l,n,k) +#define ww_mutex_base_trylock(l) mutex_trylock(l) +#define ww_mutex_base_is_locked(b) mutex_is_locked((b)) + struct ww_class { atomic_long_t stamp; struct lock_class_key acquire_key; @@ -29,7 +34,7 @@ struct ww_class { }; struct ww_mutex { - struct mutex base; + struct WW_MUTEX_BASE base; struct ww_acquire_ctx *ctx; #ifdef CONFIG_DEBUG_MUTEXES struct ww_class *ww_class; @@ -82,7 +87,7 @@ struct ww_acquire_ctx { static inline void ww_mutex_init(struct ww_mutex *lock, struct ww_class *ww_class) { - __mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key); + ww_mutex_base_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key); lock->ctx = NULL; #ifdef CONFIG_DEBUG_MUTEXES lock->ww_class = ww_class; @@ -330,7 +335,7 @@ extern void ww_mutex_unlock(struct ww_mutex *lock); */ static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock) { - return mutex_trylock(&lock->base); + return ww_mutex_base_trylock(&lock->base); } /*** @@ -354,7 +359,7 @@ static inline void ww_mutex_destroy(struct ww_mutex *lock) */ static inline bool ww_mutex_is_locked(struct ww_mutex *lock) { - return mutex_is_locked(&lock->base); + return ww_mutex_base_is_locked(&lock->base); } #endif diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31b075f03660..309f3e4d814a 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -68,6 +68,21 @@ __ww_mutex_has_waiters(struct mutex *lock) return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; } +static inline void lock_wait_lock(struct mutex *lock) +{ + raw_spin_lock(&lock->wait_lock); +} + +static inline void unlock_wait_lock(struct mutex *lock) +{ + raw_spin_unlock(&lock->wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct mutex *lock) +{ + lockdep_assert_held(&lock->wait_lock); +} + /* * Wait-Die: * The newer transactions are killed when: @@ -174,7 +189,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, { struct task_struct *owner = __ww_mutex_owner(lock); - lockdep_assert_held(&lock->wait_lock); + lockdep_assert_wait_lock_held(lock); /* * Possible through __ww_mutex_add_waiter() when we race with @@ -227,7 +242,7 @@ __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) { struct MUTEX_WAITER *cur; - lockdep_assert_held(&lock->wait_lock); + lockdep_assert_wait_lock_held(lock); for (cur = __ww_waiter_first(lock); cur; cur = __ww_waiter_next(lock, cur)) { @@ -275,9 +290,9 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * Uh oh, we raced in fastpath, check if any of the waiters need to * die or wound us. */ - raw_spin_lock(&lock->base.wait_lock); + lock_wait_lock(&lock->base); __ww_mutex_check_waiters(&lock->base, ctx); - raw_spin_unlock(&lock->base.wait_lock); + unlock_wait_lock(&lock->base); } static __always_inline int -- cgit v1.2.3-71-gd317 From dc4564f5dc2d4b11f3f3c8d3ac94012b1c7347d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:53 +0200 Subject: locking/ww_mutex: Implement rt_mutex accessors Provide the type defines and the helper inlines for rtmutex based ww_mutexes. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.790760545@linutronix.de --- kernel/locking/ww_mutex.h | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 309f3e4d814a..7da98904b3ea 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef WW_RT + #define MUTEX mutex #define MUTEX_WAITER mutex_waiter @@ -83,6 +85,82 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock) lockdep_assert_held(&lock->wait_lock); } +#else /* WW_RT */ + +#define MUTEX rt_mutex +#define MUTEX_WAITER rt_mutex_waiter + +static inline struct rt_mutex_waiter * +__ww_waiter_first(struct rt_mutex *lock) +{ + struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_next(&w->tree_entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_prev(&w->tree_entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_last(struct rt_mutex *lock) +{ + struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree_entry); +} + +static inline void +__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos) +{ + /* RT unconditionally adds the waiter first and then removes it on error */ +} + +static inline struct task_struct * +__ww_mutex_owner(struct rt_mutex *lock) +{ + return rt_mutex_owner(&lock->rtmutex); +} + +static inline bool +__ww_mutex_has_waiters(struct rt_mutex *lock) +{ + return rt_mutex_has_waiters(&lock->rtmutex); +} + +static inline void lock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_lock(&lock->rtmutex.wait_lock); +} + +static inline void unlock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_unlock(&lock->rtmutex.wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) +{ + lockdep_assert_held(&lock->rtmutex.wait_lock); +} + +#endif /* WW_RT */ + /* * Wait-Die: * The newer transactions are killed when: @@ -169,7 +247,9 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (waiter->ww_ctx->acquired > 0 && __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { +#ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); +#endif wake_up_process(waiter->task); } -- cgit v1.2.3-71-gd317 From 8850d773703f8114d7c8a2421fd20bde8a558f96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:55 +0200 Subject: locking/ww_mutex: Add RT priority to W/W order RT mutex based ww_mutexes cannot order based on timestamps. They have to order based on priority. Add the necessary decision logic. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.847536630@linutronix.de --- kernel/locking/ww_mutex.h | 64 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 7da98904b3ea..2dce4f0b0d1c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -219,19 +219,54 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) } /* - * Determine if context @a is 'after' context @b. IOW, @a is a younger - * transaction than @b and depending on algorithm either needs to wait for - * @b or die. + * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task + * or, when of equal priority, a younger transaction than @b. + * + * Depending on the algorithm, @a will either need to wait for @b, or die. */ static inline bool -__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { +/* + * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI, + * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and + * isn't affected by this. + */ +#ifdef WW_RT + /* kernel prio; less is more */ + int a_prio = a->task->prio; + int b_prio = b->task->prio; + + if (rt_prio(a_prio) || rt_prio(b_prio)) { + + if (a_prio > b_prio) + return true; + + if (a_prio < b_prio) + return false; + + /* equal static prio */ + + if (dl_prio(a_prio)) { + if (dl_time_before(b->task->dl.deadline, + a->task->dl.deadline)) + return true; + + if (dl_time_before(a->task->dl.deadline, + b->task->dl.deadline)) + return false; + } + + /* equal prio */ + } +#endif + /* FIFO order tie break -- bigger is younger */ return (signed long)(a->stamp - b->stamp) > 0; } /* - * Wait-Die; wake a younger waiter context (when locks held) such that it can + * Wait-Die; wake a lesser waiter context (when locks held) such that it can * die. * * Among waiters with context, only the first one can have other locks acquired @@ -245,8 +280,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (!ww_ctx->is_wait_die) return false; - if (waiter->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif @@ -257,10 +291,10 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, } /* - * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * Wound-Wait; wound a lesser @hold_ctx if it holds the lock. * - * Wound the lock holder if there are waiters with older transactions than - * the lock holders. Even if multiple waiters may wound the lock holder, + * Wound the lock holder if there are waiters with more important transactions + * than the lock holders. Even if multiple waiters may wound the lock holder, * it's sufficient that only one does. */ static bool __ww_mutex_wound(struct MUTEX *lock, @@ -287,7 +321,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, if (!owner) return false; - if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) { hold_ctx->wounded = 1; /* @@ -306,8 +340,8 @@ static bool __ww_mutex_wound(struct MUTEX *lock, } /* - * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die, or wound us. + * We just acquired @lock under @ww_ctx, if there are more important contexts + * waiting behind us on the wait-list, check if they need to die, or wound us. * * See __ww_mutex_add_waiter() for the list-order construction; basically the * list is ordered by stamp, smallest (oldest) first. @@ -421,7 +455,7 @@ __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, return 0; } - if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) + if (hold_ctx && __ww_ctx_less(ctx, hold_ctx)) return __ww_mutex_kill(lock, ctx); /* @@ -479,7 +513,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, if (!cur->ww_ctx) continue; - if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { + if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) { /* * Wait-Die: if we find an older context waiting, there * is no point in queueing behind it, as we'd have to -- cgit v1.2.3-71-gd317 From 2408f7a3782a6bfa69a573f5408b3a9666db78ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:56 +0200 Subject: locking/ww_mutex: Add rt_mutex based lock type and accessors Provide the defines for RT mutex based ww_mutexes and fix up the debug logic so it's either enabled by DEBUG_MUTEXES or DEBUG_RT_MUTEXES on RT kernels. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.908012566@linutronix.de --- include/linux/ww_mutex.h | 33 ++++++++++++++++++++++++--------- kernel/locking/ww_mutex.h | 6 +++--- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 3438e302a52a..29db736af86d 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -18,11 +18,24 @@ #define __LINUX_WW_MUTEX_H #include +#include +#if defined(CONFIG_DEBUG_MUTEXES) || \ + (defined(CONFIG_PREEMPT_RT) && defined(CONFIG_DEBUG_RT_MUTEXES)) +#define DEBUG_WW_MUTEXES +#endif + +#ifndef CONFIG_PREEMPT_RT #define WW_MUTEX_BASE mutex #define ww_mutex_base_init(l,n,k) __mutex_init(l,n,k) #define ww_mutex_base_trylock(l) mutex_trylock(l) #define ww_mutex_base_is_locked(b) mutex_is_locked((b)) +#else +#define WW_MUTEX_BASE rt_mutex +#define ww_mutex_base_init(l,n,k) __rt_mutex_init(l,n,k) +#define ww_mutex_base_trylock(l) rt_mutex_trylock(l) +#define ww_mutex_base_is_locked(b) rt_mutex_base_is_locked(&(b)->rtmutex) +#endif struct ww_class { atomic_long_t stamp; @@ -36,7 +49,7 @@ struct ww_class { struct ww_mutex { struct WW_MUTEX_BASE base; struct ww_acquire_ctx *ctx; -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES struct ww_class *ww_class; #endif }; @@ -47,10 +60,10 @@ struct ww_acquire_ctx { unsigned int acquired; unsigned short wounded; unsigned short is_wait_die; -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES unsigned int done_acquire; struct ww_class *ww_class; - struct ww_mutex *contending_lock; + void *contending_lock; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -89,7 +102,7 @@ static inline void ww_mutex_init(struct ww_mutex *lock, { ww_mutex_base_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key); lock->ctx = NULL; -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES lock->ww_class = ww_class; #endif } @@ -126,7 +139,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, ctx->acquired = 0; ctx->wounded = false; ctx->is_wait_die = ww_class->is_wait_die; -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES ctx->ww_class = ww_class; ctx->done_acquire = 0; ctx->contending_lock = NULL; @@ -156,7 +169,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, */ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) { -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES lockdep_assert_held(ctx); DEBUG_LOCKS_WARN_ON(ctx->done_acquire); @@ -176,7 +189,7 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) #ifdef CONFIG_DEBUG_LOCK_ALLOC mutex_release(&ctx->dep_map, _THIS_IP_); #endif -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES DEBUG_LOCKS_WARN_ON(ctx->acquired); if (!IS_ENABLED(CONFIG_PROVE_LOCKING)) /* @@ -282,7 +295,7 @@ static inline void ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { int ret; -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES DEBUG_LOCKS_WARN_ON(!ctx->contending_lock); #endif ret = ww_mutex_lock(lock, ctx); @@ -318,7 +331,7 @@ static inline int __must_check ww_mutex_lock_slow_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES DEBUG_LOCKS_WARN_ON(!ctx->contending_lock); #endif return ww_mutex_lock_interruptible(lock, ctx); @@ -348,7 +361,9 @@ static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock) */ static inline void ww_mutex_destroy(struct ww_mutex *lock) { +#ifndef CONFIG_PREEMPT_RT mutex_destroy(&lock->base); +#endif } /** diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 2dce4f0b0d1c..56f139201f24 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -180,7 +180,7 @@ static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES /* * If this WARN_ON triggers, you used ww_mutex_lock to acquire, * but released with a normal mutex_unlock in this call. @@ -413,7 +413,7 @@ static __always_inline int __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) { if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -559,7 +559,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, static inline void __ww_mutex_unlock(struct ww_mutex *lock) { if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); #endif if (lock->ctx->acquired > 0) -- cgit v1.2.3-71-gd317 From add461325ec5bc39aa619a1bfcde7245e5f31ac7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:28:58 +0200 Subject: locking/rtmutex: Extend the rtmutex core to support ww_mutex Add a ww acquire context pointer to the waiter and various functions and add the ww_mutex related invocations to the proper spots in the locking code, similar to the mutex based variant. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211304.966139174@linutronix.de --- kernel/locking/rtmutex.c | 121 ++++++++++++++++++++++++++++++++++++---- kernel/locking/rtmutex_api.c | 4 +- kernel/locking/rtmutex_common.h | 2 + kernel/locking/rwsem.c | 2 +- 4 files changed, 115 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ac8fb2f9c6f2..af7e3af4d313 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -17,9 +17,44 @@ #include #include #include +#include #include "rtmutex_common.h" +#ifndef WW_RT +# define build_ww_mutex() (false) +# define ww_container_of(rtm) NULL + +static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, + struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline int __ww_mutex_check_kill(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +#else +# define build_ww_mutex() (true) +# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) +# include "ww_mutex.h" +#endif + /* * lock->owner state tracking: * @@ -308,7 +343,28 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); + struct rt_mutex_waiter *aw = __node_2_waiter(a); + struct rt_mutex_waiter *bw = __node_2_waiter(b); + + if (rt_mutex_waiter_less(aw, bw)) + return 1; + + if (!build_ww_mutex()) + return 0; + + if (rt_mutex_waiter_less(bw, aw)) + return 0; + + /* NOTE: relies on waiter->ww_ctx being set before insertion */ + if (aw->ww_ctx) { + if (!bw->ww_ctx) + return 1; + + return (signed long)(aw->ww_ctx->stamp - + bw->ww_ctx->stamp) < 0; + } + + return 0; } static __always_inline void @@ -961,6 +1017,7 @@ takeit: static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, + struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); @@ -996,6 +1053,16 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, raw_spin_unlock(&task->pi_lock); + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should back out immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + if (res) + return res; + } + if (!owner) return 0; @@ -1281,6 +1348,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, /** * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take + * @ww_ctx: WW mutex context pointer * @state: the state the task should block in (TASK_INTERRUPTIBLE * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none @@ -1289,10 +1357,12 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, struct rt_mutex_waiter *waiter) { + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); int ret = 0; for (;;) { @@ -1309,6 +1379,12 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, break; } + if (build_ww_mutex() && ww_ctx) { + ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); + if (ret) + break; + } + raw_spin_unlock_irq(&lock->wait_lock); schedule(); @@ -1331,6 +1407,9 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, if (res != -EDEADLOCK || detect_deadlock) return; + if (build_ww_mutex() && w->ww_ctx) + return; + /* * Yell loudly and stop the task right here. */ @@ -1344,31 +1423,46 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, /** * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer * @state: The task state for sleeping * @chwalk: Indicator whether full or partial chainwalk is requested * @waiter: Initializer waiter for blocking */ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, unsigned int state, enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter) { + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct ww_mutex *ww = ww_container_of(rtm); int ret; lockdep_assert_held(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + if (build_ww_mutex() && ww_ctx) { + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } return 0; + } set_current_state(state); - ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); - + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, state, NULL, waiter); - - if (unlikely(ret)) { + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + + if (likely(!ret)) { + /* acquired the lock */ + if (build_ww_mutex() && ww_ctx) { + if (!ww_ctx->is_wait_die) + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } + } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, waiter); @@ -1383,14 +1477,17 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, } static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, unsigned int state) { struct rt_mutex_waiter waiter; int ret; rt_mutex_init_waiter(&waiter); + waiter.ww_ctx = ww_ctx; - ret = __rt_mutex_slowlock(lock, state, RT_MUTEX_MIN_CHAINWALK, &waiter); + ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, + &waiter); debug_rt_mutex_free_waiter(&waiter); return ret; @@ -1399,9 +1496,11 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, /* * rt_mutex_slowlock - Locking slowpath invoked when fast path fails * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer * @state: The task state for sleeping */ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, unsigned int state) { unsigned long flags; @@ -1416,7 +1515,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, * irqsave/restore variants. */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = __rt_mutex_slowlock_locked(lock, state); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; @@ -1428,7 +1527,7 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - return rt_mutex_slowlock(lock, state); + return rt_mutex_slowlock(lock, NULL, state); } #endif /* RT_MUTEX_BUILD_MUTEX */ @@ -1455,7 +1554,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) /* Save current state and set state to TASK_RTLOCK_WAIT */ current_save_and_set_rtlock_wait_state(); - task_blocks_on_rt_mutex(lock, &waiter, current, RT_MUTEX_MIN_CHAINWALK); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); for (;;) { /* Try to acquire the lock again */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 7f3ac096250d..16126fcb55ef 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -267,7 +267,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, return 1; /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, + ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, RT_MUTEX_FULL_CHAINWALK); if (ret && !rt_mutex_owner(lock)) { @@ -343,7 +343,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index ccf0e36d6c31..61256de5bd66 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -28,6 +28,7 @@ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) * @prio: Priority of the waiter * @deadline: Deadline of the waiter if applicable + * @ww_ctx: WW context pointer */ struct rt_mutex_waiter { struct rb_node tree_entry; @@ -37,6 +38,7 @@ struct rt_mutex_waiter { unsigned int wake_state; int prio; u64 deadline; + struct ww_acquire_ctx *ww_ctx; }; /** diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2847833d5583..9215b4d6a9de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1360,7 +1360,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) __rt_mutex_lock(rtm, state) #define rwbase_rtmutex_slowlock_locked(rtm, state) \ - __rt_mutex_slowlock_locked(rtm, state) + __rt_mutex_slowlock_locked(rtm, NULL, state) #define rwbase_rtmutex_unlock(rtm) \ __rt_mutex_unlock(rtm) -- cgit v1.2.3-71-gd317 From f8635d509d807c0a9deb273e19bc5a8a19c52895 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 15 Aug 2021 23:29:00 +0200 Subject: locking/ww_mutex: Implement rtmutex based ww_mutex API functions Add the actual ww_mutex API functions which replace the mutex based variant on RT enabled kernels. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.024057938@linutronix.de --- kernel/locking/Makefile | 2 +- kernel/locking/ww_rt_mutex.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 kernel/locking/ww_rt_mutex.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 683f0b7fbacc..d51cabf28f38 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o -obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o +obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c new file mode 100644 index 000000000000..3f1fff7d2780 --- /dev/null +++ b/kernel/locking/ww_rt_mutex.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include +#include + +#define RT_MUTEX_BUILD_MUTEX +#define WW_RT +#include "rtmutex.c" + +static int __sched +__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, + unsigned int state, unsigned long ip) +{ + struct lockdep_map __maybe_unused *nest_lock = NULL; + struct rt_mutex *rtm = &lock->base; + int ret; + + might_sleep(); + + if (ww_ctx) { + if (unlikely(ww_ctx == READ_ONCE(lock->ctx))) + return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif + } + mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); + + if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (ww_ctx) + ww_mutex_set_context_fastpath(lock, ww_ctx); + return 0; + } + + ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state); + + if (ret) + mutex_release(&rtm->dep_map, ip); + return ret; +} + +int __sched +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock); + +int __sched +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock_interruptible); + +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + struct rt_mutex *rtm = &lock->base; + + __ww_mutex_unlock(lock); + + mutex_release(&rtm->dep_map, _RET_IP_); + __rt_mutex_unlock(&rtm->rtmutex); +} +EXPORT_SYMBOL(ww_mutex_unlock); -- cgit v1.2.3-71-gd317 From bb630f9f7a7d43869e4e7f5e4c002207396aea59 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:01 +0200 Subject: locking/rtmutex: Add mutex variant for RT Add the necessary defines, helpers and API functions for replacing struct mutex on a PREEMPT_RT enabled kernel with an rtmutex based variant. No functional change when CONFIG_PREEMPT_RT=n Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.081517417@linutronix.de --- include/linux/mutex.h | 66 +++++++++++++++++++---- kernel/locking/mutex.c | 4 +- kernel/locking/rtmutex_api.c | 122 +++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 11 ++-- 4 files changed, 187 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 0bbc872ba72b..8f226d460f51 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,18 @@ #include #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_SLEEP, \ + } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifndef CONFIG_PREEMPT_RT + /* * Simple, straightforward mutexes with strict semantics: * @@ -93,16 +105,6 @@ do { \ __mutex_init((mutex), #mutex, &__key); \ } while (0) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_SLEEP, \ - } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -124,6 +126,50 @@ extern void __mutex_init(struct mutex *lock, const char *name, */ extern bool mutex_is_locked(struct mutex *lock); +#else /* !CONFIG_PREEMPT_RT */ +/* + * Preempt-RT variant based on rtmutexes. + */ +#include + +struct mutex { + struct rt_mutex_base rtmutex; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __MUTEX_INITIALIZER(mutexname) \ +{ \ + .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ +} + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void __mutex_rt_init(struct mutex *lock, const char *name, + struct lock_class_key *key); +extern int mutex_trylock(struct mutex *lock); + +static inline void mutex_destroy(struct mutex *lock) { } + +#define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) + +#define __mutex_init(mutex, name, key) \ +do { \ + rt_mutex_base_init(&(mutex)->rtmutex); \ + __mutex_rt_init((mutex), name, key); \ +} while (0) + +#define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key); \ +} while (0) +#endif /* CONFIG_PREEMPT_RT */ + /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9906ca6cc912..3a65bf4bacfd 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,6 +30,7 @@ #include #include +#ifndef CONFIG_PREEMPT_RT #include "mutex.h" #ifdef CONFIG_DEBUG_MUTEXES @@ -1066,7 +1067,8 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } EXPORT_SYMBOL(ww_mutex_lock_interruptible); -#endif +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* !CONFIG_PREEMPT_RT */ /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 16126fcb55ef..92b7d289a8af 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -454,3 +454,125 @@ void rt_mutex_debug_task_free(struct task_struct *task) DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } #endif + +#ifdef CONFIG_PREEMPT_RT +/* Mutexes */ +void __mutex_rt_init(struct mutex *mutex, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(__mutex_rt_init); + +static __always_inline int __mutex_lock_common(struct mutex *lock, + unsigned int state, + unsigned int subclass, + struct lockdep_map *nest_lock, + unsigned long ip) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, ip); + else + lock_acquired(&lock->dep_map, ip); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched _mutex_lock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_); +} +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +int __sched mutex_lock_killable_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + +#else /* CONFIG_DEBUG_LOCK_ALLOC */ + +void __sched mutex_lock(struct mutex *lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock); + +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_killable); + +void __sched mutex_lock_io(struct mutex *lock) +{ + int token = io_schedule_prepare(); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL(mutex_lock_io); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +int __sched mutex_trylock(struct mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +void __sched mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL(mutex_unlock); + +#endif /* CONFIG_PREEMPT_RT */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5ddd575159fb..e5cdf98f50c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1235,7 +1235,7 @@ config PROVE_LOCKING depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select LOCKDEP select DEBUG_SPINLOCK - select DEBUG_MUTEXES + select DEBUG_MUTEXES if !PREEMPT_RT select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_RWSEMS select DEBUG_WW_MUTEX_SLOWPATH @@ -1299,7 +1299,7 @@ config LOCK_STAT depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select LOCKDEP select DEBUG_SPINLOCK - select DEBUG_MUTEXES + select DEBUG_MUTEXES if !PREEMPT_RT select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC default n @@ -1335,7 +1335,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. @@ -1345,7 +1345,8 @@ config DEBUG_WW_MUTEX_SLOWPATH depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select DEBUG_LOCK_ALLOC select DEBUG_SPINLOCK - select DEBUG_MUTEXES + select DEBUG_MUTEXES if !PREEMPT_RT + select DEBUG_RT_MUTEXES if PREEMPT_RT help This feature enables slowpath testing for w/w mutex users by injecting additional -EDEADLK wound/backoff cases. Together with @@ -1368,7 +1369,7 @@ config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select DEBUG_SPINLOCK - select DEBUG_MUTEXES + select DEBUG_MUTEXES if !PREEMPT_RT select DEBUG_RT_MUTEXES if RT_MUTEXES select LOCKDEP help -- cgit v1.2.3-71-gd317 From dc7109aaa233d83b573f75763a9f1ae207042a53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:04 +0200 Subject: futex: Validate waiter correctly in futex_proxy_trylock_atomic() The loop in futex_requeue() has a sanity check for the waiter, which is missing in futex_proxy_trylock_atomic(). In theory the key2 check is sufficient, but futexes are cursed so add it for completeness and paranoia sake. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.193767519@linutronix.de --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 21625cb3e865..a1f27fd55a23 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1879,6 +1879,13 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, if (!top_waiter) return 0; + /* + * Ensure that this is a waiter sitting in futex_wait_requeue_pi() + * and waiting on the 'waitqueue' futex which is always !PI. + */ + if (!top_waiter->rt_waiter || top_waiter->pi_state) + ret = -EINVAL; + /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) return -EINVAL; -- cgit v1.2.3-71-gd317 From c363b7ed79253d5b53494197f6ae625cff64694f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:06 +0200 Subject: futex: Clean up stale comments The futex key reference mechanism is long gone. Clean up the stale comments which still mention it. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.249178312@linutronix.de --- kernel/futex.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a1f27fd55a23..bc5395e66e97 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1354,7 +1354,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - 1 - acquired the lock; * - <0 - error * - * The hb->lock and futex_key refs shall be held by the caller. + * The hb->lock must be held by the caller. * * @exiting is only set when the return value is -EBUSY. If so, this holds * a refcount on the exiting task on return and the caller needs to drop it @@ -2621,8 +2621,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held and a q.key reference on success, and unlocked - * with no q.key reference on failure. + * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; @@ -2700,8 +2699,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, current->timer_slack_ns); retry: /* - * Prepare to wait on uaddr. On success, holds hb lock and increments - * q.key refs. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -2712,7 +2711,6 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; - /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) goto out; ret = -ETIMEDOUT; @@ -3205,8 +3203,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, q.requeue_pi_key = &key2; /* - * Prepare to wait on uaddr. On success, increments q.key (key1) ref - * count. + * Prepare to wait on uaddr. On success, it holds hb->lock and q + * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) @@ -3235,9 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. + * race with the atomic proxy lock acquisition by the requeue code. */ /* -- cgit v1.2.3-71-gd317 From f6f4ec00b57a2c950235435bff8e888daafad5af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:07 +0200 Subject: futex: Clarify futex_requeue() PI handling When requeuing to a PI futex, then the requeue code tries to trylock the PI futex on behalf of the topmost waiter on the inner 'waitqueue' futex. If that succeeds, then PI state has to be allocated in order to requeue further waiters to the PI futex. The comment and the code are confusing, as the PI state allocation uses lookup_pi_state(), which either attaches to an existing waiter or to the owner. As the PI futex was just acquired, there cannot be a waiter on the PI futex because the hash bucket lock is held. Clarify the comment and use attach_to_pi_owner() directly. As the task on which behalf the PI futex has been acquired is guaranteed to be alive and not exiting, this call must succeed. Add a WARN_ON() in case that fails. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.305142462@linutronix.de --- kernel/futex.c | 61 ++++++++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bc5395e66e97..c39d5f17946b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1299,27 +1299,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 __user *uaddr, u32 uval, - struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct **exiting) -{ - struct futex_q *top_waiter = futex_top_waiter(hb, key); - - /* - * If there is a waiter on that futex, validate it and - * attach to the pi_state when the validation succeeds. - */ - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * We are the first waiter - try to look up the owner based on - * @uval and attach to it. - */ - return attach_to_pi_owner(uaddr, uval, key, ps, exiting); -} - static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; @@ -2038,8 +2017,8 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, ret contains the - * vpid of the top waiter task. + * reference to it. If the lock was taken, @ret contains the + * VPID of the top waiter task. * If the lock was not taken, we have pi_state and an initial * refcount on it. In case of an error we have nothing. */ @@ -2047,19 +2026,25 @@ retry_private: WARN_ON(pi_state); task_count++; /* - * If we acquired the lock, then the user space value - * of uaddr2 should be vpid. It cannot be changed by - * the top waiter as it is blocked on hb2 lock if it - * tries to do so. If something fiddled with it behind - * our back the pi state lookup might unearth it. So - * we rather use the known value than rereading and - * handing potential crap to lookup_pi_state. + * If futex_proxy_trylock_atomic() acquired the + * user space futex, then the user space value + * @uaddr2 has been set to the @hb1's top waiter + * task VPID. This task is guaranteed to be alive + * and cannot be exiting because it is either + * sleeping or blocked on @hb2 lock. + * + * The @uaddr2 futex cannot have waiters either as + * otherwise futex_proxy_trylock_atomic() would not + * have succeeded. * - * If that call succeeds then we have pi_state and an - * initial refcount on it. + * In order to requeue waiters to @hb2, pi state is + * required. Hand in the VPID value (@ret) and + * allocate PI state with an initial refcount on + * it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, - &pi_state, &exiting); + ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, + &exiting); + WARN_ON(ret); } switch (ret) { @@ -2183,9 +2168,9 @@ retry_private: } /* - * We took an extra initial reference to the pi_state either - * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We - * need to drop it here again. + * We took an extra initial reference to the pi_state either in + * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need + * to drop it here again. */ put_pi_state(pi_state); @@ -2364,7 +2349,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * Modifying pi_state _before_ the user space value would leave the * pi_state in an inconsistent state when we fault here, because we * need to drop the locks to handle the fault. This might be observed - * in the PID check in lookup_pi_state. + * in the PID checks when attaching to PI state . */ retry: if (!argowner) { -- cgit v1.2.3-71-gd317 From 8e74633dcefb280f2cefb49b7201d99650243d96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:09 +0200 Subject: futex: Remove bogus condition for requeue PI For requeue PI it's required to establish PI state for the PI futex to which waiters are requeued. This either acquires the user space futex on behalf of the top most waiter on the inner 'waitqueue' futex, or attaches to the PI state of an existing waiter, or creates on attached to the owner of the futex. This code can retry in case of failure, but retry can never happen when the pi state was successfully created. The condition to run this code is: (task_count - nr_wake) < nr_requeue which is always true because: task_count = 0 nr_wake = 1 nr_requeue >= 0 Remove it completely. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.362730187@linutronix.de --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c39d5f17946b..8ddc87cae25c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2000,7 +2000,7 @@ retry_private: } } - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + if (requeue_pi) { struct task_struct *exiting = NULL; /* -- cgit v1.2.3-71-gd317 From 59c7ecf1544e1841b5be8847e81bc9842f838e7e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:10 +0200 Subject: futex: Correct the number of requeued waiters for PI The accounting is wrong when either the PI sanity check or the requeue PI operation fails. Adjust it in the failure path. Will be simplified in the next step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.416427548@linutronix.de --- kernel/futex.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8ddc87cae25c..543974285676 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2116,6 +2116,8 @@ retry_private: /* Ensure we requeue to the expected futex for requeue_pi. */ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + /* Don't account for it */ + task_count--; ret = -EINVAL; break; } @@ -2157,6 +2159,8 @@ retry_private: */ this->pi_state = NULL; put_pi_state(pi_state); + /* Don't account for it */ + task_count--; /* * We stop queueing more waiters and let user * space deal with the mess. -- cgit v1.2.3-71-gd317 From 64b7b715f7f92ae3233446b4a4cdda3524fcd4b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:12 +0200 Subject: futex: Restructure futex_requeue() No point in taking two more 'requeue_pi' conditionals just to get to the requeue. Same for the requeue_pi case just the other way round. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.468835790@linutronix.de --- kernel/futex.c | 90 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 41 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 543974285676..6cb6b5db3e24 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2104,20 +2104,17 @@ retry_private: break; } - /* - * Wake nr_wake waiters. For requeue_pi, if we acquired the - * lock, we already woke the top_waiter. If not, it will be - * woken by futex_unlock_pi(). - */ - if (++task_count <= nr_wake && !requeue_pi) { - mark_wake_futex(&wake_q, this); + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + mark_wake_futex(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); continue; } /* Ensure we requeue to the expected futex for requeue_pi. */ - if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { - /* Don't account for it */ - task_count--; + if (!match_futex(this->requeue_pi_key, &key2)) { ret = -EINVAL; break; } @@ -2125,50 +2122,45 @@ retry_private: /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - if (requeue_pi) { + get_pi_state(pi_state); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, this->task); + if (ret == 1) { /* - * Prepare the waiter to take the rt_mutex. Take a - * refcount on the pi_state and store the pointer in - * the futex_q object of the waiter. + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. */ - get_pi_state(pi_state); - this->pi_state = pi_state; - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - if (ret == 1) { - /* - * We got the lock. We do neither drop the - * refcount on pi_state nor clear - * this->pi_state because the waiter needs the - * pi_state for cleaning up the user space - * value. It will drop the refcount after - * doing so. - */ - requeue_pi_wake_futex(this, &key2, hb2); - continue; - } else if (ret) { - /* - * rt_mutex_start_proxy_lock() detected a - * potential deadlock when we tried to queue - * that waiter. Drop the pi_state reference - * which we took above and remove the pointer - * to the state from the waiters futex_q - * object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - /* Don't account for it */ - task_count--; - /* - * We stop queueing more waiters and let user - * space deal with the mess. - */ - break; - } + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; } + /* Waiter is queued, move it to hb2 */ requeue_futex(this, hb1, hb2, &key2); + task_count++; } /* -- cgit v1.2.3-71-gd317 From c18eaa3aca43688a3aee199d85ce4227686a29b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:14 +0200 Subject: futex: Clarify comment in futex_requeue() The comment about the restriction of the number of waiters to wake for the REQUEUE_PI case is confusing at best. Rewrite it. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.524990421@linutronix.de --- kernel/futex.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6cb6b5db3e24..8d8bad5e07bb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1939,15 +1939,27 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ if (refill_pi_state_cache()) return -ENOMEM; + /* - * requeue_pi must wake as many tasks as it can, up to nr_wake - * + nr_requeue, since it acquires the rt_mutex prior to - * returning to userspace, so as to not leave the rt_mutex with - * waiters and no owner. However, second and third wake-ups - * cannot be predicted as they involve race conditions with the - * first wake and a fault while looking up the pi_state. Both - * pthread_cond_signal() and pthread_cond_broadcast() should - * use nr_wake=1. + * futex_requeue() allows the caller to define the number + * of waiters to wake up via the @nr_wake argument. With + * REQUEUE_PI, waking up more than one waiter is creating + * more problems than it solves. Waking up a waiter makes + * only sense if the PI futex @uaddr2 is uncontended as + * this allows the requeue code to acquire the futex + * @uaddr2 before waking the waiter. The waiter can then + * return to user space without further action. A secondary + * wakeup would just make the futex_wait_requeue_pi() + * handling more complex, because that code would have to + * look up pi_state and do more or less all the handling + * which the requeue code has to do for the to be requeued + * waiters. So restrict the number of waiters to wake to + * one, and only wake it up when the PI futex is + * uncontended. Otherwise requeue it and let the unlock of + * the PI futex handle the wakeup. + * + * All REQUEUE_PI users, e.g. pthread_cond_signal() and + * pthread_cond_broadcast() must use nr_wake=1. */ if (nr_wake != 1) return -EINVAL; -- cgit v1.2.3-71-gd317 From d69cba5c719b0c551f6380ec5da4ed8c20a3815a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:15 +0200 Subject: futex: Reorder sanity checks in futex_requeue() No point in allocating memory when the input parameters are bogus. Validate all parameters before proceeding. Suggested-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.581789253@linutronix.de --- kernel/futex.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8d8bad5e07bb..a5232f62de75 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1933,13 +1933,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, if (uaddr1 == uaddr2) return -EINVAL; - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - /* * futex_requeue() allows the caller to define the number * of waiters to wake up via the @nr_wake argument. With @@ -1963,6 +1956,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ if (nr_wake != 1) return -EINVAL; + + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; } retry: -- cgit v1.2.3-71-gd317 From 6231acbd0802e76580c71ceb52c09646d42170fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:17 +0200 Subject: futex: Simplify handle_early_requeue_pi_wakeup() Move the futex key match out of handle_early_requeue_pi_wakeup() which allows to simplify that function. The upcoming state machine for requeue_pi() will make that go away. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.638938670@linutronix.de --- kernel/futex.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a5232f62de75..82660b9b837a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3070,27 +3070,22 @@ pi_faulted: } /** - * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex * @hb: the hash_bucket futex_q was original enqueued on * @q: the futex_q woken while waiting to be requeued - * @key2: the futex_key of the requeue target futex * @timeout: the timeout associated with the wait (NULL if none) * - * Detect if the task was woken on the initial futex as opposed to the requeue - * target futex. If so, determine if it was a timeout or a signal that caused - * the wakeup and return the appropriate error code to the caller. Must be - * called with the hb lock held. + * Determine the cause for the early wakeup. * * Return: - * - 0 = no early wakeup detected; - * - <0 = -ETIMEDOUT or -ERESTARTNOINTR + * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, union futex_key *key2, + struct futex_q *q, struct hrtimer_sleeper *timeout) { - int ret = 0; + int ret; /* * With the hb lock held, we avoid races while we process the wakeup. @@ -3099,22 +3094,21 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * It can't be requeued from uaddr2 to something else since we don't * support a PI aware source futex for requeue. */ - if (!match_futex(&q->key, key2)) { - WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); + WARN_ON_ONCE(&hb->lock != q->lock_ptr); - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - } + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; return ret; } @@ -3217,7 +3211,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, futex_wait_queue_me(hb, &q, to); spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + /* Is @q still queued on uaddr1? */ + if (!match_futex(&q->key, key2)) + ret = handle_early_requeue_pi_wakeup(hb, &q, to); spin_unlock(&hb->lock); if (ret) goto out; -- cgit v1.2.3-71-gd317 From 07d91ef510fb16a2e0ca7453222105835b7ba3b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:18 +0200 Subject: futex: Prevent requeue_pi() lock nesting issue on RT The requeue_pi() operation on RT kernels creates a problem versus the task::pi_blocked_on state when a waiter is woken early (signal, timeout) and that early wake up interleaves with the requeue_pi() operation. When the requeue manages to block the waiter on the rtmutex which is associated to the second futex, then a concurrent early wakeup of that waiter faces the problem that it has to acquire the hash bucket spinlock, which is not an issue on non-RT kernels, but on RT kernels spinlocks are substituted by 'sleeping' spinlocks based on rtmutex. If the hash bucket lock is contended then blocking on that spinlock would result in a impossible situation: blocking on two locks at the same time (the hash bucket lock and the rtmutex representing the PI futex). It was considered to make the hash bucket locks raw_spinlocks, but especially requeue operations with a large amount of waiters can introduce significant latencies, so that's not an option for RT. The RT tree carried a solution which (ab)used task::pi_blocked_on to store the information about an ongoing requeue and an early wakeup which worked, but required to add checks for these special states all over the place. The distangling of an early wakeup of a waiter for a requeue_pi() operation is already looking at quite some different states and the task::pi_blocked_on magic just expanded that to a hard to understand 'state machine'. This can be avoided by keeping track of the waiter/requeue state in the futex_q object itself. Add a requeue_state field to struct futex_q with the following possible states: Q_REQUEUE_PI_NONE Q_REQUEUE_PI_IGNORE Q_REQUEUE_PI_IN_PROGRESS Q_REQUEUE_PI_WAIT Q_REQUEUE_PI_DONE Q_REQUEUE_PI_LOCKED The waiter starts with state = NONE and the following state transitions are valid: On the waiter side: Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT On the requeue side: Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this signals that the waiter is already on the way out. It also means that the waiter is still on the 'wait' futex, i.e. uaddr1. The waiter side signals early wakeup to the requeue side either through setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, which means the wakeup is interleaving with a requeue in progress it has to wait for the requeue side to change the state. Either to DONE/LOCKED or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by the requeue side when the requeue attempt failed via deadlock detection and therefore the waiter's futex_q is still on the uaddr1 futex. While this is not strictly required on !RT making this unconditional has the benefit of common code and it also allows the waiter to avoid taking the hash bucket lock on the way out in certain cases, which reduces contention. Add the required helpers required for the state transitions, invoke them at the right places and restructure the futex_wait_requeue_pi() code to handle the return from wait (early or not) based on the state machine values. On !RT enabled kernels the waiter spin waits for the state going from Q_REQUEUE_PI_WAIT to some other state, on RT enabled kernels this is handled by rcuwait_wait_event() and the corresponding wake up on the requeue side. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.693317658@linutronix.de --- kernel/futex.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 259 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 82660b9b837a..e7b4c6121da4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -197,6 +197,8 @@ struct futex_pi_state { * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup + * @requeue_state: State field for futex_requeue_pi() + * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -219,12 +221,68 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; + atomic_t requeue_state; +#ifdef CONFIG_PREEMPT_RT + struct rcuwait requeue_wait; +#endif } __randomize_layout; +/* + * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an + * underlying rtmutex. The task which is about to be requeued could have + * just woken up (timeout, signal). After the wake up the task has to + * acquire hash bucket lock, which is held by the requeue code. As a task + * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking + * and the hash bucket lock blocking would collide and corrupt state. + * + * On !PREEMPT_RT this is not a problem and everything could be serialized + * on hash bucket lock, but aside of having the benefit of common code, + * this allows to avoid doing the requeue when the task is already on the + * way out and taking the hash bucket lock of the original uaddr1 when the + * requeue has been completed. + * + * The following state transitions are valid: + * + * On the waiter side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT + * + * On the requeue side: + * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED + * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) + * + * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this + * signals that the waiter is already on the way out. It also means that + * the waiter is still on the 'wait' futex, i.e. uaddr1. + * + * The waiter side signals early wakeup to the requeue side either through + * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending + * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately + * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, + * which means the wakeup is interleaving with a requeue in progress it has + * to wait for the requeue side to change the state. Either to DONE/LOCKED + * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex + * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by + * the requeue side when the requeue attempt failed via deadlock detection + * and therefore the waiter q is still on the uaddr1 futex. + */ +enum { + Q_REQUEUE_PI_NONE = 0, + Q_REQUEUE_PI_IGNORE, + Q_REQUEUE_PI_IN_PROGRESS, + Q_REQUEUE_PI_WAIT, + Q_REQUEUE_PI_DONE, + Q_REQUEUE_PI_LOCKED, +}; + static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY, + .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), }; /* @@ -1772,6 +1830,108 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } +static inline bool futex_requeue_pi_prepare(struct futex_q *q, + struct futex_pi_state *pi_state) +{ + int old, new; + + /* + * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has + * already set Q_REQUEUE_PI_IGNORE to signal that requeue should + * ignore the waiter. + */ + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return false; + + /* + * futex_proxy_trylock_atomic() might have set it to + * IN_PROGRESS and a interleaved early wake to WAIT. + * + * It was considered to have an extra state for that + * trylock, but that would just add more conditionals + * all over the place for a dubious value. + */ + if (old != Q_REQUEUE_PI_NONE) + break; + + new = Q_REQUEUE_PI_IN_PROGRESS; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + q->pi_state = pi_state; + return true; +} + +static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + if (old == Q_REQUEUE_PI_IGNORE) + return; + + if (locked >= 0) { + /* Requeue succeeded. Set DONE or LOCKED */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && + old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_DONE + locked; + } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { + /* Deadlock, no early wakeup interleave */ + new = Q_REQUEUE_PI_NONE; + } else { + /* Deadlock, early wakeup interleave. */ + WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); + new = Q_REQUEUE_PI_IGNORE; + } + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + +#ifdef CONFIG_PREEMPT_RT + /* If the waiter interleaved with the requeue let it know */ + if (unlikely(old == Q_REQUEUE_PI_WAIT)) + rcuwait_wake_up(&q->requeue_wait); +#endif +} + +static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) +{ + int old, new; + + old = atomic_read_acquire(&q->requeue_state); + do { + /* Is requeue done already? */ + if (old >= Q_REQUEUE_PI_DONE) + return old; + + /* + * If not done, then tell the requeue code to either ignore + * the waiter or to wake it up once the requeue is done. + */ + new = Q_REQUEUE_PI_WAIT; + if (old == Q_REQUEUE_PI_NONE) + new = Q_REQUEUE_PI_IGNORE; + } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); + + /* If the requeue was in progress, wait for it to complete */ + if (old == Q_REQUEUE_PI_IN_PROGRESS) { +#ifdef CONFIG_PREEMPT_RT + rcuwait_wait_event(&q->requeue_wait, + atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, + TASK_UNINTERRUPTIBLE); +#else + (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); +#endif + } + + /* + * Requeue is now either prohibited or complete. Reread state + * because during the wait above it might have changed. Nothing + * will modify q->requeue_state after this point. + */ + return atomic_read(&q->requeue_state); +} + /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * @q: the futex_q @@ -1799,6 +1959,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; + /* Signal locked state to the waiter */ + futex_requeue_pi_complete(q, 1); wake_up_state(q->task, TASK_NORMAL); } @@ -1869,6 +2031,10 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, if (!match_futex(top_waiter->requeue_pi_key, key2)) return -EINVAL; + /* Ensure that this does not race against an early wakeup */ + if (!futex_requeue_pi_prepare(top_waiter, NULL)) + return -EAGAIN; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1878,8 +2044,22 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { + /* Dequeue, wake up and update top_waiter::requeue_state */ requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; + } else if (ret < 0) { + /* Rewind top_waiter::requeue_state */ + futex_requeue_pi_complete(top_waiter, ret); + } else { + /* + * futex_lock_pi_atomic() did not acquire the user space + * futex, but managed to establish the proxy lock and pi + * state. top_waiter::requeue_state cannot be fixed up here + * because the waiter is not enqueued on the rtmutex + * yet. This is handled at the callsite depending on the + * result of rt_mutex_start_proxy_lock() which is + * guaranteed to be reached with this function returning 0. + */ } return ret; } @@ -2020,6 +2200,8 @@ retry_private: * intend to requeue waiters, force setting the FUTEX_WAITERS * bit. We force this here where we are able to easily handle * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, &key2, &pi_state, @@ -2033,6 +2215,24 @@ retry_private: * VPID of the top waiter task. * If the lock was not taken, we have pi_state and an initial * refcount on it. In case of an error we have nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret > 0), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ if (ret > 0) { WARN_ON(pi_state); @@ -2064,7 +2264,10 @@ retry_private: /* We hold a reference on the pi state. */ break; - /* If the above failed, then pi_state is NULL */ + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2140,21 +2343,39 @@ retry_private: * object of the waiter. */ get_pi_state(pi_state); - this->pi_state = pi_state; + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, this->task); + this->rt_waiter, + this->task); + if (ret == 1) { /* * We got the lock. We do neither drop the refcount * on pi_state nor clear this->pi_state because the * waiter needs the pi_state for cleaning up the * user space value. It will drop the refcount - * after doing so. + * after doing so. this::requeue_state is updated + * in the wakeup as well. */ requeue_pi_wake_futex(this, &key2, hb2); task_count++; - continue; - } else if (ret) { + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { /* * rt_mutex_start_proxy_lock() detected a potential * deadlock when we tried to queue that waiter. @@ -2164,15 +2385,13 @@ retry_private: */ this->pi_state = NULL; put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); /* * We stop queueing more waiters and let user space * deal with the mess. */ break; } - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - task_count++; } /* @@ -3161,6 +3380,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; + struct rt_mutex_base *pi_mutex; int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) @@ -3210,32 +3430,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - spin_lock(&hb->lock); - /* Is @q still queued on uaddr1? */ - if (!match_futex(&q->key, key2)) + switch (futex_requeue_pi_wakeup_sync(&q)) { + case Q_REQUEUE_PI_IGNORE: + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); - if (ret) - goto out; - - /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. - */ + spin_unlock(&hb->lock); + break; - /* - * Check if the requeue code acquired the second futex for us and do - * any pertinent fixup. - */ - if (!q.rt_waiter) { + case Q_REQUEUE_PI_LOCKED: + /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_owner(uaddr2, &q, true); /* - * Drop the reference to the pi state which - * the requeue_pi() code acquired for us. + * Drop the reference to the pi state which the + * requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); @@ -3245,18 +3455,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = ret < 0 ? ret : 0; } - } else { - struct rt_mutex_base *pi_mutex; + break; - /* - * We have been woken up by futex_unlock_pi(), a timeout, or a - * signal. futex_unlock_pi() will not destroy the lock_ptr nor - * the pi_state. - */ - WARN_ON(!q.pi_state); + case Q_REQUEUE_PI_DONE: + /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + /* Current is not longer pi_blocked_on */ spin_lock(q.lock_ptr); if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; @@ -3276,17 +3482,21 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, unqueue_me_pi(&q); spin_unlock(q.lock_ptr); - } - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart by calling - * futex_lock_pi() directly. We could restart this syscall, but - * it would detect that the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart and return - * -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart + * by calling futex_lock_pi() directly. We could + * restart this syscall, but it would detect that + * the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart + * and return -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + break; + default: + BUG(); } out: -- cgit v1.2.3-71-gd317 From 51711e825a6d1b2fe7ca46bb06d08c25d97656ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:20 +0200 Subject: locking/rtmutex: Prevent lockdep false positive with PI futexes On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping' and rtmutex based. That causes a lockdep false positive because some of the futex functions invoke spin_unlock(&hb->lock) with the wait_lock of the rtmutex associated to the pi_futex held. spin_unlock() in turn takes wait_lock of the rtmutex on which the spinlock is based which makes lockdep notice a lock recursion. Give the futex/rtmutex wait_lock a separate key. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.750701219@linutronix.de --- kernel/locking/rtmutex_api.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 92b7d289a8af..5c9299aaabae 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -214,7 +214,19 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, struct task_struct *proxy_owner) { + static struct lock_class_key pi_futex_key; + __rt_mutex_base_init(lock); + /* + * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping' + * and rtmutex based. That causes a lockdep false positive, because + * some of the futex functions invoke spin_unlock(&hb->lock) with + * the wait_lock of the rtmutex associated to the pi_futex held. + * spin_unlock() in turn takes wait_lock of the rtmutex on which + * the spinlock is based, which makes lockdep notice a lock + * recursion. Give the futex/rtmutex wait_lock a separate key. + */ + lockdep_set_class(&lock->wait_lock, &pi_futex_key); rt_mutex_set_owner(lock, proxy_owner); } -- cgit v1.2.3-71-gd317 From 48eb3f4fcfd35495a8357459aa6fe437aa430b00 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Sun, 15 Aug 2021 23:29:23 +0200 Subject: locking/rtmutex: Implement equal priority lock stealing The current logic only allows lock stealing to occur if the current task is of higher priority than the pending owner. Significant throughput improvements can be gained by allowing the lock stealing to include tasks of equal priority when the contended lock is a spin_lock or a rw_lock and the tasks are not in a RT scheduling task. The assumption was that the system will make faster progress by allowing the task already on the CPU to take the lock rather than waiting for the system to wake up a different task. This does add a degree of unfairness, but in reality no negative side effects have been observed in the many years that this has been used in the RT kernel. [ tglx: Refactored and rewritten several times by Steve Rostedt, Sebastian Siewior and myself ] Signed-off-by: Gregory Haskins Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.857240222@linutronix.de --- kernel/locking/rtmutex.c | 52 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index af7e3af4d313..3eaf636606fd 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -338,6 +338,26 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + struct rt_mutex_waiter *top_waiter) +{ + if (rt_mutex_waiter_less(waiter, top_waiter)) + return true; + +#ifdef RT_MUTEX_BUILD_SPINLOCKS + /* + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ + if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + return false; + + return rt_mutex_waiter_equal(waiter, top_waiter); +#else + return false; +#endif +} + #define __node_2_waiter(node) \ rb_entry((node), struct rt_mutex_waiter, tree_entry) @@ -932,19 +952,21 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, * trylock attempt. */ if (waiter) { - /* - * If waiter is not the highest priority waiter of - * @lock, give up. - */ - if (waiter != rt_mutex_top_waiter(lock)) - return 0; + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); /* - * We can acquire the lock. Remove the waiter from the - * lock waiters tree. + * If waiter is the highest priority waiter of @lock, + * or allowed to steal it, take it over. */ - rt_mutex_dequeue(lock, waiter); - + if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. + */ + rt_mutex_dequeue(lock, waiter); + } else { + return 0; + } } else { /* * If the lock has waiters already we check whether @task is @@ -955,13 +977,9 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - /* - * If @task->prio is greater than or equal to - * the top waiter priority (kernel view), - * @task lost. - */ - if (!rt_mutex_waiter_less(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + /* Check whether the trylock can steal it. */ + if (!rt_mutex_steal(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* -- cgit v1.2.3-71-gd317 From 992caf7f17243d736fc996770bac6566103778f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 15 Aug 2021 23:29:25 +0200 Subject: locking/rtmutex: Add adaptive spinwait mechanism Going to sleep when locks are contended can be quite inefficient when the contention time is short and the lock owner is running on a different CPU. The MCS mechanism cannot be used because MCS is strictly FIFO ordered while for rtmutex based locks the waiter ordering is priority based. Provide a simple adaptive spinwait mechanism which currently restricts the spinning to the top priority waiter. [ tglx: Provide a contemporary changelog, extended it to all rtmutex based locks and updated it to match the other spin on owner implementations ] Originally-by: Gregory Haskins Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.912050691@linutronix.de --- kernel/locking/rtmutex.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3eaf636606fd..8aaa352d0c17 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,6 +8,11 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt * * See Documentation/locking/rt-mutex-design.rst for details. */ @@ -1297,6 +1302,52 @@ static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) rt_mutex_slowunlock(lock); } +#ifdef CONFIG_SMP +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) +{ + bool res = true; + + rcu_read_lock(); + for (;;) { + /* If owner changed, trylock again. */ + if (owner != rt_mutex_owner(lock)) + break; + /* + * Ensure that @owner is dereferenced after checking that + * the lock owner still matches @owner. If that fails, + * @owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + /* + * Stop spinning when: + * - the lock owner has been scheduled out + * - current is not longer the top waiter + * - current is requested to reschedule (redundant + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ + if (!owner->on_cpu || waiter != rt_mutex_top_waiter(lock) || + need_resched() || vcpu_is_preempted(task_cpu(owner))) { + res = false; + break; + } + cpu_relax(); + } + rcu_read_unlock(); + return res; +} +#else +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) +{ + return false; +} +#endif + #ifdef RT_MUTEX_BUILD_MUTEX /* * Functions required for: @@ -1381,6 +1432,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct task_struct *owner; int ret = 0; for (;;) { @@ -1403,9 +1455,14 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, break; } + if (waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; raw_spin_unlock_irq(&lock->wait_lock); - schedule(); + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + schedule(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1561,6 +1618,7 @@ static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) { struct rt_mutex_waiter waiter; + struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); @@ -1579,9 +1637,14 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) if (try_to_take_rt_mutex(lock, current, &waiter)) break; + if (&waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; raw_spin_unlock_irq(&lock->wait_lock); - schedule_rtlock(); + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + schedule_rtlock(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); -- cgit v1.2.3-71-gd317 From 31552385f8e9d0869117014bf8e55ba0497e3ec8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Aug 2021 23:29:27 +0200 Subject: locking/spinlock/rt: Prepare for RT local_lock Add the static and runtime initializer mechanics to support the RT variant of local_lock, which requires the lock type in the lockdep map to be set to LD_LOCK_PERCPU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210815211305.967526724@linutronix.de --- include/linux/spinlock_rt.h | 24 ++++++++++++++++-------- include/linux/spinlock_types.h | 6 ++++++ include/linux/spinlock_types_raw.h | 8 ++++++++ kernel/locking/spinlock_rt.c | 7 +++++-- 4 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 4fc72199cc9d..835aedaf68ac 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -8,20 +8,28 @@ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void __rt_spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); + struct lock_class_key *key, bool percpu); #else static inline void __rt_spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, bool percpu) { } #endif -#define spin_lock_init(slock) \ -do { \ - static struct lock_class_key __key; \ - \ - rt_mutex_base_init(&(slock)->lock); \ - __rt_spin_lock_init(slock, #slock, &__key); \ +#define spin_lock_init(slock) \ +do { \ + static struct lock_class_key __key; \ + \ + rt_mutex_base_init(&(slock)->lock); \ + __rt_spin_lock_init(slock, #slock, &__key, false); \ +} while (0) + +#define local_spin_lock_init(slock) \ +do { \ + static struct lock_class_key __key; \ + \ + rt_mutex_base_init(&(slock)->lock); \ + __rt_spin_lock_init(slock, #slock, &__key, true); \ } while (0) extern void rt_spin_lock(spinlock_t *lock); diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 8a9aadbaf293..2dfa35ffec76 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -60,6 +60,12 @@ typedef struct spinlock { SPIN_DEP_MAP_INIT(name) \ } +#define __LOCAL_SPIN_LOCK_UNLOCKED(name) \ + { \ + .lock = __RT_MUTEX_BASE_INITIALIZER(name.lock), \ + LOCAL_SPIN_DEP_MAP_INIT(name) \ + } + #define DEFINE_SPINLOCK(name) \ spinlock_t name = __SPIN_LOCK_UNLOCKED(name) diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h index a8a4330d7b78..91cb36b65a17 100644 --- a/include/linux/spinlock_types_raw.h +++ b/include/linux/spinlock_types_raw.h @@ -37,9 +37,17 @@ typedef struct raw_spinlock { .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ } + +# define LOCAL_SPIN_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_CONFIG, \ + .lock_type = LD_LOCK_PERCPU, \ + } #else # define RAW_SPIN_DEP_MAP_INIT(lockname) # define SPIN_DEP_MAP_INIT(lockname) +# define LOCAL_SPIN_DEP_MAP_INIT(lockname) #endif #ifdef CONFIG_DEBUG_SPINLOCK diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index c36648bd765d..d2912e44d61f 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -120,10 +120,13 @@ EXPORT_SYMBOL(rt_spin_trylock_bh); #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, bool percpu) { + u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL; + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); + lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG, + LD_WAIT_INV, type); } EXPORT_SYMBOL(__rt_spin_lock_init); #endif -- cgit v1.2.3-71-gd317 From f97a4a1a3f8769e3452885967955e21c88f3f263 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:34 +0800 Subject: workqueue: Rename "delayed" (delayed by active management) to "inactive" There are two kinds of "delayed" work items in workqueue subsystem. One is for timer-delayed work items which are visible to workqueue users. The other kind is for work items delayed by active management which can not be directly visible to workqueue users. We mixed the word "delayed" for both kinds and caused somewhat ambiguity. This patch renames the later one (delayed by active management) to "inactive", because it is used for workqueue active management and most of its related symbols are named with "active" or "activate". All "delayed" and "DELAYED" are carefully checked and renamed one by one to avoid accidentally changing the name of the other kind for timer-delayed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 58 +++++++++++++++++++++++------------------------ 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5fcf3d048a5a..3d4edd072a9b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -29,7 +29,7 @@ void delayed_work_timer_fn(struct timer_list *t); enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ + WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -42,7 +42,7 @@ enum { WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9bce39dba297..9a00ba096032 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -207,7 +207,7 @@ struct pool_workqueue { /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -1136,7 +1136,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -1144,16 +1144,16 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } -static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); } /** @@ -1176,10 +1176,10 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) pwq->nr_in_flight[color]--; pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); } /* is flush in progress and are we at the flushing tip? */ @@ -1281,14 +1281,14 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* - * A delayed work item cannot be grabbed directly because + * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work); list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, get_work_color(work)); @@ -1490,8 +1490,8 @@ retry: if (list_empty(worklist)) pwq->pool->watchdog_ts = jiffies; } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + worklist = &pwq->inactive_works; } debug_work_activate(work); @@ -2530,7 +2530,7 @@ repeat: /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2956,7 +2956,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -3712,7 +3712,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3741,9 +3741,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && + while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); kick = true; } @@ -3774,7 +3774,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); @@ -4361,7 +4361,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; @@ -4557,7 +4557,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - ret = !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); @@ -4753,11 +4753,11 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("\n"); } - if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false; - pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } @@ -4787,7 +4787,7 @@ void show_workqueue_state(void) bool idle = true; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } @@ -4799,7 +4799,7 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) show_pwq(pwq); raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -5182,7 +5182,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: -- cgit v1.2.3-71-gd317 From c4560c2c88a4c809800ba8e76faabaf80bf6ee89 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:35 +0800 Subject: workqueue: Change arguement of pwq_dec_nr_in_flight() Make pwq_dec_nr_in_flight() use work_data rather just work_color. Prepare for later patch to get WORK_STRUCT_INACTIVE bit from work_data in pwq_dec_nr_in_flight(). No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a00ba096032..41796000f3eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -579,9 +579,9 @@ static unsigned int work_color_to_flags(int color) return color << WORK_STRUCT_COLOR_SHIFT; } -static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } @@ -1159,7 +1159,7 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1167,8 +1167,10 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { + int color = get_work_color(work_data); + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) goto out_put; @@ -1291,7 +1293,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_inactive_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -2172,7 +2174,7 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; + unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP /* @@ -2208,7 +2210,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + work_data = *work_data_bits(work); /* * Record wq name for cmdline and debug reporting, may get @@ -2314,7 +2316,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + pwq_dec_nr_in_flight(pwq, work_data); } /** -- cgit v1.2.3-71-gd317 From d21cece0dbb424ad3ff9e49bde6954632b8efede Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:36 +0800 Subject: workqueue: Change the code of calculating work_flags in insert_wq_barrier() Add a local var @work_flags to calculate work_flags step by step, so that we don't need to squeeze several flags in only the last line of code. Parepare for next patch to add a bit to barrier work item's flag. Not squshing this to next patch makes it clear that what it will have changed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41796000f3eb..84fd2a8f56aa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2659,8 +2659,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); struct list_head *head; - unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked @@ -2686,13 +2686,12 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; __set_bit(WORK_STRUCT_LINKED_BIT, bits); } debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + insert_work(pwq, &barr->work, head, work_flags); } /** -- cgit v1.2.3-71-gd317 From 018f3a13dd6300701103f268b6bfec0a56beea57 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:37 +0800 Subject: workqueue: Mark barrier work with WORK_STRUCT_INACTIVE Currently, WORK_NO_COLOR has two meanings: Not participate in flushing Not participate in nr_active And only non-barrier work items are marked with WORK_STRUCT_INACTIVE when they are in inactive_works list. The barrier work items are not marked INACTIVE even linked in inactive_works list since these tail items are always moved together with the head work item. These definitions are simple, clean and practical. (Except a small blemish that only the first meaning of WORK_NO_COLOR is documented in include/linux/workqueue.h while both meanings are in workqueue.c) But dual-purpose WORK_NO_COLOR used for barrier work items has proven to be problematical[1]. Only the second purpose is obligatory. So we plan to make barrier work items participate in flushing but keep them still not participating in nr_active. So the plan is to mark barrier work items inactive without using WORK_NO_COLOR in this patch so that we can assign a flushing color to them in next patch. The reasonable way is to add or reuse a bit in work data of the work item. But adding a bit will double the size of pool_workqueue. Currently, WORK_STRUCT_INACTIVE is only used in try_to_grab_pending() for user-queued work items and try_to_grab_pending() can't work for barrier work items. So we extend WORK_STRUCT_INACTIVE to also mark barrier work items no matter which list they are in because we don't need to determind which list a barrier work item is in. So the meaning of WORK_STRUCT_INACTIVE becomes just "the work items don't participate in nr_active" (no matter whether it is a barrier work item or a user-queued work item). And WORK_STRUCT_INACTIVE for user-queued work items means they are in inactive_works list. This patch does it by setting WORK_STRUCT_INACTIVE for barrier work items in insert_wq_barrier() and checking WORK_STRUCT_INACTIVE first in pwq_dec_nr_in_flight(). And the meaning of WORK_NO_COLOR is reduced to only "not participating in flushing". There is no functionality change intended in this patch. Because WORK_NO_COLOR+WORK_STRUCT_INACTIVE represents the previous WORK_NO_COLOR in meaning and try_to_grab_pending() doesn't use for barrier work items and avoids being confused by this extended WORK_STRUCT_INACTIVE. A bunch of comment for nr_active & WORK_STRUCT_INACTIVE is also added for documenting how WORK_STRUCT_INACTIVE works in nr_active management. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 84fd2a8f56aa..6657bcbd2064 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -205,6 +205,23 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ @@ -1171,19 +1188,21 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - /* uncolored work items don't participate in flushing or nr_active */ + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } + } + + /* uncolored work items don't participate in flushing */ if (color == WORK_NO_COLOR) goto out_put; pwq->nr_in_flight[color]--; - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1283,6 +1302,10 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left * on the inactive_works list, will confuse pwq->nr_active @@ -2675,6 +2698,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; + /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. -- cgit v1.2.3-71-gd317 From d812796eb3906424cd3c0eea530983961e2f88bd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:38 +0800 Subject: workqueue: Assign a color to barrier work items There was no strong reason to or not to flush barrier work items in flush_workqueue(). And we have to make barrier work items not participate in nr_active so we had been using WORK_NO_COLOR for them which also makes them can't be flushed by flush_workqueue(). And the users of flush_workqueue() often do not intend to wait barrier work items issued by flush_work(). That made the choice sound perfect. But barrier work items have reference to internal structure (pool_workqueue) and the worker thread[s] is/are still busy for the workqueue user when the barrrier work items are not done. So it is reasonable to make flush_workqueue() also watch for flush_work() to make it more robust. And a problem[1] reported by Li Zhe shows that we need such robustness. The warning logs are listed below: WARNING: CPU: 0 PID: 19336 at kernel/workqueue.c:4430 destroy_workqueue+0x11a/0x2f0 ***** destroy_workqueue: test_workqueue9 has the following busy pwq pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=0/1 refcnt=2 in-flight: 5658:wq_barrier_func Showing busy workqueues and worker pools: ***** It shows that even after drain_workqueue() returns, the barrier work item is still in flight and the pwq (and a worker) is still busy on it. The problem is caused by flush_workqueue() not watching flush_work(): Thread A Worker /* normal work item with linked */ process_scheduled_works() destroy_workqueue() process_one_work() drain_workqueue() /* run normal work item */ /-- pwq_dec_nr_in_flight() flush_workqueue() <---/ /* the last normal work item is done */ sanity_check process_one_work() /-- raw_spin_unlock_irq(&pool->lock) raw_spin_lock_irq(&pool->lock) <-/ /* maybe preempt */ *WARNING* wq_barrier_func() /* maybe preempt by cond_resched() */ Thread A can get the pool lock after the Worker unlocks the pool lock before running wq_barrier_func(). And if there is any preemption happen around wq_barrier_func(), destroy_workqueue()'s sanity check is more likely to get the lock and catch it. (Note: preemption is not necessary to cause the bug, the unlocking is enough to possibly trigger the WARNING.) A simple solution might be just executing all linked barrier work items once without releasing pool lock after the head work item's pwq_dec_nr_in_flight(). But this solution has two problems: 1) the head work item might also be barrier work item when the user-queued work item is cancelled. For example: thread 1: thread 2: queue_work(wq, &my_work) flush_work(&my_work) cancel_work_sync(&my_work); /* Neiter my_work nor the barrier work is scheduled. */ destroy_workqueue(wq); /* This is an easier way to catch the WARNING. */ 2) there might be too much linked barrier work items and running them all once without releasing pool lock just causes trouble. The only solution is to make flush_workqueue() aslo watch barrier work items. So we have to assign a color to these barrier work items which is the color of the head (user-queued) work item. Assigning a color doesn't cause any problem in ative management, because the prvious patch made barrier work items not participate in nr_active via WORK_STRUCT_INACTIVE rather than reliance on the (old) WORK_NO_COLOR. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Reported-by: Li Zhe Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++++++-------- kernel/workqueue_internal.h | 3 ++- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6657bcbd2064..33a6b4a2443d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1197,10 +1197,6 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ } } - /* uncolored work items don't participate in flushing */ - if (color == WORK_NO_COLOR) - goto out_put; - pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ @@ -1307,7 +1303,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left + * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. @@ -2234,6 +2230,7 @@ __acquires(&pool->lock) worker->current_func = work->func; worker->current_pwq = pwq; work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get @@ -2339,6 +2336,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->current_color = INT_MAX; pwq_dec_nr_in_flight(pwq, work_data); } @@ -2682,7 +2680,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { - unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head; /* @@ -2705,17 +2704,22 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } + pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 498de0e909a4..e00b1204a8e9 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -30,7 +30,8 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + unsigned int current_color; /* L: current_work's color */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ -- cgit v1.2.3-71-gd317 From 99c37d1a63eafcd3673302a7953df760b46d0f6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:19 +0200 Subject: tracing: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Link: https://lkml.kernel.org/r/20210803141621.780504-37-bigeasy@linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace_hwlat.c | 28 ++++++++++++++-------------- kernel/trace/trace_osnoise.c | 16 ++++++++-------- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e592d1df6f88..c5a3fbf19617 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } - get_online_cpus(); + cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - put_online_cpus(); + cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - get_online_cpus(); + cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) @@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - put_online_cpus(); + cpus_read_unlock(); } out: diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 14f46aae1981..1b83d75eb103 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,10 +325,10 @@ static void move_to_next_cpu(void) if (!cpumask_equal(current_mask, current->cpus_ptr)) goto change_mode; - get_online_cpus(); + cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); - put_online_cpus(); + cpus_read_unlock(); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); @@ -398,7 +398,7 @@ static void stop_single_kthread(void) struct hwlat_kthread_data *kdata = get_cpu_data(); struct task_struct *kthread; - get_online_cpus(); + cpus_read_lock(); kthread = kdata->kthread; if (!kthread) @@ -408,7 +408,7 @@ static void stop_single_kthread(void) kdata->kthread = NULL; out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); } @@ -425,14 +425,14 @@ static int start_single_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - get_online_cpus(); + cpus_read_lock(); if (kdata->kthread) goto out_put_cpus; kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - put_online_cpus(); + cpus_read_unlock(); return -ENOMEM; } @@ -452,7 +452,7 @@ static int start_single_kthread(struct trace_array *tr) wake_up_process(kthread); out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -479,10 +479,10 @@ static void stop_per_cpu_kthreads(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_cpu_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -515,7 +515,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) mutex_lock(&trace_types_lock); mutex_lock(&hwlat_data.lock); - get_online_cpus(); + cpus_read_lock(); if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; @@ -526,7 +526,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) start_cpu_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&hwlat_data.lock); mutex_unlock(&trace_types_lock); } @@ -582,7 +582,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) unsigned int cpu; int retval; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which hwlat is allowed to run. */ @@ -596,12 +596,12 @@ static int start_per_cpu_kthreads(struct trace_array *tr) if (retval) goto out_error; } - put_online_cpus(); + cpus_read_unlock(); return 0; out_error: - put_online_cpus(); + cpus_read_unlock(); stop_per_cpu_kthreads(); return retval; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b61eefe5ccf5..65b08b8e5bf8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1498,12 +1498,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -1551,7 +1551,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) int retval; int cpu; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which trace and osnoise are allowed to run. */ @@ -1572,7 +1572,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) } } - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1590,7 +1590,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) goto out_unlock_trace; mutex_lock(&interface_lock); - get_online_cpus(); + cpus_read_lock(); if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; @@ -1601,7 +1601,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) start_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); out_unlock_trace: mutex_unlock(&trace_types_lock); @@ -1743,11 +1743,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, /* * osnoise_cpumask is read by CPU hotplug operations. */ - get_online_cpus(); + cpus_read_lock(); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); if (running) -- cgit v1.2.3-71-gd317 From 8cacfc85b615cc0bae01241593c4b25da6570efc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Aug 2021 18:08:42 +0100 Subject: bpf: Remove redundant initialization of variable allow The variable allow is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817170842.495440-1-colin.king@canonical.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a1dedba4c174..9f35928bab0a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1135,7 +1135,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow = 1; + int allow; rcu_read_lock(); cgrp = task_dfl_cgroup(current); -- cgit v1.2.3-71-gd317 From faf4ef823ac5f3b6a34a73b76c52895dee3dce55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 14:21:16 +0200 Subject: dma-direct: add support for dma_coherent_default_memory Add an option to allocate uncached memory for dma_alloc_coherent from the global dma_coherent_default_memory. This will allow to move arm-nommu (and eventually other platforms) to use generic code for allocating uncached memory from a pre-populated pool. Note that this is a different pool from the one that platforms that can remap at runtime use for GFP_ATOMIC allocations for now, although there might be opportunities to eventually end up with a common codebase for the two use cases. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/Kconfig | 4 ++++ kernel/dma/direct.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 77b405508743..725cfd51762b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -93,6 +93,10 @@ config DMA_COHERENT_POOL select GENERIC_ALLOCATOR bool +config DMA_GLOBAL_POOL + select DMA_DECLARE_COHERENT + bool + config DMA_REMAP bool depends on MMU diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f33ceb68aef2..8dca4f97d12d 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -156,9 +156,14 @@ void *dma_direct_alloc(struct device *dev, size_t size, if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !dev_is_dma_coherent(dev)) + return dma_alloc_from_global_coherent(dev, size, dma_handle); + /* * Remapping or decrypting memory may block. If either is required and * we can't block, allocate the memory from the atomic pools. @@ -255,11 +260,19 @@ void dma_direct_free(struct device *dev, size_t size, if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !dev_is_dma_coherent(dev)) { + if (!dma_release_from_global_coherent(page_order, cpu_addr)) + WARN_ON_ONCE(1); + return; + } + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) @@ -462,6 +475,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; + if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret)) + return ret; if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) return -ENXIO; -- cgit v1.2.3-71-gd317 From 70d6aa0ecfed253a2b14659a6c77359af6d9b3ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 19:38:19 +0200 Subject: dma-mapping: allow using the global coherent pool for !ARM Switch an ifdef so that the global coherent pool is initialized for any architecture that selects the DMA_GLOBAL_POOL symbol insted of hardcoding ARM. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 794e76b03b34..67b126afac5a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -361,7 +361,9 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) pr_err("Reserved memory: regions without no-map are not yet supported\n"); return -EINVAL; } +#endif +#ifdef CONFIG_DMA_GLOBAL_POOL if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { WARN(dma_reserved_default_memory, "Reserved memory: region for default DMA coherent area is redefined\n"); -- cgit v1.2.3-71-gd317 From a6933571f34a9aee843fff2aa4b96949c57d6274 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 16:00:04 +0200 Subject: dma-mapping: simplify dma_init_coherent_memory Return the allocated dma_coherent_mem structure, set the use_dma_pfn_offset and print the failure warning inside of dma_init_coherent_memory instead of leaving that to the callers. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 78 ++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 67b126afac5a..ab397ebfd5ad 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -37,51 +37,44 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory(phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, - struct dma_coherent_mem **mem) +static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset) { - struct dma_coherent_mem *dma_mem = NULL; - void *mem_base = NULL; + struct dma_coherent_mem *dma_mem; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - int ret; + void *mem_base; - if (!size) { - ret = -EINVAL; - goto out; - } + if (!size) + return ERR_PTR(-EINVAL); mem_base = memremap(phys_addr, size, MEMREMAP_WC); - if (!mem_base) { - ret = -EINVAL; - goto out; - } + if (!mem_base) + return ERR_PTR(-EINVAL); + dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dma_mem) { - ret = -ENOMEM; - goto out; - } + if (!dma_mem) + goto out_unmap_membase; dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dma_mem->bitmap) { - ret = -ENOMEM; - goto out; - } + if (!dma_mem->bitmap) + goto out_free_dma_mem; dma_mem->virt_base = mem_base; dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; + dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset; spin_lock_init(&dma_mem->spinlock); - *mem = dma_mem; - return 0; + return dma_mem; -out: +out_free_dma_mem: kfree(dma_mem); - if (mem_base) - memunmap(mem_base); - return ret; +out_unmap_membase: + memunmap(mem_base); + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n", + &phys_addr, size / SZ_1M); + return ERR_PTR(-ENOMEM); } static void dma_release_coherent_memory(struct dma_coherent_mem *mem) @@ -130,9 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); - if (ret) - return ret; + mem = dma_init_coherent_memory(phys_addr, device_addr, size, false); + if (IS_ERR(mem)) + return PTR_ERR(mem); ret = dma_assign_coherent_memory(dev, mem); if (ret) @@ -319,21 +312,16 @@ static struct reserved_mem *dma_reserved_default_memory __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - struct dma_coherent_mem *mem = rmem->priv; - int ret; - - if (!mem) { - ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, &mem); - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; - } + if (!rmem->priv) { + struct dma_coherent_mem *mem; + + mem = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, true); + if (IS_ERR(mem)) + return PTR_ERR(mem); + rmem->priv = mem; } - mem->use_dev_dma_pfn_offset = true; - rmem->priv = mem; - dma_assign_coherent_memory(dev, mem); + dma_assign_coherent_memory(dev, rmem->priv); return 0; } -- cgit v1.2.3-71-gd317 From 39a2d3506b2d53c569a6db13d65b2f3728c4feec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 16:05:05 +0200 Subject: dma-mapping: add a dma_init_global_coherent helper Add a new helper to initialize the global coherent pool. This both cleans up the existing initialization which indirects through the reserved_mem_ops that are normally only used for struct device, and also allows using the global pool for non-devicetree architectures. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- include/linux/dma-map-ops.h | 2 +- kernel/dma/coherent.c | 32 ++++++++++++++------------------ 2 files changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 2f842498c448..068f1b11a6a4 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -177,7 +177,7 @@ void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); - +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index ab397ebfd5ad..160d4e246ecb 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -300,6 +300,18 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, vaddr, size, ret); } +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) +{ + struct dma_coherent_mem *mem; + + mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true); + if (IS_ERR(mem)) + return PTR_ERR(mem); + dma_coherent_default_memory = mem; + pr_info("DMA: default coherent area is set\n"); + return 0; +} + /* * Support for reserved memory regions defined in device tree */ @@ -367,26 +379,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) static int __init dma_init_reserved_memory(void) { - const struct reserved_mem_ops *ops; - int ret; - if (!dma_reserved_default_memory) return -ENOMEM; - - ops = dma_reserved_default_memory->ops; - - /* - * We rely on rmem_dma_device_init() does not propagate error of - * dma_assign_coherent_memory() for "NULL" device. - */ - ret = ops->device_init(dma_reserved_default_memory, NULL); - - if (!ret) { - dma_coherent_default_memory = dma_reserved_default_memory->priv; - pr_info("DMA: default coherent area is set\n"); - } - - return ret; + return dma_init_global_coherent(dma_reserved_default_memory->base, + dma_reserved_default_memory->size); } core_initcall(dma_init_reserved_memory); -- cgit v1.2.3-71-gd317 From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:56 -0400 Subject: tracing: Add DYNAMIC flag for dynamic events To differentiate between static and dynamic events, add a new flag DYNAMIC to the event flags that all dynamic events have set. This will allow to differentiate when attaching to a dynamic event from a static event. Static events have a mod pointer that references the module they were created in (or NULL for core kernel). This can be incremented when the event has something attached to it. But there exists no such mechanism for dynamic events. This is dangerous as the dynamic events may now disappear without the "attachment" knowing that it no longer exists. To enforce the dynamic flag, change dyn_event_add() to pass the event that is being created such that it can set the DYNAMIC flag of the event. This helps make sure that no location that creates a dynamic event misses setting this flag. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 3 +++ kernel/trace/trace_dynevent.h | 4 +++- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..53c9dffd87fd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -310,6 +310,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, }; @@ -321,6 +322,7 @@ enum { * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint + * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe */ @@ -330,6 +332,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 7754936b57ee..936477a111d3 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) return 0; } -static inline int dyn_event_add(struct dyn_event *ev) +static inline int dyn_event_add(struct dyn_event *ev, + struct trace_event_call *call) { lockdep_assert_held(&event_mutex); if (!ev || !ev->ops) return -EINVAL; + call->flags |= TRACE_EVENT_FL_DYNAMIC; list_add_tail(&ev->list, &dyn_event_list); return 0; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9315fc03e303..f4f5489e1e28 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1298,7 +1298,7 @@ static int __create_synth_event(const char *name, const char *raw_fields) } ret = register_synth_event(event); if (!ret) - dyn_event_add(&event->devent); + dyn_event_add(&event->devent, &event->call); else free_synth_event(event); out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ea6178cb5e33..bfef43bfce37 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -618,7 +618,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) if (ret) trace_probe_unlink(&tk->tp); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); return ret; } @@ -661,7 +661,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); end: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9b50869a5ddb..50eca53b8d22 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -455,7 +455,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } @@ -518,7 +518,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); end: mutex_unlock(&event_mutex); -- cgit v1.2.3-71-gd317 From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:57 -0400 Subject: tracing: Have dynamic events have a ref counter As dynamic events are not created by modules, if something is attached to one, calling "try_module_get()" on its "mod" field, is not going to keep the dynamic event from going away. Since dynamic events do not need the "mod" pointer of the event structure, make a union out of it in order to save memory (there's one structure for each of the thousand+ events in the kernel), and have any event with the DYNAMIC flag set to use a ref counter instead. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 45 ++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_dynevent.c | 38 +++++++++++++++++++++++++++++++ kernel/trace/trace_event_perf.c | 6 ++--- kernel/trace/trace_events.c | 22 +++++++++++------- kernel/trace/trace_events_synth.c | 19 ++++++++++------ kernel/trace/trace_events_trigger.c | 6 ++--- kernel/trace/trace_kprobe.c | 4 ++++ kernel/trace/trace_uprobe.c | 4 ++++ 9 files changed, 124 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 53c9dffd87fd..9564c4d9a3b6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -350,7 +350,14 @@ struct trace_event_call { struct trace_event event; char *print_fmt; struct event_filter *filter; - void *mod; + /* + * Static events can disappear with modules, + * where as dynamic ones need their own ref count. + */ + union { + void *module; + atomic_t refcnt; + }; void *data; /* See the TRACE_EVENT_FL_* flags above */ @@ -366,6 +373,42 @@ struct trace_event_call { #endif }; +#ifdef CONFIG_DYNAMIC_EVENTS +bool trace_event_dyn_try_get_ref(struct trace_event_call *call); +void trace_event_dyn_put_ref(struct trace_event_call *call); +bool trace_event_dyn_busy(struct trace_event_call *call); +#else +static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) +{ + /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ + return false; +} +static inline void trace_event_dyn_put_ref(struct trace_event_call *call) +{ +} +static inline bool trace_event_dyn_busy(struct trace_event_call *call) +{ + /* Nothing should call this without DYNAIMIC_EVENTS configured. */ + return true; +} +#endif + +static inline bool trace_event_try_get_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + return trace_event_dyn_try_get_ref(call); + else + return try_module_get(call->module); +} + +static inline void trace_event_put_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + trace_event_dyn_put_ref(call); + else + module_put(call->module); +} + #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be0169594de5..8425c3d70895 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,11 +3697,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str) return false; event = container_of(trace_event, struct trace_event_call, event); - if (!event->mod) + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ - if (within_module_core(addr, event->mod)) + if (within_module_core(addr, event->module)) return true; return false; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e57cc0870892..1110112e55bd 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -13,11 +13,49 @@ #include #include "trace.h" +#include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" static DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); +bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) +{ + struct trace_event_call *call; + bool ret = false; + + if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC))) + return false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call == dyn_call) { + atomic_inc(&dyn_call->refcnt); + ret = true; + } + } + up_read(&trace_event_sem); + return ret; +} + +void trace_event_dyn_put_ref(struct trace_event_call *call) +{ + if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC))) + return; + + if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) { + atomic_set(&call->refcnt, 0); + return; + } + + atomic_dec(&call->refcnt); +} + +bool trace_event_dyn_busy(struct trace_event_call *call) +{ + return atomic_read(&call->refcnt) != 0; +} + int dyn_event_register(struct dyn_event_operations *ops) { if (!ops || !ops->create || !ops->show || !ops->is_busy || diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 03be4435d103..6aed10e2f7ce 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) } } out: - module_put(tp_event->mod); + trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event) list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { + trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) - module_put(tp_event->mod); + trace_event_put_ref(tp_event); break; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 80e96989770e..1349b6de5eeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2525,7 +2525,10 @@ __register_event(struct trace_event_call *call, struct module *mod) return ret; list_add(&call->list, &ftrace_events); - call->mod = mod; + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + atomic_set(&call->refcnt, 0); + else + call->module = mod; return 0; } @@ -2839,7 +2842,9 @@ static void trace_module_remove_events(struct module *mod) down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) + if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) + continue; + if (call->module == mod) __trace_remove_event_call(call); } up_write(&trace_event_sem); @@ -2982,7 +2987,7 @@ struct trace_event_file *trace_get_event_file(const char *instance, } /* Don't let event modules unload while in use */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; @@ -3012,7 +3017,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file); void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); @@ -3147,7 +3152,7 @@ static int free_probe_data(void *data) if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); - module_put(edata->file->event_call->mod); + trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; @@ -3280,7 +3285,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -3310,7 +3315,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; @@ -3376,7 +3381,8 @@ void __trace_early_add_events(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ - if (WARN_ON_ONCE(call->mod)) + if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && + WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f4f5489e1e28..d54094b7a9d7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1369,13 +1369,15 @@ static int destroy_synth_event(struct synth_event *se) int ret; if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } + return -EBUSY; + + if (trace_event_dyn_busy(&se->call)) + return -EBUSY; + + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); } return ret; @@ -2102,6 +2104,9 @@ static int synth_event_release(struct dyn_event *ev) if (event->ref) return -EBUSY; + if (trace_event_dyn_busy(&event->call)) + return -EBUSY; + ret = unregister_synth_event(event); if (ret) return ret; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cf84d0f6583a..6b11e335a62e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1334,7 +1334,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { /* Remove the SOFT_MODE flag */ trace_event_enable_disable(enable_data->file, 0, 1); - module_put(enable_data->file->event_call->mod); + trace_event_put_ref(enable_data->file->event_call); trigger_data_free(data); kfree(enable_data); } @@ -1481,7 +1481,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(event_enable_file->event_call->mod); + ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -1510,7 +1510,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: - module_put(event_enable_file->event_call->mod); + trace_event_put_ref(event_enable_file->event_call); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bfef43bfce37..82c3b86013b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -543,6 +543,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp))) + return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ if (unregister_kprobe_event(tk)) return -EBUSY; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 50eca53b8d22..1e2a92e7607d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -393,6 +393,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (trace_probe_has_sibling(&tu->tp)) goto unreg; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) + return -EBUSY; + ret = unregister_uprobe_event(tu); if (ret) return ret; -- cgit v1.2.3-71-gd317 From fcd9db51df8e219e3a61b14e9b8c5ee67d39d37c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:58 -0400 Subject: tracing/probe: Have traceprobe_parse_probe_arg() take a const arg The two places that call traceprobe_parse_probe_arg() allocate a temporary buffer to copy the argv[i] into, because argv[i] is constant and the traceprobe_parse_probe_arg() will modify it to do the parsing. These two places allocate this buffer and then free it right after calling this function, leaving the onus of this allocation to the caller. As there's about to be a third user of this function that will have to do the same thing, instead of having the caller allocate the temporary buffer, simply move that allocation into the traceprobe_parse_probe_arg() itself, which will simplify the code of the callers. Link: https://lkml.kernel.org/r/20210817035027.385422828@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +-------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++++------------------ kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 9 +-------- 4 files changed, 31 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 82c3b86013b2..ed1e3c2087ab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -873,15 +873,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); - kfree(tmp); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 15413ad7cef2..ef717b373443 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -540,26 +540,34 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; + char *arg; int ret, len; + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { trace_probe_log_err(offset, ARG_TOO_LONG); - return -EINVAL; + goto out; } else if (len == 0) { trace_probe_log_err(offset, NO_ARG_BODY); - return -EINVAL; + goto out; } + ret = -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); if (!parg->comm) - return -ENOMEM; + goto out; + ret = -EINVAL; t = strchr(arg, ':'); if (t) { *t = '\0'; @@ -571,22 +579,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, offset += t2 + strlen(t2) - arg; trace_probe_log_err(offset, ARRAY_NO_CLOSE); - return -EINVAL; + goto out; } else if (t3[1] != '\0') { trace_probe_log_err(offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - return -EINVAL; + goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(offset + t2 - arg, BAD_ARRAY_NUM); - return -EINVAL; + goto out; } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(offset + t2 - arg, ARRAY_TOO_BIG); - return -EINVAL; + goto out; } } } @@ -598,29 +606,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) - return -EINVAL; + goto out; parg->type = find_fetch_type("string"); } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); - return -EINVAL; + goto out; } parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + ret = -ENOMEM; if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) - return -ENOMEM; + goto out; snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) - return -ENOMEM; + goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], @@ -628,6 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (ret) goto fail; + ret = -EINVAL; /* Store operation */ if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { @@ -636,7 +646,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || @@ -650,7 +659,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } } @@ -672,7 +680,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -687,6 +694,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } + ret = -EINVAL; /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && @@ -694,13 +702,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -709,6 +715,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; code->op = FETCH_OP_END; + ret = 0; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -724,6 +731,8 @@ fail: kfree(code->data); } kfree(tmp); +out: + kfree(arg); return ret; } @@ -745,11 +754,11 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } -int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, unsigned int flags) { struct probe_arg *parg = &tp->args[i]; - char *body; + const char *body; /* Increment count for freeing args in error case */ tp->nr_args++; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 227d518e5ba5..42aa084902fa 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -354,7 +354,7 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_MASK GENMASK(2, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - char *arg, unsigned int flags); + const char *argv, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1e2a92e7607d..93ff96541971 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -684,16 +684,9 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - kfree(tmp); if (ret) goto error; } -- cgit v1.2.3-71-gd317 From bc1b973455fd5d84dac4a094da44202f2d8a98ef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:59 -0400 Subject: tracing/probes: Allow for dot delimiter as well as slash for system names Kprobe and uprobe events can add a "system" to the events that are created via the kprobe_events and uprobe_events files respectively. If they do not include a "system" in the name, then the default "kprobes" or "uprobes" is used. The current notation to specify a system for one of these probe events is to add a '/' delimiter in the name, where the content before the '/' will be the system to use, and the content after will be the event name. echo 'p:my_system/my_event' > kprobe_events But this is inconsistent with the way histogram triggers separate their system / event names. The histogram triggers use a '.' delimiter, which can be confusing. To allow this to be more consistent, as well as keep backward compatibility, allow the kprobe and uprobe events to denote a system name with either a '/' or a '.'. That is: echo 'p:my_system/my_event' > kprobe_events is equivalent to: echo 'p:my_system.my_event' > kprobe_events Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.580493202@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ef717b373443..0916a9964719 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, int len; slash = strchr(event, '/'); + if (!slash) + slash = strchr(event, '.'); + if (slash) { if (slash == event) { trace_probe_log_err(offset, NO_GROUP_NAME); -- cgit v1.2.3-71-gd317 From 845cbf3e11acc263ec7a46a89097d88e7e50a9ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:43:00 -0400 Subject: tracing/probes: Use struct_size() instead of defining custom macros Remove SIZEOF_TRACE_KPROBE() and SIZEOF_TRACE_UPROBE() and use struct_size() as that's what it is made for. No need to have custom macros. Especially since struct_size() has some extra memory checks for correctness. Link: https://lkml.kernel.org/r/20210817035027.795000217@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +----- kernel/trace/trace_uprobe.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ed1e3c2087ab..ca726c9d0859 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -80,10 +80,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) -#define SIZEOF_TRACE_KPROBE(n) \ - (offsetof(struct trace_kprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -265,7 +261,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, struct trace_kprobe *tk; int ret = -ENOMEM; - tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); if (!tk) return ERR_PTR(ret); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 93ff96541971..590bb9a02f8d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -83,10 +83,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); @@ -340,7 +336,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) struct trace_uprobe *tu; int ret; - tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-71-gd317 From 39f75da7bcc829ddc4d40bb60d0e95520de7898b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 2 Aug 2021 23:40:31 +0300 Subject: isystem: trim/fixup stdarg.h and other headers Delete/fixup few includes in anticipation of global -isystem compile option removal. Note: crypto/aegis128-neon-inner.c keeps due to redefinition of uintptr_t error (one definition comes from , another from ). Signed-off-by: Alexey Dobriyan Signed-off-by: Masahiro Yamada --- arch/arm/kernel/process.c | 2 -- arch/arm/mach-bcm/bcm_kona_smc.c | 2 -- arch/arm64/kernel/process.c | 3 --- arch/openrisc/kernel/process.c | 2 -- arch/parisc/kernel/process.c | 3 --- arch/powerpc/kernel/prom.c | 1 - arch/sparc/kernel/process_32.c | 3 --- arch/sparc/kernel/process_64.c | 3 --- arch/um/drivers/rtc_user.c | 1 + arch/um/drivers/vector_user.c | 1 + arch/um/include/shared/irq_user.h | 1 - arch/um/include/shared/os.h | 1 - arch/um/os-Linux/signal.c | 2 +- arch/um/os-Linux/util.c | 1 + drivers/block/xen-blkback/xenbus.c | 1 - drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 1 - drivers/gpu/drm/msm/disp/msm_disp_snapshot.h | 1 - drivers/macintosh/macio-adb.c | 1 - drivers/macintosh/via-macii.c | 2 -- drivers/net/wireless/intersil/orinoco/hermes.c | 1 - drivers/net/wwan/iosm/iosm_ipc_imem.h | 1 - drivers/pinctrl/aspeed/pinmux-aspeed.h | 1 - drivers/scsi/elx/efct/efct_driver.h | 1 - drivers/staging/media/atomisp/pci/hive_isp_css_common/host/isp_local.h | 2 -- drivers/xen/xen-scsiback.c | 2 -- include/linux/filter.h | 2 -- include/linux/mISDNif.h | 1 - kernel/debug/kdb/kdb_support.c | 1 - sound/aoa/codecs/onyx.h | 1 - sound/aoa/codecs/tas.c | 1 - sound/core/info.c | 1 - 31 files changed, 4 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index fc9e8b37eaa8..bb5ad8a6a4c3 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -5,8 +5,6 @@ * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Original Copyright (C) 1995 Linus Torvalds */ -#include - #include #include #include diff --git a/arch/arm/mach-bcm/bcm_kona_smc.c b/arch/arm/mach-bcm/bcm_kona_smc.c index 43a16f922b53..43829e49ad93 100644 --- a/arch/arm/mach-bcm/bcm_kona_smc.c +++ b/arch/arm/mach-bcm/bcm_kona_smc.c @@ -10,8 +10,6 @@ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include #include #include #include diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c8989b999250..5f7ac9a0f9a3 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -6,9 +6,6 @@ * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Copyright (C) 2012 ARM Ltd. */ - -#include - #include #include #include diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index eb62429681fc..b0698d9ce14f 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -14,8 +14,6 @@ */ #define __KERNEL_SYSCALLS__ -#include - #include #include #include diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 184ec3c1eae4..38ec4ae81239 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -17,9 +17,6 @@ * Copyright (C) 2001-2014 Helge Deller * Copyright (C) 2002 Randolph Chung */ - -#include - #include #include #include diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f620e04dc9bf..a1e7ba0fad09 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -11,7 +11,6 @@ #undef DEBUG -#include #include #include #include diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 93983d6d431d..bbbe0cfef746 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -8,9 +8,6 @@ /* * This file handles the architecture-dependent parts of process handling.. */ - -#include - #include #include #include diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index d33c58a58d4f..0cabcdfb23fd 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -9,9 +9,6 @@ /* * This file handles the architecture-dependent parts of process handling.. */ - -#include - #include #include #include diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c index 4016bc1d577e..7c3cec4c68cf 100644 --- a/arch/um/drivers/rtc_user.c +++ b/arch/um/drivers/rtc_user.c @@ -3,6 +3,7 @@ * Copyright (C) 2020 Intel Corporation * Author: Johannes Berg */ +#include #include #include #include diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index bae53220ce26..e4ffeb9a1fa4 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -3,6 +3,7 @@ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ +#include #include #include #include diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 065829f443ae..86a8a573b65c 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -7,7 +7,6 @@ #define __IRQ_USER_H__ #include -#include enum um_irq_type { IRQ_READ, diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 60b84edc8a68..96d400387c93 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -8,7 +8,6 @@ #ifndef __OS_H__ #define __OS_H__ -#include #include #include #include diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 6de99bb16113..6cf098c23a39 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -67,7 +67,7 @@ int signals_enabled; #ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT static int signals_blocked; #else -#define signals_blocked false +#define signals_blocked 0 #endif static unsigned int signals_pending; static unsigned int signals_active = 0; diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 07327425d06e..41297ec404bf 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -3,6 +3,7 @@ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ +#include #include #include #include diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 125b22205d38..33eba3df4dd9 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -8,7 +8,6 @@ #define pr_fmt(fmt) "xen-blkback: " fmt -#include #include #include #include diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index 7c4734f905d9..68fd451aca23 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -39,7 +39,6 @@ #include #include #include -#include #include "atomfirmware.h" diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.h b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.h index c92a9508c8d3..0f9a5364cd86 100644 --- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.h +++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.h @@ -25,7 +25,6 @@ #include #include #include -#include #include "msm_kms.h" #define MSM_DISP_SNAPSHOT_MAX_BLKS 10 diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index d4759db002c6..dc634c2932fd 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -2,7 +2,6 @@ /* * Driver for the ADB controller in the Mac I/O (Hydra) chip. */ -#include #include #include #include diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index 060e03f2264b..db9270da5b8e 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -23,8 +23,6 @@ * Apple's "ADB Analyzer" bus sniffer is invaluable: * ftp://ftp.apple.com/developer/Tool_Chest/Devices_-_Hardware/Apple_Desktop_Bus/ */ - -#include #include #include #include diff --git a/drivers/net/wireless/intersil/orinoco/hermes.c b/drivers/net/wireless/intersil/orinoco/hermes.c index 6d4b7f64efcf..256946552742 100644 --- a/drivers/net/wireless/intersil/orinoco/hermes.c +++ b/drivers/net/wireless/intersil/orinoco/hermes.c @@ -79,7 +79,6 @@ #undef HERMES_DEBUG #ifdef HERMES_DEBUG -#include #define DEBUG(lvl, stuff...) if ((lvl) <= HERMES_DEBUG) DMSG(stuff) diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem.h b/drivers/net/wwan/iosm/iosm_ipc_imem.h index 0d2f10e4cbc8..dc65b0712261 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_imem.h +++ b/drivers/net/wwan/iosm/iosm_ipc_imem.h @@ -7,7 +7,6 @@ #define IOSM_IPC_IMEM_H #include -#include #include "iosm_ipc_mmio.h" #include "iosm_ipc_pcie.h" diff --git a/drivers/pinctrl/aspeed/pinmux-aspeed.h b/drivers/pinctrl/aspeed/pinmux-aspeed.h index b69ba6b360a2..4d7548686f39 100644 --- a/drivers/pinctrl/aspeed/pinmux-aspeed.h +++ b/drivers/pinctrl/aspeed/pinmux-aspeed.h @@ -5,7 +5,6 @@ #define ASPEED_PINMUX_H #include -#include /* * The ASPEED SoCs provide typically more than 200 pins for GPIO and other diff --git a/drivers/scsi/elx/efct/efct_driver.h b/drivers/scsi/elx/efct/efct_driver.h index dab8eac4f243..0e3c931db7c2 100644 --- a/drivers/scsi/elx/efct/efct_driver.h +++ b/drivers/scsi/elx/efct/efct_driver.h @@ -10,7 +10,6 @@ /*************************************************************************** * OS specific includes */ -#include #include #include #include diff --git a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/isp_local.h b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/isp_local.h index eceeb5d160ad..4dbec4063b3d 100644 --- a/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/isp_local.h +++ b/drivers/staging/media/atomisp/pci/hive_isp_css_common/host/isp_local.h @@ -16,8 +16,6 @@ #ifndef __ISP_LOCAL_H_INCLUDED__ #define __ISP_LOCAL_H_INCLUDED__ -#include - #include "isp_global.h" #include diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 61ce0d142eea..0c5e565aa8cf 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -33,8 +33,6 @@ #define pr_fmt(fmt) "xen-pvscsi: " fmt -#include - #include #include #include diff --git a/include/linux/filter.h b/include/linux/filter.h index 83b896044e79..c1711c9f9439 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -5,8 +5,6 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ -#include - #include #include #include diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index a7330eb3ec64..7dd1f01ec4f9 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -18,7 +18,6 @@ #ifndef mISDNIF_H #define mISDNIF_H -#include #include #include #include diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 9f50d22d68e6..4f9950678e7b 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -10,7 +10,6 @@ * 03/02/13 added new 2.5 kallsyms */ -#include #include #include #include diff --git a/sound/aoa/codecs/onyx.h b/sound/aoa/codecs/onyx.h index 8a32c3c3d716..6c31b7373b78 100644 --- a/sound/aoa/codecs/onyx.h +++ b/sound/aoa/codecs/onyx.h @@ -6,7 +6,6 @@ */ #ifndef __SND_AOA_CODEC_ONYX_H #define __SND_AOA_CODEC_ONYX_H -#include #include #include #include diff --git a/sound/aoa/codecs/tas.c b/sound/aoa/codecs/tas.c index ac246dd3ab49..ab19a37e2a68 100644 --- a/sound/aoa/codecs/tas.c +++ b/sound/aoa/codecs/tas.c @@ -58,7 +58,6 @@ * and up to the hardware designer to not wire * them up in some weird unusable way. */ -#include #include #include #include diff --git a/sound/core/info.c b/sound/core/info.c index 9fec3070f8ba..a451b24199c3 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -16,7 +16,6 @@ #include #include #include -#include int snd_info_check_reserved_words(const char *str) { -- cgit v1.2.3-71-gd317 From 22f9feb49950885cdb6e37513f134d154175e743 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 19:37:00 +0200 Subject: dma-mapping: make the global coherent pool conditional Only build the code to support the global coherent pool if support for it is enabled. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- include/linux/dma-map-ops.h | 18 +++++++++-------- kernel/dma/coherent.c | 49 +++++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 068f1b11a6a4..0d5b06b3a4a6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -171,13 +171,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); - -void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle); -int dma_release_from_global_coherent(int order, void *vaddr); -int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, - size_t size, int *ret); -int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) @@ -187,7 +180,16 @@ static inline int dma_declare_coherent_memory(struct device *dev, #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) +#endif /* CONFIG_DMA_DECLARE_COHERENT */ +#ifdef CONFIG_DMA_GLOBAL_POOL +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle); +int dma_release_from_global_coherent(int order, void *vaddr); +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, + size_t size, int *ret); +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); +#else static inline void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle) { @@ -202,7 +204,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, { return 0; } -#endif /* CONFIG_DMA_DECLARE_COHERENT */ +#endif /* CONFIG_DMA_GLOBAL_POOL */ /* * This is the actual return value from the ->alloc_noncontiguous method. diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 160d4e246ecb..25fc85a7aebe 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -20,8 +20,6 @@ struct dma_coherent_mem { bool use_dev_dma_pfn_offset; }; -static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; - static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) { if (dev && dev->dma_mem) @@ -191,16 +189,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 1; } -void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle) -{ - if (!dma_coherent_default_memory) - return NULL; - - return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, - dma_handle); -} - static int __dma_release_from_coherent(struct dma_coherent_mem *mem, int order, void *vaddr) { @@ -236,15 +224,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) return __dma_release_from_coherent(mem, order, vaddr); } -int dma_release_from_global_coherent(int order, void *vaddr) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_release_from_coherent(dma_coherent_default_memory, order, - vaddr); -} - static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { @@ -290,6 +269,28 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); } +#ifdef CONFIG_DMA_GLOBAL_POOL +static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; + +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle) +{ + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, + dma_handle); +} + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { @@ -311,6 +312,7 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) pr_info("DMA: default coherent area is set\n"); return 0; } +#endif /* CONFIG_DMA_GLOBAL_POOL */ /* * Support for reserved memory regions defined in device tree @@ -320,7 +322,9 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) #include #include +#ifdef CONFIG_DMA_GLOBAL_POOL static struct reserved_mem *dma_reserved_default_memory __initdata; +#endif static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { @@ -377,6 +381,7 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) return 0; } +#ifdef CONFIG_DMA_GLOBAL_POOL static int __init dma_init_reserved_memory(void) { if (!dma_reserved_default_memory) @@ -384,8 +389,8 @@ static int __init dma_init_reserved_memory(void) return dma_init_global_coherent(dma_reserved_default_memory->base, dma_reserved_default_memory->size); } - core_initcall(dma_init_reserved_memory); +#endif /* CONFIG_DMA_GLOBAL_POOL */ RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); #endif -- cgit v1.2.3-71-gd317 From 1daf08a066cfe500587affd3fa3be8c13b8ff007 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:09 +0200 Subject: livepatch: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 3a4beb9395c4..291b857a6e20 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -411,7 +411,7 @@ void klp_try_complete_transition(void) /* * Ditto for the idle "swapper" tasks. */ - get_online_cpus(); + cpus_read_lock(); for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { @@ -423,7 +423,7 @@ void klp_try_complete_transition(void) task->patch_state = klp_target_state; } } - put_online_cpus(); + cpus_read_unlock(); if (!complete) { if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) -- cgit v1.2.3-71-gd317 From 007517a01995fb24f2f4effc9cf34814361a9d10 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:27 -0400 Subject: tracing/probe: Change traceprobe_set_print_fmt() to take a type Instead of a boolean "is_return" have traceprobe_set_print_fmt() take a type (currently just PROBE_PRINT_NORMAL and PROBE_PRINT_RETURN). This will simplify adding different types. For example, the development of the event_probe, will need its own type as it prints an event, and not an IP. Link: https://lkml.kernel.org/r/20210819041842.104626301@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++-- kernel/trace/trace_probe.c | 18 ++++++++++++------ kernel/trace/trace_probe.h | 7 ++++++- kernel/trace/trace_uprobe.c | 8 ++++++-- 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ca726c9d0859..c6fe7a6e3f35 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,6 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + enum probe_print_type ptype; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -875,7 +876,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } - ret = traceprobe_set_print_fmt(&tk->tp, is_return); + ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) goto error; @@ -1799,6 +1801,7 @@ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { + enum probe_print_type ptype; struct trace_kprobe *tk; int ret; char *event; @@ -1822,7 +1825,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ptype = trace_kprobe_is_return(tk) ? + PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0916a9964719..9c9c83a063b2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -851,19 +851,25 @@ int traceprobe_update_arg(struct probe_arg *arg) /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, - bool is_return) + enum probe_print_type ptype) { struct probe_arg *parg; int i, j; int pos = 0; const char *fmt, *arg; - if (!is_return) { + switch (ptype) { + case PROBE_PRINT_NORMAL: fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; - } else { + break; + case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + break; + default: + WARN_ON_ONCE(1); + return 0; } pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); @@ -912,20 +918,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, } #undef LEN_OR_ZERO -int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype) { struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0, is_return); + len = __set_print_fmt(tp, NULL, 0, ptype); print_fmt = kmalloc(len + 1, GFP_KERNEL); if (!print_fmt) return -ENOMEM; /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1, is_return); + __set_print_fmt(tp, print_fmt, len + 1, ptype); call->print_fmt = print_fmt; return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42aa084902fa..8adf5f3542a6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -363,7 +363,12 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); -extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); +enum probe_print_type { + PROBE_PRINT_NORMAL, + PROBE_PRINT_RETURN, +}; + +extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 590bb9a02f8d..09f8ca7f7ba0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -536,6 +536,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; bool is_return = false; @@ -687,7 +688,8 @@ static int __trace_uprobe_create(int argc, const char **argv) goto error; } - ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) goto error; @@ -1578,6 +1580,7 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return) { + enum probe_print_type ptype; struct trace_uprobe *tu; struct path path; int ret; @@ -1612,7 +1615,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu); - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { ret = -ENOMEM; goto error; } -- cgit v1.2.3-71-gd317 From 8565a45d0858078b63c7d84074a21a42ba9ebf01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:28 -0400 Subject: tracing/probes: Have process_fetch_insn() take a void * instead of pt_regs In preparation to allow event probes to use the process_fetch_insn() callback in trace_probe_tmpl.h, change the data passed to it from a pointer to pt_regs, as the event probe will not be using regs, and make it a void pointer instead. Update the process_fetch_insn() callers for kprobe and uprobe events to have the regs defined in the function and just typecast the void pointer parameter. Link: https://lkml.kernel.org/r/20210819041842.291622924@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 3 ++- kernel/trace/trace_probe_tmpl.h | 6 +++--- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6fe7a6e3f35..4b013d24f5a9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1325,9 +1325,10 @@ probe_mem_read(void *dest, void *src, size_t size) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; retry: diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index f003c5d02a3a..b3bdb8ddb862 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, +store_trace_args(void *data, struct trace_probe *tp, void *rec, int header_size, int maxlen) { struct probe_arg *arg; @@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, regs, dl, base); + ret = process_fetch_insn(arg->code, rec, dl, base); if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); } else { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 09f8ca7f7ba0..d219ba50efbd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -213,9 +213,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; /* 1st stage: get value from context */ -- cgit v1.2.3-71-gd317 From 8e242060c6a4947e8ae7d29794af6a581db08841 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Aug 2021 19:26:02 +0900 Subject: tracing/probes: Reject events which have the same name of existing one Since kprobe_events and uprobe_events only check whether the other same-type probe event has the same name or not, if the user gives the same name of the existing tracepoint event (or the other type of probe events), it silently fails to create the tracefs entry (but registered.) as below. /sys/kernel/tracing # ls events/task/task_rename enable filter format hist id trigger /sys/kernel/tracing # echo p:task/task_rename vfs_read >> kprobe_events [ 113.048508] Could not create tracefs 'task_rename' directory /sys/kernel/tracing # cat kprobe_events p:task/task_rename vfs_read To fix this issue, check whether the existing events have the same name or not in trace_probe_register_event_call(). If exists, it rejects to register the new event. Link: https://lkml.kernel.org/r/162936876189.187130.17558311387542061930.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_probe.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 6 +++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4b013d24f5a9..882c27044029 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -647,7 +647,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9c9c83a063b2..782c00eb6859 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,11 +1047,36 @@ error: return ret; } +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + int trace_probe_register_event_call(struct trace_probe *tp) { struct trace_event_call *call = trace_probe_event_call(tp); int ret; + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + ret = register_trace_event(&call->event); if (!ret) return -ENODEV; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8adf5f3542a6..66701a92d186 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,6 +404,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_NAME, "Event name is not specified"), \ C(EVENT_TOO_LONG, "Event name is too long"), \ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d219ba50efbd..225ce569bf8f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -515,7 +515,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3-71-gd317 From f9dabe016b63c9629e152bf876c126c29de223cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Aug 2021 15:59:33 +0200 Subject: bpf: Undo off-by-one in interpreter tail call count limit The BPF interpreter as well as x86-64 BPF JIT were both in line by allowing up to 33 tail calls (however odd that number may be!). Recently, this was changed for the interpreter to reduce it down to 32 with the assumption that this should have been the actual limit "which is in line with the behavior of the x86 JITs" according to b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting"). Paul recently reported: I'm a bit surprised by this because I had previously tested the tail call limit of several JIT compilers and found it to be 33 (i.e., allowing chains of up to 34 programs). I've just extended a test program I had to validate this again on the x86-64 JIT, and found a limit of 33 tail calls again [1]. Also note we had previously changed the RISC-V and MIPS JITs to allow up to 33 tail calls [2, 3], for consistency with other JITs and with the interpreter. We had decided to increase these two to 33 rather than decrease the other JITs to 32 for backward compatibility, though that probably doesn't matter much as I'd expect few people to actually use 33 tail calls. [1] https://github.com/pchaigno/tail-call-bench/commit/ae7887482985b4b1745c9b2ef7ff9ae506c82886 [2] 96bc4432f5ad ("bpf, riscv: Limit to 33 tail calls") [3] e49e6f6db04e ("bpf, mips: Limit to 33 tail calls") Therefore, revert b61a28cf11d61 to re-align interpreter to limit a maximum of 33 tail calls. While it is unlikely to hit the limit for the vast majority, programs in the wild could one way or another depend on this, so lets rather be a bit more conservative, and lets align the small remainder of JITs to 33. If needed in future, this limit could be slightly increased, but not decreased. Fixes: b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting") Reported-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Johan Almbladh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/CAO5pjwTWrC0_dzTbTHFPSqDwA56aVH+4KFGVqdq8=ASs0MqZGQ@mail.gmail.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 91f24c7b38a1..9f4636d021b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1564,7 +1564,7 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; -- cgit v1.2.3-71-gd317 From 594286b7574c6e8217b1c233cc0d0650f2268a77 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 19 Aug 2021 08:52:09 -0700 Subject: bpf: Fix NULL event->prog pointer access in bpf_overflow_handler Andrii reported that libbpf CI hit the following oops when running selftest send_signal: [ 1243.160719] BUG: kernel NULL pointer dereference, address: 0000000000000030 [ 1243.161066] #PF: supervisor read access in kernel mode [ 1243.161066] #PF: error_code(0x0000) - not-present page [ 1243.161066] PGD 0 P4D 0 [ 1243.161066] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1243.161066] CPU: 1 PID: 882 Comm: new_name Tainted: G O 5.14.0-rc5 #1 [ 1243.161066] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 1243.161066] RIP: 0010:bpf_overflow_handler+0x9a/0x1e0 [ 1243.161066] Code: 5a 84 c0 0f 84 06 01 00 00 be 66 02 00 00 48 c7 c7 6d 96 07 82 48 8b ab 18 05 00 00 e8 df 55 eb ff 66 90 48 8d 75 48 48 89 e7 55 30 41 89 c4 e8 fb c1 f0 ff 84 c0 0f 84 94 00 00 00 e8 6e 0f [ 1243.161066] RSP: 0018:ffffc900000c0d80 EFLAGS: 00000046 [ 1243.161066] RAX: 0000000000000002 RBX: ffff8881002e0dd0 RCX: 00000000b4b47cf8 [ 1243.161066] RDX: ffffffff811dcb06 RSI: 0000000000000048 RDI: ffffc900000c0d80 [ 1243.161066] RBP: 0000000000000000 R08: 0000000000000000 R09: 1a9d56bb00000000 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: 0000000000000000 [ 1243.161066] R13: ffffc900000c0e00 R14: ffffc900001c3c68 R15: 0000000000000082 [ 1243.161066] FS: 00007fc0be2d3380(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 [ 1243.161066] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1243.161066] CR2: 0000000000000030 CR3: 0000000104f8e000 CR4: 00000000000006e0 [ 1243.161066] Call Trace: [ 1243.161066] [ 1243.161066] __perf_event_overflow+0x4f/0xf0 [ 1243.161066] perf_swevent_hrtimer+0x116/0x130 [ 1243.161066] ? __lock_acquire+0x378/0x2730 [ 1243.161066] ? __lock_acquire+0x372/0x2730 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? find_held_lock+0x2b/0x80 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] __hrtimer_run_queues+0x1a3/0x460 [ 1243.161066] hrtimer_interrupt+0x110/0x220 [ 1243.161066] __sysvec_apic_timer_interrupt+0x8a/0x260 [ 1243.161066] sysvec_apic_timer_interrupt+0x89/0xc0 [ 1243.161066] [ 1243.161066] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 1243.161066] RIP: 0010:finish_task_switch+0xaf/0x250 [ 1243.161066] Code: 31 f6 68 90 2a 09 81 49 8d 7c 24 18 e8 aa d6 03 00 4c 89 e7 e8 12 ff ff ff 4c 89 e7 e8 ca 9c 80 00 e8 35 af 0d 00 fb 4d 85 f6 <58> 74 1d 65 48 8b 04 25 c0 6d 01 00 4c 3b b0 a0 04 00 00 74 37 f0 [ 1243.161066] RSP: 0018:ffffc900001c3d18 EFLAGS: 00000282 [ 1243.161066] RAX: 000000000000031f RBX: ffff888104cf4980 RCX: 0000000000000000 [ 1243.161066] RDX: 0000000000000000 RSI: ffffffff82095460 RDI: ffffffff820adc4e [ 1243.161066] RBP: ffffc900001c3d58 R08: 0000000000000001 R09: 0000000000000001 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: ffff88813bd2bc80 [ 1243.161066] R13: ffff8881002e8000 R14: ffff88810022ad80 R15: 0000000000000000 [ 1243.161066] ? finish_task_switch+0xab/0x250 [ 1243.161066] ? finish_task_switch+0x70/0x250 [ 1243.161066] __schedule+0x36b/0xbb0 [ 1243.161066] ? _raw_spin_unlock_irqrestore+0x2d/0x50 [ 1243.161066] ? lockdep_hardirqs_on+0x79/0x100 [ 1243.161066] schedule+0x43/0xe0 [ 1243.161066] pipe_read+0x30b/0x450 [ 1243.161066] ? wait_woken+0x80/0x80 [ 1243.161066] new_sync_read+0x164/0x170 [ 1243.161066] vfs_read+0x122/0x1b0 [ 1243.161066] ksys_read+0x93/0xd0 [ 1243.161066] do_syscall_64+0x35/0x80 [ 1243.161066] entry_SYSCALL_64_after_hwframe+0x44/0xae The oops can also be reproduced with the following steps: ./vmtest.sh -s # at qemu shell cd /root/bpf && while true; do ./test_progs -t send_signal Further analysis showed that the failure is introduced with commit b89fbfbb854c ("bpf: Implement minimal BPF perf link"). With the above commit, the following scenario becomes possible: cpu1 cpu2 hrtimer_interrupt -> bpf_overflow_handler (due to closing link_fd) bpf_perf_link_release -> perf_event_free_bpf_prog -> perf_event_free_bpf_handler -> WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler) event->prog = NULL bpf_prog_run(event->prog, &ctx) In the above case, the event->prog is NULL for bpf_prog_run, hence causing oops. To fix the issue, check whether event->prog is NULL or not. If it is, do not call bpf_prog_run. This seems working as the above reproducible step runs more than one hour and I didn't see any failures. Fixes: b89fbfbb854c ("bpf: Implement minimal BPF perf link") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210819155209.1927994-1-yhs@fb.com --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d1e63dd97f2..011cc5069b7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9920,13 +9920,16 @@ static void bpf_overflow_handler(struct perf_event *event, .data = data, .event = event, }; + struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = bpf_prog_run(event->prog, &ctx); + prog = READ_ONCE(event->prog); + if (prog) + ret = bpf_prog_run(prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3-71-gd317 From f0dce1d9b7c81fc3dc9d0cc0bc7ef9b3eae22584 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:15 -0700 Subject: bpf: Use kvmalloc for map values in syscall Use kvmalloc/kvfree for temporary value when manipulating a map via syscall. kmalloc might not be sufficient for percpu maps where the value is big (and further multiplied by hundreds of CPUs). Can be reproduced with netcnt test on qemu with "-smp 255". Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-1-sdf@google.com --- kernel/bpf/syscall.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7420e1334ab2..075f650d297a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1076,7 +1076,7 @@ static int map_lookup_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1091,7 +1091,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1137,16 +1137,10 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1157,7 +1151,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) err = bpf_map_update_value(map, f, key, value, attr->flags); free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1367,7 +1361,7 @@ int generic_map_update_batch(struct bpf_map *map, if (!key) return -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { kfree(key); return -ENOMEM; @@ -1390,7 +1384,7 @@ int generic_map_update_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(value); + kvfree(value); kfree(key); return err; } @@ -1429,7 +1423,7 @@ int generic_map_lookup_batch(struct bpf_map *map, if (!buf_prevkey) return -ENOMEM; - buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { kfree(buf_prevkey); return -ENOMEM; @@ -1492,7 +1486,7 @@ int generic_map_lookup_batch(struct bpf_map *map, free_buf: kfree(buf_prevkey); - kfree(buf); + kvfree(buf); return err; } @@ -1547,7 +1541,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1579,7 +1573,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: -- cgit v1.2.3-71-gd317 From 44779a4b85abd1d1dab9e5b90bd5e6adcfc8143a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:16 -0700 Subject: bpf: Use kvmalloc for map keys in syscalls Same as previous patch but for the keys. memdup_bpfptr is renamed to kvmemdup_bpfptr (and converted to kvmalloc). Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-2-sdf@google.com --- include/linux/bpfptr.h | 12 ++++++++++-- kernel/bpf/syscall.c | 34 +++++++++++++++++----------------- 2 files changed, 27 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 5cdeab497cb3..546e27fc6d46 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -62,9 +62,17 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size); } -static inline void *memdup_bpfptr(bpfptr_t src, size_t len) +static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) { - return memdup_sockptr((sockptr_t) src, len); + void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN); + + if (!p) + return ERR_PTR(-ENOMEM); + if (copy_from_bpfptr(p, src, len)) { + kvfree(p); + return ERR_PTR(-EFAULT); + } + return p; } static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 075f650d297a..4e50c0bfdb7d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1013,7 +1013,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static void *__bpf_copy_key(void __user *ukey, u64 key_size) { if (key_size) - return memdup_user(ukey, key_size); + return vmemdup_user(ukey, key_size); if (ukey) return ERR_PTR(-EINVAL); @@ -1024,7 +1024,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) { if (key_size) - return memdup_bpfptr(ukey, key_size); + return kvmemdup_bpfptr(ukey, key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); @@ -1093,7 +1093,7 @@ static int map_lookup_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1153,7 +1153,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1205,7 +1205,7 @@ static int map_delete_elem(union bpf_attr *attr) bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1247,7 +1247,7 @@ static int map_get_next_key(union bpf_attr *attr) } err = -ENOMEM; - next_key = kmalloc(map->key_size, GFP_USER); + next_key = kvmalloc(map->key_size, GFP_USER); if (!next_key) goto free_key; @@ -1270,9 +1270,9 @@ out: err = 0; free_next_key: - kfree(next_key); + kvfree(next_key); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1299,7 +1299,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1326,7 +1326,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(key); + kvfree(key); return err; } @@ -1357,13 +1357,13 @@ int generic_map_update_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { - kfree(key); + kvfree(key); return -ENOMEM; } @@ -1385,7 +1385,7 @@ int generic_map_update_batch(struct bpf_map *map, err = -EFAULT; kvfree(value); - kfree(key); + kvfree(key); return err; } @@ -1419,13 +1419,13 @@ int generic_map_lookup_batch(struct bpf_map *map, if (put_user(0, &uattr->batch.count)) return -EFAULT; - buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!buf_prevkey) return -ENOMEM; buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kfree(buf_prevkey); + kvfree(buf_prevkey); return -ENOMEM; } @@ -1485,7 +1485,7 @@ int generic_map_lookup_batch(struct bpf_map *map, err = -EFAULT; free_buf: - kfree(buf_prevkey); + kvfree(buf_prevkey); kvfree(buf); return err; } @@ -1575,7 +1575,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; -- cgit v1.2.3-71-gd317 From 2c531639deb5e3ddfd6e8123b82052b2d9fbc6e5 Mon Sep 17 00:00:00 2001 From: Prankur Gupta Date: Tue, 17 Aug 2021 15:42:20 -0700 Subject: bpf: Add support for {set|get} socket options from setsockopt BPF Add logic to call bpf_setsockopt() and bpf_getsockopt() from setsockopt BPF programs. An example use case is when the user sets the IPV6_TCLASS socket option, we would also like to change the tcp-cc for that socket. We don't have any use case for calling bpf_setsockopt() from supposedly read- only sys_getsockopt(), so it is made available to BPF_CGROUP_SETSOCKOPT only at this point. Signed-off-by: Prankur Gupta Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210817224221.3257826-2-prankgup@fb.com --- kernel/bpf/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f35928bab0a..8e9d99e2ade4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1873,6 +1873,14 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_getsockopt_proto; + return NULL; #endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: -- cgit v1.2.3-71-gd317 From b857174e68e26f9c4f0796971e11eb63ad5a3eb6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 19 Aug 2021 21:30:30 +0200 Subject: locking/ww_mutex: Initialize waiter.ww_ctx properly The consolidation of the debug code for mutex waiter intialization sets waiter::ww_ctx to a poison value unconditionally. For regular mutexes this is intended to catch the case where waiter_ww_ctx is dereferenced accidentally. For ww_mutex the poison value has to be overwritten either with a context pointer or NULL for ww_mutexes without context. The rework broke this as it made the store conditional on the context pointer instead of the argument which signals whether ww_mutex code should be compiled in or optiized out. As a result waiter::ww_ctx ends up with the poison pointer for contextless ww_mutexes which causes a later dereference of the poison pointer because it is != NULL. Use the build argument instead so for ww_mutex the poison value is always overwritten. Fixes: c0afb0ffc06e6 ("locking/ww_mutex: Gather mutex_waiter initialization") Reported-by: Guenter Roeck Suggested-by: Peter Zijlstra Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210819193030.zpwrpvvrmy7xxxiy@linutronix.de --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3a65bf4bacfd..d456579d0952 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -618,7 +618,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas debug_mutex_lock_common(lock, &waiter); waiter.task = current; - if (ww_ctx) + if (use_ww_ctx) waiter.ww_ctx = ww_ctx; lock_contended(&lock->dep_map, ip); -- cgit v1.2.3-71-gd317 From 3c474b3239f12fe0b00d7e82481f36a1f31e79ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Aug 2021 13:09:17 +0200 Subject: sched: Fix Core-wide rq->lock for uninitialized CPUs Eugene tripped over the case where rq_lock(), as called in a for_each_possible_cpu() loop came apart because rq->core hadn't been setup yet. This is a somewhat unusual, but valid case. Rework things such that rq->core is initialized to point at itself. IOW initialize each CPU as a single threaded Core. CPU online will then join the new CPU (thread) to an existing Core where needed. For completeness sake, have CPU offline fully undo the state so as to not presume the topology will match the next time it comes online. Fixes: 9edeaea1bc45 ("sched: Core-wide rq->lock") Reported-by: Eugene Syromiatnikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Don Tested-by: Eugene Syromiatnikov Link: https://lkml.kernel.org/r/YR473ZGeKqMs6kw+@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 143 +++++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 2 +- 2 files changed, 118 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..f3b27c6c5153 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static struct cpumask sched_core_mask; +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + static void __sched_core_flip(bool enabled) { - int cpu, t, i; + unsigned long flags; + int cpu, t; cpus_read_lock(); @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled) for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - i = 0; - local_irq_disable(); - for_each_cpu(t, smt_mask) { - /* supports up to SMT8 */ - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); - } + sched_core_lock(cpu, &flags); for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - for_each_cpu(t, smt_mask) - raw_spin_unlock(&cpu_rq(t)->__lock); - local_irq_enable(); + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); } @@ -5736,35 +5750,109 @@ void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } -static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - struct rq *rq, *core_rq = NULL; - int i; + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; - core_rq = cpu_rq(cpu)->core; + sched_core_lock(cpu, &flags); - if (!core_rq) { - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); - if (rq->core && rq->core == rq) - core_rq = rq; + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; } + } - if (!core_rq) - core_rq = cpu_rq(cpu); + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock; - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); - WARN_ON_ONCE(rq->core && rq->core != core_rq); + if (t == cpu) rq->core = core_rq; - } + + WARN_ON_ONCE(rq->core != core_rq); } + +unlock: + sched_core_unlock(cpu, &flags); } + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; + } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle = rq->core_forceidle; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -8707,6 +8795,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) @@ -8811,6 +8901,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif @@ -9022,7 +9113,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_SCHED_CORE - rq->core = NULL; + rq->core = rq; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14a41a243f7b..da4295f5ab77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1093,7 +1093,7 @@ struct rq { unsigned int core_sched_seq; struct rb_root core_tree; - /* shared state */ + /* shared state -- careful with sched_core_cpu_deactivate() */ unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; -- cgit v1.2.3-71-gd317 From 0083242c93759dde353a963a90cb351c5c283379 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 18 Aug 2021 13:13:33 +0530 Subject: sched/topology: Skip updating masks for non-online nodes The scheduler currently expects NUMA node distances to be stable from init onwards, and as a consequence builds the related data structures once-and-for-all at init (see sched_init_numa()). Unfortunately, on some architectures node distance is unreliable for offline nodes and may very well change upon onlining. Skip over offline nodes during sched_init_numa(). Track nodes that have been onlined at least once, and trigger a build of a node's NUMA masks when it is first onlined post-init. Reported-by: Geetika Moolchandani Signed-off-by: Srikar Dronamraju Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210818074333.48645-1-srikar@linux.vnet.ibm.com --- kernel/sched/topology.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..4e8698e62f07 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1482,6 +1482,8 @@ int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1833,6 +1835,16 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + /* + * Distance information can be unreliable for + * offline nodes, defer building the node + * masks to its bringup. + * This relies on all unique distance values + * still being visible at init time. + */ + if (!node_online(j)) + continue; + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1898,53 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); + if (!sched_numa_onlined_nodes) + return; + + bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); + for_each_online_node(i) + bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ + int i, j; + + /* + * NUMA masks are not built for offline nodes in sched_init_numa(). + * Thus, when a CPU of a never-onlined-before node gets plugged in, + * adding that new CPU to the right NUMA masks is not sufficient: the + * masks of that CPU's node must also be updated. + */ + if (test_bit(node, sched_numa_onlined_nodes)) + return; + + bitmap_set(sched_numa_onlined_nodes, node, 1); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j) || node == j) + continue; + + if (node_distance(j, node) > sched_domains_numa_distance[i]) + continue; + + /* Add remote nodes in our masks */ + cpumask_or(sched_domains_numa_masks[i][node], + sched_domains_numa_masks[i][node], + sched_domains_numa_masks[0][j]); + } + } + + /* + * A new node has been brought up, potentially changing the topology + * classification. + * + * Note that this is racy vs any use of sched_numa_topology_type :/ + */ + init_numa_topology_type(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; + __sched_domains_numa_masks_set(node); + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } -- cgit v1.2.3-71-gd317 From 304000390f88d049c85e9a0958ac5567f38816ee Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 29 Jul 2021 19:00:18 -0700 Subject: sched: Cgroup SCHED_IDLE support This extends SCHED_IDLE to cgroups. Interface: cgroup/cpu.idle. 0: default behavior 1: SCHED_IDLE Extending SCHED_IDLE to cgroups means that we incorporate the existing aspects of SCHED_IDLE; a SCHED_IDLE cgroup will count all of its descendant threads towards the idle_h_nr_running count of all of its ancestor cgroups. Thus, sched_idle_rq() will work properly. Additionally, SCHED_IDLE cgroups are configured with minimum weight. There are two key differences between the per-task and per-cgroup SCHED_IDLE interface: - The cgroup interface allows tasks within a SCHED_IDLE hierarchy to maintain their relative weights. The entity that is "idle" is the cgroup, not the tasks themselves. - Since the idle entity is the cgroup, our SCHED_IDLE wakeup preemption decision is not made by comparing the current task with the woken task, but rather by comparing their matching sched_entity. A typical use-case for this is a user that creates an idle and a non-idle subtree. The non-idle subtree will dominate competition vs the idle subtree, but the idle subtree will still be high priority vs other users on the system. The latter is accomplished via comparing matching sched_entity in the waken preemption path (this could also be improved by making the sched_idle_rq() decision dependent on the perspective of a specific task). For now, we maintain the existing SCHED_IDLE semantics. Future patches may make improvements that extend how we treat SCHED_IDLE entities. The per-task_group idle field is an integer that currently only holds either a 0 or a 1. This is explicitly typed as an integer to allow for further extensions to this API. For example, a negative value may indicate a highly latency-sensitive cgroup that should be preferred for preemption/placement/etc. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210730020019.1487127-2-joshdon@google.com --- kernel/sched/core.c | 25 +++++++ kernel/sched/debug.c | 3 + kernel/sched/fair.c | 197 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 8 +++ 4 files changed, 208 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d67c5dd51c16..7fa6ce74f40e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10135,6 +10135,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10142,6 +10156,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10349,6 +10368,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7e08e3d947c2..49716228efb4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -607,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a0fbf224b2..6cd05f1d77ef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } +static int tg_is_idle(struct task_group *tg) +{ + return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); +} + #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -4812,6 +4844,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4831,6 +4866,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@ -4875,39 +4913,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -4920,9 +4964,9 @@ unthrottle_throttle: * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@ -5545,6 +5589,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5562,6 +5609,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5639,6 +5689,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -5668,6 +5721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -7010,24 +7066,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -7048,6 +7102,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@ -7092,8 +7147,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -11387,10 +11455,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11399,9 +11469,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11419,10 +11488,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75a5c120e7d9..5fa02902c143 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -396,6 +396,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -505,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -601,6 +606,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; -- cgit v1.2.3-71-gd317 From 9ae606bc74dd0e58d4de894e3c5cbb9d45599267 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:28 +0100 Subject: sched: Introduce task_cpu_possible_mask() to limit fallback rq selection Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. On such a system, we must take care not to migrate a task to an unsupported CPU when forcefully moving tasks in select_fallback_rq() in response to a CPU hot-unplug operation. Introduce a task_cpu_possible_mask() hook which, given a task argument, allows an architecture to return a cpumask of CPUs that are capable of executing that task. The default implementation returns the cpu_possible_mask, since sane machines do not suffer from per-cpu ISA limitations that affect scheduling. The new mask is used when selecting the fallback runqueue as a last resort before forcing a migration to the first active CPU. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-2-will@kernel.org --- include/linux/mmu_context.h | 14 ++++++++++++++ kernel/sched/core.c | 9 +++------ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 03dee12d2b61..b9b970f7ab45 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -14,4 +14,18 @@ static inline void leave_mm(int cpu) { } #endif +/* + * CPUs that are capable of running user task @p. Must contain at least one + * active CPU. It is assumed that the kernel can run on all CPUs, so calling + * this for a kernel thread is pointless. + * + * By default, we assume a sane, homogeneous system. + */ +#ifndef task_cpu_possible_mask +# define task_cpu_possible_mask(p) cpu_possible_mask +# define task_cpu_possible(cpu, p) true +#else +# define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) +#endif + #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7fa6ce74f40e..6f31267c4beb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2173,7 +2173,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -3124,9 +3124,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -3156,10 +3154,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; -- cgit v1.2.3-71-gd317 From d4b96fb92ae7fe7533e11e662504d96161928575 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:29 +0100 Subject: cpuset: Don't use the cpu_possible_mask as a last resort for cgroup v1 If the scheduler cannot find an allowed CPU for a task, cpuset_cpus_allowed_fallback() will widen the affinity to cpu_possible_mask if cgroup v1 is in use. In preparation for allowing architectures to provide their own fallback mask, just return early if we're either using cgroup v1 or we're using cgroup v2 with a mask that contains invalid CPUs. This will allow select_fallback_rq() to figure out the mask by itself. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lkml.kernel.org/r/20210730112443.23245-3-will@kernel.org --- include/linux/cpuset.h | 1 + kernel/cgroup/cpuset.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 04c20de66afc..ed6ec677dd6b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #ifdef CONFIG_CPUSETS diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..a8693783f385 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3322,9 +3322,13 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + cs_mask = task_cs(tsk)->cpus_allowed; + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) + do_set_cpus_allowed(tsk, cs_mask); rcu_read_unlock(); /* -- cgit v1.2.3-71-gd317 From 431c69fac05baa7477d61a44f2708e069f2bed6c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:30 +0100 Subject: cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus() Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. Modify guarantee_online_cpus() to take task_cpu_possible_mask() into account when trying to find a suitable set of online CPUs for a given task. This will avoid passing an invalid mask to set_cpus_allowed_ptr() during ->attach() and will subsequently allow the cpuset hierarchy to be taken into account when forcefully overriding the affinity mask for a task which requires migration to a compatible CPU. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210730112443.23245-4-will@kernel.org --- include/linux/cpuset.h | 2 +- kernel/cgroup/cpuset.c | 43 ++++++++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index ed6ec677dd6b..414a8e694413 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -185,7 +185,7 @@ static inline void cpuset_read_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { - cpumask_copy(mask, cpu_possible_mask); + cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a8693783f385..391813245cb2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void) } /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) + cpumask_copy(pmask, cpu_online_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* @@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ - cpumask_copy(pmask, cpu_online_mask); - return; + goto out_unlock; } } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: + rcu_read_unlock(); } /* @@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) percpu_down_write(&cpuset_rwsem); - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); - rcu_read_unlock(); + guarantee_online_cpus(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } -- cgit v1.2.3-71-gd317 From 97c0054dbe2c3c59d1156fd233f2d44e91981c8e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:31 +0100 Subject: cpuset: Cleanup cpuset_cpus_allowed_fallback() use in select_fallback_rq() select_fallback_rq() only needs to recheck for an allowed CPU if the affinity mask of the task has changed since the last check. Return a 'bool' from cpuset_cpus_allowed_fallback() to indicate whether the affinity mask was updated, and use this to elide the allowed check when the mask has been left alone. No functional change. Suggested-by: Valentin Schneider Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-5-will@kernel.org --- include/linux/cpuset.h | 5 +++-- kernel/cgroup/cpuset.c | 10 ++++++++-- kernel/sched/core.c | 3 +-- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 414a8e694413..d2b9c41c8edf 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -59,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void); extern void cpuset_read_lock(void); extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -188,8 +188,9 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, task_cpu_possible_mask(p)); } -static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) +static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { + return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 391813245cb2..6500cbe0ce16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3327,17 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. + * + * Returns true if the affinity of @tsk was changed, false otherwise. **/ -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); const struct cpumask *cs_mask; + bool changed = false; rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; - if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) + if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { do_set_cpus_allowed(tsk, cs_mask); + changed = true; + } rcu_read_unlock(); /* @@ -3357,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) * select_fallback_rq() will fix things ups and set cpu_possible_mask * if required. */ + return changed; } void __init cpuset_init_current_mems_allowed(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6f31267c4beb..b9d4bae922a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3141,8 +3141,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } -- cgit v1.2.3-71-gd317 From 234a503e670be01f72841be9fcf68dfb89a1fa8b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:32 +0100 Subject: sched: Reject CPU affinity changes based on task_cpu_possible_mask() Reject explicit requests to change the affinity mask of a task via set_cpus_allowed_ptr() if the requested mask is not a subset of the mask returned by task_cpu_possible_mask(). This ensures that the 'cpus_mask' for a given task cannot contain CPUs which are incapable of executing it, except in cases where the affinity is forced. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-6-will@kernel.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b9d4bae922a8..8cec0d24c88c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2709,7 +2709,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; unsigned int dest_cpu; struct rq_flags rf; struct rq *rq; @@ -2718,7 +2720,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@ -2732,6 +2734,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. -- cgit v1.2.3-71-gd317 From b90ca8badbd11488e5f762346b028666808164e7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:33 +0100 Subject: sched: Introduce task_struct::user_cpus_ptr to track requested affinity In preparation for saving and restoring the user-requested CPU affinity mask of a task, add a new cpumask_t pointer to 'struct task_struct'. If the pointer is non-NULL, then the mask is copied across fork() and freed on task exit. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-7-will@kernel.org --- include/linux/sched.h | 13 +++++++++++++ init/init_task.c | 1 + kernel/fork.c | 2 ++ kernel/sched/core.c | 20 ++++++++++++++++++++ 4 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 50db9496c99d..2c5d638daaad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -748,6 +748,7 @@ struct task_struct { unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP @@ -1706,6 +1707,8 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_ #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); +extern void release_user_cpus_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -1716,6 +1719,16 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma return -EINVAL; return 0; } +static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) +{ + if (src->user_cpus_ptr) + return -EINVAL; + return 0; +} +static inline void release_user_cpus_ptr(struct task_struct *p) +{ + WARN_ON(p->user_cpus_ptr); +} #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/init/init_task.c b/init/init_task.c index 562f2ef8d157..2d024066e27b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -80,6 +80,7 @@ struct task_struct init_task .normal_prio = MAX_PRIO - 20, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, .mm = NULL, diff --git a/kernel/fork.c b/kernel/fork.c index 1a9af73b47c1..5d7addf0c41a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK @@ -919,6 +920,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; + dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8cec0d24c88c..360a3ec6d03b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2480,6 +2480,26 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(p->user_cpus_ptr); + p->user_cpus_ptr = NULL; +} + /* * This function is wildly self concurrent; here be dragons. * -- cgit v1.2.3-71-gd317 From db3b02ae896e88b6bb7a95c1373602e87e0de84c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:34 +0100 Subject: sched: Split the guts of sched_setaffinity() into a helper function In preparation for replaying user affinity requests using a saved mask, split sched_setaffinity() up so that the initial task lookup and security checks are only performed when the request is coming directly from userspace. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210730112443.23245-8-will@kernel.org --- kernel/sched/core.c | 105 ++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 360a3ec6d03b..672d0fcbf2ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7594,53 +7594,22 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; int retval; + cpumask_var_t cpus_allowed, new_mask; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { retval = -ENOMEM; goto out_free_cpus_allowed; } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } - - retval = security_task_setscheduler(p); - if (retval) - goto out_free_new_mask; - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); /* * Since bandwidth control happens on root_domain basis, @@ -7661,23 +7630,63 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) #endif again: retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + if (retval) + goto out_free_new_mask; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; } + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + struct task_struct *p; + int retval; + + rcu_read_lock(); + + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } + + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + retval = -EPERM; + goto out_put_task; + } + rcu_read_unlock(); + } + + retval = security_task_setscheduler(p); + if (retval) + goto out_put_task; + + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; -- cgit v1.2.3-71-gd317 From 07ec77a1d4e82526e1588979fff2f024f8e96df2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:35 +0100 Subject: sched: Allow task CPU affinity to be restricted on asymmetric systems Asymmetric systems may not offer the same level of userspace ISA support across all CPUs, meaning that some applications cannot be executed by some CPUs. As a concrete example, upcoming arm64 big.LITTLE designs do not feature support for 32-bit applications on both clusters. Although userspace can carefully manage the affinity masks for such tasks, one place where it is particularly problematic is execve() because the CPU on which the execve() is occurring may be incompatible with the new application image. In such a situation, it is desirable to restrict the affinity mask of the task and ensure that the new image is entered on a compatible CPU. From userspace's point of view, this looks the same as if the incompatible CPUs have been hotplugged off in the task's affinity mask. Similarly, if a subsequent execve() reverts to a compatible image, then the old affinity is restored if it is still valid. In preparation for restricting the affinity mask for compat tasks on arm64 systems without uniform support for 32-bit applications, introduce {force,relax}_compatible_cpus_allowed_ptr(), which respectively restrict and restore the affinity mask for a task based on the compatible CPUs. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Quentin Perret Link: https://lore.kernel.org/r/20210730112443.23245-9-will@kernel.org --- include/linux/sched.h | 2 + kernel/sched/core.c | 198 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 1 + 3 files changed, 183 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c5d638daaad..ce2d5cfc331e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1709,6 +1709,8 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 672d0fcbf2ef..6ee197049c9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2494,10 +2494,18 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, return 0; } +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + void release_user_cpus_ptr(struct task_struct *p) { - kfree(p->user_cpus_ptr); - p->user_cpus_ptr = NULL; + kfree(clear_user_cpus_ptr(p)); } /* @@ -2717,27 +2725,23 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); if (kthread || is_migration_disabled(p)) { @@ -2793,20 +2797,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -7629,7 +7791,7 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); if (retval) goto out_free_new_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5fa02902c143..e7e2bba5b520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2244,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 #ifdef CONFIG_SMP -- cgit v1.2.3-71-gd317 From 234b8ab6476c5edd5262e2ff563de9498d60044a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:24:36 +0100 Subject: sched: Introduce dl_task_check_affinity() to check proposed affinity In preparation for restricting the affinity of a task during execve() on arm64, introduce a new dl_task_check_affinity() helper function to give an indication as to whether the restricted mask is admissible for a deadline task. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20210730112443.23245-10-will@kernel.org --- include/linux/sched.h | 6 ++++++ kernel/sched/core.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index ce2d5cfc331e..3bb9fecfdaa1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1709,6 +1709,7 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else @@ -1731,6 +1732,11 @@ static inline void release_user_cpus_ptr(struct task_struct *p) { WARN_ON(p->user_cpus_ptr); } + +static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + return 0; +} #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ee197049c9c..a22cc3c156ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7756,6 +7756,32 @@ out_unlock: return retval; } +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + static int __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { @@ -7773,23 +7799,9 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, mask, cpus_allowed); - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; again: retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); if (retval) -- cgit v1.2.3-71-gd317 From 99409b935c9ac5ea36ab5218954115c52449234d Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Mon, 9 Aug 2021 10:12:15 +0800 Subject: locking/semaphore: Add might_sleep() to down_*() family Semaphore is sleeping lock. Add might_sleep() to down*() family (with exception of down_trylock()) to detect atomic context sleep. Signed-off-by: Xiaoming Ni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210809021215.19991-1-nixiaoming@huawei.com --- kernel/locking/semaphore.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 9aa855a96c4a..9ee381e4d2a4 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -54,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -77,6 +78,7 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -103,6 +105,7 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -157,6 +160,7 @@ int down_timeout(struct semaphore *sem, long timeout) unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; -- cgit v1.2.3-71-gd317 From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Aug 2021 11:26:06 -0400 Subject: tracing: Add a probe that attaches to trace events A new dynamic event is introduced: event probe. The event is attached to an existing tracepoint and uses its fields as arguments. The user can specify custom format string of the new event, select what tracepoint arguments will be printed and how to print them. An event probe is created by writing configuration string in 'dynamic_events' ftrace file: e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS] - Set an event probe -:SNAME/ENAME - Delete an event probe Where: SNAME - System name, if omitted 'eprobes' is used. ENAME - Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used. SYSTEM - Name of the system, where the tracepoint is defined, mandatory. EVENT - Name of the tracepoint event in SYSTEM, mandatory. FETCHARGS - Arguments: =$[:TYPE] - Fetch given filed of the tracepoint and print it as given TYPE with given name. Supported types are: (u8/u16/u32/u64/s8/s16/s32/s64), basic type (x8/x16/x32/x64), hexadecimal types "string", "ustring" and bitfield. Example, attach an event probe on openat system call and print name of the file that will be opened: echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events A new dynamic event is created in events/esys/eopen/ directory. It can be deleted with: echo "-:esys/eopen" >> dynamic_events Filters, triggers and histograms can be attached to the new event, it can be matched in synthetic events. There is one limitation - an event probe can not be attached to kprobe, uprobe or another event probe. Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org Acked-by: Masami Hiramatsu Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 4 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 5 +- kernel/trace/trace.h | 18 + kernel/trace/trace_eprobe.c | 903 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 14 +- kernel/trace/trace_kprobe.c | 8 - kernel/trace/trace_probe.c | 16 +- kernel/trace/trace_probe.h | 6 +- 9 files changed, 962 insertions(+), 13 deletions(-) create mode 100644 kernel/trace/trace_eprobe.c (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9564c4d9a3b6..0a0144580bbd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -313,6 +313,7 @@ enum { TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, + TRACE_EVENT_FL_EPROBE_BIT, }; /* @@ -325,6 +326,7 @@ enum { * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe + * EPROBE - Event is an event probe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -335,6 +337,7 @@ enum { TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), + TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) @@ -680,6 +683,7 @@ enum event_trigger_type { ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), + ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b1c47ccf4f73..6de5d4d63165 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8425c3d70895..489924cde4f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5543,6 +5543,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif + "\t e[:[/]] . []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -5552,7 +5553,7 @@ static const char readme_msg[] = " place (uprobe): :[%return][(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" - "\t fetcharg: %, @
, @[+|-],\n" + "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack, $stack, $retval, $comm, $arg,\n" #else @@ -5567,6 +5568,8 @@ static const char readme_msg[] = "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the /.\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a0e693000c6..b7c0f8e160fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -126,6 +126,11 @@ struct kprobe_trace_entry_head { unsigned long ip; }; +struct eprobe_trace_entry_head { + struct trace_entry ent; + unsigned int type; +}; + struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; @@ -1508,9 +1513,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); +enum { + EVENT_TRIGGER_FL_PROBE = BIT(0), +}; + struct event_trigger_data { unsigned long count; int ref; + int flags; struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; @@ -1918,6 +1928,14 @@ static inline bool is_good_name(const char *name) return true; } +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + /* * This is a generic way to read and write a u64 value from a file in tracefs. * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c new file mode 100644 index 000000000000..56a96e9750cf --- /dev/null +++ b/kernel/trace/trace_eprobe.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event probes + * + * Part of this code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu + * + * Copyright (C) 2021, VMware Inc, Steven Rostedt + * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> + * + */ +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" + +#define EPROBE_EVENT_SYSTEM "eprobes" + +struct trace_eprobe { + /* tracepoint system */ + const char *event_system; + + /* tracepoint event */ + const char *event_name; + + struct trace_event_call *event; + + struct dyn_event devent; + struct trace_probe tp; +}; + +struct eprobe_data { + struct trace_event_file *file; + struct trace_eprobe *ep; +}; + +static int __trace_eprobe_create(int argc, const char *argv[]); + +static void trace_event_probe_cleanup(struct trace_eprobe *ep) +{ + if (!ep) + return; + trace_probe_cleanup(&ep->tp); + kfree(ep->event_name); + kfree(ep->event_system); + if (ep->event) + trace_event_put_ref(ep->event); + kfree(ep); +} + +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_eprobe, devent); +} + +static int eprobe_dyn_event_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_eprobe_create); +} + +static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int i; + + seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp), + trace_probe_name(&ep->tp)); + seq_printf(m, " %s.%s", ep->event_system, ep->event_name); + + for (i = 0; i < ep->tp.nr_args; i++) + seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int unregister_trace_eprobe(struct trace_eprobe *ep) +{ + /* If other probes are on the event, just unregister eprobe */ + if (trace_probe_has_sibling(&ep->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&ep->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (trace_probe_unregister_event_call(&ep->tp)) + return -EBUSY; + +unreg: + dyn_event_remove(&ep->devent); + trace_probe_unlink(&ep->tp); + + return 0; +} + +static int eprobe_dyn_event_release(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int ret = unregister_trace_eprobe(ep); + + if (!ret) + trace_event_probe_cleanup(ep); + return ret; +} + +static bool eprobe_dyn_event_is_busy(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return trace_probe_is_enabled(&ep->tp); +} + +static bool eprobe_dyn_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return strcmp(trace_probe_name(&ep->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && + trace_probe_match_command_args(&ep->tp, argc, argv); +} + +static struct dyn_event_operations eprobe_dyn_event_ops = { + .create = eprobe_dyn_event_create, + .show = eprobe_dyn_event_show, + .is_busy = eprobe_dyn_event_is_busy, + .free = eprobe_dyn_event_release, + .match = eprobe_dyn_event_match, +}; + +static struct trace_eprobe *alloc_event_probe(const char *group, + const char *this_event, + struct trace_event_call *event, + int nargs) +{ + struct trace_eprobe *ep; + const char *event_name; + const char *sys_name; + int ret = -ENOMEM; + + if (!event) + return ERR_PTR(-ENODEV); + + sys_name = event->class->system; + event_name = trace_event_name(event); + + ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + if (!ep) { + trace_event_put_ref(ep->event); + goto error; + } + ep->event = event; + ep->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ep->event_name) + goto error; + ep->event_system = kstrdup(sys_name, GFP_KERNEL); + if (!ep->event_system) + goto error; + + ret = trace_probe_init(&ep->tp, this_event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); + return ep; +error: + trace_event_probe_cleanup(ep); + return ERR_PTR(ret); +} + +static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) +{ + struct probe_arg *parg = &ep->tp.args[i]; + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ep->event); + list_for_each_entry(field, head, link) { + if (!strcmp(parg->code->data, field->name)) { + kfree(parg->code->data); + parg->code->data = field; + return 0; + } + } + kfree(parg->code->data); + parg->code->data = NULL; + return -ENOENT; +} + +static int eprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct eprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_fields eprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = eprobe_event_define_fields }, + {} +}; + +/* Event entry printers */ +static enum print_line_t +print_eprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct eprobe_trace_entry_head *field; + struct trace_event_call *pevent; + struct trace_event *probed_event; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct eprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + probed_event = ftrace_find_event(field->type); + if (probed_event) { + pevent = container_of(probed_event, struct trace_event_call, event); + trace_seq_printf(s, "%s.%s", pevent->class->system, + trace_event_name(pevent)); + } else { + trace_seq_printf(s, "%u", field->type); + } + + trace_seq_putc(s, ')'); + + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static unsigned long get_event_field(struct fetch_insn *code, void *rec) +{ + struct ftrace_event_field *field = code->data; + unsigned long val; + void *addr; + + addr = rec + field->offset; + + switch (field->size) { + case 1: + if (field->is_signed) + val = *(char *)addr; + else + val = *(unsigned char *)addr; + break; + case 2: + if (field->is_signed) + val = *(short *)addr; + else + val = *(unsigned short *)addr; + break; + case 4: + if (field->is_signed) + val = *(int *)addr; + else + val = *(unsigned int *)addr; + break; + default: + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + return val; +} + +static int get_eprobe_size(struct trace_probe *tp, void *rec) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + unsigned long val; + + val = get_event_field(arg->code, rec); + len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Kprobe specific fetch functions */ + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + unsigned long val; + + val = get_event_field(code, rec); + return process_fetch_insn_bottom(code + 1, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + +/* eprobe handler */ +static inline void +__eprobe_trace_func(struct eprobe_data *edata, void *rec) +{ + struct eprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != edata->file->event_call)) + return; + + if (trace_trigger_soft_disabled(edata->file)) + return; + + fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = edata->file; + + dsize = get_eprobe_size(&edata->ep->tp, rec); + fbuffer.regs = NULL; + + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, + call->event.type, + sizeof(*entry) + edata->ep->tp.size + dsize, + fbuffer.trace_ctx); + if (!fbuffer.event) + return; + + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + if (edata->ep->event) + entry->type = edata->ep->event->event.type; + else + entry->type = 0; + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* + * The event probe implementation uses event triggers to get access to + * the event it is attached to, but is not an actual trigger. The below + * functions are just stubs to fulfill what is needed to use the trigger + * infrastructure. + */ +static int eprobe_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return 0; +} + +static void eprobe_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + +} + +static int eprobe_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + /* Do not print eprobe event triggers */ + return 0; +} + +static void eprobe_trigger_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct eprobe_data *edata = data->private_data; + + __eprobe_trace_func(edata, rec); +} + +static struct event_trigger_ops eprobe_trigger_ops = { + .func = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, +}; + +static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + return -1; +} + +static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + return -1; +} + +static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + +} + +static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) +{ + return &eprobe_trigger_ops; +} + +static struct event_command event_trigger_cmd = { + .name = "eprobe", + .trigger_type = ETT_EVENT_EPROBE, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = eprobe_trigger_cmd_func, + .reg = eprobe_trigger_reg_func, + .unreg = eprobe_trigger_unreg_func, + .unreg_all = NULL, + .get_trigger_ops = eprobe_trigger_get_ops, + .set_filter = NULL, +}; + +static struct event_trigger_data * +new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) +{ + struct event_trigger_data *trigger; + struct eprobe_data *edata; + + edata = kzalloc(sizeof(*edata), GFP_KERNEL); + trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + if (!trigger || !edata) { + kfree(edata); + kfree(trigger); + return ERR_PTR(-ENOMEM); + } + + trigger->flags = EVENT_TRIGGER_FL_PROBE; + trigger->count = -1; + trigger->ops = &eprobe_trigger_ops; + + /* + * EVENT PROBE triggers are not registered as commands with + * register_event_command(), as they are not controlled by the user + * from the trigger file + */ + trigger->cmd_ops = &event_trigger_cmd; + + INIT_LIST_HEAD(&trigger->list); + RCU_INIT_POINTER(trigger->filter, NULL); + + edata->file = file; + edata->ep = ep; + trigger->private_data = edata; + + return trigger; +} + +static int enable_eprobe(struct trace_eprobe *ep, + struct trace_event_file *eprobe_file) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct trace_array *tr = eprobe_file->tr; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + trigger = new_eprobe_trigger(ep, eprobe_file); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + + list_add_tail_rcu(&trigger->list, &file->triggers); + + trace_event_trigger_enable_disable(file, 1); + update_cond_flag(file); + + return 0; +} + +static struct trace_event_functions eprobe_funcs = { + .trace = print_eprobe_event +}; + +static int disable_eprobe(struct trace_eprobe *ep, + struct trace_array *tr) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct eprobe_data *edata; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + + list_for_each_entry(trigger, &file->triggers, list) { + if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + continue; + edata = trigger->private_data; + if (edata->ep == ep) + break; + } + if (list_entry_is_head(trigger, &file->triggers, list)) + return -ENODEV; + + list_del_rcu(&trigger->list); + + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + return 0; +} + +static int enable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + ret = enable_eprobe(ep, file); + if (ret) + break; + enabled = true; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + disable_eprobe(ep, file->tr); + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +static int disable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +static int eprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_eprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_eprobe(event, file); +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static inline void init_trace_eprobe_call(struct trace_eprobe *ep) +{ + struct trace_event_call *call = trace_probe_event_call(&ep->tp); + + call->flags = TRACE_EVENT_FL_EPROBE; + call->event.funcs = &eprobe_funcs; + call->class->fields_array = eprobe_fields_array; + call->class->reg = eprobe_register; +} + +static struct trace_event_call * +find_and_get_event(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + /* Skip other probes and ftrace events */ + if (tp_event->flags & + (TRACE_EVENT_FL_IGNORE_ENABLE | + TRACE_EVENT_FL_KPROBE | + TRACE_EVENT_FL_UPROBE | + TRACE_EVENT_FL_EPROBE)) + continue; + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + if (!trace_event_try_get_ref(tp_event)) { + return NULL; + break; + } + return tp_event; + break; + } + return NULL; +} + +static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) +{ + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + int ret; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + if (ret) + return ret; + + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + ret = trace_eprobe_tp_arg_update(ep, i); + + return ret; +} + +static int __trace_eprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * Fetch args: + * =$[:TYPE] + */ + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; + const char *sys_event = NULL, *sys_name = NULL; + struct trace_event_call *event_call; + struct trace_eprobe *ep = NULL; + char buf1[MAX_EVENT_NAME_LEN]; + char buf2[MAX_EVENT_NAME_LEN]; + int ret = 0; + int i; + + if (argc < 2 || argv[0][0] != 'e') + return -ECANCELED; + + trace_probe_log_init("event_probe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) { + event++; + ret = traceprobe_parse_event_name(&event, &group, buf1, + event - argv[0]); + if (ret) + goto parse_error; + } else { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + if (!is_good_name(event) || !is_good_name(group)) + goto parse_error; + + sys_event = argv[1]; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, + sys_event - argv[1]); + if (ret || !sys_name) + goto parse_error; + if (!is_good_name(sys_event) || !is_good_name(sys_name)) + goto parse_error; + + mutex_lock(&event_mutex); + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + mutex_unlock(&event_mutex); + + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto error; /* We know ep is not allocated */ + } + + argc -= 2; argv += 2; + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = trace_eprobe_tp_update_arg(ep, argv, i); + if (ret) + goto error; + } + ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); + if (ret < 0) + goto error; + init_trace_eprobe_call(ep); + mutex_lock(&event_mutex); + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + mutex_unlock(&event_mutex); + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + mutex_unlock(&event_mutex); + return ret; +parse_error: + ret = -EINVAL; +error: + trace_event_probe_cleanup(ep); + return ret; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup eprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_eprobe_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&eprobe_dyn_event_ops); + if (err) + pr_warn("Could not register eprobe_dyn_event_ops\n"); + + return err; +} +core_initcall(trace_events_eprobe_init_early); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6b11e335a62e..3d5c07239a2a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) return seq_list_next(t, &event_file->triggers, pos); } +static bool check_user_trigger(struct trace_event_file *file) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; + } + return false; +} + static void *trigger_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *event_file; @@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) if (unlikely(!event_file)) return ERR_PTR(-ENODEV); - if (list_empty(&event_file->triggers)) + if (list_empty(&event_file->triggers) || !check_user_trigger(event_file)) return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; return seq_list_start(&event_file->triggers, *pos); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 882c27044029..3a64ba4bbad6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,14 +707,6 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; -/* Convert certain expected symbols into '_' when generating event names */ -static inline void sanitize_event_name(char *name) -{ - while (*name++ != '\0') - if (*name == ':' || *name == '.') - *name = '_'; -} - static int __trace_kprobe_create(int argc, const char *argv[]) { /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 782c00eb6859..3ed2a3f37297 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -319,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif + } else if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -646,13 +653,14 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA) { + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || - code->op == FETCH_OP_DATA) || parg->count) { + code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || + parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an @@ -867,6 +875,10 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; + case PROBE_PRINT_EVENT: + fmt = "(%u)"; + arg = "REC->" FIELD_STRING_TYPE; + break; default: WARN_ON_ONCE(1); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 66701a92d186..99e7a5df025e 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" +#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_TP_ARG, /* Trace Point argument */ FETCH_OP_END, FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; @@ -351,7 +353,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_MASK GENMASK(2, 0) +#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_MASK GENMASK(3, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -366,6 +369,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, enum probe_print_type { PROBE_PRINT_NORMAL, PROBE_PRINT_RETURN, + PROBE_PRINT_EVENT, }; extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); -- cgit v1.2.3-71-gd317 From 131d326ba969847daa43d708ac11c27978d78566 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Mon, 23 Aug 2021 13:34:39 +0530 Subject: irqdomain: Export irq_domain_disconnect_hierarchy() Export irq_domain_disconnect_hierarchy() so irqchip module drivers can use it. Signed-off-by: Maulik Shah Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1629705880-27877-2-git-send-email-mkshah@codeaurora.org --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..62be16135e7c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1215,6 +1215,7 @@ int irq_domain_disconnect_hierarchy(struct irq_domain *domain, irqd->chip = ERR_PTR(-ENOTCONN); return 0; } +EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy); static int irq_domain_trim_hierarchy(unsigned int virq) { -- cgit v1.2.3-71-gd317 From f552a27afe67f05c47bb0c33b92af2a23b684c31 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 12 Aug 2021 12:14:35 +0800 Subject: io_uring: remove files pointer in cancellation functions When doing cancellation, we use a parameter to indicate where it's from do_exit or exec. So a boolean value is good enough for this, remove the struct files* as it is not necessary. Signed-off-by: Hao Xu [axboe: fixup io_uring_files_cancel for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- include/linux/io_uring.h | 10 +++++----- kernel/exit.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4f5a00707644..7626cad93f60 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9213,9 +9213,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } } -void __io_uring_cancel(struct files_struct *files) +void __io_uring_cancel(bool cancel_all) { - io_uring_cancel_generic(!files, NULL); + io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index ed13304e764c..649a4d7c241b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -7,18 +7,18 @@ #if defined(CONFIG_IO_URING) struct sock *io_uring_get_socket(struct file *file); -void __io_uring_cancel(struct files_struct *files); +void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { if (current->io_uring) - __io_uring_cancel(files); + __io_uring_cancel(false); } static inline void io_uring_task_cancel(void) { if (current->io_uring) - __io_uring_cancel(NULL); + __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { @@ -33,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file) static inline void io_uring_task_cancel(void) { } -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..91a43e57a32e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -777,7 +777,7 @@ void __noreturn do_exit(long code) schedule(); } - io_uring_files_cancel(tsk->files); + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ -- cgit v1.2.3-71-gd317 From 5b029a32cfe4600f5e10e36b41778506b90fd4de Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 23 Aug 2021 21:02:09 +0200 Subject: bpf: Fix ringbuf helper function compatibility Commit 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") extended check_map_func_compatibility() by enforcing map -> helper function match, but not helper -> map type match. Due to this all of the bpf_ringbuf_*() helper functions could be used with a wrong map type such as array or hash map, leading to invalid access due to type confusion. Also, both BPF_FUNC_ringbuf_{submit,discard} have ARG_PTR_TO_ALLOC_MEM as argument and not a BPF map. Therefore, their check_map_func_compatibility() presence is incorrect since it's only for map type checking. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Ryota Shiga (Flatt Security) Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 381d3d6f24bc..49f07e2bf23b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5150,8 +5150,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_submit && - func_id != BPF_FUNC_ringbuf_discard && func_id != BPF_FUNC_ringbuf_query) goto error; break; @@ -5260,6 +5258,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; + case BPF_FUNC_ringbuf_output: + case BPF_FUNC_ringbuf_reserve: + case BPF_FUNC_ringbuf_query: + if (map->map_type != BPF_MAP_TYPE_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; -- cgit v1.2.3-71-gd317 From 5ddf994fa22f78ae3742d72520a8c3e8521d96cd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Aug 2021 11:12:17 -0500 Subject: ucounts: Fix regression preventing increasing of rlimits in init_user_ns "Ma, XinjianX" reported: > When lkp team run kernel selftests, we found after these series of patches, testcase mqueue: mq_perf_tests > in kselftest failed with following message. > > # selftests: mqueue: mq_perf_tests > # > # Initial system state: > # Using queue path: /mq_perf_tests > # RLIMIT_MSGQUEUE(soft): 819200 > # RLIMIT_MSGQUEUE(hard): 819200 > # Maximum Message Size: 8192 > # Maximum Queue Size: 10 > # Nice value: 0 > # > # Adjusted system state for testing: > # RLIMIT_MSGQUEUE(soft): (unlimited) > # RLIMIT_MSGQUEUE(hard): (unlimited) > # Maximum Message Size: 16777216 > # Maximum Queue Size: 65530 > # Nice value: -20 > # Continuous mode: (disabled) > # CPUs to pin: 3 > # ./mq_perf_tests: mq_open() at 296: Too many open files > not ok 2 selftests: mqueue: mq_perf_tests # exit=1 > ``` > > Test env: > rootfs: debian-10 > gcc version: 9 After investigation the problem turned out to be that ucount_max for the rlimits in init_user_ns was being set to the initial rlimit value. The practical problem is that ucount_max provides a limit that applications inside the user namespace can not exceed. Which means in practice that rlimits that have been converted to use the ucount infrastructure were not able to exceend their initial rlimits. Solve this by setting the relevant values of ucount_max to RLIM_INIFINITY. A limit in init_user_ns is pointless so the code should allow the values to grow as large as possible without riscking an underflow or an overflow. As the ltp test case was a bit of a pain I have reproduced the rlimit failure and tested the fix with the following little C program: > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > > int main(int argc, char **argv) > { > struct mq_attr mq_attr; > struct rlimit rlim; > mqd_t mqd; > int ret; > > ret = getrlimit(RLIMIT_MSGQUEUE, &rlim); > if (ret != 0) { > fprintf(stderr, "getrlimit(RLIMIT_MSGQUEUE) failed: %s\n", strerror(errno)); > exit(EXIT_FAILURE); > } > printf("RLIMIT_MSGQUEUE %lu %lu\n", > rlim.rlim_cur, rlim.rlim_max); > rlim.rlim_cur = RLIM_INFINITY; > rlim.rlim_max = RLIM_INFINITY; > ret = setrlimit(RLIMIT_MSGQUEUE, &rlim); > if (ret != 0) { > fprintf(stderr, "setrlimit(RLIMIT_MSGQUEUE, RLIM_INFINITY) failed: %s\n", strerror(errno)); > exit(EXIT_FAILURE); > } > > memset(&mq_attr, 0, sizeof(struct mq_attr)); > mq_attr.mq_maxmsg = 65536 - 1; > mq_attr.mq_msgsize = 16*1024*1024 - 1; > > mqd = mq_open("/mq_rlimit_test", O_RDONLY|O_CREAT, 0600, &mq_attr); > if (mqd == (mqd_t)-1) { > fprintf(stderr, "mq_open failed: %s\n", strerror(errno)); > exit(EXIT_FAILURE); > } > ret = mq_close(mqd); > if (ret) { > fprintf(stderr, "mq_close failed; %s\n", strerror(errno)); > exit(EXIT_FAILURE); > } > > return EXIT_SUCCESS; > } Fixes: 6e52a9f0532f ("Reimplement RLIMIT_MSGQUEUE on top of ucounts") Fixes: d7c9e99aee48 ("Reimplement RLIMIT_MEMLOCK on top of ucounts") Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Reported-by: kernel test robot lkp@intel.com Acked-by: Alexey Gladkov Link: https://lkml.kernel.org/r/87eeajswfc.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bc94b2cc5995..44f4c2d83763 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -828,10 +828,10 @@ void __init fork_init(void) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", -- cgit v1.2.3-71-gd317 From bbb6d0f3e1feb43d663af089c7dedb23be6a04fb Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 23 Aug 2021 18:16:33 +0200 Subject: ucounts: Increase ucounts reference counter before the security hook We need to increment the ucounts reference counter befor security_prepare_creds() because this function may fail and abort_creds() will try to decrement this reference. [ 96.465056][ T8641] FAULT_INJECTION: forcing a failure. [ 96.465056][ T8641] name fail_page_alloc, interval 1, probability 0, space 0, times 0 [ 96.478453][ T8641] CPU: 1 PID: 8641 Comm: syz-executor668 Not tainted 5.14.0-rc6-syzkaller #0 [ 96.487215][ T8641] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ 96.497254][ T8641] Call Trace: [ 96.500517][ T8641] dump_stack_lvl+0x1d3/0x29f [ 96.505758][ T8641] ? show_regs_print_info+0x12/0x12 [ 96.510944][ T8641] ? log_buf_vmcoreinfo_setup+0x498/0x498 [ 96.516652][ T8641] should_fail+0x384/0x4b0 [ 96.521141][ T8641] prepare_alloc_pages+0x1d1/0x5a0 [ 96.526236][ T8641] __alloc_pages+0x14d/0x5f0 [ 96.530808][ T8641] ? __rmqueue_pcplist+0x2030/0x2030 [ 96.536073][ T8641] ? lockdep_hardirqs_on_prepare+0x3e2/0x750 [ 96.542056][ T8641] ? alloc_pages+0x3f3/0x500 [ 96.546635][ T8641] allocate_slab+0xf1/0x540 [ 96.551120][ T8641] ___slab_alloc+0x1cf/0x350 [ 96.555689][ T8641] ? kzalloc+0x1d/0x30 [ 96.559740][ T8641] __kmalloc+0x2e7/0x390 [ 96.563980][ T8641] ? kzalloc+0x1d/0x30 [ 96.568029][ T8641] kzalloc+0x1d/0x30 [ 96.571903][ T8641] security_prepare_creds+0x46/0x220 [ 96.577174][ T8641] prepare_creds+0x411/0x640 [ 96.581747][ T8641] __sys_setfsuid+0xe2/0x3a0 [ 96.586333][ T8641] do_syscall_64+0x3d/0xb0 [ 96.590739][ T8641] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 96.596611][ T8641] RIP: 0033:0x445a69 [ 96.600483][ T8641] Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 [ 96.620152][ T8641] RSP: 002b:00007f1054173318 EFLAGS: 00000246 ORIG_RAX: 000000000000007a [ 96.628543][ T8641] RAX: ffffffffffffffda RBX: 00000000004ca4c8 RCX: 0000000000445a69 [ 96.636600][ T8641] RDX: 0000000000000010 RSI: 00007f10541732f0 RDI: 0000000000000000 [ 96.644550][ T8641] RBP: 00000000004ca4c0 R08: 0000000000000001 R09: 0000000000000000 [ 96.652500][ T8641] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004ca4cc [ 96.660631][ T8641] R13: 00007fffffe0b62f R14: 00007f1054173400 R15: 0000000000022000 Fixes: 905ae01c4ae2 ("Add a reference to ucounts for each cred") Reported-by: syzbot+01985d7909f9468f013c@syzkaller.appspotmail.com Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/97433b1742c3331f02ad92de5a4f07d673c90613.1629735352.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/cred.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index e6fd2b3fc31f..f784e08c2fbd 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -286,13 +286,13 @@ struct cred *prepare_creds(void) new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) - goto error; - new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) + goto error; + validate_creds(new); return new; @@ -753,13 +753,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) #ifdef CONFIG_SECURITY new->security = NULL; #endif - if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) - goto error; - new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; + if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) + goto error; + put_cred(old); validate_creds(new); return new; -- cgit v1.2.3-71-gd317 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- include/linux/bpf-cgroup.h | 182 ++++++++++++++++++++++++++++------------- include/uapi/linux/bpf.h | 2 +- kernel/bpf/cgroup.c | 156 +++++++++++++++++++++-------------- net/ipv4/af_inet.c | 6 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 6 +- net/ipv6/udp.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 8 files changed, 226 insertions(+), 132 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a74cd1c3bd87..2746fd804216 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,9 +23,73 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +#define CGROUP_ATYPE(type) \ + case BPF_##type: return type + +static inline enum cgroup_bpf_attach_type +to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + CGROUP_ATYPE(CGROUP_INET_INGRESS); + CGROUP_ATYPE(CGROUP_INET_EGRESS); + CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); + CGROUP_ATYPE(CGROUP_SOCK_OPS); + CGROUP_ATYPE(CGROUP_DEVICE); + CGROUP_ATYPE(CGROUP_INET4_BIND); + CGROUP_ATYPE(CGROUP_INET6_BIND); + CGROUP_ATYPE(CGROUP_INET4_CONNECT); + CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_INET4_POST_BIND); + CGROUP_ATYPE(CGROUP_INET6_POST_BIND); + CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); + CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_SYSCTL); + CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); + CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_GETSOCKOPT); + CGROUP_ATYPE(CGROUP_SETSOCKOPT); + CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); + default: + return CGROUP_BPF_ATTACH_TYPE_INVALID; + } +} + +#undef CGROUP_ATYPE -extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; -#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -67,15 +131,15 @@ struct bpf_prog_array; struct cgroup_bpf { /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; /* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_BPF_ATTACH_TYPE]; - u32 flags[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; @@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); + short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, @@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ + CGROUP_INET_INGRESS); \ \ __ret; \ }) @@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ + CGROUP_INET_EGRESS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + if (cgroup_bpf_enabled(atype)) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) -#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + if (cgroup_bpf_enabled(atype)) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, \ &__unused_flags); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx, \ &__unused_flags); \ release_sock(sk); \ @@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ @@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ - ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ - cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by @@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ + __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ - BPF_CGROUP_DEVICE); \ + CGROUP_DEVICE); \ \ __ret; \ }) @@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ + if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ - BPF_CGROUP_SYSCTL); \ + CGROUP_SYSCTL); \ __ret; \ }) @@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled(type) (0) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) +#define cgroup_bpf_enabled(atype) (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) @@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8e9d99e2ade4..03145d45e3d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work) struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; - unsigned int type; + unsigned int atype; mutex_lock(&cgroup_mutex); - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { - struct list_head *progs = &cgrp->bpf.progs[type]; + for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { + struct list_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl, *pltmp; list_for_each_entry_safe(pl, pltmp, progs, node) { @@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( - cgrp->bpf.effective[type], + cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } @@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *p; @@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (!p) return true; do { - u32 flags = p->bpf.flags[type]; + u32 flags = p->bpf.flags[atype]; u32 cnt; if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[type]); + cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; @@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp, /* count number of effective programs by walking parents */ do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[type]); + if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p); @@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[type], node) { + list_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp, } static void activate_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array @@ -328,7 +328,7 @@ cleanup: } static int update_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; - err = compute_effective_progs(desc, type, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } @@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp, continue; } - activate_effective_progs(desc, type, desc->bpf.inactive); + activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); - struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; + struct list_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type)) + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + + if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) + if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); - cgrp->bpf.flags[type] = saved_flags; + cgrp->bpf.flags[atype] = saved_flags; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key[type]); + static_branch_inc(&cgroup_bpf_enabled_key[atype]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -520,7 +527,7 @@ cleanup: * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; @@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp, /* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { - if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - head = &cg->bpf.progs[type]; + head = &cg->bpf.progs[atype]; list_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; @@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp, found: BUG_ON(!cg); progs = rcu_dereference_protected( - desc->bpf.effective[type], + desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); @@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { - struct list_head *progs = &cgrp->bpf.progs[link->type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; + struct list_head *progs; bool found = false; + atype = to_cgroup_bpf_attach_type(link->type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + if (link->link.prog->type != new_prog->type) return -EINVAL; @@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, return -ENOENT; old_prog = xchg(&link->link.prog, new_prog); - replace_effective_prog(cgrp, link->type, link); + replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } @@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog_list *pl; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + struct list_head *progs; + u32 flags; int err; + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; + if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; @@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; @@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ - cgrp->bpf.flags[type] = 0; + cgrp->bpf.flags[atype] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; cleanup: @@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; + struct list_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; + u32 flags; + + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; - effective = rcu_dereference_protected(cgrp->bpf.effective[type], + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) @@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; } - err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, - BPF_F_ALLOW_MULTI); + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; @@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - if (type == BPF_CGROUP_INET_EGRESS) { + if (atype == CGROUP_INET_EGRESS) { ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } @@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) { @@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; @@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) + short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { @@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); @@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, @@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) + enum cgroup_bpf_attach_type attach_type) { struct bpf_prog_array *prog_array; bool empty; @@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; @@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e4d758c2585..1d816a5fd3eb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a742b710e54..8851c9463b4b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d92c90d97763..b5878bb8e419 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c5e15e94bb00..ea53847b5b7e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1475,7 +1475,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { -- cgit v1.2.3-71-gd317 From 88ffe2d0a55a165e55cedad1693f239d47e3e17e Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 16 Aug 2021 14:48:17 +0100 Subject: genirq/cpuhotplug: Demote debug printk to KERN_DEBUG This sort of information is only generally useful when debugging. No need to have these sprinkled through the kernel log otherwise. Real world problem: During pre-release testing these have an affect on performance on real products. To the point where so much logging builds up, that it sets off the watchdog(s) on some high profile consumer devices. Signed-off-by: Lee Jones Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210816134817.1503661-1-lee.jones@linaro.org --- kernel/irq/cpuhotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 02236b13b359..39a41c56ad4f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -166,7 +166,7 @@ void irq_migrate_all_off_this_cpu(void) raw_spin_unlock(&desc->lock); if (affinity_broken) { - pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n", + pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", irq, smp_processor_id()); } } -- cgit v1.2.3-71-gd317 From 2f170814bdd26289e9daaa4ae359290f854e5dcf Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 13 Aug 2021 15:56:27 +1200 Subject: genirq/msi: Move MSI sysfs handling from PCI to MSI core Move PCI's MSI sysfs code to the irq core so that other busses such as platform can reuse it. Signed-off-by: Barry Song Signed-off-by: Thomas Gleixner Acked-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20210813035628.6844-2-21cnbao@gmail.com --- drivers/pci/msi.c | 125 ++++-------------------------------------------- include/linux/msi.h | 4 ++ kernel/irq/msi.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index ce841f327ff6..6eb0ae39deae 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -363,9 +363,7 @@ static void free_msi_irqs(struct pci_dev *dev) { struct list_head *msi_list = dev_to_msi_list(&dev->dev); struct msi_desc *entry, *tmp; - struct attribute **msi_attrs; - struct device_attribute *dev_attr; - int i, count = 0; + int i; for_each_pci_msi_entry(entry, dev) if (entry->irq) @@ -385,18 +383,7 @@ static void free_msi_irqs(struct pci_dev *dev) } if (dev->msi_irq_groups) { - sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups); - msi_attrs = dev->msi_irq_groups[0]->attrs; - while (msi_attrs[count]) { - dev_attr = container_of(msi_attrs[count], - struct device_attribute, attr); - kfree(dev_attr->attr.name); - kfree(dev_attr); - ++count; - } - kfree(msi_attrs); - kfree(dev->msi_irq_groups[0]); - kfree(dev->msi_irq_groups); + msi_destroy_sysfs(&dev->dev, dev->msi_irq_groups); dev->msi_irq_groups = NULL; } } @@ -476,102 +463,6 @@ void pci_restore_msi_state(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_restore_msi_state); -static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct msi_desc *entry; - unsigned long irq; - int retval; - - retval = kstrtoul(attr->attr.name, 10, &irq); - if (retval) - return retval; - - entry = irq_get_msi_desc(irq); - if (!entry) - return -ENODEV; - - return sysfs_emit(buf, "%s\n", - entry->msi_attrib.is_msix ? "msix" : "msi"); -} - -static int populate_msi_sysfs(struct pci_dev *pdev) -{ - struct attribute **msi_attrs; - struct attribute *msi_attr; - struct device_attribute *msi_dev_attr; - struct attribute_group *msi_irq_group; - const struct attribute_group **msi_irq_groups; - struct msi_desc *entry; - int ret = -ENOMEM; - int num_msi = 0; - int count = 0; - int i; - - /* Determine how many msi entries we have */ - for_each_pci_msi_entry(entry, pdev) - num_msi += entry->nvec_used; - if (!num_msi) - return 0; - - /* Dynamically create the MSI attributes for the PCI device */ - msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); - if (!msi_attrs) - return -ENOMEM; - for_each_pci_msi_entry(entry, pdev) { - for (i = 0; i < entry->nvec_used; i++) { - msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); - if (!msi_dev_attr) - goto error_attrs; - msi_attrs[count] = &msi_dev_attr->attr; - - sysfs_attr_init(&msi_dev_attr->attr); - msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", - entry->irq + i); - if (!msi_dev_attr->attr.name) - goto error_attrs; - msi_dev_attr->attr.mode = S_IRUGO; - msi_dev_attr->show = msi_mode_show; - ++count; - } - } - - msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); - if (!msi_irq_group) - goto error_attrs; - msi_irq_group->name = "msi_irqs"; - msi_irq_group->attrs = msi_attrs; - - msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); - if (!msi_irq_groups) - goto error_irq_group; - msi_irq_groups[0] = msi_irq_group; - - ret = sysfs_create_groups(&pdev->dev.kobj, msi_irq_groups); - if (ret) - goto error_irq_groups; - pdev->msi_irq_groups = msi_irq_groups; - - return 0; - -error_irq_groups: - kfree(msi_irq_groups); -error_irq_group: - kfree(msi_irq_group); -error_attrs: - count = 0; - msi_attr = msi_attrs[count]; - while (msi_attr) { - msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); - kfree(msi_attr->name); - kfree(msi_dev_attr); - ++count; - msi_attr = msi_attrs[count]; - } - kfree(msi_attrs); - return ret; -} - static struct msi_desc * msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd) { @@ -667,9 +558,11 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, if (ret) goto err; - ret = populate_msi_sysfs(dev); - if (ret) + dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(dev->msi_irq_groups)) { + ret = PTR_ERR(dev->msi_irq_groups); goto err; + } /* Set MSI enabled bits */ pci_intx_for_msi(dev, 0); @@ -834,9 +727,11 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, msix_update_entries(dev, entries); - ret = populate_msi_sysfs(dev); - if (ret) + dev->msi_irq_groups = msi_populate_sysfs(&dev->dev); + if (IS_ERR(dev->msi_irq_groups)) { + ret = PTR_ERR(dev->msi_irq_groups); goto out_free; + } /* Set MSI-X enabled bits and unmask the function */ pci_intx_for_msi(dev, 0); diff --git a/include/linux/msi.h b/include/linux/msi.h index a20dc66b9946..49cf6eb222e7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -239,6 +239,10 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); +const struct attribute_group **msi_populate_sysfs(struct device *dev); +void msi_destroy_sysfs(struct device *dev, + const struct attribute_group **msi_irq_groups); + /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index bb18040efbe8..48ef144649e3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internals.h" @@ -71,6 +72,139 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); +static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct msi_desc *entry; + bool is_msix = false; + unsigned long irq; + int retval; + + retval = kstrtoul(attr->attr.name, 10, &irq); + if (retval) + return retval; + + entry = irq_get_msi_desc(irq); + if (!entry) + return -ENODEV; + + if (dev_is_pci(dev)) + is_msix = entry->msi_attrib.is_msix; + + return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); +} + +/** + * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices + * @dev: The device(PCI, platform etc) who will get sysfs entries + * + * Return attribute_group ** so that specific bus MSI can save it to + * somewhere during initilizing msi irqs. If devices has no MSI irq, + * return NULL; if it fails to populate sysfs, return ERR_PTR + */ +const struct attribute_group **msi_populate_sysfs(struct device *dev) +{ + const struct attribute_group **msi_irq_groups; + struct attribute **msi_attrs, *msi_attr; + struct device_attribute *msi_dev_attr; + struct attribute_group *msi_irq_group; + struct msi_desc *entry; + int ret = -ENOMEM; + int num_msi = 0; + int count = 0; + int i; + + /* Determine how many msi entries we have */ + for_each_msi_entry(entry, dev) + num_msi += entry->nvec_used; + if (!num_msi) + return NULL; + + /* Dynamically create the MSI attributes for the device */ + msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); + if (!msi_attrs) + return ERR_PTR(-ENOMEM); + + for_each_msi_entry(entry, dev) { + for (i = 0; i < entry->nvec_used; i++) { + msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); + if (!msi_dev_attr) + goto error_attrs; + msi_attrs[count] = &msi_dev_attr->attr; + + sysfs_attr_init(&msi_dev_attr->attr); + msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", + entry->irq + i); + if (!msi_dev_attr->attr.name) + goto error_attrs; + msi_dev_attr->attr.mode = 0444; + msi_dev_attr->show = msi_mode_show; + ++count; + } + } + + msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); + if (!msi_irq_group) + goto error_attrs; + msi_irq_group->name = "msi_irqs"; + msi_irq_group->attrs = msi_attrs; + + msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); + if (!msi_irq_groups) + goto error_irq_group; + msi_irq_groups[0] = msi_irq_group; + + ret = sysfs_create_groups(&dev->kobj, msi_irq_groups); + if (ret) + goto error_irq_groups; + + return msi_irq_groups; + +error_irq_groups: + kfree(msi_irq_groups); +error_irq_group: + kfree(msi_irq_group); +error_attrs: + count = 0; + msi_attr = msi_attrs[count]; + while (msi_attr) { + msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); + kfree(msi_attr->name); + kfree(msi_dev_attr); + ++count; + msi_attr = msi_attrs[count]; + } + kfree(msi_attrs); + return ERR_PTR(ret); +} + +/** + * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices + * @dev: The device(PCI, platform etc) who will remove sysfs entries + * @msi_irq_groups: attribute_group for device msi_irqs entries + */ +void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) +{ + struct device_attribute *dev_attr; + struct attribute **msi_attrs; + int count = 0; + + if (msi_irq_groups) { + sysfs_remove_groups(&dev->kobj, msi_irq_groups); + msi_attrs = msi_irq_groups[0]->attrs; + while (msi_attrs[count]) { + dev_attr = container_of(msi_attrs[count], + struct device_attribute, attr); + kfree(dev_attr->attr.name); + kfree(dev_attr); + ++count; + } + kfree(msi_attrs); + kfree(msi_irq_groups[0]); + kfree(msi_irq_groups); + } +} + #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) -- cgit v1.2.3-71-gd317 From d7af7e497f0308bc97809cc48b58e8e0f13887e1 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 20 Aug 2021 09:39:35 -0700 Subject: bpf: Fix possible out of bound write in narrow load handling Fix a verifier bug found by smatch static checker in [0]. This problem has never been seen in prod to my best knowledge. Fixing it still seems to be a good idea since it's hard to say for sure whether it's possible or not to have a scenario where a combination of convert_ctx_access() and a narrow load would lead to an out of bound write. When narrow load is handled, one or two new instructions are added to insn_buf array, but before it was only checked that cnt >= ARRAY_SIZE(insn_buf) And it's safe to add a new instruction to insn_buf[cnt++] only once. The second try will lead to out of bound write. And this is what can happen if `shift` is set. Fix it by making sure that if the BPF_RSH instruction has to be added in addition to BPF_AND then there is enough space for two more instructions in insn_buf. The full report [0] is below: kernel/bpf/verifier.c:12304 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c:12311 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c 12282 12283 insn->off = off & ~(size_default - 1); 12284 insn->code = BPF_LDX | BPF_MEM | size_code; 12285 } 12286 12287 target_size = 0; 12288 cnt = convert_ctx_access(type, insn, insn_buf, env->prog, 12289 &target_size); 12290 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Bounds check. 12291 (ctx_field_size && !target_size)) { 12292 verbose(env, "bpf verifier is misconfigured\n"); 12293 return -EINVAL; 12294 } 12295 12296 if (is_narrower_load && size < target_size) { 12297 u8 shift = bpf_ctx_narrow_access_offset( 12298 off, size, size_default) * 8; 12299 if (ctx_field_size <= 4) { 12300 if (shift) 12301 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, ^^^^^ increment beyond end of array 12302 insn->dst_reg, 12303 shift); --> 12304 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, ^^^^^ out of bounds write 12305 (1 << size * 8) - 1); 12306 } else { 12307 if (shift) 12308 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, 12309 insn->dst_reg, 12310 shift); 12311 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, ^^^^^^^^^^^^^^^ Same. 12312 (1ULL << size * 8) - 1); 12313 } 12314 } 12315 12316 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 12317 if (!new_prog) 12318 return -ENOMEM; 12319 12320 delta += cnt - 1; 12321 12322 /* keep walking new program and skip insns we just inserted */ 12323 env->prog = new_prog; 12324 insn = new_prog->insnsi + i + delta; 12325 } 12326 12327 return 0; 12328 } [0] https://lore.kernel.org/bpf/20210817050843.GA21456@kili/ v1->v2: - clarify that problem was only seen by static checker but not in prod; Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0") Reported-by: Dan Carpenter Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820163935.1902398-1-rdna@fb.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5a0077c9981..206c221453cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12295,6 +12295,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; + if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + return -EINVAL; + } if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3-71-gd317 From 67d69e9d1a6c889d98951c1d74b19332ce0565af Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 23 Aug 2021 22:04:09 -0400 Subject: audit: move put_tree() to avoid trim_trees refcount underflow and UAF AUDIT_TRIM is expected to be idempotent, but multiple executions resulted in a refcount underflow and use-after-free. git bisect fingered commit fb041bb7c0a9 ("locking/refcount: Consolidate implementations of refcount_t") but this patch with its more thorough checking that wasn't in the x86 assembly code merely exposed a previously existing tree refcount imbalance in the case of tree trimming code that was refactored with prune_one() to remove a tree introduced in commit 8432c7006297 ("audit: Simplify locking around untag_chunk()") Move the put_tree() to cover only the prune_one() case. Passes audit-testsuite and 3 passes of "auditctl -t" with at least one directory watch. Cc: Jan Kara Cc: Will Deacon Cc: Alexander Viro Cc: Seiji Nishikawa Cc: stable@vger.kernel.org Fixes: 8432c7006297 ("audit: Simplify locking around untag_chunk()") Signed-off-by: Richard Guy Briggs Reviewed-by: Jan Kara [PM: reformatted/cleaned-up the commit description] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b2be4e978ba3..2cd7b5694422 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) spin_lock(&hash_lock); } spin_unlock(&hash_lock); - put_tree(victim); } /* @@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) static void prune_one(struct audit_tree *victim) { prune_tree_chunks(victim, false); + put_tree(victim); } /* trim the uncommitted chunks from tree */ -- cgit v1.2.3-71-gd317 From c3123c431447da99db160264506de9897c003513 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Aug 2021 12:33:12 +0200 Subject: locking/rtmutex: Dont dereference waiter lockless The new rt_mutex_spin_on_onwer() loop checks whether the spinning waiter is still the top waiter on the lock by utilizing rt_mutex_top_waiter(), which is broken because that function contains a sanity check which dereferences the top waiter pointer to check whether the waiter belongs to the lock. That's wrong in the lockless spinwait case: CPU 0 CPU 1 rt_mutex_lock(lock) rt_mutex_lock(lock); queue(waiter0) waiter0 == rt_mutex_top_waiter(lock) rt_mutex_spin_on_onwer(lock, waiter0) { queue(waiter1) waiter1 == rt_mutex_top_waiter(lock) ... top_waiter = rt_mutex_top_waiter(lock) leftmost = rb_first_cached(&lock->waiters); -> signal dequeue(waiter1) destroy(waiter1) w = rb_entry(leftmost, ....) BUG_ON(w->lock != lock) <- UAF The BUG_ON() is correct for the case where the caller holds lock->wait_lock which guarantees that the leftmost waiter entry cannot vanish. For the lockless spinwait case it's broken. Create a new helper function which avoids the pointer dereference and just compares the leftmost entry pointer with current's waiter pointer to validate that currrent is still elegible for spinning. Fixes: 992caf7f1724 ("locking/rtmutex: Add adaptive spinwait mechanism") Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210825102453.981720644@linutronix.de --- kernel/locking/rtmutex.c | 5 +++-- kernel/locking/rtmutex_common.h | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8aaa352d0c17..b3c09611ef6a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1329,8 +1329,9 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * for CONFIG_PREEMPT_RCU=y) * - the VCPU on which owner runs is preempted */ - if (!owner->on_cpu || waiter != rt_mutex_top_waiter(lock) || - need_resched() || vcpu_is_preempted(task_cpu(owner))) { + if (!owner->on_cpu || need_resched() || + rt_mutex_waiter_is_top_waiter(lock, waiter) || + vcpu_is_preempted(task_cpu(owner))) { res = false; break; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 61256de5bd66..c47e8361bfb5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -95,6 +95,19 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } +/* + * Lockless speculative check whether @waiter is still the top waiter on + * @lock. This is solely comparing pointers and not derefencing the + * leftmost entry which might be about to vanish. + */ +static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + + return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; +} + static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); -- cgit v1.2.3-71-gd317 From 37e8abff2bebbf9947d6b784f5c75ed48a717089 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Aug 2021 12:33:14 +0200 Subject: locking/rtmutex: Dequeue waiter on ww_mutex deadlock The rt_mutex based ww_mutex variant queues the new waiter first in the lock's rbtree before evaluating the ww_mutex specific conditions which might decide that the waiter should back out. This check and conditional exit happens before the waiter is enqueued into the PI chain. The failure handling at the call site assumes that the waiter, if it is the top most waiter on the lock, is queued in the PI chain and then proceeds to adjust the unmodified PI chain, which results in RB tree corruption. Dequeue the waiter from the lock waiter list in the ww_mutex error exit path to prevent this. Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210825102454.042280541@linutronix.de --- kernel/locking/rtmutex.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b3c09611ef6a..c8fe74ef8db9 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1082,8 +1082,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); - if (res) + if (res) { + raw_spin_lock(&task->pi_lock); + rt_mutex_dequeue(lock, waiter); + task->pi_blocked_on = NULL; + raw_spin_unlock(&task->pi_lock); return res; + } } if (!owner) -- cgit v1.2.3-71-gd317 From 9f72daf7edfa8f7e86ce8940d52266b5e931dcb0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 25 Aug 2021 12:54:15 +0200 Subject: cgroup/cpuset: Avoid memory migration when nodemasks match With the introduction of ee9707e8593d ("cgroup/cpuset: Enable memory migration for cpuset v2") attaching a process to a different cgroup will trigger a memory migration regardless of whether it's really needed. Memory migration is an expensive operation, so bypass it if the nodemasks passed to cpuset_migrate_mm() are equal. Note that we're not only avoiding the migration work itself, but also a call to lru_cache_disable(), which triggers and flushes an LRU drain work on every online CPU. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 44d234b0df5e..d497a65c4f04 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1634,6 +1634,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct cpuset_migrate_mm_work *mwork; + if (nodes_equal(*from, *to)) { + mmput(mm); + return; + } + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; -- cgit v1.2.3-71-gd317 From 33c5cb36015ac1034b50b823fae367e908d05147 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:47 -0700 Subject: bpf: Consolidate task_struct BTF_ID declarations No need to have it defined 5 times. Once is enough. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/6dcefa5bed26fe1226f26683f36819bb53ec19a2.1629772842.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 2 ++ kernel/bpf/bpf_task_storage.c | 6 ++---- kernel/bpf/stackmap.c | 4 +--- kernel/bpf/task_iter.c | 11 +++++------ kernel/trace/bpf_trace.c | 4 ++-- 5 files changed, 12 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6d1395030616..93d881ab0d48 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -188,4 +188,6 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif +extern u32 btf_task_struct_ids[]; + #endif diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 3ce75758d394..ebfa8bc90892 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6fbc2abe9c91..e8eefdf8cf3e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -530,14 +530,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return res; } -BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], + .arg1_btf_id = &btf_task_struct_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b68cb5d6d6eb..b48750bfba5a 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { }; BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, task_struct) BTF_ID(struct, file) BTF_ID(struct, vm_area_struct) @@ -591,19 +590,19 @@ static int __init task_iter_init(void) { int ret; - task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cbc73c08c4a4..50d055fc2327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,13 +714,13 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) static const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, - .ret_btf_id = &bpf_get_current_btf_ids[0], + .ret_btf_id = &btf_task_struct_ids[0], }; BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -- cgit v1.2.3-71-gd317 From a396eda5517ac958fb4eb7358f4708eb829058c4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:48 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with bpf_get_current_task_btf() bpf_get_current_task() is already supported so it's natural to also include the _btf() variant for btf-powered helpers. This is required for non-tracing progs to use bpf_task_pt_regs() in the next commit. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/f99870ed5f834c9803d73b3476f8272b1bb987c0.1629772842.git.dxu@dxuuu.xyz --- kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4e8540716187..609674f409ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1322,6 +1322,7 @@ out: } const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; @@ -1407,6 +1408,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 50d055fc2327..4e54f3dc209f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -716,7 +716,7 @@ BPF_CALL_0(bpf_get_current_task_btf) BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) -static const struct bpf_func_proto bpf_get_current_task_btf_proto = { +const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, -- cgit v1.2.3-71-gd317 From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:49 -0700 Subject: bpf: Add bpf_task_pt_regs() helper The motivation behind this helper is to access userspace pt_regs in a kprobe handler. uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a kprobe handler. The final case (kernelspace pt_regs in uprobe) is pretty rare (usermode helper) so I think that can be solved later if necessary. More concretely, this helper is useful in doing BPF-based DWARF stack unwinding. Currently the kernel can only do framepointer based stack unwinds for userspace code. This is because the DWARF state machines are too fragile to be computed in kernelspace [0]. The idea behind DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace stack (while in prog context) and send it up to userspace for unwinding (probably with libunwind) [1]. This would effectively enable profiling applications with -fomit-frame-pointer using kprobes and uprobes. [0]: https://lkml.org/lkml/2012/2/10/356 [1]: https://github.com/danobi/bpf-dwarf-walk Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 609674f409ed..c227b7d4f56c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e54f3dc209f..580e14ee7ff9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .ret_btf_id = &btf_task_struct_ids[0], }; +BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) +{ + return (unsigned long) task_pt_regs(task); +} + +BTF_ID_LIST(bpf_task_pt_regs_ids) +BTF_ID(struct, pt_regs) + +const struct bpf_func_proto bpf_task_pt_regs_proto = { + .func = bpf_task_pt_regs, + .gpl_only = true, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_task_pt_regs_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3-71-gd317 From eb18b49ea758ec052ac2a12c6bb204e1e877ec31 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:07 -0700 Subject: bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use case is to allow a bpf-tcp-cc switching to another cc during init(). For example, when the tcp flow is not ecn ready, the bpf_dctcp can switch to another cc by calling setsockopt(TCP_CONGESTION). During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be called and this could cause a recursion but it is stopped by the current trampoline's logic (in the prog->active counter). While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()), the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not available to the bpf-tcp-cc's release(). This will avoid release() making setsockopt() call that will potentially allocate new resources. Although the bpf-tcp-cc already has a more powerful way to read tcp_sock from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and bpf_setsockopt are available together. Thus, bpf_getsockopt() is also added to all tcp_congestion_ops except release(). When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION) to switch to a new cc, the old bpf-tcp-cc will be released by bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map after a rcu grace period because the trampoline's image cannot be freed while the old bpf-tcp-cc is still running. bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size of icsk_ca_priv has already been raised a few times to avoid extra kmalloc and memory referencing. The only exception is the kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv. To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer value stored in icsk_ca_priv after switching and without over-complicating the bpf's verifier for this one exception in tcp_cdg, this patch does not allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be implemented and then use the bpf_sk_storage as the extended storage. bpf_sk_setsockopt proto has only been recently added and used in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the same proto instead of adding a new proto specifically for bpf-tcp-cc. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 22 +++++++++++++++++++++- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 41 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 70f6fd4fa305..d6731c32864e 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -28,6 +28,7 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; + struct rcu_head rcu; const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; @@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata) return refcount_inc_not_zero(&kvalue->refcnt); } +static void bpf_struct_ops_put_rcu(struct rcu_head *head) +{ + struct bpf_struct_ops_map *st_map; + + st_map = container_of(head, struct bpf_struct_ops_map, rcu); + bpf_map_put(&st_map->map); +} + void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; @@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata) st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - bpf_map_put(&st_map->map); + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its map->refcnt may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * Thus, a rcu grace period is needed here. + */ + call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); } } diff --git a/net/core/filter.c b/net/core/filter.c index cfbd01167eb5..2e32cee2c469 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5051,6 +5051,12 @@ err_clear: BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { + if (level == SOL_TCP && optname == TCP_CONGESTION) { + if (optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, optlen)) + return -ENOTSUPP; + } + return _bpf_setsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9e41eff4a685..0dcee9df1326 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -10,6 +10,9 @@ #include #include +/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), offsetof(struct tcp_congestion_ops, release), @@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 prog_ops_moff(const struct bpf_prog *prog) +{ + const struct btf_member *m; + const struct btf_type *t; + u32 midx; + + midx = prog->expected_attach_type; + t = bpf_tcp_congestion_ops.type; + m = &btf_type_member(t)[midx]; + + return btf_member_bit_offset(t, m) / 8; +} + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + /* Does not allow release() to call setsockopt. + * release() is called when the current bpf-tcp-cc + * is retiring. It is not allowed to call + * setsockopt() to make further changes which + * may potentially allocate new resources. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + /* Since get/setsockopt is usually expected to + * be available together, disable getsockopt for + * release also to avoid usage surprise. + * The bpf-tcp-cc already has a more powerful way + * to read tcp_sock from the PTR_TO_BTF_ID. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_getsockopt_proto; + return NULL; default: return bpf_base_func_proto(func_id); } @@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } -/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; - struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, -- cgit v1.2.3-71-gd317 From eb529c5b10b9401a0f2d1f469e82c6a0ba98082c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 25 Aug 2021 18:48:31 -0700 Subject: bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS This commit fixes linker errors along the lines of: s390-linux-ld: task_iter.c:(.init.text+0xa4): undefined reference to `btf_task_struct_ids'` Fix by defining btf_task_struct_ids unconditionally in kernel/bpf/btf.c since there exists code that unconditionally uses btf_task_struct_ids. Reported-by: kernel test robot Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/05d94748d9f4b3eecedc4fddd6875418a396e23c.1629942444.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 1 + kernel/bpf/btf.c | 2 ++ kernel/trace/bpf_trace.c | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93d881ab0d48..47d9abfbdb55 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -151,6 +151,7 @@ extern struct btf_id_set name; #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c395024610ed..dfe61df4f974 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6213,3 +6213,5 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; + +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 580e14ee7ff9..8e2eb950aa82 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,8 +714,6 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, -- cgit v1.2.3-71-gd317 From ffec09f9c7d7b21b0aff29dd5c3972f4631c0b6b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:15:54 +0200 Subject: perf/hw_breakpoint: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210803141621.780504-12-bigeasy@linutronix.de --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 835973444a1e..f32320ac02fd 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -568,7 +568,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return (void __percpu __force *)ERR_PTR(-ENOMEM); - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); @@ -579,7 +579,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, per_cpu(*cpu_events, cpu) = bp; } - put_online_cpus(); + cpus_read_unlock(); if (likely(!err)) return cpu_events; -- cgit v1.2.3-71-gd317 From 366e7ad6ba5f4cb2ffd0b7316e404d6ee9c0f401 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Aug 2021 10:47:09 +0200 Subject: sched/fair: Mark tg_is_idle() an inline in the !CONFIG_FAIR_GROUP_SCHED case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not actually used in the !CONFIG_FAIR_GROUP_SCHED case: kernel/sched/fair.c:488:12: warning: ‘tg_is_idle’ defined but not used [-Wunused-function] Keep around a placeholder nevertheless, for API completeness. Mark it inline, so the compiler doesn't think it must be used. Fixes: 304000390f88: ("sched: Cgroup SCHED_IDLE support") Signed-off-by: Ingo Molnar Cc: Josh Don Cc: Peter Zijlstra (Intel) Cc: Vincent Guittot --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6cd05f1d77ef..7b3e85912a1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -485,7 +485,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } -static int tg_is_idle(struct task_group *tg) +static inline int tg_is_idle(struct task_group *tg) { return 0; } -- cgit v1.2.3-71-gd317 From 307d522f5eb86cd6ac8c905f5b0577dedac54ec5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Jun 2021 16:44:32 -0500 Subject: signal/seccomp: Refactor seccomp signal and coredump generation Factor out force_sig_seccomp from the seccomp signal generation and place it in kernel/signal.c. The function force_sig_seccomp takes a parameter force_coredump to indicate that the sigaction field should be reset to SIGDFL so that a coredump will be generated when the signal is delivered. force_sig_seccomp is then used to replace both seccomp_send_sigsys and seccomp_init_siginfo. force_sig_info_to_task gains an extra parameter to force using the default signal action. With this change seccomp is no longer a special case and there becomes exactly one place do_coredump is called from. Further it no longer becomes necessary for __seccomp_filter to call do_group_exit. Acked-by: Kees Cook Link: https://lkml.kernel.org/r/87r1gr6qc4.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + kernel/seccomp.c | 40 ++++++---------------------------------- kernel/signal.c | 30 ++++++++++++++++++++++++++---- 3 files changed, 33 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..e3c933facd86 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -329,6 +329,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); +int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 057e17f3215d..abcbd3d2ba54 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -922,30 +922,6 @@ void get_seccomp_filter(struct task_struct *tsk) refcount_inc(&orig->users); } -static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) -{ - clear_siginfo(info); - info->si_signo = SIGSYS; - info->si_code = SYS_SECCOMP; - info->si_call_addr = (void __user *)KSTK_EIP(current); - info->si_errno = reason; - info->si_arch = syscall_get_arch(current); - info->si_syscall = syscall; -} - -/** - * seccomp_send_sigsys - signals the task to allow in-process syscall emulation - * @syscall: syscall number to send to userland - * @reason: filter-supplied reason code to send to userland (via si_errno) - * - * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. - */ -static void seccomp_send_sigsys(int syscall, int reason) -{ - struct kernel_siginfo info; - seccomp_init_siginfo(&info, syscall, reason); - force_sig_info(&info); -} #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ @@ -1218,7 +1194,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Show the handler the original registers. */ syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); + force_sig_seccomp(this_syscall, data, false); goto skip; case SECCOMP_RET_TRACE: @@ -1289,18 +1265,14 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { - kernel_siginfo_t info; - /* Show the original registers in the dump. */ syscall_rollback(current, current_pt_regs()); - /* Trigger a manual coredump since do_exit skips it. */ - seccomp_init_siginfo(&info, this_syscall, data); - do_coredump(&info); - } - if (action == SECCOMP_RET_KILL_THREAD) + /* Trigger a coredump with SIGSYS */ + force_sig_seccomp(this_syscall, data, true); + } else { do_exit(SIGSYS); - else - do_group_exit(SIGSYS); + } + return -1; /* skip the syscall go directly to signal handling */ } unreachable(); diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..fbf941b80dd4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -54,6 +54,7 @@ #include #include #include +#include /* for syscall_get_* */ /* * SLAB caches for signal bits. @@ -1322,7 +1323,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * that is why we also clear SIGNAL_UNKILLABLE. */ static int -force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl) { unsigned long int flags; int ret, blocked, ignored; @@ -1333,7 +1334,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); - if (blocked || ignored) { + if (blocked || ignored || sigdfl) { action->sa.sa_handler = SIG_DFL; if (blocked) { sigdelset(&t->blocked, sig); @@ -1354,7 +1355,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, current); + return force_sig_info_to_task(info, current, false); } /* @@ -1685,7 +1686,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info_to_task(&info, t); + return force_sig_info_to_task(&info, t, false); } int force_sig_fault(int sig, int code, void __user *addr @@ -1793,6 +1794,27 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) return force_sig_info(&info); } +/** + * force_sig_seccomp - signals the task to allow in-process syscall emulation + * @syscall: syscall number to send to userland + * @reason: filter-supplied reason code to send to userland (via si_errno) + * + * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. + */ +int force_sig_seccomp(int syscall, int reason, bool force_coredump) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_SECCOMP; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = reason; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall; + return force_sig_info_to_task(&info, current, force_coredump); +} + /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ -- cgit v1.2.3-71-gd317 From e681dcbaa4b284454fecd09617f8b24231448446 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 26 Aug 2021 15:37:38 +0200 Subject: sched: Fix get_push_task() vs migrate_disable() push_rt_task() attempts to move the currently running task away if the next runnable task has migration disabled and therefore is pinned on the current CPU. The current task is retrieved via get_push_task() which only checks for nr_cpus_allowed == 1, but does not check whether the task has migration disabled and therefore cannot be moved either. The consequence is a pointless invocation of the migration thread which correctly observes that the task cannot be moved. Return NULL if the task has migration disabled and cannot be moved to another CPU. Fixes: a7c81556ec4d3 ("sched: Fix migrate_disable() vs rt/dl balancing") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210826133738.yiotqbtdaxzjsnfj@linutronix.de --- kernel/sched/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index da4295f5ab77..ddefb0419d7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2255,6 +2255,9 @@ static inline struct task_struct *get_push_task(struct rq *rq) if (p->nr_cpus_allowed == 1) return NULL; + if (p->migration_disabled) + return NULL; + rq->push_busy = true; return get_task_struct(p); } -- cgit v1.2.3-71-gd317 From d21918e5a94a862ccb297b9f2be38574c865fda0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Jun 2021 16:51:49 -0500 Subject: signal/seccomp: Dump core when there is only one live thread Replace get_nr_threads with atomic_read(¤t->signal->live) as that is a more accurate number that is decremented sooner. Acked-by: Kees Cook Link: https://lkml.kernel.org/r/87lf6z6qbd.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index abcbd3d2ba54..afa4db097068 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1264,7 +1264,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || - get_nr_threads(current) == 1) { + (atomic_read(¤t->signal->live) == 1)) { /* Show the original registers in the dump. */ syscall_rollback(current, current_pt_regs()); /* Trigger a coredump with SIGSYS */ -- cgit v1.2.3-71-gd317 From cedcf527d59bcca5f87f52ea34a157bbc6e7a3a8 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Sun, 22 Aug 2021 10:27:34 +0800 Subject: padata: Remove repeated verbose license text remove it because SPDX-License-Identifier is already used Signed-off-by: Cai Huoqing Acked-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 3258ab89c2a3..18d3a5c699d8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -9,19 +9,6 @@ * * Copyright (c) 2020 Oracle and/or its affiliates. * Author: Daniel Jordan - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include -- cgit v1.2.3-71-gd317 From 6467822b8cc96e5feda98c7bf5c6329c6a896c91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Aug 2021 09:36:53 +0200 Subject: locking/rtmutex: Prevent spurious EDEADLK return caused by ww_mutexes rtmutex based ww_mutexes can legitimately create a cycle in the lock graph which can be observed by a blocker which didn't cause the problem: P1: A, ww_A, ww_B P2: ww_B, ww_A P3: A P3 might therefore be trapped in the ww_mutex induced cycle and run into the lock depth limitation of rt_mutex_adjust_prio_chain() which returns -EDEADLK to the caller. Disable the deadlock detection walk when the chain walk observes a ww_mutex to prevent this looping. [ tglx: Split it apart and added changelog ] Reported-by: Sebastian Siewior Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YSeWjCHoK4v5OcOt@hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c8fe74ef8db9..3c1ba7b9a326 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -656,6 +656,31 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, if (next_lock != waiter->lock) goto out_unlock_pi; + /* + * There could be 'spurious' loops in the lock graph due to ww_mutex, + * consider: + * + * P1: A, ww_A, ww_B + * P2: ww_B, ww_A + * P3: A + * + * P3 should not return -EDEADLK because it gets trapped in the cycle + * created by P1 and P2 (which will resolve -- and runs into + * max_lock_depth above). Therefore disable detect_deadlock such that + * the below termination condition can trigger once all relevant tasks + * are boosted. + * + * Even when we start with ww_mutex we can disable deadlock detection, + * since we would supress a ww_mutex induced deadlock at [6] anyway. + * Supressing it here however is not sufficient since we might still + * hit [6] due to adjustment driven iteration. + * + * NOTE: if someone were to create a deadlock between 2 ww_classes we'd + * utterly fail to report it; lockdep should. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) + detect_deadlock = false; + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting -- cgit v1.2.3-71-gd317 From a055fcc132d4c25b96d1115aea514258810dc6fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Aug 2021 10:48:18 +0200 Subject: locking/rtmutex: Return success on deadlock for ww_mutex waiters ww_mutexes can legitimately cause a deadlock situation in the lock graph which is resolved afterwards by the wait/wound mechanics. The rtmutex chain walk can detect such a deadlock and returns EDEADLK which in turn skips the wait/wound mechanism and returns EDEADLK to the caller. That's wrong because both lock chains might get EDEADLK or the wrong waiter would back out. Detect that situation and return 'success' in case that the waiter which initiated the chain walk is a ww_mutex with context. This allows the wait/wound mechanics to resolve the situation according to the rules. [ tglx: Split it apart and added changelog ] Reported-by: Sebastian Siewior Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YSeWjCHoK4v5OcOt@hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3c1ba7b9a326..8eabdc79602b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -742,8 +742,21 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; + + /* + * When the deadlock is due to ww_mutex; also see above. Don't + * report the deadlock and instead let the ww_mutex wound/die + * logic pick which of the contending threads gets -EDEADLK. + * + * NOTE: assumes the cycle only contains a single ww_class; any + * other configuration and we fail to report; also, see + * lockdep. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + ret = 0; + + raw_spin_unlock(&lock->wait_lock); goto out_unlock_pi; } -- cgit v1.2.3-71-gd317 From bc17bed5fd73ef1a9aed39f3b0ea26936dad60b8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 4 Aug 2021 21:01:05 +0800 Subject: printk/index: Fix -Wunused-function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_MODULES is n, we got this: kernel/printk/index.c:146:13: warning: ‘pi_remove_file’ defined but not used [-Wunused-function] Move it inside #ifdef block to fix this warning. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210804130105.18732-1-yuehaibing@huawei.com --- kernel/printk/index.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/index.c b/kernel/printk/index.c index 58d27272f992..d3709408debe 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -143,12 +143,12 @@ static void pi_create_file(struct module *mod) mod, &dfs_index_fops); } +#ifdef CONFIG_MODULES static void pi_remove_file(struct module *mod) { debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); } -#ifdef CONFIG_MODULES static int pi_module_notify(struct notifier_block *nb, unsigned long op, void *data) { -- cgit v1.2.3-71-gd317 From d25a025201ed98f4b93775e0999a3f2135702106 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Aug 2021 09:31:28 -0700 Subject: clocksource: Make clocksource watchdog test safe for slow-HZ systems The clocksource watchdog test sets a local JIFFIES_SHIFT macro and assumes that HZ is >= 100. For smaller HZ values this shift value is too large and causes undefined behaviour. Move the HZ-based definitions of JIFFIES_SHIFT from kernel/time/jiffies.c to kernel/time/tick-internal.h so the clocksource watchdog test can utilize them, which makes it work correctly with all HZ values. [ tglx: Resolved conflicts and massaged changelog ] Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/20210812000133.GA402890@paulmck-ThinkPad-P17-Gen-1/ --- kernel/time/clocksource-wdtest.c | 5 ++--- kernel/time/jiffies.c | 21 +-------------------- kernel/time/tick-internal.h | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 01df12395c0e..df922f49d171 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -19,6 +19,8 @@ #include #include +#include "tick-internal.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -34,9 +36,6 @@ static u64 wdtest_jiffies_read(struct clocksource *cs) return (u64)jiffies; } -/* Assume HZ > 100. */ -#define JIFFIES_SHIFT 8 - static struct clocksource clocksource_wdtest_jiffies = { .name = "wdtest-jiffies", .rating = 1, /* lowest valid rating*/ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 01935aafdb46..bc4db9e5ab70 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -10,28 +10,9 @@ #include #include "timekeeping.h" +#include "tick-internal.h" -/* Since jiffies uses a simple TICK_NSEC multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. TICK_NSEC grows as HZ - * shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#if HZ < 34 -#define JIFFIES_SHIFT 6 -#elif HZ < 67 -#define JIFFIES_SHIFT 7 -#else -#define JIFFIES_SHIFT 8 -#endif - static u64 jiffies_read(struct clocksource *cs) { return (u64) jiffies; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 3548f0829e6d..649f2b48e8f0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -177,3 +177,23 @@ void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif -- cgit v1.2.3-71-gd317 From d20d30ebb199354729c64c86945ed25c66ff4991 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Aug 2021 17:02:55 -0700 Subject: cgroup: Avoid compiler warnings with no subsystems As done before in commit cb4a31675270 ("cgroup: use bitmask to filter for_each_subsys"), avoid compiler warnings for the pathological case of having no subsystems (i.e. CGROUP_SUBSYS_COUNT == 0). This condition is hit for the arm multi_v7_defconfig config under -Wzero-length-bounds: In file included from ./arch/arm/include/generated/asm/rwonce.h:1, from include/linux/compiler.h:264, from include/uapi/linux/swab.h:6, from include/linux/swab.h:5, from arch/arm/include/asm/opcodes.h:86, from arch/arm/include/asm/bug.h:7, from include/linux/bug.h:5, from include/linux/thread_info.h:13, from include/asm-generic/current.h:5, from ./arch/arm/include/generated/asm/current.h:1, from include/linux/sched.h:12, from include/linux/cgroup.h:12, from kernel/cgroup/cgroup-internal.h:5, from kernel/cgroup/cgroup.c:31: kernel/cgroup/cgroup.c: In function 'of_css': kernel/cgroup/cgroup.c:651:42: warning: array subscript '' is outside the bounds of an interior zero-length array 'struct cgroup_subsys_state *[0]' [-Wzero-length-bounds] 651 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); Reported-by: Stephen Rothwell Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d0725c1a8db5..881ce1470beb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -67,6 +67,14 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -248,7 +256,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ bool cgroup_ssid_enabled(int ssid) { - if (CGROUP_SUBSYS_COUNT == 0) + if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); @@ -472,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - if (ss) + if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else @@ -550,6 +558,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + do { css = cgroup_css(cgrp, ss); @@ -577,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + rcu_read_lock(); do { @@ -647,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ - if (cft->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; @@ -695,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css); */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ - if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ + if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ @@ -2372,7 +2386,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; - while (&cset->mg_node != tset->csets) { + while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); @@ -4643,7 +4657,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->ss = css->ss; it->flags = flags; - if (it->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; -- cgit v1.2.3-71-gd317 From f3c4b1341e8320e63f197a554fc5a25686a11d22 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Fri, 27 Aug 2021 11:48:02 +0800 Subject: swiotlb: use depends on for DMA_RESTRICTED_POOL Use depends on instead of select for DMA_RESTRICTED_POOL; otherwise it will make SWIOTLB user configurable and cause compile errors for some arch (e.g. mips). Fixes: 0b84e4f8b793 ("swiotlb: Add restricted DMA pool initialization") Acked-by: Will Deacon Reported-by: Guenter Roeck Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 3e961dc39634..d19f9fef4ab9 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -82,8 +82,7 @@ config SWIOTLB config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" - depends on OF && OF_RESERVED_MEM - select SWIOTLB + depends on OF && OF_RESERVED_MEM && SWIOTLB help This enables support for restricted DMA pools which provide a level of DMA memory protection on systems with limited hardware protection -- cgit v1.2.3-71-gd317 From 49ca6153208f6efc409c1deb82dd5bcbb519d7e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Aug 2021 09:39:31 +0200 Subject: bpf: Relicense disassembler as GPL-2.0-only OR BSD-2-Clause Some time ago we dual-licensed both libbpf and bpftool through commits 1bc38b8ff6cc ("libbpf: relicense libbpf as LGPL-2.1 OR BSD-2-Clause") and 907b22365115 ("tools: bpftool: dual license all files"). The latter missed the disasm.{c,h} which we pull in via kernel/bpf/ such that we have a single source for verifier as well as bpftool asm dumping, see also f4ac7e0b5cc8 ("bpf: move instruction printing into a separate file"). It is currently GPL-2.0-only and missed the conversion in 907b22365115, therefore relicense the two as GPL-2.0-only OR BSD-2-Clause as well. Spotted-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Brendan Jackman Acked-by: Jakub Kicinski Acked-by: Jiri Olsa Acked-by: Simon Horman Acked-by: Martin KaFai Lau Acked-by: Xu Kuohai Acked-by: Edward Cree --- kernel/bpf/disasm.c | 2 +- kernel/bpf/disasm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ -- cgit v1.2.3-71-gd317 From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 31 Aug 2021 08:38:19 +0200 Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT 730633f0b7f95 became the first direct caller of __init_rwsem() vs the usual init_rwsem(), exposing PREEMPT_RT's lack thereof. Add it. [ tglx: Move it out of line ] Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de --- include/linux/rwsem.h | 12 ++---------- kernel/locking/rwsem.c | 10 ++++++---- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 426e98e0b675..352c6127cb90 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -142,22 +142,14 @@ struct rw_semaphore { #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, +extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); -#else -static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name, - struct lock_class_key *key) -{ -} -#endif #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - init_rwbase_rt(&(sem)->rwbase); \ - __rwsem_init((sem), #sem, &__key); \ + __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9215b4d6a9de..000e8d5a2884 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #include "rwbase_rt.c" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __rwsem_init(struct rw_semaphore *sem, const char *name, +void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); -} -EXPORT_SYMBOL(__rwsem_init); #endif +} +EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { -- cgit v1.2.3-71-gd317 From a974b54036f79dd5e395e9f6c80c3decb4661a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Aug 2021 14:18:40 +0100 Subject: futex: Return error code instead of assigning it without effect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check on the rt_waiter and top_waiter->pi_state is assigning an error return code to ret but this later gets re-assigned, hence the check is ineffective. Return -EINVAL rather than assigning it to ret which was the original intent. Fixes: dc7109aaa233 ("futex: Validate waiter correctly in futex_proxy_trylock_atomic()") Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20210818131840.34262-1-colin.king@canonical.com --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e7b4c6121da4..30e7daebaec8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2025,7 +2025,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) - ret = -EINVAL; + return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) -- cgit v1.2.3-71-gd317 From 4f07ec0d76f242d4ca0f0c0c6f7293c28254a554 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:48 +0200 Subject: futex: Prevent inconsistent state and exit race The recent rework of the requeue PI code introduced a possibility for going back to user space in inconsistent state: CPU 0 CPU 1 requeue_futex() if (lock_pifutex_user()) { dequeue_waiter(); wake_waiter(task); sched_in(task); return_from_futex_syscall(); ---> Inconsistent state because PI state is not established It becomes worse if the woken up task immediately exits: sys_exit(); attach_pistate(vpid); <--- FAIL Attach the pi state before dequeuing and waking the waiter. If the waiter gets a spurious wakeup before the dequeue operation it will wait in futex_requeue_pi_wakeup_sync() and therefore cannot return and exit. Fixes: 07d91ef510fb ("futex: Prevent requeue_pi() lock nesting issue on RT") Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de --- kernel/futex.c | 98 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 30e7daebaec8..04164d440d98 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, newval |= FUTEX_WAITERS; ret = lock_pi_update_atomic(uaddr, uval, newval); - /* If the take over worked, return 1 */ - return ret < 0 ? ret : 1; + if (ret) + return ret; + + /* + * If the waiter bit was requested the caller also needs PI + * state attached to the new owner of the user space futex. + * + * @task is guaranteed to be alive and it cannot be exiting + * because it is either sleeping or waiting in + * futex_requeue_pi_wakeup_sync(). + */ + if (set_waiters) { + ret = attach_to_pi_owner(uaddr, newval, key, ps, + exiting); + WARN_ON(ret); + } + return 1; } /* @@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EAGAIN; /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { - /* Dequeue, wake up and update top_waiter::requeue_state */ + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); @@ -2208,19 +2230,26 @@ retry_private: &exiting, nr_requeue); /* - * At this point the top_waiter has either taken uaddr2 or is - * waiting on it. If the former, then the pi_state will not - * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, @ret contains the - * VPID of the top waiter task. - * If the lock was not taken, we have pi_state and an initial - * refcount on it. In case of an error we have nothing. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. * * The top waiter's requeue_state is up to date: * - * - If the lock was acquired atomically (ret > 0), then + * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an @@ -2234,36 +2263,20 @@ retry_private: * the same sanity checks for requeue_pi as the loop * below does. */ - if (ret > 0) { - WARN_ON(pi_state); - task_count++; - /* - * If futex_proxy_trylock_atomic() acquired the - * user space futex, then the user space value - * @uaddr2 has been set to the @hb1's top waiter - * task VPID. This task is guaranteed to be alive - * and cannot be exiting because it is either - * sleeping or blocked on @hb2 lock. - * - * The @uaddr2 futex cannot have waiters either as - * otherwise futex_proxy_trylock_atomic() would not - * have succeeded. - * - * In order to requeue waiters to @hb2, pi state is - * required. Hand in the VPID value (@ret) and - * allocate PI state with an initial refcount on - * it. - */ - ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, - &exiting); - WARN_ON(ret); - } - switch (ret) { case 0: /* We hold a reference on the pi state. */ break; + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. @@ -2395,9 +2408,8 @@ retry_private: } /* - * We took an extra initial reference to the pi_state either in - * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need - * to drop it here again. + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); -- cgit v1.2.3-71-gd317 From 249955e51c8136189b3c66f54e212981a1350a0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:50 +0200 Subject: futex: Clarify comment for requeue_pi_wake_futex() It's slightly confusing. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.618613025@linutronix.de --- kernel/futex.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 04164d440d98..82cd270f25a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1954,12 +1954,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -- cgit v1.2.3-71-gd317 From 340576590dac4bb58d532a8ad5bfa806d8ab473c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:51 +0200 Subject: futex: Avoid redundant task lookup No need to do the full VPID based task lookup and validation of the top waiter when the user space futex was acquired on it's behalf during the requeue_pi operation. The task is known already and it cannot go away before requeue_pi_wake_futex() has been invoked. Split out the actual attach code from attach_pi_state_owner() and use that instead of the full blown variant. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.676104881@linutronix.de --- kernel/futex.c | 67 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 82cd270f25a0..a316dce74c3d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, return -ESRCH; } +static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, + struct futex_pi_state **ps) +{ + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + struct futex_pi_state *pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + + *ps = pi_state; +} /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. @@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; - struct futex_pi_state *pi_state; struct task_struct *p; /* @@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return ret; } - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - /* - * Assignment without holding pi_state->pi_mutex.wait_lock is safe - * because there is no concurrency as the object is not published yet. - */ - pi_state->owner = p; + __attach_to_pi_owner(p, key, ps); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); - *ps = pi_state; - return 0; } @@ -1464,11 +1468,14 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * @task is guaranteed to be alive and it cannot be exiting * because it is either sleeping or waiting in * futex_requeue_pi_wakeup_sync(). + * + * No need to do the full attach_to_pi_owner() exercise + * because @task is known and valid. */ if (set_waiters) { - ret = attach_to_pi_owner(uaddr, newval, key, ps, - exiting); - WARN_ON(ret); + raw_spin_lock_irq(&task->pi_lock); + __attach_to_pi_owner(task, key, ps); + raw_spin_unlock_irq(&task->pi_lock); } return 1; } -- cgit v1.2.3-71-gd317 From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:20:25 +0200 Subject: kernel/fork: factor out replacing the current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's factor the main logic out into replace_mm_exe_file(), such that all mm->exe_file logic is contained in kernel/fork.c. While at it, perform some simple cleanups that are possible now that we're simplifying the individual functions. Acked-by: Christian Brauner Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mm.h | 1 + kernel/fork.c | 44 +++++++++++++++++++++++++++++++++++++++++--- kernel/sys.c | 33 +-------------------------------- 3 files changed, 43 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..48c6fa9ab792 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2581,6 +2581,7 @@ extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..f4ac883c4a1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1148,9 +1148,7 @@ void mmput_async(struct mm_struct *mm) * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single - * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the - * mm->exe_file, but does so without using set_mm_exe_file() in order - * to avoid the need for any locks. + * threaded. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1170,6 +1168,46 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) fput(old_exe_file); } +/** + * replace_mm_exe_file - replace a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe), + * dealing with concurrent invocation and without grabbing the mmap lock in + * write mode. + * + * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). + */ +int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + struct vm_area_struct *vma; + struct file *old_exe_file; + int ret = 0; + + /* Forbid mm->exe_file change if old file still mapped. */ + old_exe_file = get_mm_exe_file(mm); + if (old_exe_file) { + mmap_read_lock(mm); + for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &old_exe_file->f_path)) + ret = -EBUSY; + } + mmap_read_unlock(mm); + fput(old_exe_file); + if (ret) + return ret; + } + + /* set the new file, lockless */ + get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); + return 0; +} + /** * get_mm_exe_file - acquire a reference to the mm's executable file * diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..30c12e54585a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,6 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct file *old_exe, *exe_file; struct inode *inode; int err; @@ -1869,40 +1868,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - /* - * Forbid mm->exe_file change if old file still mapped. - */ - exe_file = get_mm_exe_file(mm); - err = -EBUSY; - if (exe_file) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_err; - } - - mmap_read_unlock(mm); - fput(exe_file); - } - - err = 0; - /* set the new file, lockless */ - get_file(exe.file); - old_exe = xchg(&mm->exe_file, exe.file); - if (old_exe) - fput(old_exe); + err = replace_mm_exe_file(mm, exe.file); exit: fdput(exe); return err; -exit_err: - mmap_read_unlock(mm); - fput(exe_file); - goto exit; } /* -- cgit v1.2.3-71-gd317 From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:29:59 +0200 Subject: kernel/fork: always deny write access to current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to remove VM_DENYWRITE only currently only used when mapping the executable during exec. During exec, we already deny_write_access() the executable, however, after exec completes the VMAs mapped with VM_DENYWRITE effectively keeps write access denied via deny_write_access(). Let's deny write access when setting or replacing the MM exe_file. With this change, we can remove VM_DENYWRITE for mapping executables. Make set_mm_exe_file() return an error in case deny_write_access() fails; note that this should never happen, because exec code does a deny_write_access() early and keeps write access denied when calling set_mm_exe_file. However, it makes the code easier to read and makes set_mm_exe_file() and replace_mm_exe_file() look more similar. This represents a minor user space visible change: sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file cannot be opened writable. Note that we can already fail with -EACCES if the file doesn't have execute permissions. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- fs/exec.c | 4 +++- include/linux/mm.h | 2 +- kernel/fork.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..9294049f5487 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1270,7 +1270,9 @@ int begin_new_exec(struct linux_binprm * bprm) * not visibile until then. This also enables the update * to be lockless. */ - set_mm_exe_file(bprm->mm, bprm->file); + retval = set_mm_exe_file(bprm->mm, bprm->file); + if (retval) + goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); diff --git a/include/linux/mm.h b/include/linux/mm.h index 48c6fa9ab792..56b1cd41db61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2580,7 +2580,7 @@ static inline int check_data_rlimit(unsigned long rlim, extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index f4ac883c4a1e..7677f897ecb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -470,6 +470,20 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct file *exe_file; + + exe_file = get_mm_exe_file(oldmm); + RCU_INIT_POINTER(mm->exe_file, exe_file); + /* + * We depend on the oldmm having properly denied write access to the + * exe_file already. + */ + if (exe_file && deny_write_access(exe_file)) + pr_warn_once("deny_write_access() failed in %s\n", __func__); +} + #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) @@ -493,7 +507,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; @@ -639,7 +653,7 @@ static inline void mm_free_pgd(struct mm_struct *mm) static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } @@ -1149,8 +1163,10 @@ void mmput_async(struct mm_struct *mm) * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single * threaded. + * + * Can only fail if new_exe_file != NULL. */ -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; @@ -1161,11 +1177,21 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) + if (new_exe_file) { + /* + * We expect the caller (i.e., sys_execve) to already denied + * write access, so this is unlikely to fail. + */ + if (unlikely(deny_write_access(new_exe_file))) + return -EACCES; get_file(new_exe_file); + } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + allow_write_access(old_exe_file); fput(old_exe_file); + } + return 0; } /** @@ -1201,10 +1227,22 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) } /* set the new file, lockless */ + ret = deny_write_access(new_exe_file); + if (ret) + return -EACCES; get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + /* + * Don't race with dup_mmap() getting the file and disallowing + * write access while someone might open the file writable. + */ + mmap_read_lock(mm); + allow_write_access(old_exe_file); fput(old_exe_file); + mmap_read_unlock(mm); + } return 0; } -- cgit v1.2.3-71-gd317 From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 22 Apr 2021 12:08:20 +0200 Subject: mm: remove VM_DENYWRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be set from user space, so all users are gone; let's remove it. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- fs/proc/task_mmu.c | 1 - include/linux/mm.h | 1 - include/linux/mman.h | 1 - include/trace/events/mmflags.h | 1 - kernel/events/core.c | 2 -- kernel/fork.c | 3 --- lib/test_printf.c | 5 ++--- mm/mmap.c | 27 +++------------------------ 8 files changed, 5 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index eb97468dfe4c..cf25be3e0321 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -619,7 +619,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", - [ilog2(VM_DENYWRITE)] = "dw", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index 56b1cd41db61..257995f62e83 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -281,7 +281,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/linux/mman.h b/include/linux/mman.h index ebb09a964272..bd9aadda047b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -153,7 +153,6 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | - _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | arch_calc_vm_flag_bits(flags); diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index f160484afc5c..0b53e855c4ac 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -165,7 +165,6 @@ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") {VM_UFFD_MISSING, "uffd_missing" }, \ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ - {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..19767bb9933c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8307,8 +8307,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) diff --git a/kernel/fork.c b/kernel/fork.c index 7677f897ecb6..feef1057081d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -570,12 +570,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); file = tmp->vm_file; if (file) { - struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); diff --git a/lib/test_printf.c b/lib/test_printf.c index 8ac71aee46af..8a48b61c3763 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -675,9 +675,8 @@ flags(void) "uptodate|dirty|lru|active|swapbacked", cmp_buffer); - flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC - | VM_DENYWRITE; - test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags); + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + test("read|exec|mayread|maywrite|mayexec", "%pGv", &flags); gfp = GFP_TRANSHUGE; test("GFP_TRANSHUGE", "%pGg", &gfp); diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203a..589dc1dc13db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -148,8 +148,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_DENYWRITE) - allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -666,8 +664,6 @@ static void __vma_link_file(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - if (vma->vm_flags & VM_DENYWRITE) - put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -1788,22 +1784,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_DENYWRITE) { - error = deny_write_access(file); - if (error) - goto free_vma; - } if (vm_flags & VM_SHARED) { error = mapping_map_writable(file->f_mapping); if (error) - goto allow_write_and_free_vma; + goto free_vma; } - /* ->mmap() can change vma->vm_file, but must guarantee that - * vma_link() below can deny write-access if VM_DENYWRITE is set - * and map writably if VM_SHARED is set. This usually means the - * new file must not have been exposed to user-space, yet. - */ vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) @@ -1860,13 +1846,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (file) { unmap_writable: - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - } + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); file = vma->vm_file; out: perf_event_mmap(vma); @@ -1906,9 +1888,6 @@ unmap_and_free_vma: charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); -allow_write_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); free_vma: vm_area_free(vma); unacct_error: -- cgit v1.2.3-71-gd317 From fab827dbee8c2e06ca4ba000fa6c48bcf9054aba Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:54:57 -0700 Subject: memcg: enable accounting for pids in nested pid namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") enabled memcg accounting for pids allocated from init_pid_ns.pid_cachep, but forgot to adjust the setting for nested pid namespaces. As a result, pid memory is not accounted exactly where it is really needed, inside memcg-limited containers with their own pid namespaces. Pid was one the first kernel objects enabled for memcg accounting. init_pid_ns.pid_cachep marked by SLAB_ACCOUNT and we can expect that any new pids in the system are memcg-accounted. Though recently I've noticed that it is wrong. nested pid namespaces creates own slab caches for pid objects, nested pids have increased size because contain id both for all parent and for own pid namespaces. The problem is that these slab caches are _NOT_ marked by SLAB_ACCOUNT, as a result any pids allocated in nested pid namespaces are not memcg-accounted. Pid struct in nested pid namespace consumes up to 500 bytes memory, 100000 such objects gives us up to ~50Mb unaccounted memory, this allow container to exceed assigned memcg limits. Link: https://lkml.kernel.org/r/8b6de616-fd1a-02c6-cbdb-976ecdcfa604@virtuozzo.com Fixes: 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") Cc: stable@vger.kernel.org Signed-off-by: Vasily Averin Reviewed-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca43239a255a..cb5a25a8a0cc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); -- cgit v1.2.3-71-gd317 From 30acd0bdfb86548172168a0cc71d455944de0683 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:27 -0700 Subject: memcg: enable accounting for new namesapces and struct nsproxy Container admin can create new namespaces and force kernel to allocate up to several pages of memory for the namespaces and its associated structures. Net and uts namespaces have enabled accounting for such allocations. It makes sense to account for rest ones to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Serge Hallyn Acked-by: Christian Brauner Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 4 ++-- kernel/user_namespace.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index e51b63ae233b..94a9817851cc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3307,7 +3307,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); diff --git a/ipc/namespace.c b/ipc/namespace.c index 7bd0766ddc3b..ae83f0f2651b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index f5e8828c109c..0d5c29879a50 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index abc01fcad8c7..eec72ca962e2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -568,6 +568,6 @@ out: int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index cb5a25a8a0cc..a46a3723bc66 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -450,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12eab0d2ae28..aec832801c26 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ef82d401dde8..6b2e3ca7ee99 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); -- cgit v1.2.3-71-gd317 From 5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:35 -0700 Subject: memcg: enable accounting for signals When a user send a signal to any another processes it forces the kernel to allocate memory for 'struct sigqueue' objects. The number of signals is limited by RLIMIT_SIGPENDING resource limit, but even the default settings allow each user to consume up to several megabytes of memory. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..8921c4a8419f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4663,7 +4663,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB -- cgit v1.2.3-71-gd317 From c509723ec27e925bb91a20682c448e95d4bc8c9f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:39 -0700 Subject: memcg: enable accounting for posix_timers_cache slab A program may create multiple interval timers using timer_create(). For each timer the kernel preallocates a "queued real-time signal", Consequently, the number of timers is limited by the RLIMIT_SIGPENDING resource limit. The allocated object is quite small, ~250 bytes, but even the default signal limits allow to consume up to 100 megabytes per user. It makes sense to account for them to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Thomas Gleixner Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..7363f81dc31a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); -- cgit v1.2.3-71-gd317 From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:59 -0700 Subject: mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a [akpm@linux-foundation.org: tweak vm.rst, per Mike] Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 3 ++- include/linux/compaction.h | 2 ++ include/linux/mmzone.h | 1 + kernel/sysctl.c | 2 +- mm/compaction.c | 38 +++++++++++++++++++++++++++++++-- 5 files changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 003d5cc3751b..5e795202111f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -118,7 +118,8 @@ compaction_proactiveness This tunable takes a value in the range [0, 100] with a default value of 20. This tunable determines how aggressively compaction is done in the -background. Setting it to 0 disables proactive compaction. +background. Write of a non zero value to this tunable will immediately +trigger the proactive compaction. Setting it to 0 disables proactive compaction. Note that compaction has a non-trivial system-wide impact as pages belonging to different processes are moved around, which could also lead diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c24098c7acca..34bce35c808d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order) extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59bad25ce78e..1bd5f5955f9a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -846,6 +846,7 @@ typedef struct pglist_data { enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; + bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..297f0b3966bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compaction_proactiveness, .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, diff --git a/mm/compaction.c b/mm/compaction.c index 4ee0d40d93f2..fa9b2b598eab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2706,6 +2706,30 @@ static void compact_nodes(void) */ unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2901,9 +2926,16 @@ static int kcompactd(void *p) while (!kthread_should_stop()) { unsigned long pflags; + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), timeout)) { + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); @@ -2938,6 +2970,8 @@ static int kcompactd(void *p) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; } return 0; -- cgit v1.2.3-71-gd317 From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:33 -0700 Subject: mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 2 ++ arch/arm/tools/syscall.tbl | 2 ++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 2 ++ arch/m68k/kernel/syscalls/syscall.tbl | 2 ++ arch/microblaze/kernel/syscalls/syscall.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n32.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n64.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_o32.tbl | 2 ++ arch/parisc/kernel/syscalls/syscall.tbl | 2 ++ arch/powerpc/kernel/syscalls/syscall.tbl | 2 ++ arch/s390/kernel/syscalls/syscall.tbl | 2 ++ arch/sh/kernel/syscalls/syscall.tbl | 2 ++ arch/sparc/kernel/syscalls/syscall.tbl | 2 ++ arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 1 + 21 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a17687ed4b51..605645eae04c 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -486,3 +486,5 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +# 557 reserved for memfd_secret +558 common process_mrelease sys_process_mrelease diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c5df1179fc5d..2f32eb8beca8 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -460,3 +460,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 727bfc3be99b..3cb206aea3db 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 447 +#define __NR_compat_syscalls 449 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 99ffcafc736c..0f49cdb180dd 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 6d07742c57b8..9bf45f2be966 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -367,3 +367,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 541bc1b3a8f9..f1f98ee6c82d 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -446,3 +446,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a176faca2927..da49ddd4bb54 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -452,3 +452,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c2d2e19abea8..56c8d3cf42ed 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -385,3 +385,5 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ac653d08b1ea..1ca7bc337932 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -361,3 +361,5 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n64 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 253f2cd70b6b..fd3a9df60ec2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -434,3 +434,5 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 o32 process_mrelease sys_process_mrelease diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e26187b9ab87..040df1b7a589 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -444,3 +444,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index aef2a290e71a..d8ebd7d37c0f 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -526,3 +526,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b4..57233ace30cb 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e0a70be77d84..2f6e95eb4690 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 603f5a821502..42fc2906215d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -492,3 +492,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ce763a12311c..661a03bcfbd1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -452,3 +452,4 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f6b57799c1ea..807b6a1de8e8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -369,6 +369,7 @@ 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 235d67d6ceb4..f4384951f393 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -417,3 +417,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..00bc170a50f0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a9d6fcd95f42..14c8fe863c6d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #define __NR_memfd_secret 447 __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #endif +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) #undef __NR_syscalls -#define __NR_syscalls 448 +#define __NR_syscalls 449 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..18a9c2cde767 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -289,6 +289,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); -- cgit v1.2.3-71-gd317 From d66e3edee7af87fe212df611ab9846b987a5070f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 22:47:06 +0200 Subject: futex: Remove unused variable 'vpid' in futex_proxy_trylock_atomic() The recent bug fix left the variable 'vpid' and an assignment to it around, but the variable is otherwise unused. clang dose not complain even with W=1, but gcc exposed this. Fixes: 4f07ec0d76f2 ("futex: Prevent inconsistent state and exit race") Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a316dce74c3d..c15ad276fd15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2034,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, { struct futex_q *top_waiter = NULL; u32 curval; - int ret, vpid; + int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -2079,7 +2079,6 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * the user space lock can be acquired then PI state is attached to * the new owner (@top_waiter->task) when @set_waiters is true. */ - vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { -- cgit v1.2.3-71-gd317 From 54357f0c9149c871e5e4b83ad385a6f2ad3a749f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Aug 2021 15:26:25 +0200 Subject: tracing: Add migrate-disabled counter to tracing output. migrate_disable() forbids task migration to another CPU. It is available since v5.11 and has already users such as highmem or BPF. It is useful to observe this task state in tracing which already has other states like the preemption counter. Instead of adding the migrate disable counter as a new entry to struct trace_entry, which would extend the whole struct by four bytes, it is squashed into the preempt-disable counter. The lower four bits represent the preemption counter, the upper four bits represent the migrate disable counter. Both counter shouldn't exceed 15 but if they do, there is a safety net which caps the value at 15. Add the migrate-disable counter to the trace entry so it shows up in the trace. Due to the users mentioned above, it is already possible to observe it: | bash-1108 [000] ...21 73.950578: rss_stat: mm_id=2213312838 curr=0 type=MM_ANONPAGES size=8192B | bash-1108 [000] d..31 73.951222: irq_disable: caller=flush_tlb_mm_range+0x115/0x130 parent=ptep_clear_flush+0x42/0x50 | bash-1108 [000] d..31 73.951222: tlb_flush: pages:1 reason:local mm shootdown (3) The last value is the migrate-disable counter. Things that popped up: - trace_print_lat_context() does not print the migrate counter. Not sure if it should. It is used in "verbose" mode and uses 8 digits and I'm not sure ther is something processing the value. - trace_define_common_fields() now defines a different variable. This probably breaks things. No ide what to do in order to preserve the old behaviour. Since this is used as a filter it should be split somehow to be able to match both nibbles here. Link: https://lkml.kernel.org/r/20210810132625.ylssabmsrkygokuv@linutronix.de Signed-off-by: Thomas Gleixner [bigeasy: patch description.] Signed-off-by: Sebastian Andrzej Siewior [ SDR: Removed change to common_preempt_count field name ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 26 +++++++++++++++++++------- kernel/trace/trace_events.c | 1 + kernel/trace/trace_output.c | 11 ++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 489924cde4f8..b554e18924f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2603,6 +2603,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return); +static unsigned short migration_disable_value(void) +{ +#if defined(CONFIG_SMP) + return current->migration_disabled; +#else + return 0; +#endif +} + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; @@ -2621,7 +2630,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (pc & 0xff); + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * @@ -4189,9 +4199,10 @@ static void print_lat_help_header(struct seq_file *m) "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + "# |||| / _-=> migrate-disable \n" + "# ||||| / delay \n" + "# cmd pid |||||| time | caller \n" + "# \\ / |||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -4229,9 +4240,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1349b6de5eeb..830b3b9940f4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -181,6 +181,7 @@ static int trace_define_common_fields(void) __common_field(unsigned short, type); __common_field(unsigned char, flags); + /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a0bf446bb034..c2ca40e8595b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -492,8 +492,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); else trace_seq_putc(s, '.'); @@ -656,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_seq_printf( s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + entry->preempt_count & 0xf, iter->idx); } else { lat_print_generic(s, entry, iter->cpu); } -- cgit v1.2.3-71-gd317 From a61cb6017df0a9be072a35259e6e9ae7aa0ef6b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2021 14:40:00 +0200 Subject: dma-mapping: fix the kerneldoc for dma_map_sg_attrs Add the missing description for the nents parameter, and fix a trivial misalignment. Fixes: fffe3cc8c219 ("dma-mapping: allow map_sg() ops to return negative error codes") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 7ee5284bff58..06fec5547e7c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation - * @sg: The sg_table object describing the buffer + * @sg: The sg_table object describing the buffer + * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * -- cgit v1.2.3-71-gd317 From f8416aa29185468e0d914ba4b2a330fd53ee263f Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Mon, 6 Sep 2021 19:23:02 +0800 Subject: kernel: debug: Convert to SPDX identifier use SPDX-License-Identifier instead of a verbose license text Signed-off-by: Cai Huoqing Link: https://lore.kernel.org/r/20210906112302.937-1-caihuoqing@baidu.com Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 5 +---- kernel/debug/gdbstub.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b4aa6bb6b2bd..da06a5553835 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel Debug Core * @@ -22,10 +23,6 @@ * * Original KGDB stub: David Grothe , * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. */ #define pr_fmt(fmt) "KGDB: " fmt diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index b6f28fad4307..9d34d2364b5a 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel Debug Core * @@ -22,10 +23,6 @@ * * Original KGDB stub: David Grothe , * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. */ #include -- cgit v1.2.3-71-gd317 From 5615e088b43d564aec08ccfaecb21ba484a1b4d6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2021 14:51:50 +0300 Subject: tracing: Fix some alloc_event_probe() error handling bugs There are two bugs in this code. First, if the kzalloc() fails it leads to a NULL dereference of "ep" on the next line. Second, if the alloc_event_probe() function returns an error then it leads to an error pointer dereference in the caller. Link: https://lkml.kernel.org/r/20210824115150.GI31143@kili Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 56a96e9750cf..3044b762cbd7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -151,7 +151,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { - trace_event_put_ref(ep->event); + trace_event_put_ref(event); goto error; } ep->event = event; @@ -851,7 +851,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) ret = PTR_ERR(ep); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto error; /* We know ep is not allocated */ + ep = NULL; + goto error; } argc -= 2; argv += 2; -- cgit v1.2.3-71-gd317 From 3c91dda97eea704ac257ddb138d1154adab8db62 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 7 Sep 2021 19:58:18 -0700 Subject: kernel/acct.c: use dedicated helper to access rlimit values Use rlimit() helper instead of manually writing whole chain from task to rlimit value. See patch "posix-cpu-timers: Use dedicated helper to access rlimit values". Link: https://lkml.kernel.org/r/20210728030822.524789-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Randy Dunlap Cc: sh_def@163.com Cc: Yang Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a64102be2bb0..23a7ab8e6cbc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* * Accounting records are not subject to resource limits. */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + flim = rlimit(RLIMIT_FSIZE); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); -- cgit v1.2.3-71-gd317 From 2d186afd04d669fe9c48b994c41a7405a3c9f16d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 7 Sep 2021 19:58:21 -0700 Subject: profiling: fix shift-out-of-bounds bugs Syzbot reported shift-out-of-bounds bug in profile_init(). The problem was in incorrect prof_shift. Since prof_shift value comes from userspace we need to clamp this value into [0, BITS_PER_LONG -1] boundaries. Second possible shiht-out-of-bounds was found by Tetsuo: sample_step local variable in read_profile() had "unsigned int" type, but prof_shift allows to make a BITS_PER_LONG shift. So, to prevent possible shiht-out-of-bounds sample_step type was changed to "unsigned long". Also, "unsigned short int" will be sufficient for storing [0, BITS_PER_LONG] value, that's why there is no need for "unsigned long" prof_shift. Link: https://lkml.kernel.org/r/20210813140022.5011-1-paskripkin@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+e68c89a9510c159d9684@syzkaller.appspotmail.com Suggested-by: Tetsuo Handa Signed-off-by: Pavel Skripkin Cc: Thomas Gleixner Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index c2ebddb5e974..eb9c7f0f5ac5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -41,7 +41,8 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; +static unsigned long prof_len; +static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); @@ -67,8 +68,8 @@ int profile_setup(char *str) if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel sleep profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel sleep profiling enabled (shift: %u)\n", prof_shift); #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); @@ -78,21 +79,21 @@ int profile_setup(char *str) if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel schedule profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel schedule profiling enabled (shift: %u)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; if (str[strlen(kvmstr)] == ',') str += strlen(kvmstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel KVM profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel KVM profiling enabled (shift: %u)\n", prof_shift); } else if (get_option(&str, &par)) { - prof_shift = par; + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; - pr_info("kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } return 1; @@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long p = *ppos; ssize_t read; char *pnt; - unsigned int sample_step = 1 << prof_shift; + unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) -- cgit v1.2.3-71-gd317 From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Sep 2021 20:00:00 -0700 Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. [rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n] [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none [akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter] Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Anton Blanchard Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 18 ++++++++++-------- include/linux/sched/user.h | 3 ++- kernel/user.c | 25 +++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e596e1d0bba..648ed77f4164 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) */ call_rcu(&epi->rcu, epi_rcu_free); - atomic_long_dec(&ep->user->epoll_watches); + percpu_counter_dec(&ep->user->epoll_watches); return 0; } @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - long user_watches; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, lockdep_assert_irqs_enabled(); - user_watches = atomic_long_read(&ep->user->epoll_watches); - if (unlikely(user_watches >= max_user_watches)) + if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, + max_user_watches) >= 0)) return -ENOSPC; - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) + percpu_counter_inc(&ep->user->epoll_watches); + + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; + } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { - kmem_cache_free(epi_cache, epi); if (tep) mutex_unlock(&tep->mtx); + kmem_cache_free(epi_cache, epi); + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); - atomic_long_inc(&ep->user->epoll_watches); - /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 2462f7d07695..00ed419dd464 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ struct user_struct { refcount_t __count; /* reference count */ #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ diff --git a/kernel/user.c b/kernel/user.c index c82399c1618a..e2cf8c22b539 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) return NULL; } +static int user_epoll_alloc(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); +#else + return 0; +#endif +} + +static void user_epoll_free(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + percpu_counter_destroy(&up->epoll_watches); +#endif +} + /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + user_epoll_free(up); kmem_cache_free(uid_cachep, up); } @@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); + if (user_epoll_alloc(new)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +238,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (user_epoll_alloc(&root_user)) + panic("root_user epoll percpu counter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); -- cgit v1.2.3-71-gd317 From 05da8113c9ba63a8913e6c73dc06ed01cae55f68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 20:00:35 -0700 Subject: kernel/fork.c: unexport get_{mm,task}_exe_file Only used by core code and the tomoyo which can't be a module either. Link: https://lkml.kernel.org/r/20210820095430.445242-1-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..df7296474ecd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,7 +1187,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm) rcu_read_unlock(); return exe_file; } -EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_exe_file - acquire a reference to the task's executable file @@ -1210,7 +1209,6 @@ struct file *get_task_exe_file(struct task_struct *task) task_unlock(task); return exe_file; } -EXPORT_SYMBOL(get_task_exe_file); /** * get_task_mm - acquire a reference to the task's mm -- cgit v1.2.3-71-gd317 From e1fbbd073137a9d63279f6bf363151a938347640 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 7 Sep 2021 20:00:41 -0700 Subject: prctl: allow to setup brk for et_dyn executables Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data. For example a test program shows | # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000 and when executed via ld-linux | # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000 This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call. Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov Reported-by: Keno Fischer Acked-by: Andrey Vagin Tested-by: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Pavel Tikhomirov Cc: Alexander Mikhalitsyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..6ec50924b517 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1959,13 +1959,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; - /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */ -- cgit v1.2.3-71-gd317 From 4b6b08f2e45edda4c067ac40833e3c1f84383c0b Mon Sep 17 00:00:00 2001 From: "Qiang.Zhang" Date: Tue, 31 Aug 2021 10:29:19 +0800 Subject: tracing/osnoise: Fix missed cpus_read_unlock() in start_per_cpu_kthreads() When start_kthread() return error, the cpus_read_unlock() need to be called. Link: https://lkml.kernel.org/r/20210831022919.27630-1-qiang.zhang@windriver.com Cc: Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Acked-by: Daniel Bristot de Oliveira Signed-off-by: Qiang.Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 65b08b8e5bf8..ce053619f289 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1548,7 +1548,7 @@ static int start_kthread(unsigned int cpu) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - int retval; + int retval = 0; int cpu; cpus_read_lock(); @@ -1568,13 +1568,13 @@ static int start_per_cpu_kthreads(struct trace_array *tr) retval = start_kthread(cpu); if (retval) { stop_per_cpu_kthreads(); - return retval; + break; } } cpus_read_unlock(); - return 0; + return retval; } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3-71-gd317 From 0be083cee42ec78ba6f9148f5b12a00aa2fb8bd3 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 1 Sep 2021 16:55:13 +0300 Subject: tracing: synth events: increase max fields count Sometimes it is useful to construct larger synthetic trace events. Increase 'SYNTH_FIELDS_MAX' (maximum number of fields in a synthetic event) from 32 to 64. Link: https://lkml.kernel.org/r/20210901135513.3087062-1-dedekind1@gmail.com Signed-off-by: Artem Bityutskiy Acked-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_synth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index 4007fe95cf42..b29595fe3ac5 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -5,7 +5,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 +#define SYNTH_FIELDS_MAX 64 #define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ -- cgit v1.2.3-71-gd317 From c910db943d35d4ac4b77570ece76e0799af24233 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 2 Sep 2021 15:57:12 -0500 Subject: tracing: Dynamically allocate the per-elt hist_elt_data array Setting the hist_elt_data.field_var_str[] array unconditionally to a size of SYNTH_FIELD_MAX elements wastes space unnecessarily. The actual number of elements needed can be calculated at run-time instead. In most cases, this will save a lot of space since it's a per-elt array which isn't normally close to being full. It also allows us to increase SYNTH_FIELD_MAX without worrying about even more wastage when we do that. Link: https://lkml.kernel.org/r/d52ae0ad5e1b59af7c4f54faf3fc098461fd82b3.camel@kernel.org Signed-off-by: Tom Zanussi Tested-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9d91b1c06957..a6061a69aa84 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -508,7 +508,8 @@ struct track_data { struct hist_elt_data { char *comm; u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; + char **field_var_str; + int n_field_var_str; }; struct snapshot_context { @@ -1401,9 +1402,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data) { unsigned int i; - for (i = 0; i < SYNTH_FIELDS_MAX; i++) + for (i = 0; i < elt_data->n_field_var_str; i++) kfree(elt_data->field_var_str[i]); + kfree(elt_data->field_var_str); + kfree(elt_data->comm); kfree(elt_data); } @@ -1451,6 +1454,13 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) size = STR_VAR_LEN_MAX; + elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL); + if (!elt_data->field_var_str) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + elt_data->n_field_var_str = n_str; + for (i = 0; i < n_str; i++) { elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); if (!elt_data->field_var_str[i]) { -- cgit v1.2.3-71-gd317 From cfd799837dbc48499abb05d1891b3d9992354d3a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 04:38:03 +0900 Subject: tracing/boot: Fix to loop on only subkeys Since the commit e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") allows to co-exist a value node and key nodes under a node, xbc_node_for_each_child() is not only returning key node but also a value node. In the boot-time tracing using xbc_node_for_each_child() to iterate the events, groups and instances, but those must be key nodes. Thus it must use xbc_node_for_each_subkey(). Link: https://lkml.kernel.org/r/163112988361.74896.2267026262061819145.stgit@devnote2 Fixes: e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1060b0446032..388e65d05978 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -522,14 +522,14 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ - xbc_node_for_each_child(node, gnode) { + xbc_node_for_each_subkey(node, gnode) { data = xbc_node_get_data(gnode); if (!strcmp(data, "enable")) { enable_all = true; continue; } enable = false; - xbc_node_for_each_child(gnode, enode) { + xbc_node_for_each_subkey(gnode, enode) { data = xbc_node_get_data(enode); if (!strcmp(data, "enable")) { enable = true; @@ -625,7 +625,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!node) return; - xbc_node_for_each_child(node, inode) { + xbc_node_for_each_subkey(node, inode) { p = xbc_node_get_data(inode); if (!p || *p == '\0') continue; -- cgit v1.2.3-71-gd317 From 4b692e861619353ce069e547a67c8d0e32d9ef3d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:10 -0700 Subject: kexec: move locking into do_kexec_load Patch series "compat: remove compat_alloc_user_space", v5. Going through compat_alloc_user_space() to convert indirect system call arguments tends to add complexity compared to handling the native and compat logic in the same code. This patch (of 6): The locking is the same between the native and compat version of sys_kexec_load(), so it can be done in the common implementation to reduce duplication. Link: https://lkml.kernel.org/r/20210727144859.4150043-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20210727144859.4150043-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Al Viro Cc: Feng Tang Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c82c6c06f051..9c7aef8f4bb6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -110,6 +110,17 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long i; int ret; + /* + * Because we write directly to the reserved memory region when loading + * crash kernels we need a mutex here to prevent multiple crash kernels + * from attempting to load simultaneously, and to prevent a crash kernel + * from loading over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) @@ -121,7 +132,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (nr_segments == 0) { /* Uninstall image */ kimage_free(xchg(dest_image, NULL)); - return 0; + ret = 0; + goto out_unlock; } if (flags & KEXEC_ON_CRASH) { /* @@ -134,7 +146,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); if (ret) - return ret; + goto out_unlock; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -171,6 +183,8 @@ out: arch_kexec_protect_crashkres(); kimage_free(image); +out_unlock: + mutex_unlock(&kexec_mutex); return ret; } @@ -247,21 +261,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, segments, flags); - mutex_unlock(&kexec_mutex); - return result; } @@ -301,21 +302,8 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, ksegments, flags); - mutex_unlock(&kexec_mutex); - return result; } #endif -- cgit v1.2.3-71-gd317 From 5d700a0fd71ded7096b97f01d276efc1a6579613 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:13 -0700 Subject: kexec: avoid compat_alloc_user_space kimage_alloc_init() expects a __user pointer, so compat_sys_kexec_load() uses compat_alloc_user_space() to convert the layout and put it back onto the user space caller stack. Moving the user space access into the syscall handler directly actually makes the code simpler, as the conversion for compat mode can now be done on kernel memory. Link: https://lkml.kernel.org/r/20210727144859.4150043-3-arnd@kernel.org Link: https://lore.kernel.org/lkml/YPbtsU4GX6PL7%2F42@infradead.org/ Link: https://lore.kernel.org/lkml/m1y2cbzmnw.fsf@fess.ebiederm.org/ Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 61 ++++++++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c7aef8f4bb6..b5e40f069768 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -19,26 +19,9 @@ #include "kexec_internal.h" -static int copy_user_segment_list(struct kimage *image, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int ret; - size_t segment_bytes; - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - ret = copy_from_user(image->segment, segments, segment_bytes); - if (ret) - ret = -EFAULT; - - return ret; -} - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, + struct kexec_segment *segments, unsigned long flags) { int ret; @@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, return -ENOMEM; image->start = entry; - - ret = copy_user_segment_list(image, nr_segments, segments); - if (ret) - goto out_free_image; + image->nr_segments = nr_segments; + memcpy(image->segment, segments, nr_segments * sizeof(*segments)); if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ @@ -104,7 +85,7 @@ out_free_image: } static int do_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, unsigned long flags) + struct kexec_segment *segments, unsigned long flags) { struct kimage **dest_image, *image; unsigned long i; @@ -250,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments, SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - int result; + struct kexec_segment *ksegments; + unsigned long result; result = kexec_load_check(nr_segments, flags); if (result) @@ -261,7 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - result = do_kexec_load(entry, nr_segments, segments, flags); + ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0])); + if (IS_ERR(ksegments)) + return PTR_ERR(ksegments); + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + kfree(ksegments); return result; } @@ -273,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, flags) { struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; + struct kexec_segment *ksegments; unsigned long i, result; result = kexec_load_check(nr_segments, flags); @@ -286,24 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]), + GFP_KERNEL); + if (!ksegments) + return -ENOMEM; + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) - return -EFAULT; + goto fail; - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; + ksegments[i].buf = compat_ptr(in.buf); + ksegments[i].bufsz = in.bufsz; + ksegments[i].mem = in.mem; + ksegments[i].memsz = in.memsz; } result = do_kexec_load(entry, nr_segments, ksegments, flags); +fail: + kfree(ksegments); return result; } #endif -- cgit v1.2.3-71-gd317 From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/unistd32.h | 10 ++++----- arch/mips/kernel/syscalls/syscall_n32.tbl | 10 ++++----- arch/mips/kernel/syscalls/syscall_o32.tbl | 10 ++++----- arch/parisc/kernel/syscalls/syscall.tbl | 8 +++---- arch/powerpc/kernel/syscalls/syscall.tbl | 10 ++++----- arch/s390/kernel/syscalls/syscall.tbl | 10 ++++----- arch/sparc/kernel/syscalls/syscall.tbl | 10 ++++----- arch/x86/entry/syscalls/syscall_32.tbl | 4 ++-- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/linux/compat.h | 20 ----------------- include/uapi/asm-generic/unistd.h | 10 ++++----- kernel/sys_ni.c | 5 ----- mm/mempolicy.c | 37 ------------------------------- mm/migrate.c | 13 ----------- 14 files changed, 42 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 4e99e4b912ef..844f6ae58662 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) #define __NR_inotify_rm_watch 318 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) #define __NR_mbind 319 -__SYSCALL(__NR_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 320 -__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 321 -__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_openat 322 __SYSCALL(__NR_openat, compat_sys_openat) #define __NR_mkdirat 323 @@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee) #define __NR_vmsplice 343 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 344 -__SYSCALL(__NR_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_getcpu 345 __SYSCALL(__NR_getcpu, sys_getcpu) #define __NR_epoll_pwait 346 @@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq) #define __NR_io_pgetevents 399 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents) #define __NR_migrate_pages 400 -__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_kexec_file_load 401 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) /* 402 is unused */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 56c8d3cf42ed..70e32de2bcaa 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -239,9 +239,9 @@ 228 n32 clock_nanosleep sys_clock_nanosleep_time32 229 n32 tgkill sys_tgkill 230 n32 utimes sys_utimes_time32 -231 n32 mbind compat_sys_mbind -232 n32 get_mempolicy compat_sys_get_mempolicy -233 n32 set_mempolicy compat_sys_set_mempolicy +231 n32 mbind sys_mbind +232 n32 get_mempolicy sys_get_mempolicy +233 n32 set_mempolicy sys_set_mempolicy 234 n32 mq_open compat_sys_mq_open 235 n32 mq_unlink sys_mq_unlink 236 n32 mq_timedsend sys_mq_timedsend_time32 @@ -258,7 +258,7 @@ 247 n32 inotify_init sys_inotify_init 248 n32 inotify_add_watch sys_inotify_add_watch 249 n32 inotify_rm_watch sys_inotify_rm_watch -250 n32 migrate_pages compat_sys_migrate_pages +250 n32 migrate_pages sys_migrate_pages 251 n32 openat sys_openat 252 n32 mkdirat sys_mkdirat 253 n32 mknodat sys_mknodat @@ -279,7 +279,7 @@ 268 n32 sync_file_range sys_sync_file_range 269 n32 tee sys_tee 270 n32 vmsplice sys_vmsplice -271 n32 move_pages compat_sys_move_pages +271 n32 move_pages sys_move_pages 272 n32 set_robust_list compat_sys_set_robust_list 273 n32 get_robust_list compat_sys_get_robust_list 274 n32 kexec_load compat_sys_kexec_load diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 201237fd0f43..a61c35edaa74 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -279,9 +279,9 @@ 265 o32 clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill 267 o32 utimes sys_utimes_time32 -268 o32 mbind sys_mbind compat_sys_mbind -269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 o32 mbind sys_mbind +269 o32 get_mempolicy sys_get_mempolicy +270 o32 set_mempolicy sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink 273 o32 mq_timedsend sys_mq_timedsend_time32 @@ -298,7 +298,7 @@ 284 o32 inotify_init sys_inotify_init 285 o32 inotify_add_watch sys_inotify_add_watch 286 o32 inotify_rm_watch sys_inotify_rm_watch -287 o32 migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 o32 migrate_pages sys_migrate_pages 288 o32 openat sys_openat compat_sys_openat 289 o32 mkdirat sys_mkdirat 290 o32 mknodat sys_mknodat @@ -319,7 +319,7 @@ 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range 306 o32 tee sys_tee 307 o32 vmsplice sys_vmsplice -308 o32 move_pages sys_move_pages compat_sys_move_pages +308 o32 move_pages sys_move_pages 309 o32 set_robust_list sys_set_robust_list compat_sys_set_robust_list 310 o32 get_robust_list sys_get_robust_list compat_sys_get_robust_list 311 o32 kexec_load sys_kexec_load compat_sys_kexec_load diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0bf854b70612..bf751e0732b7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -292,9 +292,9 @@ 258 32 clock_nanosleep sys_clock_nanosleep_time32 258 64 clock_nanosleep sys_clock_nanosleep 259 common tgkill sys_tgkill -260 common mbind sys_mbind compat_sys_mbind -261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -262 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +260 common mbind sys_mbind +261 common get_mempolicy sys_get_mempolicy +262 common set_mempolicy sys_set_mempolicy # 263 was vserver 264 common add_key sys_add_key 265 common request_key sys_request_key @@ -331,7 +331,7 @@ 292 64 sync_file_range sys_sync_file_range 293 common tee sys_tee 294 common vmsplice sys_vmsplice -295 common move_pages sys_move_pages compat_sys_move_pages +295 common move_pages sys_move_pages 296 common getcpu sys_getcpu 297 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 298 common statfs64 sys_statfs64 compat_sys_statfs64 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 29b55e2e035c..7bef917cc84e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -330,10 +330,10 @@ 256 64 sys_debug_setcontext sys_ni_syscall 256 spu sys_debug_setcontext sys_ni_syscall # 257 reserved for vserver -258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages -259 nospu mbind sys_mbind compat_sys_mbind -260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +258 nospu migrate_pages sys_migrate_pages +259 nospu mbind sys_mbind +260 nospu get_mempolicy sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink 264 32 mq_timedsend sys_mq_timedsend_time32 @@ -381,7 +381,7 @@ 298 common faccessat sys_faccessat 299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -301 common move_pages sys_move_pages compat_sys_move_pages +301 common move_pages sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 304 32 utimensat sys_utimensat_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index aa9d68b8ee14..df5261e5cfe1 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -274,9 +274,9 @@ 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink 273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 @@ -293,7 +293,7 @@ 284 common inotify_init sys_inotify_init sys_inotify_init 285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 common migrate_pages sys_migrate_pages sys_migrate_pages 288 common openat sys_openat compat_sys_openat 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat @@ -317,7 +317,7 @@ 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range 308 common tee sys_tee sys_tee 309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages +310 common move_pages sys_move_pages sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 313 common utimes sys_utimes sys_utimes_time32 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7893104718c2..c37764dc764d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -365,12 +365,12 @@ 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -302 common migrate_pages sys_migrate_pages compat_sys_migrate_pages -303 common mbind sys_mbind compat_sys_mbind -304 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -305 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +302 common migrate_pages sys_migrate_pages +303 common mbind sys_mbind +304 common get_mempolicy sys_get_mempolicy +305 common set_mempolicy sys_set_mempolicy 306 common kexec_load sys_kexec_load compat_sys_kexec_load -307 common move_pages sys_move_pages compat_sys_move_pages +307 common move_pages sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 310 32 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 61f18b72552b..960a021d543e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -286,7 +286,7 @@ 272 i386 fadvise64_64 sys_ia32_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +275 i386 get_mempolicy sys_get_mempolicy 276 i386 set_mempolicy sys_set_mempolicy 277 i386 mq_open sys_mq_open compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink @@ -328,7 +328,7 @@ 314 i386 sync_file_range sys_ia32_sync_file_range 315 i386 tee sys_tee 316 i386 vmsplice sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages +317 i386 move_pages sys_move_pages 318 i386 getcpu sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait 320 i386 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 807b6a1de8e8..18b5500ea8bf 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -398,7 +398,7 @@ 530 x32 set_robust_list compat_sys_set_robust_list 531 x32 get_robust_list compat_sys_get_robust_list 532 x32 vmsplice sys_vmsplice -533 x32 move_pages compat_sys_move_pages +533 x32 move_pages sys_move_pages 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo diff --git a/include/linux/compat.h b/include/linux/compat.h index 3a2ac5afee30..2d42cebd1fb8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr /* mm/fadvise.c: No generic prototype for fadvise64_64 */ /* mm/, CONFIG_MMU only */ -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags); -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, - compat_ulong_t flags); -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode); -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes); -asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); - asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 14c8fe863c6d..1c5fb86d455a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -673,15 +673,15 @@ __SYSCALL(__NR_madvise, sys_madvise) #define __NR_remap_file_pages 234 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) #define __NR_mbind 235 -__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 236 -__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 237 -__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_migrate_pages 238 -__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_move_pages 239 -__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #endif #define __NR_rt_tgsigqueueinfo 240 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 64578adfe115..f43d89d92860 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); -COND_SYSCALL_COMPAT(mbind); COND_SYSCALL(get_mempolicy); -COND_SYSCALL_COMPAT(get_mempolicy); COND_SYSCALL(set_mempolicy); -COND_SYSCALL_COMPAT(set_mempolicy); COND_SYSCALL(migrate_pages); -COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); -COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb95578f5997..8d14240896a8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1649,43 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, - compat_ulong_t, addr, compat_ulong_t, flags) -{ - return kernel_get_mempolicy(policy, (unsigned long __user *)nmask, - maxnode, addr, flags); -} - -COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode) -{ - return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode); -} - -COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, - compat_ulong_t, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, compat_ulong_t, flags) -{ - return kernel_mbind(start, len, mode, (unsigned long __user *)nmask, - maxnode, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - return kernel_migrate_pages(pid, maxnode, - (const unsigned long __user *)old_nodes, - (const unsigned long __user *)new_nodes); -} - -#endif /* CONFIG_COMPAT */ - bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) diff --git a/mm/migrate.c b/mm/migrate.c index 2bc494875cea..a6a7743ee98f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2047,19 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - return kernel_move_pages(pid, nr_pages, - (const void __user *__user *)pages, - nodes, status, flags); -} -#endif /* CONFIG_COMPAT */ - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA -- cgit v1.2.3-71-gd317 From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/compat.h | 5 --- arch/arm64/include/asm/uaccess.h | 11 ----- arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/copy_in_user.S | 77 --------------------------------- arch/mips/cavium-octeon/octeon-memcpy.S | 2 - arch/mips/include/asm/compat.h | 8 ---- arch/mips/include/asm/uaccess.h | 26 ----------- arch/mips/lib/memcpy.S | 11 ----- arch/parisc/include/asm/compat.h | 6 --- arch/parisc/include/asm/uaccess.h | 2 - arch/parisc/lib/memcpy.c | 9 ---- arch/powerpc/include/asm/compat.h | 16 ------- arch/s390/include/asm/compat.h | 10 ----- arch/s390/include/asm/uaccess.h | 3 -- arch/s390/lib/uaccess.c | 63 --------------------------- arch/sparc/include/asm/compat.h | 19 -------- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/signal32.c | 12 ++--- arch/sparc/kernel/signal_64.c | 8 ++-- arch/x86/include/asm/compat.h | 13 ------ arch/x86/include/asm/uaccess_64.h | 7 --- include/linux/compat.h | 2 - include/linux/uaccess.h | 10 ----- kernel/compat.c | 21 --------- 24 files changed, 12 insertions(+), 333 deletions(-) delete mode 100644 arch/arm64/lib/copy_in_user.S (limited to 'kernel') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 79c1a750e357..eaa6ca062d89 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -107,11 +107,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -static inline void __user *arch_compat_alloc_user_space(long len) -{ - return (void __user *)compat_user_stack_pointer() - len; -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b5f08621fa29..190b494e22ab 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -430,17 +430,6 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi __actu_ret; \ }) -extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); -#define raw_copy_in_user(to, from, n) \ -({ \ - unsigned long __aciu_ret; \ - uaccess_ttbr0_enable(); \ - __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ - __uaccess_mask_ptr(from), (n)); \ - uaccess_ttbr0_disable(); \ - __aciu_ret; \ -}) - #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 6dd56a49790a..0941180a86d3 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ - copy_to_user.o copy_in_user.o copy_page.o \ + copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S deleted file mode 100644 index dbea3799c3ef..000000000000 --- a/arch/arm64/lib/copy_in_user.S +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copy from user space to user space - * - * Copyright (C) 2012 ARM Ltd. - */ - -#include - -#include -#include -#include - -/* - * Copy from user space to user space (alignment handled by the hardware) - * - * Parameters: - * x0 - to - * x1 - from - * x2 - n - * Returns: - * x0 - bytes not copied - */ - .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm - - .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm - - .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm - - .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm - - .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm - -end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_in_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" - mov x0, #0 - ret -SYM_FUNC_END(__arch_copy_in_user) -EXPORT_SYMBOL(__arch_copy_in_user) - - .section .fixup,"ax" - .align 2 -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret - .previous diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 600d018cf354..0a515cde1c18 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -154,8 +154,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) /* * Note: dst & src may be unaligned, len may be 0 * Temps diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 53f015a1b0a7..bbb3bc5a42fd 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -96,14 +96,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = (struct pt_regs *) - ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1; - - return (void __user *) (regs->regs[29] - len); -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 783fecce65c8..f8f74f9f5883 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -428,7 +428,6 @@ do { \ extern size_t __raw_copy_from_user(void *__to, const void *__from, size_t __n); extern size_t __raw_copy_to_user(void *__to, const void *__from, size_t __n); -extern size_t __raw_copy_in_user(void *__to, const void *__from, size_t __n); static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) @@ -480,31 +479,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -static inline unsigned long -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - register void __user *__cu_to_r __asm__("$4"); - register const void __user *__cu_from_r __asm__("$5"); - register long __cu_len_r __asm__("$6"); - - __cu_to_r = to; - __cu_from_r = from; - __cu_len_r = n; - - __asm__ __volatile__( - ".set\tnoreorder\n\t" - __MODULE_JAL(__raw_copy_in_user) - ".set\tnoat\n\t" - __UA_ADDU "\t$1, %1, %2\n\t" - ".set\tat\n\t" - ".set\treorder" - : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r) - : - : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31", - DADDI_SCRATCH, "memory"); - return __cu_len_r; -} - extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size); /* diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index e19fb98b5d38..277c32296636 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S @@ -666,8 +666,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) #endif /* Legacy Mode, user <-> user */ __BUILD_COPY_USER LEGACY_MODE USEROP USEROP @@ -703,13 +701,4 @@ EXPORT_SYMBOL(__raw_copy_to_user) __BUILD_COPY_USER EVA_MODE KERNELOP USEROP END(__raw_copy_to_user) -/* - * __copy_in_user (EVA) - */ - -LEAF(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) -__BUILD_COPY_USER EVA_MODE USEROP USEROP -END(__raw_copy_in_user) - #endif diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index b5d90e82b65d..c04f5a637c39 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -163,12 +163,6 @@ struct compat_shmid64_ds { #define COMPAT_ELF_NGREG 80 typedef compat_ulong_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; -static __inline__ void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = ¤t->thread.regs; - return (void __user *)regs->gr[30]; -} - static inline int __is_compat_task(struct task_struct *t) { return test_tsk_thread_flag(t, TIF_32BIT); diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ed2cd4fb479b..7c13314aae4a 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -215,8 +215,6 @@ unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src, unsigned long len); unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src, unsigned long len); -unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src, - unsigned long len); #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 4b75388190b4..ea70a0e08321 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -38,14 +38,6 @@ unsigned long raw_copy_from_user(void *dst, const void __user *src, } EXPORT_SYMBOL(raw_copy_from_user); -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len) -{ - mtsp(get_user_space(), 1); - mtsp(get_user_space(), 2); - return pa_memcpy((void __force *)dst, (void __force *)src, len); -} - - void * memcpy(void * dst,const void *src, size_t count) { mtsp(get_kernel_space(), 1); @@ -54,7 +46,6 @@ void * memcpy(void * dst,const void *src, size_t count) return dst; } -EXPORT_SYMBOL(raw_copy_in_user); EXPORT_SYMBOL(memcpy); bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index e33dcf134cdd..7afc96fb6524 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -83,22 +83,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current->thread.regs; - unsigned long usp = regs->gpr[1]; - - /* - * We can't access below the stack pointer in the 32bit ABI and - * can access 288 bytes in the 64bit big-endian ABI, - * or 512 bytes with the new ELFv2 little-endian ABI. - */ - if (!is_32bit_task()) - usp -= USER_REDZONE_SIZE; - - return (void __user *) (usp - len); -} - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 8d49505b4a43..cdc7ae72529d 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -176,16 +176,6 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_31BIT); } -static inline void __user *arch_compat_alloc_user_space(long len) -{ - unsigned long stack; - - stack = KSTK_ESP(current); - if (is_compat_task()) - stack &= 0x7fffffffUL; - return (void __user *) (stack - len); -} - #endif struct compat_ipc64_perm { diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9ed9aa37e836..ce550d06abc3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -227,9 +227,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s __get_user(x, ptr); \ }) -unsigned long __must_check -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); - /* * Copy a null terminated string from userspace. */ diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 94ca99bde59d..a596e69d3c47 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -204,69 +204,6 @@ unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long } EXPORT_SYMBOL(raw_copy_to_user); -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - " lgr 0,%[spec]\n" - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2:slgr %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (0x810081UL) - : "cc", "memory", "0"); - return size; -} - -static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1; - - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0: aghi %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " aghi %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - return size; -} - -unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (copy_with_mvcos()) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_mvc(to, from, n); -} -EXPORT_SYMBOL(raw_copy_in_user); - static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 8b63410e830f..bd949fcf9d63 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -116,25 +116,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -#ifdef CONFIG_COMPAT -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current_thread_info()->kregs; - unsigned long usp = regs->u_regs[UREG_I6]; - - if (test_thread_64bit_stack(usp)) - usp += STACK_BIAS; - - if (test_thread_flag(TIF_32BIT)) - usp &= 0xffffffffUL; - - usp -= len; - usp &= ~0x7UL; - - return (void __user *) usp; -} -#endif - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 093849bfda50..d1cc410d2f64 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -455,7 +455,7 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) distance = fp - psp; rval = (csp - distance); - if (copy_in_user((void __user *) rval, (void __user *) psp, distance)) + if (raw_copy_in_user((void __user *)rval, (void __user *)psp, distance)) rval = 0; else if (!stack_64bit) { if (put_user(((u32)csp), diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 4276b9e003ca..6cc124a3bb98 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -435,9 +435,9 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs, (_COMPAT_NSIG_WORDS - 1) * sizeof(unsigned int)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; @@ -567,9 +567,9 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs, err |= put_compat_sigset(&sf->mask, oldset, sizeof(compat_sigset_t)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index cea23cf95600..2a78d2af1265 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -406,10 +406,10 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) err |= copy_to_user(&sf->mask, sigmask_to_save(), sizeof(sigset_t)); if (!wsaved) { - err |= copy_in_user((u64 __user *)sf, - (u64 __user *)(regs->u_regs[UREG_FP] + - STACK_BIAS), - sizeof(struct reg_window)); + err |= raw_copy_in_user((u64 __user *)sf, + (u64 __user *)(regs->u_regs[UREG_FP] + + STACK_BIAS), + sizeof(struct reg_window)); } else { struct reg_window *rp; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 4ae01cdb99de..7516e4199b3c 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -156,19 +156,6 @@ struct compat_shmid64_ds { (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif -static inline void __user *arch_compat_alloc_user_space(long len) -{ - compat_uptr_t sp = task_pt_regs(current)->sp; - - /* - * -128 for the x32 ABI redzone. For IA32, it is not strictly - * necessary, but not harmful. - */ - sp -= 128; - - return (void __user *)round_down(sp - len, 16); -} - static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..45697e04d771 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -static __always_inline __must_check -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) -{ - return copy_user_generic((__force void *)dst, - (__force void *)src, size); -} - extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); diff --git a/include/linux/compat.h b/include/linux/compat.h index 2d42cebd1fb8..1c758b0e0359 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -511,8 +511,6 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, struct epoll_event; /* fortunately, this one is fixed-layout */ -extern void __user *compat_alloc_user_space(unsigned long len); - int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index c05e903cef02..ac0394087f7d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -200,16 +200,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) n = _copy_to_user(to, from, n); return n; } -#ifdef CONFIG_COMPAT -static __always_inline unsigned long __must_check -copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - might_fault(); - if (access_ok(to, n) && access_ok(from, n)) - n = raw_copy_in_user(to, from, n); - return n; -} -#endif #ifndef copy_mc_to_kernel /* diff --git a/kernel/compat.c b/kernel/compat.c index 05adfd6fa8bf..55551989d9da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset); - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3-71-gd317 From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++++++ kernel/fork.c | 1 + 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f7ca1a3870ea..1faebe1cd0ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -858,6 +858,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); +static inline void hugetlb_count_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->hugetlb_usage, 0); +} + static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); @@ -1042,6 +1047,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline void hugetlb_count_init(struct mm_struct *mm) +{ +} + static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } diff --git a/kernel/fork.c b/kernel/fork.c index ff5be23800af..38681ad44c76 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1063,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3-71-gd317 From e5480572706da1b2c2dc2c6484eab64f92b9263b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Sep 2021 11:44:11 +0200 Subject: locking/rtmutex: Fix ww_mutex deadlock check Dan reported that rt_mutex_adjust_prio_chain() can be called with .orig_waiter == NULL however commit a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") unconditionally dereferences it. Since both call-sites that have .orig_waiter == NULL don't care for the return value, simply disable the deadlock squash by adding the NULL check. Notably, both callers use the deadlock condition as a termination condition for the iteration; once detected, it is sure that (de)boosting is done. Arguably step [3] would be a more natural termination point, but it's dubious whether adding a third deadlock detection state would improve the code. Fixes: a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/YS9La56fHMiCCo75@hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8eabdc79602b..6bb116c559b4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * other configuration and we fail to report; also, see * lockdep. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) ret = 0; raw_spin_unlock(&lock->wait_lock); -- cgit v1.2.3-71-gd317 From 9848417926353daa59d2b05eb26e185063dbac6e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 6 Sep 2021 13:30:34 +0200 Subject: sched/idle: Make the idle timer expire in hard interrupt context The intel powerclamp driver will setup a per-CPU worker with RT priority. The worker will then invoke play_idle() in which it remains in the idle poll loop until it is stopped by the timer it started earlier. That timer needs to expire in hard interrupt context on PREEMPT_RT. Otherwise the timer will expire in ksoftirqd as a SOFT timer but that task won't be scheduled on the CPU because its priority is lower than the priority of the worker which is in the idle loop. Always expire the idle timer in hard interrupt context. Reported-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210906113034.jgfxrjdvxnjqgtmc@linutronix.de --- kernel/sched/idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 912b47aa99d8..d17b0a5ce6ac 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); it.timer.function = idle_inject_timer_fn; hrtimer_start(&it.timer, ns_to_ktime(duration_ns), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); while (!READ_ONCE(it.done)) do_idle(); -- cgit v1.2.3-71-gd317 From 868ad33bfa3bf39960982682ad3a0f8ebda1656e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 28 Aug 2021 15:55:52 +0200 Subject: sched: Prevent balance_push() on remote runqueues sched_setscheduler() and rt_mutex_setprio() invoke the run-queue balance callback after changing priorities or the scheduling class of a task. The run-queue for which the callback is invoked can be local or remote. That's not a problem for the regular rq::push_work which is serialized with a busy flag in the run-queue struct, but for the balance_push() work which is only valid to be invoked on the outgoing CPU that's wrong. It not only triggers the debug warning, but also leaves the per CPU variable push_work unprotected, which can result in double enqueues on the stop machine list. Remove the warning and validate that the function is invoked on the outgoing CPU. Fixes: ae7927023243 ("sched: Optimize finish_lock_switch()") Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87zgt1hdw7.ffs@tglx --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3b27c6c5153..b21a1857b75a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8523,7 +8523,6 @@ static void balance_push(struct rq *rq) struct task_struct *push_task = rq->curr; lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* * Ensure the thing is persistent until balance_push_set(.on = false); @@ -8531,9 +8530,10 @@ static void balance_push(struct rq *rq) rq->balance_callback = &balance_push_callback; /* - * Only active while going offline. + * Only active while going offline and when invoked on the outgoing + * CPU. */ - if (!cpu_dying(rq->cpu)) + if (!cpu_dying(rq->cpu) || rq != this_rq()) return; /* -- cgit v1.2.3-71-gd317 From a3928f877e7b153dbc243d6329b3777ac75b6004 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:23 +0900 Subject: tracing/boot: Fix trace_boot_hist_add_array() to check array is value trace_boot_hist_add_array() uses the combination of xbc_node_find_child() and xbc_node_get_child() to get the child node of the key node. But since it missed to check the child node is data node or not, user can pass the subkey node for the array node (anode). To avoid this issue, check the array node is a data node. Actually, there is xbc_node_find_value(node, key, vnode), which ensures the @vnode is a value node, so use it in trace_boot_hist_add_array() to fix this issue. Link: https://lkml.kernel.org/r/163119458308.161018.1516455973625940212.stgit@devnote2 Fixes: e66ed86ca6c5 ("tracing/boot: Add per-event histogram action options") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 388e65d05978..a6be48b24774 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -219,13 +219,12 @@ static int __init trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, char *end, const char *key) { - struct xbc_node *knode, *anode; + struct xbc_node *anode; const char *p; char sep; - knode = xbc_node_find_child(hnode, key); - if (knode) { - anode = xbc_node_get_child(knode); + p = xbc_node_find_value(hnode, key, &anode); + if (p) { if (!anode) { pr_err("hist.%s requires value(s).\n", key); return -EINVAL; -- cgit v1.2.3-71-gd317 From 5f8895b27da2656e5e2be029acfb68c016f03326 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:30 +0900 Subject: tracing/boot: Fix to check the histogram control param is a leaf node Since xbc_node_find_child() doesn't ensure the returned node is a leaf node (key-value pair or do not have subkeys), use xbc_node_find_value to ensure the histogram control parameter is a leaf node in trace_boot_compose_hist_cmd(). Link: https://lkml.kernel.org/r/163119459059.161018.18341288218424528962.stgit@devnote2 Fixes: e66ed86ca6c5 ("tracing/boot: Add per-event histogram action options") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a6be48b24774..db6ee372dc6d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -385,11 +385,11 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) } /* Histogram control attributes (mutual exclusive) */ - if (xbc_node_find_child(hnode, "pause")) + if (xbc_node_find_value(hnode, "pause", NULL)) append_printf(&buf, end, ":pause"); - else if (xbc_node_find_child(hnode, "continue")) + else if (xbc_node_find_value(hnode, "continue", NULL)) append_printf(&buf, end, ":continue"); - else if (xbc_node_find_child(hnode, "clear")) + else if (xbc_node_find_value(hnode, "clear", NULL)) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ -- cgit v1.2.3-71-gd317 From 5dfe50b05588010f347cb2f436434bf22b7a84ed Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:38 +0900 Subject: bootconfig: Rename xbc_node_find_child() to xbc_node_find_subkey() Rename xbc_node_find_child() to xbc_node_find_subkey() for clarifying that function returns a key node (no value node). Since there are xbc_node_for_each_child() (loop on all child nodes) and xbc_node_for_each_subkey() (loop on only subkey nodes), this name distinction is necessary to avoid confusing users. Link: https://lkml.kernel.org/r/163119459826.161018.11200274779483115300.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 4 ++-- kernel/trace/trace_boot.c | 24 ++++++++++++------------ lib/bootconfig.c | 8 ++++---- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index abe089c27529..537e1b991f11 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -110,7 +110,7 @@ static inline __init bool xbc_node_is_leaf(struct xbc_node *node) } /* Tree-based key-value access APIs */ -struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent, +struct xbc_node * __init xbc_node_find_subkey(struct xbc_node *parent, const char *key); const char * __init xbc_node_find_value(struct xbc_node *parent, @@ -148,7 +148,7 @@ xbc_find_value(const char *key, struct xbc_node **vnode) */ static inline struct xbc_node * __init xbc_find_node(const char *key) { - return xbc_node_find_child(NULL, key); + return xbc_node_find_subkey(NULL, key); } /** diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index db6ee372dc6d..8d252f63cd78 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -262,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ - knode = xbc_node_find_child(hnode, "trace"); + knode = xbc_node_find_subkey(hnode, "trace"); if (!knode) - knode = xbc_node_find_child(hnode, "save"); + knode = xbc_node_find_subkey(hnode, "save"); if (knode) { anode = xbc_node_get_child(knode); @@ -283,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, sep = ','; } append_printf(bufp, end, ")"); - } else if (xbc_node_find_child(hnode, "snapshot")) { + } else if (xbc_node_find_subkey(hnode, "snapshot")) { append_printf(bufp, end, ".snapshot()"); } else { pr_err("hist.%s requires an action.\n", @@ -314,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, break; } - if (xbc_node_find_child(hnode, param)) + if (xbc_node_find_subkey(hnode, param)) ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); return ret; @@ -374,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) if (p) append_printf(&buf, end, ":name=%s", p); - node = xbc_node_find_child(hnode, "var"); + node = xbc_node_find_subkey(hnode, "var"); if (node) { xbc_node_for_each_key_value(node, knode, p) { /* Expression must not include spaces. */ @@ -393,13 +393,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ - node = xbc_node_find_child(hnode, "onmax"); + node = xbc_node_find_subkey(hnode, "onmax"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onchange"); + node = xbc_node_find_subkey(hnode, "onchange"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onmatch"); + node = xbc_node_find_subkey(hnode, "onmatch"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; @@ -436,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file, } } - if (xbc_node_find_child(hnode, "keys")) { + if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) @@ -495,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); } - anode = xbc_node_find_child(enode, "hist"); + anode = xbc_node_find_subkey(enode, "hist"); if (anode) trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) @@ -517,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) bool enable, enable_all = false; const char *data; - node = xbc_node_find_child(node, "event"); + node = xbc_node_find_subkey(node, "event"); if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ @@ -620,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node) struct trace_array *tr; const char *p; - node = xbc_node_find_child(node, "instance"); + node = xbc_node_find_subkey(node, "instance"); if (!node) return; diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 927017431fb6..f8419cff1147 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -142,16 +142,16 @@ xbc_node_match_prefix(struct xbc_node *node, const char **prefix) } /** - * xbc_node_find_child() - Find a child node which matches given key + * xbc_node_find_subkey() - Find a subkey node which matches given key * @parent: An XBC node. * @key: A key string. * - * Search a node under @parent which matches @key. The @key can contain + * Search a key node under @parent which matches @key. The @key can contain * several words jointed with '.'. If @parent is NULL, this searches the * node from whole tree. Return NULL if no node is matched. */ struct xbc_node * __init -xbc_node_find_child(struct xbc_node *parent, const char *key) +xbc_node_find_subkey(struct xbc_node *parent, const char *key) { struct xbc_node *node; @@ -191,7 +191,7 @@ const char * __init xbc_node_find_value(struct xbc_node *parent, const char *key, struct xbc_node **vnode) { - struct xbc_node *node = xbc_node_find_child(parent, key); + struct xbc_node *node = xbc_node_find_subkey(parent, key); if (!node || !xbc_node_is_key(node)) return NULL; -- cgit v1.2.3-71-gd317 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- include/linux/mmap_lock.h | 9 --------- kernel/bpf/stackmap.c | 10 ++++++++-- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..3af8f7fb067d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm) __mmap_lock_trace_released(mm, false); } -static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) -{ - if (mmap_read_trylock(mm)) { - rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); - return true; - } - return false; -} - static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } -- cgit v1.2.3-71-gd317 From 510e1a724ab1bf38150be2c1acabb303f98d0047 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 10 Sep 2021 19:53:37 -0400 Subject: dma-debug: prevent an error message from causing runtime problems For some drivers, that use the DMA API. This error message can be reached several millions of times per second, causing spam to the kernel's printk buffer and bringing the CPU usage up to 100% (so, it should be rate limited). However, since there is at least one driver that is in the mainline and suffers from the error condition, it is more useful to err_printk() here instead of just rate limiting the error message (in hopes that it will make it easier for other drivers that suffer from this issue to be spotted). Link: https://lkml.kernel.org/r/fd67fbac-64bf-f0ea-01e1-5938ccfab9d0@arm.com Reported-by: Jeremy Linton Signed-off-by: Hamza Mahfooz Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6c90c69e5311..95445bd6eb72 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -567,7 +567,8 @@ static void add_dma_entry(struct dma_debug_entry *entry) pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST) { - pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n"); + err_printk(entry->dev, entry, + "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } } -- cgit v1.2.3-71-gd317 From 0e6491b559704da720f6da09dd0a52c4df44c514 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: bpf: Add oversize check before call kvcalloc() Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..e76b55917905 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || -- cgit v1.2.3-71-gd317 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- include/linux/cgroup-defs.h | 107 +++++++++++-------------------------------- include/linux/cgroup.h | 22 +-------- kernel/cgroup/cgroup.c | 50 ++++---------------- net/core/netclassid_cgroup.c | 7 +-- net/core/netprio_cgroup.c | 10 +--- 5 files changed, 41 insertions(+), 155 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e1c705fdfa7c..db2e147e069f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * - * On legacy hierarchies, net_prio and net_cls controllers directly set - * attributes on each sock which can then be tested by the network layer. - * On the default hierarchy, each sock is associated with the cgroup it was - * created in and the networking layer can match the cgroup directly. - * - * To avoid carrying all three cgroup related fields separately in sock, - * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. - * On boot, sock_cgroup_data records the cgroup that the sock was created - * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overridden to carry prioidx and/or - * classid. The two modes are distinguished by whether the lowest bit is - * set. Clear bit indicates cgroup pointer while set bit prioidx and - * classid. - * - * While userland may start using net_prio or net_cls at any time, once - * either is used, cgroup2 matching no longer works. There is no reason to - * mix the two and this is in line with how legacy and v2 compatibility is - * handled. On mode switch, cgroup references which are already being - * pointed to by socks may be leaked. While this can be remedied by adding - * synchronization around sock_cgroup_data, given that the number of leaked - * cgroups is bound and highly unlikely to be high, this seems to be the - * better trade-off. + * On legacy hierarchies, net_prio and net_cls controllers directly + * set attributes on each sock which can then be tested by the network + * layer. On the default hierarchy, each sock is associated with the + * cgroup it was created in and the networking layer can match the + * cgroup directly. */ struct sock_cgroup_data { - union { -#ifdef __LITTLE_ENDIAN - struct { - u8 is_data : 1; - u8 no_refcnt : 1; - u8 unused : 6; - u8 padding; - u16 prioidx; - u32 classid; - } __packed; -#else - struct { - u32 classid; - u16 prioidx; - u8 padding; - u8 unused : 6; - u8 no_refcnt : 1; - u8 is_data : 1; - } __packed; + struct cgroup *cgroup; /* v2 */ +#ifdef CONFIG_CGROUP_NET_CLASSID + u32 classid; /* v1 */ +#endif +#ifdef CONFIG_CGROUP_NET_PRIO + u16 prioidx; /* v1 */ #endif - u64 val; - }; }; -/* - * There's a theoretical window where the following accessors race with - * updaters and return part of the previous pointer as the prioidx or - * classid. Such races are short-lived and the result isn't critical. - */ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { - /* fallback to 1 which is always the ID of the root cgroup */ - return (skcd->is_data & 1) ? skcd->prioidx : 1; +#ifdef CONFIG_CGROUP_NET_PRIO + return READ_ONCE(skcd->prioidx); +#else + return 1; +#endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { - /* fallback to 0 which is the unconfigured default classid */ - return (skcd->is_data & 1) ? skcd->classid : 0; +#ifdef CONFIG_CGROUP_NET_CLASSID + return READ_ONCE(skcd->classid); +#else + return 0; +#endif } -/* - * If invoked concurrently, the updaters may clobber each other. The - * caller is responsible for synchronization. - */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_prioidx(&skcd_buf) == prioidx) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.prioidx = prioidx; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_PRIO + WRITE_ONCE(skcd->prioidx, prioidx); +#endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_classid(&skcd_buf) == classid) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.classid = classid; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_CLASSID + WRITE_ONCE(skcd->classid, classid); +#endif } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7bf60454a313..75c151413fda 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -extern spinlock_t cgroup_sk_update_lock; -#endif - -void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - unsigned long v; - - /* - * @skcd->val is 64bit but the following is safe on 32bit too as we - * just need the lower ulong to be written and read atomically. - */ - v = READ_ONCE(skcd->val); - - if (v & 3) - return &cgrp_dfl_root.cgrp; - - return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; -#else - return (struct cgroup *)(unsigned long)skcd->val; -#endif + return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..8afa8690d288 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index b49c57d35a88..1a6a86693b74 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); - spin_unlock(&cgroup_sk_update_lock); - } if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; @@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, struct css_task_iter it; struct task_struct *p; - cgroup_sk_alloc_disable(); - cs->classid = (u32)value; css_task_iter_start(css, 0, &it); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 99a431c56f23..8456dfbe2eb4 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; - cgroup_sk_alloc_disable(); - rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + + if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); - spin_unlock(&cgroup_sk_update_lock); - } return 0; } @@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; - cgroup_sk_alloc_disable(); - cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; -- cgit v1.2.3-71-gd317 From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Sep 2021 13:23:22 -0700 Subject: memblock: introduce saner 'memblock_free_ptr()' interface The boot-time allocation interface for memblock is a mess, with 'memblock_alloc()' returning a virtual pointer, but then you are supposed to free it with 'memblock_free()' that takes a _physical_ address. Not only is that all kinds of strange and illogical, but it actually causes bugs, when people then use it like a normal allocation function, and it fails spectacularly on a NULL pointer: https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/ or just random memory corruption if the debug checks don't catch it: https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/ I really don't want to apply patches that treat the symptoms, when the fundamental cause is this horribly confusing interface. I started out looking at just automating a sane replacement sequence, but because of this mix or virtual and physical addresses, and because people have used the "__pa()" macro that can take either a regular kernel pointer, or just the raw "unsigned long" address, it's all quite messy. So this just introduces a new saner interface for freeing a virtual address that was allocated using 'memblock_alloc()', and that was kept as a regular kernel pointer. And then it converts a couple of users that are obvious and easy to test, including the 'xbc_nodes' case in lib/bootconfig.c that caused problems. Reported-by: kernel test robot Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Steven Rostedt Cc: Mike Rapoport Cc: Andrew Morton Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Vlastimil Babka Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/mm/kasan_init_64.c | 6 ++---- arch/x86/mm/numa.c | 2 +- arch/x86/mm/numa_emulation.c | 3 +-- drivers/base/arch_numa.c | 2 +- drivers/macintosh/smu.c | 2 +- include/linux/memblock.h | 1 + init/main.c | 2 +- kernel/printk/printk.c | 4 ++-- lib/bootconfig.c | 2 +- mm/memblock.c | 16 +++++++++++++++- 11 files changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 78a32b956e81..5afd98559193 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free_ptr(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1a50434c8a4d..ef885370719a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PMD_SIZE); + memblock_free_ptr(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PUD_SIZE); + memblock_free_ptr(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a1b5c71099e6..1e9b93b088db 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 737491b13728..e801e30089c4 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - if (phys_dist) - memblock_free(__pa(phys_dist), phys_size); + memblock_free_ptr(phys_dist, phys_size); return; no_emu: diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 46c503486e96..00fb4120a5b3 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -264,7 +264,7 @@ void __init numa_free_distance(void) size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; } diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 94fb63a7b357..fe63d5ee201b 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -570,7 +570,7 @@ fail_msg_node: fail_db_node: of_node_put(smu->db_node); fail_bootmem: - memblock_free(__pa(smu), sizeof(struct smu_device)); + memblock_free_ptr(smu, sizeof(struct smu_device)); smu = NULL; fail_np: of_node_put(np); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b066024c62e3..34de69b3b8ba 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,6 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); +void memblock_free_ptr(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); diff --git a/init/main.c b/init/main.c index 5c9a48df90e1..3f7216934441 100644 --- a/init/main.c +++ b/init/main.c @@ -924,7 +924,7 @@ static void __init print_unknown_bootoptions(void) end += sprintf(end, " %s", *p); pr_notice("Unknown command line parameters:%s\n", unknown_options); - memblock_free(__pa(unknown_options), len); + memblock_free_ptr(unknown_options, len); } asmlinkage __visible void __init __no_sanitize_address start_kernel(void) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/lib/bootconfig.c b/lib/bootconfig.c index f8419cff1147..5ae248b29373 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -792,7 +792,7 @@ void __init xbc_destroy_all(void) xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - memblock_free(__pa(xbc_nodes), sizeof(struct xbc_node) * XBC_NODE_MAX); + memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); xbc_nodes = NULL; brace_index = 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 0ab5a749bfa6..184dcd2e5d99 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_alloc_size); + memblock_free_ptr(old_array, old_alloc_size); /* * Reserve the new array if that comes from the memblock. Otherwise, we @@ -795,6 +795,20 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } +/** + * memblock_free_ptr - free boot memory allocation + * @ptr: starting address of the boot memory allocation + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init_memblock memblock_free_ptr(void *ptr, size_t size) +{ + if (ptr) + memblock_free(__pa(ptr), size); +} + /** * memblock_free - free boot memory block * @base: phys starting address of the boot memory block -- cgit v1.2.3-71-gd317 From b89a05b21f46150ac10a962aa50109250b56b03b Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Mon, 6 Sep 2021 11:53:10 +1000 Subject: events: Reuse value read using READ_ONCE instead of re-reading it In perf_event_addr_filters_apply, the task associated with the event (event->ctx->task) is read using READ_ONCE at the beginning of the function, checked, and then re-read from event->ctx->task, voiding all guarantees of the checks. Reuse the value that was read by READ_ONCE to ensure the consistency of the task struct throughout the function. Fixes: 375637bc52495 ("perf/core: Introduce address range filtering") Signed-off-by: Baptiste Lepers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210906015310.12802-1-baptiste.lepers@gmail.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 744e8726c5b2..0c000cb01eeb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10193,7 +10193,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; -- cgit v1.2.3-71-gd317 From 7687201e37fabf2b7cf2b828f7ca46bf30e2948f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2021 12:59:17 +0200 Subject: locking/rwbase: Properly match set_and_save_state() to restore_state() Noticed while looking at the readers race. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20210909110203.828203010@infradead.org --- kernel/locking/rwbase_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 4ba15088e640..542b0170e4f5 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -220,7 +220,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, for (; atomic_read(&rwb->readers);) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { - __set_current_state(TASK_RUNNING); + rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } -- cgit v1.2.3-71-gd317 From 616be87eac9fa2ab2dca1069712f7236e50f3bf6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2021 12:59:18 +0200 Subject: locking/rwbase: Extract __rwbase_write_trylock() The code in rwbase_write_lock() is a little non-obvious vs the read+set 'trylock', extract the sequence into a helper function to clarify the code. This also provides a single site to fix fast-path ordering. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/YUCq3L+u44NDieEJ@hirez.programming.kicks-ass.net --- kernel/locking/rwbase_rt.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 542b0170e4f5..48fbbccdce3f 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -196,6 +196,19 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); } +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + if (!atomic_read(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + static int __sched rwbase_write_lock(struct rwbase_rt *rwb, unsigned int state) { @@ -210,34 +223,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * set_current_state() for rw_semaphore - * current_save_and_set_rtlock_wait_state() for rwlock - */ - rwbase_set_and_save_current_state(state); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; - /* Block until all readers have left the critical section. */ - for (; atomic_read(&rwb->readers);) { + rwbase_set_and_save_current_state(state); + for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } + + if (__rwbase_write_trylock(rwb)) + break; + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * Schedule and wait for the readers to leave the critical - * section. The last reader leaving it wakes the waiter. - */ - if (atomic_read(&rwb->readers) != 0) - rwbase_schedule(); set_current_state(state); - raw_spin_lock_irqsave(&rtm->wait_lock, flags); } - - atomic_set(&rwb->readers, WRITER_BIAS); rwbase_restore_current_state(); + +out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 0; } @@ -253,8 +262,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb) atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - if (!atomic_read(&rwb->readers)) { - atomic_set(&rwb->readers, WRITER_BIAS); + if (__rwbase_write_trylock(rwb)) { raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 1; } -- cgit v1.2.3-71-gd317 From 81121524f1c798c9481bd7900450b72ee7ac2eef Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 9 Sep 2021 12:59:19 +0200 Subject: locking/rwbase: Take care of ordering guarantee for fastpath reader Readers of rwbase can lock and unlock without taking any inner lock, if that happens, we need the ordering provided by atomic operations to satisfy the ordering semantics of lock/unlock. Without that, considering the follow case: { X = 0 initially } CPU 0 CPU 1 ===== ===== rt_write_lock(); X = 1 rt_write_unlock(): atomic_add(READER_BIAS - WRITER_BIAS, ->readers); // ->readers is READER_BIAS. rt_read_lock(): if ((r = atomic_read(->readers)) < 0) // True atomic_try_cmpxchg(->readers, r, r + 1); // succeed. r1 = X; // r1 may be 0, because nothing prevent the reordering // of "X=1" and atomic_add() on CPU 1. Therefore audit every usage of atomic operations that may happen in a fast path, and add necessary barriers. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20210909110203.953991276@infradead.org --- kernel/locking/rwbase_rt.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 48fbbccdce3f..88191f6e252c 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -41,6 +41,12 @@ * The risk of writer starvation is there, but the pathological use cases * which trigger it are not necessarily the typical RT workloads. * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * * Common code shared between RT rw_semaphore and rwlock */ @@ -53,6 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { + /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) return 1; } @@ -162,6 +169,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, /* * rwb->readers can only hit 0 when a writer is waiting for the * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. */ if (unlikely(atomic_dec_and_test(&rwb->readers))) __rwbase_read_unlock(rwb, state); @@ -172,7 +181,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, { struct rt_mutex_base *rtm = &rwb->rtmutex; - atomic_add(READER_BIAS - bias, &rwb->readers); + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); rwbase_rtmutex_unlock(rtm); } @@ -201,7 +214,11 @@ static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) /* Can do without CAS because we're serialized by wait_lock. */ lockdep_assert_held(&rwb->rtmutex.wait_lock); - if (!atomic_read(&rwb->readers)) { + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { atomic_set(&rwb->readers, WRITER_BIAS); return 1; } -- cgit v1.2.3-71-gd317