From 350a5c4dd2452ea999cc5e1d4a8dbf12de2f97ef Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Mar 2021 14:52:48 -0800 Subject: bpf: Dont allow vmlinux BTF to be used in map_create and prog_load. The syzbot got FD of vmlinux BTF and passed it into map_create which caused crash in btf_type_id_size() when it tried to access resolved_ids. The vmlinux BTF doesn't have 'resolved_ids' and 'resolved_sizes' initialized to save memory. To avoid such issues disallow using vmlinux BTF in prog_load and map_create commands. Fixes: 5329722057d4 ("bpf: Assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO") Reported-by: syzbot+8bab8ed346746e7540e8@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210307225248.79031-1-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 5 +++++ kernel/bpf/verifier.c | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c859bc46d06c..250503482cda 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr) err = PTR_ERR(btf); goto free_map; } + if (btf_is_kernel(btf)) { + btf_put(btf); + err = -EACCES; + goto free_map; + } map->btf = btf; if (attr->btf_value_type_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c56e3fcb5f1a..4192a9e56654 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9056,6 +9056,10 @@ static int check_btf_info(struct bpf_verifier_env *env, btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) return PTR_ERR(btf); + if (btf_is_kernel(btf)) { + btf_put(btf); + return -EACCES; + } env->prog->aux->btf = btf; err = check_btf_func(env, attr, uattr); -- cgit v1.2.3-71-gd317 From 769c18b254ca191b45047e1fcb3b2ce56fada0b6 Mon Sep 17 00:00:00 2001 From: Tal Lossos Date: Sun, 7 Mar 2021 14:09:48 +0200 Subject: bpf: Change inode_storage's lookup_elem return value from NULL to -EBADF bpf_fd_inode_storage_lookup_elem() returned NULL when getting a bad FD, which caused -ENOENT in bpf_map_copy_value. -EBADF error is better than -ENOENT for a bad FD behaviour. The patch was partially contributed by CyberArk Software, Inc. Fixes: 8ea636848aca ("bpf: Implement bpf_local_storage for inodes") Signed-off-by: Tal Lossos Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210307120948.61414-1-tallossos@gmail.com --- kernel/bpf/bpf_inode_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..b58b2efb9b43 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; f = fget_raw(fd); if (!f) - return NULL; + return ERR_PTR(-EBADF); sdata = inode_storage_lookup(f->f_inode, map, true); fput(f); -- cgit v1.2.3-71-gd317 From 15b2219facadec583c24523eed40fa45865f859f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Mar 2021 20:20:42 -0700 Subject: kernel: freezer should treat PF_IO_WORKER like PF_KTHREAD for freezing Don't send fake signals to PF_IO_WORKER threads, they don't accept signals. Just treat them like kthreads in this regard, all they need is a wakeup as no forced kernel/user transition is needed. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index dc520f01f99d..1a2d57d1327c 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) + if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) fake_signal_wake_up(p); else wake_up_state(p, TASK_INTERRUPTIBLE); -- cgit v1.2.3-71-gd317 From 16efa4fce3b7af17bb45d635c3e89992d721e0f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Mar 2021 20:26:13 -0700 Subject: io_uring: allow IO worker threads to be frozen With the freezer using the proper signaling to notify us of when it's time to freeze a thread, we can re-enable normal freezer usage for the IO threads. Ensure that SQPOLL, io-wq, and the io-wq manager call try_to_freeze() appropriately, and remove the default setting of PF_NOFREEZE from create_io_thread(). Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++++- fs/io_uring.c | 1 + kernel/fork.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0ae9ecadf295..e05f996d088f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -488,6 +488,8 @@ static int io_wqe_worker(void *data) set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + long ret; + set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); @@ -498,7 +500,8 @@ loop: __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); io_flush_signals(); - if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + ret = schedule_timeout(WORKER_IDLE_TIMEOUT); + if (try_to_freeze() || ret) continue; if (fatal_signal_pending(current)) break; @@ -709,6 +712,7 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); + try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index a4bce17af506..05adc4887ef3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6752,6 +6752,7 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); + try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index 72e444cd0ffe..d3171e8e88e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,7 +2436,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); - tsk->flags |= PF_NOFREEZE; } return tsk; } -- cgit v1.2.3-71-gd317 From ef4cb70a4c22bf301cd757dcc838dc8ca9526477 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 2 Mar 2021 18:14:53 +0200 Subject: genirq/irq_sim: Fix typos in kernel doc (fnode -> fwnode) Fix typos in kernel doc, otherwise validation script complains: .../irq_sim.c:170: warning: Function parameter or member 'fwnode' not described in 'irq_domain_create_sim' .../irq_sim.c:170: warning: Excess function parameter 'fnode' description in 'irq_domain_create_sim' .../irq_sim.c:240: warning: Function parameter or member 'fwnode' not described in 'devm_irq_domain_create_sim' .../irq_sim.c:240: warning: Excess function parameter 'fnode' description in 'devm_irq_domain_create_sim' Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210302161453.28540-1-andriy.shevchenko@linux.intel.com --- kernel/irq/irq_sim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 48006608baf0..40880c350b95 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = { * irq_domain_create_sim - Create a new interrupt simulator irq_domain and * allocate a range of dummy interrupts. * - * @fnode: struct fwnode_handle to be associated with this domain. + * @fwnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate. * * On success: return a new irq_domain object. @@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res) * a managed device. * * @dev: Device to initialize the simulator object for. - * @fnode: struct fwnode_handle to be associated with this domain. + * @fwnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * * On success: return a new irq_domain object. -- cgit v1.2.3-71-gd317 From 5abbe51a526253b9f003e9a0a195638dc882d660 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Feb 2021 18:46:41 +0100 Subject: kernel, fs: Introduce and use set_restart_fn() and arch_set_restart_data() Preparation for fixing get_nr_restart_syscall() on X86 for COMPAT. Add a new helper which sets restart_block->fn and calls a dummy arch_set_restart_data() helper. Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210201174641.GA17871@redhat.com --- fs/select.c | 10 ++++------ include/linux/thread_info.h | 13 +++++++++++++ kernel/futex.c | 3 +-- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- 6 files changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/fs/select.c b/fs/select.c index 37aaa8317f3a..945896d0ac9e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block) ret = do_sys_poll(ufds, nfds, to); - if (ret == -ERESTARTNOHAND) { - restart_block->fn = do_restart_poll; - ret = -ERESTART_RESTARTBLOCK; - } + if (ret == -ERESTARTNOHAND) + ret = set_restart_fn(restart_block, do_restart_poll); + return ret; } @@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; @@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } else restart_block->poll.has_timeout = 0; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9b2158c69275..157762db9d4b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -59,6 +60,18 @@ enum syscall_work_bit { #ifdef __KERNEL__ +#ifndef arch_set_restart_data +#define arch_set_restart_data(restart) do { } while (0) +#endif + +static inline long set_restart_fn(struct restart_block *restart, + long (*fn)(struct restart_block *)) +{ + restart->fn = fn; + arch_set_restart_data(restart); + return -ERESTART_RESTARTBLOCK; +} + #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif diff --git a/kernel/futex.c b/kernel/futex.c index e68db7745039..00febd6dea9c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2728,14 +2728,13 @@ retry: goto out; restart = ¤t->restart_block; - restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart, futex_wait_restart); out: if (to) { diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 98d7a15e8cf6..4d94e2b5499d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (flags == TIMER_ABSTIME) return -ERESTARTNOHAND; - restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; + set_restart_fn(restart, alarm_timer_nsleep_restart); return ret; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 788b9d137de4..5c9d968187ae 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1957,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, } restart = ¤t->restart_block; - restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a71758e34e45..9abe15255bc4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; + set_restart_fn(restart_block, posix_cpu_nsleep_restart); } return error; } -- cgit v1.2.3-71-gd317 From 8a141dd7f7060d1e64c14a5257e0babae20ac99b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 12:58:15 -0700 Subject: ftrace: Fix modify_ftrace_direct. The following sequence of commands: register_ftrace_direct(ip, addr1); modify_ftrace_direct(ip, addr1, addr2); unregister_ftrace_direct(ip, addr2); will cause the kernel to warn: [ 30.179191] WARNING: CPU: 2 PID: 1961 at kernel/trace/ftrace.c:5223 unregister_ftrace_direct+0x130/0x150 [ 30.180556] CPU: 2 PID: 1961 Comm: test_progs W O 5.12.0-rc2-00378-g86bc10a0a711-dirty #3246 [ 30.182453] RIP: 0010:unregister_ftrace_direct+0x130/0x150 When modify_ftrace_direct() changes the addr from old to new it should update the addr stored in ftrace_direct_funcs. Otherwise the final unregister_ftrace_direct() won't find the address and will cause the splat. Fixes: 0567d6809182 ("ftrace: Add modify_ftrace_direct()") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/bpf/20210316195815.34714-1-alexei.starovoitov@gmail.com --- kernel/trace/ftrace.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d8e35575549..b7e29db127fa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5045,6 +5045,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) return NULL; } +static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *direct; + + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) + return NULL; + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; + return direct; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5120,15 +5134,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct = ftrace_find_direct_func(addr); if (!direct) { - direct = kmalloc(sizeof(*direct), GFP_KERNEL); + direct = ftrace_alloc_direct_func(addr); if (!direct) { kfree(entry); goto out_unlock; } - direct->addr = addr; - direct->count = 0; - list_add_rcu(&direct->next, &ftrace_direct_funcs); - ftrace_direct_func_count++; } entry->ip = ip; @@ -5329,6 +5339,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { + struct ftrace_direct_func *direct, *new_direct = NULL; struct ftrace_func_entry *entry; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5344,6 +5355,20 @@ int modify_ftrace_direct(unsigned long ip, if (entry->direct != old_addr) goto out_unlock; + direct = ftrace_find_direct_func(old_addr); + if (WARN_ON(!direct)) + goto out_unlock; + if (direct->count > 1) { + ret = -ENOMEM; + new_direct = ftrace_alloc_direct_func(new_addr); + if (!new_direct) + goto out_unlock; + direct->count--; + new_direct->count++; + } else { + direct->addr = new_addr; + } + /* * If there's no other ftrace callback on the rec->ip location, * then it can be changed directly by the architecture. @@ -5357,6 +5382,14 @@ int modify_ftrace_direct(unsigned long ip, ret = 0; } + if (unlikely(ret && new_direct)) { + direct->count++; + list_del_rcu(&new_direct->next); + synchronize_rcu_tasks(); + kfree(new_direct); + ftrace_direct_func_count--; + } + out_unlock: mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); -- cgit v1.2.3-71-gd317 From f232326f6966cf2a1d1db7bc917a4ce5f9f55f76 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 09:47:02 +0100 Subject: bpf: Prohibit alu ops for pointer types not defining ptr_limit The purpose of this patch is to streamline error propagation and in particular to propagate retrieve_ptr_limit() errors for pointer types that are not defining a ptr_limit such that register-based alu ops against these types can be rejected. The main rationale is that a gap has been identified by Piotr in the existing protection against speculatively out-of-bounds loads, for example, in case of ctx pointers, unprivileged programs can still perform pointer arithmetic. This can be abused to execute speculatively out-of-bounds loads without restrictions and thus extract contents of kernel memory. Fix this by rejecting unprivileged programs that attempt any pointer arithmetic on unprotected pointer types. The two affected ones are pointer to ctx as well as pointer to map. Field access to a modified ctx' pointer is rejected at a later point in time in the verifier, and 7c6967326267 ("bpf: Permit map_ptr arithmetic with opcode add and offset 0") only relevant for root-only use cases. Risk of unprivileged program breakage is considered very low. Fixes: 7c6967326267 ("bpf: Permit map_ptr arithmetic with opcode add and offset 0") Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4192a9e56654..1c8cbef7cc14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5934,6 +5934,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u32 alu_state, alu_limit; struct bpf_reg_state tmp; bool ret; + int err; if (can_skip_alu_sanitation(env, insn)) return 0; @@ -5949,10 +5950,13 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; - if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) - return 0; - if (update_alu_sanitation_state(aux, alu_state, alu_limit)) - return -EACCES; + err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg); + if (err < 0) + return err; + + err = update_alu_sanitation_state(aux, alu_state, alu_limit); + if (err < 0) + return err; do_sim: /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of @@ -6103,7 +6107,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_ADD: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); if (ret < 0) { - verbose(env, "R%d tried to add from different maps or paths\n", dst); + verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst); return ret; } /* We can take a fixed offset as long as it doesn't overflow @@ -6158,7 +6162,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); if (ret < 0) { - verbose(env, "R%d tried to sub from different maps or paths\n", dst); + verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst); return ret; } if (dst_reg == off_reg) { -- cgit v1.2.3-71-gd317 From 10d2bb2e6b1d8c4576c56a748f697dbeb8388899 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 08:20:16 +0100 Subject: bpf: Fix off-by-one for area size in creating mask to left retrieve_ptr_limit() computes the ptr_limit for registers with stack and map_value type. ptr_limit is the size of the memory area that is still valid / in-bounds from the point of the current position and direction of the operation (add / sub). This size will later be used for masking the operation such that attempting out-of-bounds access in the speculative domain is redirected to remain within the bounds of the current map value. When masking to the right the size is correct, however, when masking to the left, the size is off-by-one which would lead to an incorrect mask and thus incorrect arithmetic operation in the non-speculative domain. Piotr found that if the resulting alu_limit value is zero, then the BPF_MOV32_IMM() from the fixup_bpf_calls() rewrite will end up loading 0xffffffff into AX instead of sign-extending to the full 64 bit range, and as a result, this allows abuse for executing speculatively out-of- bounds loads against 4GB window of address space and thus extracting the contents of kernel memory via side-channel. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c8cbef7cc14..56c63658b702 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5870,13 +5870,13 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off; + *ptr_limit = MAX_BPF_STACK + off + 1; else *ptr_limit = -off; return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + *ptr_limit = ptr_reg->umax_value + ptr_reg->off + 1; } else { off = ptr_reg->smin_value + ptr_reg->off; *ptr_limit = ptr_reg->map_ptr->value_size - off; -- cgit v1.2.3-71-gd317 From b5871dca250cd391885218b99cc015aca1a51aea Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 08:26:25 +0100 Subject: bpf: Simplify alu_limit masking for pointer arithmetic Instead of having the mov32 with aux->alu_limit - 1 immediate, move this operation to retrieve_ptr_limit() instead to simplify the logic and to allow for subsequent sanity boundary checks inside retrieve_ptr_limit(). This avoids in future that at the time of the verifier masking rewrite we'd run into an underflow which would not sign extend due to the nature of mov32 instruction. Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56c63658b702..d16722a99b61 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5870,16 +5870,16 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) - *ptr_limit = MAX_BPF_STACK + off + 1; + *ptr_limit = MAX_BPF_STACK + off; else - *ptr_limit = -off; + *ptr_limit = -off - 1; return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { - *ptr_limit = ptr_reg->umax_value + ptr_reg->off + 1; + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; } else { off = ptr_reg->smin_value + ptr_reg->off; - *ptr_limit = ptr_reg->map_ptr->value_size - off; + *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; } return 0; default: @@ -11668,7 +11668,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) off_reg = issrc ? insn->src_reg : insn->dst_reg; if (isneg) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); -- cgit v1.2.3-71-gd317 From 1b1597e64e1a610c7a96710fc4717158e98a08b3 Mon Sep 17 00:00:00 2001 From: Piotr Krysiuk Date: Tue, 16 Mar 2021 09:47:02 +0100 Subject: bpf: Add sanity check for upper ptr_limit Given we know the max possible value of ptr_limit at the time of retrieving the latter, add basic assertions, so that the verifier can bail out if anything looks odd and reject the program. Nothing triggered this so far, but it also does not hurt to have these. Signed-off-by: Piotr Krysiuk Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d16722a99b61..44e4ec1640f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5861,10 +5861,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, { bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || (opcode == BPF_SUB && !off_is_neg); - u32 off; + u32 off, max; switch (ptr_reg->type) { case PTR_TO_STACK: + /* Offset 0 is out-of-bounds, but acceptable start for the + * left direction, see BPF_REG_FP. + */ + max = MAX_BPF_STACK + mask_to_left; /* Indirect variable offset stack access is prohibited in * unprivileged mode so it's not handled here. */ @@ -5873,15 +5877,16 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, *ptr_limit = MAX_BPF_STACK + off; else *ptr_limit = -off - 1; - return 0; + return *ptr_limit >= max ? -ERANGE : 0; case PTR_TO_MAP_VALUE: + max = ptr_reg->map_ptr->value_size; if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; } else { off = ptr_reg->smin_value + ptr_reg->off; *ptr_limit = ptr_reg->map_ptr->value_size - off - 1; } - return 0; + return *ptr_limit >= max ? -ERANGE : 0; default: return -EINVAL; } -- cgit v1.2.3-71-gd317 From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 14:00:07 -0700 Subject: bpf: Fix fexit trampoline. The fexit/fmod_ret programs can be attached to kernel functions that can sleep. The synchronize_rcu_tasks() will not wait for such tasks to complete. In such case the trampoline image will be freed and when the task wakes up the return IP will point to freed memory causing the crash. Solve this by adding percpu_ref_get/put for the duration of trampoline and separate trampoline vs its image life times. The "half page" optimization has to be removed, since first_half->second_half->first_half transition cannot be guaranteed to complete in deterministic time. Every trampoline update becomes a new image. The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and call_rcu_tasks. Together they will wait for the original function and trampoline asm to complete. The trampoline is patched from nop to jmp to skip fexit progs. They are freed independently from the trampoline. The image with fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which will wait for both sleepable and non-sleepable progs to complete. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney # for RCU Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 26 +++++- include/linux/bpf.h | 24 ++++- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 4 +- kernel/bpf/trampoline.c | 218 ++++++++++++++++++++++++++++++++++---------- 5 files changed, 213 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 747bba0a584a..72b5a57e9e31 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + emit_nops(&prog, 5); } if (fmod_ret->nr_progs) { @@ -2033,9 +2043,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7e0f479a5b0..3625f019767d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -556,7 +557,8 @@ struct bpf_tramp_progs { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +struct bpf_tramp_image; +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call); @@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); struct bpf_ksym { unsigned long start; @@ -583,6 +587,18 @@ enum bpf_tramp_prog_type { BPF_TRAMP_REPLACE, /* more than MAX */ }; +struct bpf_tramp_image { + void *image; + struct bpf_ksym ksym; + struct percpu_ref pcref; + void *ip_after_call; + void *ip_epilogue; + union { + struct rcu_head rcu; + struct work_struct work; + }; +}; + struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; @@ -605,9 +621,8 @@ struct bpf_trampoline { /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ - void *image; + struct bpf_tramp_image *cur_image; u64 selector; - struct bpf_ksym ksym; }; struct bpf_attach_target_info { @@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +int bpf_jit_charge_modmem(u32 pages); +void bpf_jit_uncharge_modmem(u32 pages); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -787,7 +804,6 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; - enum bpf_tramp_prog_type trampoline_prog_type; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 1a666a975416..70f6fd4fa305 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; - err = arch_prepare_bpf_trampoline(image, + err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, tprogs, NULL); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3a283bf97f2f..75244ecb2389 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -static int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { @@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages) return 0; } -static void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 pages) { atomic_long_sub(pages, &bpf_jit_current); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7bc3b3209224..1f3a4be4b175 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) -{ - struct bpf_ksym *ksym = &tr->ksym; - - snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); - bpf_image_ksym_add(tr->image, ksym); -} - static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; - void *image; int i; mutex_lock(&trampoline_mutex); @@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) if (!tr) goto out; - /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); - if (!image) { - kfree(tr); - tr = NULL; - goto out; - } - tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); @@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - tr->image = image; - INIT_LIST_HEAD_RCU(&tr->ksym.lnode); - bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) return tprogs; } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ + struct bpf_tramp_image *im; + + im = container_of(work, struct bpf_tramp_image, work); + bpf_image_ksym_del(&im->ksym); + bpf_jit_free_exec(im->image); + bpf_jit_uncharge_modmem(1); + percpu_ref_exit(&im->pcref); + kfree_rcu(im, rcu); +} + +/* callback, fexit step 3 or fentry step 2 */ +static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); + schedule_work(&im->work); +} + +/* callback, fexit step 2. Called after percpu_ref_kill confirms. */ +static void __bpf_tramp_image_release(struct percpu_ref *pcref) +{ + struct bpf_tramp_image *im; + + im = container_of(pcref, struct bpf_tramp_image, pcref); + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +/* callback, fexit or fentry step 1 */ +static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + if (im->ip_after_call) + /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ + percpu_ref_kill(&im->pcref); + else + /* the case of fentry trampoline */ + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +static void bpf_tramp_image_put(struct bpf_tramp_image *im) +{ + /* The trampoline image that calls original function is using: + * rcu_read_lock_trace to protect sleepable bpf progs + * rcu_read_lock to protect normal bpf progs + * percpu_ref to protect trampoline itself + * rcu tasks to protect trampoline asm not covered by percpu_ref + * (which are few asm insns before __bpf_tramp_enter and + * after __bpf_tramp_exit) + * + * The trampoline is unreachable before bpf_tramp_image_put(). + * + * First, patch the trampoline to avoid calling into fexit progs. + * The progs will be freed even if the original function is still + * executing or sleeping. + * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on + * first few asm instructions to execute and call into + * __bpf_tramp_enter->percpu_ref_get. + * Then use percpu_ref_kill to wait for the trampoline and the original + * function to finish. + * Then use call_rcu_tasks() to make sure few asm insns in + * the trampoline epilogue are done as well. + * + * In !PREEMPT case the task that got interrupted in the first asm + * insns won't go through an RCU quiescent state which the + * percpu_ref_kill will be waiting for. Hence the first + * call_rcu_tasks() is not necessary. + */ + if (im->ip_after_call) { + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, + NULL, im->ip_epilogue); + WARN_ON(err); + if (IS_ENABLED(CONFIG_PREEMPTION)) + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); + else + percpu_ref_kill(&im->pcref); + return; + } + + /* The trampoline without fexit and fmod_ret progs doesn't call original + * function and doesn't use percpu_ref. + * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu_tasks() to wait for the rest of trampoline asm + * and normal progs. + */ + call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); +} + +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +{ + struct bpf_tramp_image *im; + struct bpf_ksym *ksym; + void *image; + int err = -ENOMEM; + + im = kzalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + err = bpf_jit_charge_modmem(1); + if (err) + goto out_free_im; + + err = -ENOMEM; + im->image = image = bpf_jit_alloc_exec_page(); + if (!image) + goto out_uncharge; + + err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); + if (err) + goto out_free_image; + + ksym = &im->ksym; + INIT_LIST_HEAD_RCU(&ksym->lnode); + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); + bpf_image_ksym_add(image, ksym); + return im; + +out_free_image: + bpf_jit_free_exec(im->image); +out_uncharge: + bpf_jit_uncharge_modmem(1); +out_free_im: + kfree(im); +out: + return ERR_PTR(err); +} + static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_image *im; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; int err, total; @@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) return PTR_ERR(tprogs); if (total == 0) { - err = unregister_fentry(tr, old_image); + err = unregister_fentry(tr, tr->cur_image->image); + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; tr->selector = 0; goto out; } + im = bpf_tramp_image_alloc(tr->key, tr->selector); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - /* Though the second half of trampoline page is unused a task could be - * preempted in the middle of the first half of trampoline and two - * updates to trampoline would change the code from underneath the - * preempted task. Hence wait for tasks to voluntarily schedule or go - * to userspace. - * The same trampoline can hold both sleepable and non-sleepable progs. - * synchronize_rcu_tasks_trace() is needed to make sure all sleepable - * programs finish executing. - * Wait for these two grace periods together. - */ - synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); - - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; - if (tr->selector) + WARN_ON(tr->cur_image && tr->selector == 0); + WARN_ON(!tr->cur_image && tr->selector); + if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, old_image, new_image); + err = modify_fentry(tr, tr->cur_image->image, im->image); else /* first time registering */ - err = register_fentry(tr, new_image); + err = register_fentry(tr, im->image); if (err) goto out; + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; tr->selector++; out: kfree(tprogs); @@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - bpf_image_ksym_del(&tr->ksym); - /* This code will be executed when all bpf progs (both sleepable and - * non-sleepable) went through - * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). - * Hence no need for another synchronize_rcu_tasks_trace() here, - * but synchronize_rcu_tasks() is still needed, since trampoline - * may not have had any sleepable programs and we need to wait - * for tasks to get out of trampoline code before freeing it. + /* This code will be executed even when the last bpf_tramp_image + * is alive. All progs are detached from the trampoline and the + * trampoline image is patched with jmp into epilogue to skip + * fexit progs. The fentry-only trampoline will be freed via + * multiple rcu callbacks. */ - synchronize_rcu_tasks(); - bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: @@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) rcu_read_unlock_trace(); } +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) +{ + percpu_ref_get(&tr->pcref); +} + +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) +{ + percpu_ref_put(&tr->pcref); +} + int __weak -arch_prepare_bpf_trampoline(void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) -- cgit v1.2.3-71-gd317 From 9d3fcb28f9b9750b474811a2964ce022df56336e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Mar 2021 22:17:48 -0400 Subject: Revert "PM: ACPI: reboot: Use S5 for reboot" This reverts commit d60cd06331a3566d3305b3c7b566e79edf4e2095. This patch causes a panic when rebooting my Dell Poweredge r440. I do not have the full panic log as it's lost at that stage of the reboot and I do not have a serial console. Reverting this patch makes my system able to reboot again. Signed-off-by: Josef Bacik Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index eb1b15850761..a6ad5eb2fa73 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - if (pm_power_off_prepare) - pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) -- cgit v1.2.3-71-gd317 From 68b1eddd421d2b16c6655eceb48918a1e896bbbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Mar 2021 11:27:19 +0100 Subject: static_call: Fix static_call_set_init() It turns out that static_call_set_init() does not preserve the other flags; IOW. it clears TAIL if it was set. Fixes: 9183c3f9ed710 ("static_call: Add inline static call infrastructure") Reported-by: Sumit Garg Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jarkko Sakkinen Tested-by: Sumit Garg Link: https://lkml.kernel.org/r/20210318113610.519406371@infradead.org --- kernel/static_call.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index ae825295cf68..080c8a9ddfaf 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site) return (void *)((long)site->addr + (long)&site->addr); } +static inline unsigned long __static_call_key(const struct static_call_site *site) +{ + return (long)site->key + (long)&site->key; +} static inline struct static_call_key *static_call_key(const struct static_call_site *site) { - return (struct static_call_key *) - (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS); + return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); } /* These assume the key is word-aligned. */ static inline bool static_call_is_init(struct static_call_site *site) { - return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT; + return __static_call_key(site) & STATIC_CALL_SITE_INIT; } static inline bool static_call_is_tail(struct static_call_site *site) { - return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL; + return __static_call_key(site) & STATIC_CALL_SITE_TAIL; } static inline void static_call_set_init(struct static_call_site *site) { - site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) - + site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - (long)&site->key; } @@ -190,7 +193,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) } arch_static_call_transform(site_addr, NULL, func, - static_call_is_tail(site)); + static_call_is_tail(site)); } } @@ -349,7 +352,7 @@ static int static_call_add_module(struct module *mod) struct static_call_site *site; for (site = start; site != stop; site++) { - unsigned long s_key = (long)site->key + (long)&site->key; + unsigned long s_key = __static_call_key(site); unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; unsigned long key; -- cgit v1.2.3-71-gd317 From 698bacefe993ad2922c9d3b1380591ad489355e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Mar 2021 11:29:56 +0100 Subject: static_call: Align static_call_is_init() patching condition The intent is to avoid writing init code after init (because the text might have been freed). The code is needlessly different between jump_label and static_call and not obviously correct. The existing code relies on the fact that the module loader clears the init layout, such that within_module_init() always fails, while jump_label relies on the module state which is more obvious and matches the kernel logic. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jarkko Sakkinen Tested-by: Sumit Garg Link: https://lkml.kernel.org/r/20210318113610.636651340@infradead.org --- kernel/static_call.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index 080c8a9ddfaf..fc2259047be2 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -149,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) }; for (site_mod = &first; site_mod; site_mod = site_mod->next) { + bool init = system_state < SYSTEM_RUNNING; struct module *mod = site_mod->mod; if (!site_mod->sites) { @@ -168,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) if (mod) { stop = mod->static_call_sites + mod->num_static_call_sites; + init = mod->state == MODULE_STATE_COMING; } #endif @@ -175,16 +177,8 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) site < stop && static_call_key(site) == key; site++) { void *site_addr = static_call_addr(site); - if (static_call_is_init(site)) { - /* - * Don't write to call sites which were in - * initmem and have since been freed. - */ - if (!mod && system_state >= SYSTEM_RUNNING) - continue; - if (mod && !within_module_init((unsigned long)site_addr, mod)) - continue; - } + if (!init && static_call_is_init(site)) + continue; if (!kernel_text_address((unsigned long)site_addr)) { WARN_ONCE(1, "can't patch static call site at %pS", -- cgit v1.2.3-71-gd317 From 38c93587375053c5b9ef093f4a5ea754538cba32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Mar 2021 11:31:51 +0100 Subject: static_call: Fix static_call_update() sanity check Sites that match init_section_contains() get marked as INIT. For built-in code init_sections contains both __init and __exit text. OTOH kernel_text_address() only explicitly includes __init text (and there are no __exit text markers). Match what jump_label already does and ignore the warning for INIT sites. Also see the excellent changelog for commit: 8f35eaa5f2de ("jump_label: Don't warn on __exit jump entries") Fixes: 9183c3f9ed710 ("static_call: Add inline static call infrastructure") Reported-by: Sumit Garg Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jarkko Sakkinen Tested-by: Sumit Garg Link: https://lkml.kernel.org/r/20210318113610.739542434@infradead.org --- kernel/jump_label.c | 8 ++++++++ kernel/static_call.c | 11 ++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index c6a39d662935..ba39fbb1f8e7 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) return false; if (!kernel_text_address(jump_entry_code(entry))) { + /* + * This skips patching built-in __exit, which + * is part of init_section_contains() but is + * not part of kernel_text_address(). + * + * Skipping built-in __exit is fine since it + * will never be executed. + */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); diff --git a/kernel/static_call.c b/kernel/static_call.c index fc2259047be2..2c5950b0b90e 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -181,7 +181,16 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; if (!kernel_text_address((unsigned long)site_addr)) { - WARN_ONCE(1, "can't patch static call site at %pS", + /* + * This skips patching built-in __exit, which + * is part of init_section_contains() but is + * not part of kernel_text_address(). + * + * Skipping built-in __exit is fine since it + * will never be executed. + */ + WARN_ONCE(!static_call_is_init(site), + "can't patch static call site at %pS", site_addr); continue; } -- cgit v1.2.3-71-gd317 From f60a85cad677c4f9bb4cadd764f1d106c38c7cf8 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 17 Mar 2021 11:09:15 +0800 Subject: bpf: Fix umd memory leak in copy_process() The syzbot reported a memleak as follows: BUG: memory leak unreferenced object 0xffff888101b41d00 (size 120): comm "kworker/u4:0", pid 8, jiffies 4294944270 (age 12.780s) backtrace: [] alloc_pid+0x66/0x560 [] copy_process+0x1465/0x25e0 [] kernel_clone+0xf3/0x670 [] kernel_thread+0x61/0x80 [] call_usermodehelper_exec_work [] call_usermodehelper_exec_work+0xc4/0x120 [] process_one_work+0x2c9/0x600 [] worker_thread+0x59/0x5d0 [] kthread+0x178/0x1b0 [] ret_from_fork+0x1f/0x30 unreferenced object 0xffff888110ef5c00 (size 232): comm "kworker/u4:0", pid 8414, jiffies 4294944270 (age 12.780s) backtrace: [] kmem_cache_zalloc [] __alloc_file+0x1f/0xf0 [] alloc_empty_file+0x69/0x120 [] alloc_file+0x33/0x1b0 [] alloc_file_pseudo+0xb2/0x140 [] create_pipe_files+0x138/0x2e0 [] umd_setup+0x33/0x220 [] call_usermodehelper_exec_async+0xb4/0x1b0 [] ret_from_fork+0x1f/0x30 After the UMD process exits, the pipe_to_umh/pipe_from_umh and tgid need to be released. Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.") Reported-by: syzbot+44908bb56d2bfe56b28e@syzkaller.appspotmail.com Signed-off-by: Zqiang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210317030915.2865-1-qiang.zhang@windriver.com --- include/linux/usermode_driver.h | 1 + kernel/bpf/preload/bpf_preload_kern.c | 19 +++++++++++++++---- kernel/usermode_driver.c | 21 +++++++++++++++------ 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 073a9e0ec07d..ad970416260d 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -14,5 +14,6 @@ struct umd_info { int umd_load_blob(struct umd_info *info, const void *data, size_t len); int umd_unload_blob(struct umd_info *info); int fork_usermode_driver(struct umd_info *info); +void umd_cleanup_helper(struct umd_info *info); #endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 79c5772465f1..53736e52c1df 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -60,9 +60,12 @@ static int finish(void) &magic, sizeof(magic), &pos); if (n != sizeof(magic)) return -EPIPE; + tgid = umd_ops.info.tgid; - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_ops.info.tgid = NULL; + if (tgid) { + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_cleanup_helper(&umd_ops.info); + } return 0; } @@ -80,10 +83,18 @@ static int __init load_umd(void) static void __exit fini_umd(void) { + struct pid *tgid; + bpf_preload_ops = NULL; + /* kill UMD in case it's still there due to earlier error */ - kill_pid(umd_ops.info.tgid, SIGKILL, 1); - umd_ops.info.tgid = NULL; + tgid = umd_ops.info.tgid; + if (tgid) { + kill_pid(tgid, SIGKILL, 1); + + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_cleanup_helper(&umd_ops.info); + } umd_unload_blob(&umd_ops.info); } late_initcall(load_umd); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 0b35212ffc3d..bb7bb3b478ab 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info) struct umd_info *umd_info = info->data; /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) { - fput(umd_info->pipe_to_umh); - fput(umd_info->pipe_from_umh); - put_pid(umd_info->tgid); - umd_info->tgid = NULL; - } + if (info->retval) + umd_cleanup_helper(umd_info); +} + +/** + * umd_cleanup_helper - release the resources which were allocated in umd_setup + * @info: information about usermode driver + */ +void umd_cleanup_helper(struct umd_info *info) +{ + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + put_pid(info->tgid); + info->tgid = NULL; } +EXPORT_SYMBOL_GPL(umd_cleanup_helper); /** * fork_usermode_driver - fork a usermode driver -- cgit v1.2.3-71-gd317 From 81e2073c175b887398e5bca6c004efa89983f58d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Mar 2021 15:38:52 +0100 Subject: genirq: Disable interrupts for force threaded handlers With interrupt force threading all device interrupt handlers are invoked from kernel threads. Contrary to hard interrupt context the invocation only disables bottom halfs, but not interrupts. This was an oversight back then because any code like this will have an issue: thread(irq_A) irq_handler(A) spin_lock(&foo->lock); interrupt(irq_B) irq_handler(B) spin_lock(&foo->lock); This has been triggered with networking (NAPI vs. hrtimers) and console drivers where printk() happens from an interrupt which interrupted the force threaded handler. Now people noticed and started to change the spin_lock() in the handler to spin_lock_irqsave() which affects performance or add IRQF_NOTHREAD to the interrupt request which in turn breaks RT. Fix the root cause and not the symptom and disable interrupts before invoking the force threaded handler which preserves the regular semantics and the usefulness of the interrupt force threading as a general debugging tool. For not RT this is not changing much, except that during the execution of the threaded handler interrupts are delayed until the handler returns. Vs. scheduling and softirq processing there is no difference. For RT kernels there is no issue. Fixes: 8d32a307e4fa ("genirq: Provide forced interrupt threading") Reported-by: Johan Hovold Signed-off-by: Thomas Gleixner Reviewed-by: Johan Hovold Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20210317143859.513307808@linutronix.de --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dec3f73e8db9..21ea370fccda 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) irqreturn_t ret; local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); local_bh_enable(); return ret; } -- cgit v1.2.3-71-gd317 From 5be28c8f85ce99ed2d329d2ad8bdd18ea19473a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 19 Mar 2021 19:25:13 -0600 Subject: signal: don't allow sending any signals to PF_IO_WORKER threads They don't take signals individually, and even if they share signals with the parent task, don't allow them to be delivered through the worker thread. Linux does allow this kind of behavior for regular threads, but it's really a compatability thing that we need not care about for the IO threads. Reported-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ba4d1ef39a9e..11cabcf20e7a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -833,6 +833,9 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, if (!valid_signal(sig)) return -EINVAL; + /* PF_IO_WORKER threads don't take any signals */ + if (t->flags & PF_IO_WORKER) + return -ESRCH; if (!si_fromuser(info)) return 0; -- cgit v1.2.3-71-gd317 From 4db4b1a0d1779dc159f7b87feb97030ec0b12597 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 21 Mar 2021 09:37:48 -0600 Subject: signal: don't allow STOP on PF_IO_WORKER threads Just like we don't allow normal signals to IO threads, don't deliver a STOP to a task that has PF_IO_WORKER set. The IO threads don't take signals in general, and have no means of flushing out a stop either. Longer term, we may want to look into allowing stop of these threads, as it relates to eg process freezing. For now, this prevents a spin issue if a SIGSTOP is delivered to the parent task. Reported-by: Stefan Metzmacher Signed-off-by: Jens Axboe Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 11cabcf20e7a..f2a1b898da29 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -288,7 +288,8 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); - if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) + if (unlikely(fatal_signal_pending(task) || + (task->flags & (PF_EXITING | PF_IO_WORKER)))) return false; if (mask & JOBCTL_STOP_SIGMASK) -- cgit v1.2.3-71-gd317 From fb9d62b27ab1e07d625591549c314b7d406d21df Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 23 Mar 2021 14:56:08 +0000 Subject: PM: EM: postpone creating the debugfs dir till fs_initcall The debugfs directory '/sys/kernel/debug/energy_model' is needed before the Energy Model registration can happen. With the recent change in debugfs subsystem it's not allowed to create this directory at early stage (core_initcall). Thus creating this directory would fail. Postpone the creation of the EM debug dir to later stage: fs_initcall. It should be safe since all clients: CPUFreq drivers, Devfreq drivers will be initialized in later stages. The custom debug log below prints the time of creation the EM debug dir at fs_initcall and successful registration of EMs at later stages. [ 1.505717] energy_model: creating rootdir [ 3.698307] cpu cpu0: EM: created perf domain [ 3.709022] cpu cpu1: EM: created perf domain Fixes: 56348560d495 ("debugfs: do not attempt to create a new file before the filesystem is initalized") Reported-by: Ionela Voinescu Signed-off-by: Lukasz Luba Reviewed-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 1358fa4abfa8..0f4530b3a8cd 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -98,7 +98,7 @@ static int __init em_debug_init(void) return 0; } -core_initcall(em_debug_init); +fs_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} -- cgit v1.2.3-71-gd317 From 60bcf728ee7c60ac2a1f9a0eaceb3a7b3954cd2b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 24 Mar 2021 21:37:44 -0700 Subject: gcov: fix clang-11+ support LLVM changed the expected function signatures for llvm_gcda_start_file() and llvm_gcda_emit_function() in the clang-11 release. Users of clang-11 or newer may have noticed their kernels failing to boot due to a panic when enabling CONFIG_GCOV_KERNEL=y +CONFIG_GCOV_PROFILE_ALL=y. Fix up the function signatures so calling these functions doesn't panic the kernel. Link: https://reviews.llvm.org/rGcdd683b516d147925212724b09ec6fb792a40041 Link: https://reviews.llvm.org/rG13a633b438b6500ecad9e4f936ebadf3411d0f44 Link: https://lkml.kernel.org/r/20210312224132.3413602-2-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reported-by: Prasad Sodagudi Suggested-by: Nathan Chancellor Reviewed-by: Fangrui Song Tested-by: Nathan Chancellor Acked-by: Peter Oberparleiter Reviewed-by: Nathan Chancellor Cc: [5.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/clang.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'kernel') diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index c94b820a1b62..8743150db2ac 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -75,7 +75,9 @@ struct gcov_fn_info { u32 num_counters; u64 *counters; +#if CONFIG_CLANG_VERSION < 110000 const char *function_name; +#endif }; static struct gcov_info *current_info; @@ -105,6 +107,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) } EXPORT_SYMBOL(llvm_gcov_init); +#if CONFIG_CLANG_VERSION < 110000 void llvm_gcda_start_file(const char *orig_filename, const char version[4], u32 checksum) { @@ -113,7 +116,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4], current_info->checksum = checksum; } EXPORT_SYMBOL(llvm_gcda_start_file); +#else +void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) +{ + current_info->filename = orig_filename; + current_info->version = version; + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); +#endif +#if CONFIG_CLANG_VERSION < 110000 void llvm_gcda_emit_function(u32 ident, const char *function_name, u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) { @@ -133,6 +146,24 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name, list_add_tail(&info->head, ¤t_info->functions); } EXPORT_SYMBOL(llvm_gcda_emit_function); +#else +void llvm_gcda_emit_function(u32 ident, u32 func_checksum, + u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); +#endif void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) { @@ -295,6 +326,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) } } +#if CONFIG_CLANG_VERSION < 110000 static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) { size_t cv_size; /* counter values size */ @@ -322,6 +354,28 @@ err_name: kfree(fn_dup); return NULL; } +#else +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) { + kfree(fn_dup); + return NULL; + } + + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; +} +#endif /** * gcov_info_dup - duplicate profiling data set @@ -362,6 +416,7 @@ err: * gcov_info_free - release memory for profiling data set duplicate * @info: profiling data set duplicate to free */ +#if CONFIG_CLANG_VERSION < 110000 void gcov_info_free(struct gcov_info *info) { struct gcov_fn_info *fn, *tmp; @@ -375,6 +430,20 @@ void gcov_info_free(struct gcov_info *info) kfree(info->filename); kfree(info); } +#else +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} +#endif #define ITER_STRIDE PAGE_SIZE -- cgit v1.2.3-71-gd317 From 10442994ba195efef6fdcc0c3699e4633cb5161b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Mar 2021 08:57:10 -0600 Subject: kernel: don't call do_exit() for PF_IO_WORKER threads Right now we're never calling get_signal() from PF_IO_WORKER threads, but in preparation for doing so, don't handle a fatal signal for them. The workers have state they need to cleanup when exiting, so just return instead of calling do_exit() on their behalf. The threads themselves will detect a fatal signal and do proper shutdown. Signed-off-by: Jens Axboe --- kernel/signal.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f2a1b898da29..d22177d37b21 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2771,6 +2771,14 @@ relock: do_coredump(&ksig->info); } + /* + * PF_IO_WORKER threads will catch and exit on fatal signals + * themselves. They have cleanup that must be performed, so + * we cannot call do_exit() on their behalf. + */ + if (current->flags & PF_IO_WORKER) + goto out; + /* * Death signals, no core dump. */ @@ -2778,7 +2786,7 @@ relock: /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - +out: ksig->sig = signr; if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) -- cgit v1.2.3-71-gd317 From b16b3855d89fba640996fefdd3a113c0aa0e380d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Mar 2021 09:05:22 -0600 Subject: kernel: stop masking signals in create_io_thread() This is racy - move the blocking into when the task is created and we're marking it as PF_IO_WORKER anyway. The IO threads are now prepared to handle signals like SIGSTOP as well, so clear that from the mask to allow proper stopping of IO threads. Acked-by: "Eric W. Biederman" Reported-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/fork.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..ddaa15227071 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1940,8 +1940,14 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; - if (args->io_thread) + if (args->io_thread) { + /* + * Mark us an IO worker, and block any signal that isn't + * fatal or STOP + */ p->flags |= PF_IO_WORKER; + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); + } /* * This _must_ happen before we call free_task(), i.e. before we jump @@ -2430,14 +2436,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .stack_size = (unsigned long)arg, .io_thread = 1, }; - struct task_struct *tsk; - tsk = copy_process(NULL, 0, node, &args); - if (!IS_ERR(tsk)) { - sigfillset(&tsk->blocked); - sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); - } - return tsk; + return copy_process(NULL, 0, node, &args); } /* -- cgit v1.2.3-71-gd317 From 5a842a7448bbfa9bda0a74ca4f239c1b02bb98d8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:18:15 -0600 Subject: Revert "signal: don't allow sending any signals to PF_IO_WORKER threads" This reverts commit 5be28c8f85ce99ed2d329d2ad8bdd18ea19473a5. IO threads now take signals just fine, so there's no reason to limit them specifically. Revert the change that prevented that from happening. Signed-off-by: Jens Axboe --- kernel/signal.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d22177d37b21..9e172b9341f4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -834,9 +834,6 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, if (!valid_signal(sig)) return -EINVAL; - /* PF_IO_WORKER threads don't take any signals */ - if (t->flags & PF_IO_WORKER) - return -ESRCH; if (!si_fromuser(info)) return 0; -- cgit v1.2.3-71-gd317 From e8b33b8cfafcfcef287ae4c0f23a173bfcf617f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:18:59 -0600 Subject: Revert "kernel: treat PF_IO_WORKER like PF_KTHREAD for ptrace/signals" This reverts commit 6fb8f43cede0e4bd3ead847de78d531424a96be9. The IO threads do allow signals now, including SIGSTOP, and we can allow ptrace attach. Attaching won't reveal anything interesting for the IO threads, but it will allow eg gdb to attach to a task with io_urings and IO threads without complaining. And once attached, it will allow the usual introspection into regular threads. Signed-off-by: Jens Axboe --- kernel/ptrace.c | 2 +- kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 821cf1723814..61db50f7ca86 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); retval = -EPERM; - if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (unlikely(task->flags & PF_KTHREAD)) goto out; if (same_thread_group(task, current)) goto out; diff --git a/kernel/signal.c b/kernel/signal.c index 9e172b9341f4..dd86841cce94 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) return true; /* Only allow kernel generated signals to this kthread */ - if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) && + if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; @@ -1097,7 +1097,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ - if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER))) + if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* -- cgit v1.2.3-71-gd317 From d3dc04cd81e0eaf50b2d09ab051a13300e587439 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:22:11 -0600 Subject: Revert "kernel: freezer should treat PF_IO_WORKER like PF_KTHREAD for freezing" This reverts commit 15b2219facadec583c24523eed40fa45865f859f. Before IO threads accepted signals, the freezer using take signals to wake up an IO thread would cause them to loop without any way to clear the pending signal. That is no longer the case, so stop special casing PF_IO_WORKER in the freezer. Signed-off-by: Jens Axboe --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 1a2d57d1327c..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else wake_up_state(p, TASK_INTERRUPTIBLE); -- cgit v1.2.3-71-gd317 From 1e4cf0d3d072173ee70757ee4aec11b2839705f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:23:44 -0600 Subject: Revert "signal: don't allow STOP on PF_IO_WORKER threads" This reverts commit 4db4b1a0d1779dc159f7b87feb97030ec0b12597. The IO threads allow and handle SIGSTOP now, so don't special case them anymore in task_set_jobctl_pending(). Signed-off-by: Jens Axboe --- kernel/signal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index dd86841cce94..f2718350bf4b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -288,8 +288,7 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); - if (unlikely(fatal_signal_pending(task) || - (task->flags & (PF_EXITING | PF_IO_WORKER)))) + if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) -- cgit v1.2.3-71-gd317