From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 7 Nov 2021 14:51:16 +1000 Subject: printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces printk from NMI context relies on irq work being raised on the local CPU to print to console. This can be a problem if the NMI was raised by a lockup detector to print lockup stack and regs, because the CPU may not enable irqs (because it is locked up). Introduce printk_trigger_flush() that can be called another CPU to try to get those messages to the console, call that where printk_safe_flush was previously called. Fixes: 93d102f094be ("printk: remove safe buffers") Cc: stable@vger.kernel.org # 5.15 Signed-off-by: Nicholas Piggin Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com --- kernel/printk/printk.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 65fffa6368c9..eabe23b0a982 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3261,6 +3261,11 @@ void defer_console_output(void) preempt_enable(); } +void printk_trigger_flush(void) +{ + defer_console_output(); +} + int vprintk_deferred(const char *fmt, va_list args) { int r; -- cgit v1.2.3-71-gd317 From 34d11a440c6167133201b7374065b59f259730d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 10 Nov 2021 09:25:56 -0800 Subject: bpf: Fix inner map state pruning regression. Introduction of map_uid made two lookups from outer map to be distinct. That distinction is only necessary when inner map has an embedded timer. Otherwise it will make the verifier state pruning to be conservative which will cause complex programs to hit 1M insn_processed limit. Tighten map_uid logic to apply to inner maps with timers only. Fixes: 3e8ce29850f1 ("bpf: Prevent pointer mismatch in bpf_timer_init.") Reported-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com Link: https://lore.kernel.org/bpf/20211110172556.20754-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 890b3ec375a3..aab7482ed1c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1151,7 +1151,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - reg->map_uid = reg->id; + if (map_value_has_timer(map->inner_map_meta)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || -- cgit v1.2.3-71-gd317 From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 14 Nov 2021 13:28:34 -0500 Subject: tracing: Add length protection to histogram string copies The string copies to the histogram storage has a max size of 256 bytes (defined by MAX_FILTER_STR_VAL). Only the string size of the event field needs to be copied to the event storage, but no more than what is in the event storage. Although nothing should be bigger than 256 bytes, there's no protection against overwriting of the storage if one day there is. Copy no more than the destination size, and enforce it. Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the min() comparison of the string sizes of comparable types. Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reported-by: Linus Torvalds Reviewed-by: Masami Hiramatsu Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events_hist.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 50453b287615..2d167ac3452c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,7 +673,7 @@ struct trace_event_file { #define PERF_MAX_TRACE_SIZE 8192 -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ +#define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1475d7347fe0..34afcaebd0e5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3026,8 +3026,10 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, if (val->flags & HIST_FIELD_FL_STRING) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; + unsigned int size; - strscpy(str, val_str, val->size); + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -4914,6 +4916,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_STRING) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; + unsigned int size; str_start = hist_data->n_field_var_str + hist_data->n_save_var_str; @@ -4922,7 +4925,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - strscpy(str, val_str, hist_field->size); + + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); hist_val = (u64)(uintptr_t)str; } -- cgit v1.2.3-71-gd317 From 5e0bc3082e2e403ac0753e099c2b01446bb35578 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Nov 2021 18:22:26 +0400 Subject: bpf: Forbid bpf_ktime_get_coarse_ns and bpf_timer_* in tracing progs Use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in tracing progs may result in locking issues. bpf_ktime_get_coarse_ns() uses ktime_get_coarse_ns() time accessor that isn't safe for any context: ====================================================== WARNING: possible circular locking dependency detected 5.15.0-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.4/14877 is trying to acquire lock: ffffffff8cb30008 (tk_core.seq.seqcount){----}-{0:0}, at: ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 but task is already holding lock: ffffffff90dbf200 (&obj_hash[i].lock){-.-.}-{2:2}, at: debug_object_deactivate+0x61/0x400 lib/debugobjects.c:735 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&obj_hash[i].lock){-.-.}-{2:2}: lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0xd1/0x120 kernel/locking/spinlock.c:162 __debug_object_init+0xd9/0x1860 lib/debugobjects.c:569 debug_hrtimer_init kernel/time/hrtimer.c:414 [inline] debug_init kernel/time/hrtimer.c:468 [inline] hrtimer_init+0x20/0x40 kernel/time/hrtimer.c:1592 ntp_init_cmos_sync kernel/time/ntp.c:676 [inline] ntp_init+0xa1/0xad kernel/time/ntp.c:1095 timekeeping_init+0x512/0x6bf kernel/time/timekeeping.c:1639 start_kernel+0x267/0x56e init/main.c:1030 secondary_startup_64_no_verify+0xb1/0xbb -> #0 (tk_core.seq.seqcount){----}-{0:0}: check_prev_add kernel/locking/lockdep.c:3051 [inline] check_prevs_add kernel/locking/lockdep.c:3174 [inline] validate_chain+0x1dfb/0x8240 kernel/locking/lockdep.c:3789 __lock_acquire+0x1382/0x2b00 kernel/locking/lockdep.c:5015 lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 seqcount_lockdep_reader_access+0xfe/0x230 include/linux/seqlock.h:103 ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 ktime_get_coarse include/linux/timekeeping.h:120 [inline] ktime_get_coarse_ns include/linux/timekeeping.h:126 [inline] ____bpf_ktime_get_coarse_ns kernel/bpf/helpers.c:173 [inline] bpf_ktime_get_coarse_ns+0x7e/0x130 kernel/bpf/helpers.c:171 bpf_prog_a99735ebafdda2f1+0x10/0xb50 bpf_dispatcher_nop_func include/linux/bpf.h:721 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] BPF_PROG_RUN_ARRAY include/linux/bpf.h:1294 [inline] trace_call_bpf+0x2cf/0x5d0 kernel/trace/bpf_trace.c:127 perf_trace_run_bpf_submit+0x7b/0x1d0 kernel/events/core.c:9708 perf_trace_lock+0x37c/0x440 include/trace/events/lock.h:39 trace_lock_release+0x128/0x150 include/trace/events/lock.h:58 lock_release+0x82/0x810 kernel/locking/lockdep.c:5636 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:149 [inline] _raw_spin_unlock_irqrestore+0x75/0x130 kernel/locking/spinlock.c:194 debug_hrtimer_deactivate kernel/time/hrtimer.c:425 [inline] debug_deactivate kernel/time/hrtimer.c:481 [inline] __run_hrtimer kernel/time/hrtimer.c:1653 [inline] __hrtimer_run_queues+0x2f9/0xa60 kernel/time/hrtimer.c:1749 hrtimer_interrupt+0x3b3/0x1040 kernel/time/hrtimer.c:1811 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline] __sysvec_apic_timer_interrupt+0xf9/0x270 arch/x86/kernel/apic/apic.c:1103 sysvec_apic_timer_interrupt+0x8c/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] _raw_spin_unlock_irqrestore+0xd4/0x130 kernel/locking/spinlock.c:194 try_to_wake_up+0x702/0xd20 kernel/sched/core.c:4118 wake_up_process kernel/sched/core.c:4200 [inline] wake_up_q+0x9a/0xf0 kernel/sched/core.c:953 futex_wake+0x50f/0x5b0 kernel/futex/waitwake.c:184 do_futex+0x367/0x560 kernel/futex/syscalls.c:127 __do_sys_futex kernel/futex/syscalls.c:199 [inline] __se_sys_futex+0x401/0x4b0 kernel/futex/syscalls.c:180 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae There is a possible deadlock with bpf_timer_* set of helpers: hrtimer_start() lock_base(); trace_hrtimer...() perf_event() bpf_run() bpf_timer_start() hrtimer_start() lock_base() <- DEADLOCK Forbid use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_PERF_EVENT and BPF_PROG_TYPE_RAW_TRACEPOINT prog types. Fixes: d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Reported-by: syzbot+43fd005b5a1b4d10781e@syzkaller.appspotmail.com Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211113142227.566439-2-me@ubique.spb.ru --- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/helpers.c | 2 -- kernel/bpf/verifier.c | 7 +++++++ kernel/trace/bpf_trace.c | 2 -- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 2 ++ 6 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2ca643af9a54..43eb3501721b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1809,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_new_value_proto; case BPF_FUNC_sysctl_set_new_value: return &bpf_sysctl_set_new_value_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return cgroup_base_func_proto(func_id, prog); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..649f07623df6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1364,8 +1364,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aab7482ed1c3..65d2f93b7030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11632,6 +11632,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (map_value_has_timer(map)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_timer yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7396488793ff..ae9755037b7e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1111,8 +1111,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/net/core/filter.c b/net/core/filter.c index e471c9b09670..6102f093d59a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7162,6 +7162,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10327,6 +10329,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10833,6 +10837,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 2cf02b4d77fb..4bb9401b0a3f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -205,6 +205,8 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, offsetof(struct tcp_congestion_ops, release)) return &bpf_sk_getsockopt_proto; return NULL; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-71-gd317 From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Nov 2021 18:48:08 +0000 Subject: bpf: Fix toctou on read-only map's constant scalar tracking Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is checking whether maps are read-only both from BPF program side and user space side, and then, given their content is constant, reading out their data via map->ops->map_direct_value_addr() which is then subsequently used as known scalar value for the register, that is, it is marked as __mark_reg_known() with the read value at verification time. Before a23740ec43ba, the register content was marked as an unknown scalar so the verifier could not make any assumptions about the map content. The current implementation however is prone to a TOCTOU race, meaning, the value read as known scalar for the register is not guaranteed to be exactly the same at a later point when the program is executed, and as such, the prior made assumptions of the verifier with regards to the program will be invalid which can cause issues such as OOB access, etc. While the BPF_F_RDONLY_PROG map flag is always fixed and required to be specified at map creation time, the map->frozen property is initially set to false for the map given the map value needs to be populated, e.g. for global data sections. Once complete, the loader "freezes" the map from user space such that no subsequent updates/deletes are possible anymore. For the rest of the lifetime of the map, this freeze one-time trigger cannot be undone anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_* cmd calls which would update/delete map entries will be rejected with -EPERM since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also means that pending update/delete map entries must still complete before this guarantee is given. This corner case is not an issue for loaders since they create and prepare such program private map in successive steps. However, a malicious user is able to trigger this TOCTOU race in two different ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is used to expand the competition interval, so that map_update_elem() can modify the contents of the map after map_freeze() and bpf_prog_load() were executed. This works, because userfaultfd halts the parallel thread which triggered a map_update_elem() at the time where we copy key/value from the user buffer and this already passed the FMODE_CAN_WRITE capability test given at that time the map was not "frozen". Then, the main thread performs the map_freeze() and bpf_prog_load(), and once that had completed successfully, the other thread is woken up to complete the pending map_update_elem() which then changes the map content. For ii) the idea of the batched update is similar, meaning, when there are a large number of updates to be processed, it can increase the competition interval between the two. It is therefore possible in practice to modify the contents of the map after executing map_freeze() and bpf_prog_load(). One way to fix both i) and ii) at the same time is to expand the use of the map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with the rationale to make a writable mmap()'ing of a map mutually exclusive with read-only freezing. The counter indicates writable mmap() mappings and then prevents/fails the freeze operation. Its semantics can be expanded beyond just mmap() by generally indicating ongoing write phases. This would essentially span any parallel regular and batched flavor of update/delete operation and then also have map_freeze() fail with -EBUSY. For the check_mem_access() in the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all last pending writes have completed via bpf_map_write_active() test. Once the map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0 only then we are really guaranteed to use the map's data as known constants. For map->frozen being set and pending writes in process of still being completed we fall back to marking that register as unknown scalar so we don't end up making assumptions about it. With this, both TOCTOU reproducers from i) and ii) are fixed. Note that the map->writecnt has been converted into a atomic64 in the fix in order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning the freeze_mutex over entire map update/delete operations in syscall side would not be possible due to then causing everything to be serialized. Similarly, something like synchronize_rcu() after setting map->frozen to wait for update/deletes to complete is not possible either since it would also have to span the user copy which can sleep. On the libbpf side, this won't break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed mmap()-ed memory where for .rodata it's non-writable. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: w1tcher.bupt@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++------------------- kernel/bpf/verifier.c | 17 ++++++++++++++- 3 files changed, 54 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f715e8863f4d..e7a163a3146b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,7 +193,7 @@ struct bpf_map { atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; - u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + atomic64_t writecnt; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); +bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..1033ee8c0caf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -132,6 +132,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static void bpf_map_write_active_inc(struct bpf_map *map) +{ + atomic64_inc(&map->writecnt); +} + +static void bpf_map_write_active_dec(struct bpf_map *map) +{ + atomic64_dec(&map->writecnt); +} + +bool bpf_map_write_active(const struct bpf_map *map) +{ + return atomic64_read(&map->writecnt) != 0; +} + static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt++; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ @@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt--; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) goto out; if (vma->vm_flags & VM_MAYWRITE) - map->writecnt++; + bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; @@ -1139,6 +1148,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1174,6 +1184,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1196,6 +1207,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1226,6 +1238,7 @@ static int map_delete_elem(union bpf_attr *attr) out: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1533,6 +1546,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; @@ -1597,6 +1611,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1624,8 +1639,7 @@ static int map_freeze(const union bpf_attr *attr) } mutex_lock(&map->freeze_mutex); - - if (map->writecnt) { + if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } @@ -4171,6 +4185,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { + bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; + bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; @@ -4183,16 +4200,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if ((cmd == BPF_MAP_LOOKUP_BATCH || - cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && - !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + if (has_write) + bpf_map_write_active_inc(map); + if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } - - if (cmd != BPF_MAP_LOOKUP_BATCH && - !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -4205,8 +4219,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, BPF_DO_BATCH(map->ops->map_update_batch); else BPF_DO_BATCH(map->ops->map_delete_batch); - err_put: + if (has_write) + bpf_map_write_active_dec(map); fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65d2f93b7030..50efda51515b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4056,7 +4056,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static bool bpf_map_is_rdonly(const struct bpf_map *map) { - return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; + /* A map is considered read-only if the following condition are true: + * + * 1) BPF program side cannot change any of the map content. The + * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map + * and was set at map creation time. + * 2) The map value(s) have been initialized from user space by a + * loader and then "frozen", such that no new map update/delete + * operations from syscall side are possible for the rest of + * the map's lifetime from that point onwards. + * 3) Any parallel/pending map update/delete operations from syscall + * side have been completed. Only after that point, it's safe to + * assume that map value(s) are immutable. + */ + return (map->map_flags & BPF_F_RDONLY_PROG) && + READ_ONCE(map->frozen) && + !bpf_map_write_active(map); } static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) -- cgit v1.2.3-71-gd317 From f86b0aaad741c45aba5a84a27277dd56a96808ba Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 17 Nov 2021 17:15:42 -0800 Subject: tracing/histogram: Fix UAF in destroy_hist_field() Calling destroy_hist_field() on an expression will recursively free any operands associated with the expression. If during expression parsing the operands of the expression are already set when an error is encountered, there is no need to explicity free the operands. Doing so will result in destroy_hist_field() being called twice for the operands and lead to a use-after-free (UAF) error. If the operands are associated with the expression, only call destroy_hist_field() on the expression since the operands will be recursively freed. Link: https://lore.kernel.org/all/CAHk-=wgcrEbFgkw9720H3tW-AhHOoEKhYwZinYJw4FpzSaJ6_Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211118011542.1420131-1-kaleshsingh@google.com Suggested-by: Linus Torvalds Signed-off-by: Kalesh Singh Fixes: 8b5d46fd7a38 ("tracing/histogram: Optimize division by constants") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ea2c9ec54a6..9555b8e1d1e3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2576,28 +2576,27 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* Split the expression string at the root operator */ if (!sep) - goto free; + return ERR_PTR(-EINVAL); + *sep = '\0'; operand1_str = str; str = sep+1; /* Binary operator requires both operands */ if (*operand1_str == '\0' || *str == '\0') - goto free; + return ERR_PTR(-EINVAL); operand_flags = 0; /* LHS of string is an expression e.g. a+b in a+b+c */ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); - if (IS_ERR(operand1)) { - ret = PTR_ERR(operand1); - operand1 = NULL; - goto free; - } + if (IS_ERR(operand1)) + return ERR_CAST(operand1); + if (operand1->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); ret = -EINVAL; - goto free; + goto free_op1; } /* RHS of string is another expression e.g. c in a+b+c */ @@ -2605,13 +2604,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); - operand2 = NULL; - goto free; + goto free_op1; } if (operand2->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); ret = -EINVAL; - goto free; + goto free_operands; } switch (field_op) { @@ -2629,12 +2627,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, break; default: ret = -EINVAL; - goto free; + goto free_operands; } ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) - goto free; + goto free_operands; operand_flags = var1 ? var1->flags : operand1->flags; operand2_flags = var2 ? var2->flags : operand2->flags; @@ -2653,12 +2651,13 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr = create_hist_field(hist_data, NULL, flags, var_name); if (!expr) { ret = -ENOMEM; - goto free; + goto free_operands; } operand1->read_once = true; operand2->read_once = true; + /* The operands are now owned and free'd by 'expr' */ expr->operands[0] = operand1; expr->operands[1] = operand2; @@ -2669,7 +2668,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, if (!divisor) { hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str)); ret = -EDOM; - goto free; + goto free_expr; } /* @@ -2709,18 +2708,22 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; - goto free; + goto free_expr; } expr->name = expr_str(expr, 0); } return expr; -free: - destroy_hist_field(operand1, 0); + +free_operands: destroy_hist_field(operand2, 0); - destroy_hist_field(expr, 0); +free_op1: + destroy_hist_field(operand1, 0); + return ERR_PTR(ret); +free_expr: + destroy_hist_field(expr, 0); return ERR_PTR(ret); } -- cgit v1.2.3-71-gd317 From c4c1dbcc09e723295969a62aff401815b7ee15f4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 12:22:17 -0800 Subject: tracing: Use memset_startat() to zero struct trace_iterator In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Use memset_startat() to avoid confusing memset() about writing beyond the target struct member. Link: https://lkml.kernel.org/r/20211118202217.1285588-1-keescook@chromium.org Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f9139dc1262c..e3c80cfd4eec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6706,9 +6706,7 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + memset_startat(iter, 0, seq); cpumask_clear(iter->started); trace_seq_init(&iter->seq); iter->pos = -1; -- cgit v1.2.3-71-gd317 From 2ef75e9bd2c998f1c6f6f23a3744136105ddefd5 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 18 Nov 2021 17:55:16 +0300 Subject: tracing: Don't use out-of-sync va_list in event printing If trace_seq becomes full, trace_seq_vprintf() no longer consumes arguments from va_list, making va_list out of sync with format processing by trace_check_vprintf(). This causes va_arg() in trace_check_vprintf() to return wrong positional argument, which results into a WARN_ON_ONCE() hit. ftrace_stress_test from LTP triggers this situation. Fix it by explicitly avoiding further use if va_list at the point when it's consistency can no longer be guaranteed. Link: https://lkml.kernel.org/r/20211118145516.13219-1-nikita.yushchenko@virtuozzo.com Signed-off-by: Nikita Yushchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3c80cfd4eec..88de94da596b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3812,6 +3812,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + /* + * If iter->seq is full, the above call no longer guarantees + * that ap is in sync with fmt processing, and further calls + * to va_arg() can return wrong positional arguments. + * + * Ensure that ap is no longer used in this case. + */ + if (iter->seq.full) { + p = ""; + break; + } + if (star) len = va_arg(ap, int); -- cgit v1.2.3-71-gd317 From e349d945fac76bddc78ae1cb92a0145b427a87ce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 11:11:13 -0600 Subject: signal: Don't always set SA_IMMUTABLE for forced signals Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Update force_sig_to_task to support both the case when we can allow the debugger to intercept and possibly ignore the signal and the case when it is not safe to let userspace know about the signal until the process has exited. Suggested-by: Linus Torvalds Reported-by: Kyle Huey Reported-by: kernel test robot Cc: stable@vger.kernel.org [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Link: https://lkml.kernel.org/r/877dd5qfw5.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7c4b7ae714d4..7815e1bbeddc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1298,6 +1298,12 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p return ret; } +enum sig_handler { + HANDLER_CURRENT, /* If reachable use the current handler */ + HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ + HANDLER_EXIT, /* Only visible as the process exit code */ +}; + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1310,7 +1316,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * that is why we also clear SIGNAL_UNKILLABLE. */ static int -force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl) +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, + enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; @@ -1321,9 +1328,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); - if (blocked || ignored || sigdfl) { + if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; - action->sa.sa_flags |= SA_IMMUTABLE; + if (handler == HANDLER_EXIT) + action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) { sigdelset(&t->blocked, sig); recalc_sigpending_and_wake(t); @@ -1343,7 +1351,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, current, false); + return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* @@ -1660,7 +1668,7 @@ void force_fatal_sig(int sig) info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - force_sig_info_to_task(&info, current, true); + force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } /* @@ -1693,7 +1701,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info_to_task(&info, t, false); + return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr @@ -1813,7 +1821,8 @@ int force_sig_seccomp(int syscall, int reason, bool force_coredump) info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; - return force_sig_info_to_task(&info, current, force_coredump); + return force_sig_info_to_task(&info, current, + force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in -- cgit v1.2.3-71-gd317 From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 14:23:21 -0600 Subject: signal: Replace force_fatal_sig with force_exit_sig when in doubt Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Add force_exit_sig and use it instead of force_fatal_sig where historically the code has directly called do_exit. This has the implementation benefits of going through the signal exit path (including generating core dumps) without the danger of allowing userspace to ignore or change these signals. This avoids userspace regressions as older kernels exited with do_exit which debuggers also can not intercept. In the future is should be possible to improve the quality of implementation of the kernel by changing some of these force_exit_sig calls to force_fatal_sig. That can be done where it matters on a case-by-case basis with careful analysis. Reported-by: Kyle Huey Reported-by: kernel test robot [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die") Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV") Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler") Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig") Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails") Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit") Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.") Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure") Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- arch/m68k/kernel/traps.c | 2 +- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 4 ++-- arch/s390/kernel/traps.c | 2 +- arch/sparc/kernel/signal_32.c | 4 ++-- arch/sparc/kernel/windows.c | 2 +- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- include/linux/sched/signal.h | 1 + kernel/entry/syscall_user_dispatch.c | 4 ++-- kernel/signal.c | 13 +++++++++++++ 11 files changed, 26 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 99058a6da956..34d6458340b0 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -1145,7 +1145,7 @@ asmlinkage void set_esp0(unsigned long ssp) */ asmlinkage void fpsp040_die(void) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } #ifdef CONFIG_M68KFPU_EMU diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 00a9c9cd6d42..3e053e2fd6b6 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1063,7 +1063,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(new_ctx, regs, 0)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index ef518535d436..d1e1fc0acbea 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -704,7 +704,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, */ if (__get_user_sigset(&set, &new_ctx->uc_sigmask)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } set_current_blocked(&set); @@ -713,7 +713,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, return -EFAULT; if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) { user_read_access_end(); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } user_read_access_end(); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 035705c9f23e..2b780786fc68 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -84,7 +84,7 @@ static void default_trap_handler(struct pt_regs *regs) { if (user_mode(regs)) { report_user_fault(regs, SIGSEGV, 0); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } else die(regs, "Unknown program exception"); } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index cd677bc564a7..ffab16369bea 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -244,7 +244,7 @@ static int setup_frame(struct ksignal *ksig, struct pt_regs *regs, get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } @@ -336,7 +336,7 @@ static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs, sf = (struct rt_signal_frame __user *) get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } diff --git a/arch/sparc/kernel/windows.c b/arch/sparc/kernel/windows.c index bbbd40cc6b28..8f20862ccc83 100644 --- a/arch/sparc/kernel/windows.c +++ b/arch/sparc/kernel/windows.c @@ -122,7 +122,7 @@ void try_to_clear_window_buffer(struct pt_regs *regs, int who) if ((sp & 7) || copy_to_user((char __user *) sp, &tp->reg_window[window], sizeof(struct reg_window32))) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return; } } diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 0b6b277ee050..fd2ee9408e91 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -226,7 +226,7 @@ bool emulate_vsyscall(unsigned long error_code, if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index cce1c89cb7df..c21bcd668284 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -160,7 +160,7 @@ Efault_end: user_access_end(); Efault: pr_alert("could not access userspace vm86 info\n"); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); goto exit_vm86; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..33a50642cf41 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 4508201847d2..0b6379adff6b 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -48,7 +48,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) * the selector is loaded by userspace. */ if (unlikely(__get_user(state, sd->selector))) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return true; } @@ -56,7 +56,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) return false; if (state != SYSCALL_DISPATCH_FILTER_BLOCK) { - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } } diff --git a/kernel/signal.c b/kernel/signal.c index 7815e1bbeddc..a629b11bf3e0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1671,6 +1671,19 @@ void force_fatal_sig(int sig) force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } +void force_exit_sig(int sig) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info_to_task(&info, current, HANDLER_EXIT); +} + /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused -- cgit v1.2.3-71-gd317 From d257cc8cb8d5355ffc43a96bab94db7b5a324803 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Nov 2021 20:29:12 -0500 Subject: locking/rwsem: Make handoff bit handling more consistent There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com --- kernel/locking/rwsem.c | 171 ++++++++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..e039cf1605af 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); if (list_empty(&sem->wait_list)) { - /* hit end of list above */ + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; - if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -956,7 +1001,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1047,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1061,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1119,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1137,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1148,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } -- cgit v1.2.3-71-gd317 From 14c24048841151548a3f4d9e218510c844c1b737 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Nov 2021 17:44:55 +0800 Subject: locking/rwsem: Optimize down_read_trylock() under highly contended case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found that a process with 10 thousnads threads has been encountered a regression problem from Linux-v4.14 to Linux-v5.4. It is a kind of workload which will concurrently allocate lots of memory in different threads sometimes. In this case, we will see the down_read_trylock() with a high hotspot. Therefore, we suppose that rwsem has a regression at least since Linux-v5.4. In order to easily debug this problem, we write a simply benchmark to create the similar situation lile the following. ```c++ #include #include #include #include #include #include #include #include #include volatile int mutex; void trigger(int cpu, char* ptr, std::size_t sz) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); assert(pthread_setaffinity_np(pthread_self(), sizeof(set), &set) == 0); while (mutex); for (std::size_t i = 0; i < sz; i += 4096) { *ptr = '\0'; ptr += 4096; } } int main(int argc, char* argv[]) { std::size_t sz = 100; if (argc > 1) sz = atoi(argv[1]); auto nproc = std::thread::hardware_concurrency(); std::vector thr; sz <<= 30; auto* ptr = mmap(nullptr, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); assert(ptr != MAP_FAILED); char* cptr = static_cast(ptr); auto run = sz / nproc; run = (run >> 12) << 12; mutex = 1; for (auto i = 0U; i < nproc; ++i) { thr.emplace_back(std::thread([i, cptr, run]() { trigger(i, cptr, run); })); cptr += run; } rusage usage_start; getrusage(RUSAGE_SELF, &usage_start); auto start = std::chrono::system_clock::now(); mutex = 0; for (auto& t : thr) t.join(); rusage usage_end; getrusage(RUSAGE_SELF, &usage_end); auto end = std::chrono::system_clock::now(); timeval utime; timeval stime; timersub(&usage_end.ru_utime, &usage_start.ru_utime, &utime); timersub(&usage_end.ru_stime, &usage_start.ru_stime, &stime); printf("usr: %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("sys: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("real: %lu\n", std::chrono::duration_cast(end - start).count()); return 0; } ``` The functionality of above program is simply which creates `nproc` threads and each of them are trying to touch memory (trigger page fault) on different CPU. Then we will see the similar profile by `perf top`. 25.55% [kernel] [k] down_read_trylock 14.78% [kernel] [k] handle_mm_fault 13.45% [kernel] [k] up_read 8.61% [kernel] [k] clear_page_erms 3.89% [kernel] [k] __do_page_fault The highest hot instruction, which accounts for about 92%, in down_read_trylock() is cmpxchg like the following. 91.89 │ lock cmpxchg %rdx,(%rdi) Sice the problem is found by migrating from Linux-v4.14 to Linux-v5.4, so we easily found that the commit ddb20d1d3aed ("locking/rwsem: Optimize down_read_trylock()") caused the regression. The reason is that the commit assumes the rwsem is not contended at all. But it is not always true for mmap lock which could be contended with thousands threads. So most threads almost need to run at least 2 times of "cmpxchg" to acquire the lock. The overhead of atomic operation is higher than non-atomic instructions, which caused the regression. By using the above benchmark, the real executing time on a x86-64 system before and after the patch were: Before Patch After Patch # of Threads real real reduced by ------------ ------ ------ ---------- 1 65,373 65,206 ~0.0% 4 15,467 15,378 ~0.5% 40 6,214 5,528 ~11.0% For the uncontended case, the new down_read_trylock() is the same as before. For the contended cases, the new down_read_trylock() is faster than before. The more contended, the more fast. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211118094455.9068-1-songmuchun@bytedance.com --- kernel/locking/rwsem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e039cf1605af..04a74d040a6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1248,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); + } return 0; } -- cgit v1.2.3-71-gd317 From 73743c3b092277febbf69b250ce8ebbca0525aa2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 9 Nov 2021 13:22:32 +0100 Subject: perf: Ignore sigtrap for tracepoints destined for other tasks syzbot reported that the warning in perf_sigtrap() fires, saying that the event's task does not match current: | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | Modules linked in: | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0 | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline] | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline] | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | ... | Call Trace: | | irq_work_single+0x106/0x220 kernel/irq_work.c:211 | irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242 | irq_work_run+0x4f/0xd0 kernel/irq_work.c:251 | __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22 | sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17 | | | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664 | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194 | ... | coredump_task_exit kernel/exit.c:371 [inline] | do_exit+0x1865/0x25c0 kernel/exit.c:771 | do_group_exit+0xe7/0x290 kernel/exit.c:929 | get_signal+0x3b0/0x1ce0 kernel/signal.c:2820 | arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 | handle_signal_work kernel/entry/common.c:148 [inline] | exit_to_user_mode_loop kernel/entry/common.c:172 [inline] | exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 | __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] | syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 | do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 | entry_SYSCALL_64_after_hwframe+0x44/0xae On x86 this shouldn't happen, which has arch_irq_work_raise(). The test program sets up a perf event with sigtrap set to fire on the 'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup(). This happened because the 'sched_wakeup' tracepoint also takes a task argument passed on to perf_tp_event(), which is used to deliver the event to that other task. Since we cannot deliver synchronous signals to other tasks, skip an event if perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is set, which will avoid ever entering perf_sigtrap() for such events. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..30d94f68c5bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9759,6 +9759,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } -- cgit v1.2.3-71-gd317 From 1880ed71ce863318c1ce93bf324876fb5f92854f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 23 Nov 2021 15:28:01 +0100 Subject: tracing/uprobe: Fix uprobe_perf_open probes iteration Add missing 'tu' variable initialization in the probes loop, otherwise the head 'tu' is used instead of added probes. Link: https://lkml.kernel.org/r/20211123142801.182530-1-jolsa@kernel.org Cc: stable@vger.kernel.org Fixes: 99c9a923e97a ("tracing/uprobe: Fix double perf_event linking on multiprobe uprobe") Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0a5c0db3137e..f5f0039d31e5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1313,6 +1313,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); -- cgit v1.2.3-71-gd317 From dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 23 Nov 2021 11:40:47 +0000 Subject: sched/scs: Reset task stack state in bringup_cpu() To hot unplug a CPU, the idle task on that CPU calls a few layers of C code before finally leaving the kernel. When KASAN is in use, poisoned shadow is left around for each of the active stack frames, and when shadow call stacks are in use. When shadow call stacks (SCS) are in use the task's saved SCS SP is left pointing at an arbitrary point within the task's shadow call stack. When a CPU is offlined than onlined back into the kernel, this stale state can adversely affect execution. Stale KASAN shadow can alias new stackframes and result in bogus KASAN warnings. A stale SCS SP is effectively a memory leak, and prevents a portion of the shadow call stack being used. Across a number of hotplug cycles the idle task's entire shadow call stack can become unusable. We previously fixed the KASAN issue in commit: e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug") ... by removing any stale KASAN stack poison immediately prior to onlining a CPU. Subsequently in commit: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") ... the refactoring left the KASAN and SCS cleanup in one-time idle thread initialization code rather than something invoked prior to each CPU being onlined, breaking both as above. We fixed SCS (but not KASAN) in commit: 63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit") ... but as this runs in the context of the idle task being offlined it's potentially fragile. To fix these consistently and more robustly, reset the SCS SP and KASAN shadow of a CPU's idle task immediately before we online that CPU in bringup_cpu(). This ensures the idle task always has a consistent state when it is running, and removes the need to so so when exiting an idle task. Whenever any thread is created, dup_task_struct() will give the task a stack which is free of KASAN shadow, and initialize the task's SCS SP, so there's no need to specially initialize either for idle thread within init_idle(), as this was only necessary to handle hotplug cycles. I've tested this on arm64 with: * gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK * clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK ... offlining and onlining CPUS with: | while true; do | for C in /sys/devices/system/cpu/cpu*/online; do | echo 0 > $C; | echo 1 > $C; | done | done Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") Reported-by: Qian Cai Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Qian Cai Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/ --- kernel/cpu.c | 7 +++++++ kernel/sched/core.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..407a2568f35e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..76f9deeaa942 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8619,9 +8619,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8777,7 +8774,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -- cgit v1.2.3-71-gd317 From cefcf24b4d351daf70ecd945324e200d3736821e Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 23 Nov 2021 20:18:43 +0100 Subject: PM: hibernate: use correct mode for swsusp_close() Commit 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") changed the opening mode of the block device to (FMODE_READ | FMODE_EXCL). In the corresponding calls to swsusp_close(), the mode is still just FMODE_READ which triggers the warning in blkdev_flush_mapping() on resume from hibernate. So, use the mode (FMODE_READ | FMODE_EXCL) also when closing the device. Fixes: 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") Signed-off-by: Thomas Zeitlhofer Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9ed9b744876c..e6af502c2fd7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,7 +693,7 @@ static int load_image_and_restore(void) goto Unlock; error = swsusp_read(&flags); - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -983,7 +983,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; } @@ -1018,7 +1018,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Finish; } -- cgit v1.2.3-71-gd317 From 88a5045f176b78c33a269a30a7b146e99c550bd9 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 29 Oct 2021 12:24:22 -0700 Subject: PM: hibernate: Fix snapshot partial write lengths snapshot_write() is inappropriately limiting the amount of data that can be written in cases where a partial page has already been written. For example, one would expect to be able to write 1 byte, then 4095 bytes to the snapshot device, and have both of those complete fully (since now we're aligned to a page again). But what ends up happening is we write 1 byte, then 4094/4095 bytes complete successfully. The reason is that simple_write_to_buffer()'s second argument is the total size of the buffer, not the size of the buffer minus the offset. Since simple_write_to_buffer() accounts for the offset in its implementation, snapshot_write() can just pass the full page size directly down. Signed-off-by: Evan Green Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 740723bb3885..ad241b4ff64c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -177,7 +177,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res <= 0) goto unlock; } else { - res = PAGE_SIZE - pg_offp; + res = PAGE_SIZE; } if (!data_of(data->handle)) { -- cgit v1.2.3-71-gd317 From 6cb206508b621a9a0a2c35b60540e399225c8243 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 13:35:26 -0500 Subject: tracing: Check pid filtering when creating events When pid filtering is activated in an instance, all of the events trace files for that instance has the PID_FILTER flag set. This determines whether or not pid filtering needs to be done on the event, otherwise the event is executed as normal. If pid filtering is enabled when an event is created (via a dynamic event or modules), its flag is not updated to reflect the current state, and the events are not filtered properly. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4021b9a79f93..f8965fd50d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2678,12 +2678,24 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; struct trace_event_file *file; + unsigned int first; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + if (!trace_pid_list_first(pid_list, &first) || + !trace_pid_list_first(pid_list, &first)) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); -- cgit v1.2.3-71-gd317 From a55f224ff5f238013de8762c4287117e47b86e22 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 17:34:42 -0500 Subject: tracing: Fix pid filtering when triggers are attached If a event is filtered by pid and a trigger that requires processing of the event to happen is a attached to the event, the discard portion does not take the pid filtering into account, and the event will then be recorded when it should not have been. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b60ab9475ed..38715aa6cfdf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1366,14 +1366,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, buffer, entry, event); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; } /** -- cgit v1.2.3-71-gd317 From 27ff768fa21ca3286fcc87c3f38ac67d1a2cbe2d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 27 Nov 2021 16:45:26 -0500 Subject: tracing: Test the 'Do not trace this pid' case in create event When creating a new event (via a module, kprobe, eprobe, etc), the descriptors that are created must add flags for pid filtering if an instance has pid filtering enabled, as the flags are used at the time the event is executed to know if pid filtering should be done or not. The "Only trace this pid" case was added, but a cut and paste error made that case checked twice, instead of checking the "Trace all but this pid" case. Link: https://lore.kernel.org/all/202111280401.qC0z99JB-lkp@intel.com/ Fixes: 6cb206508b62 ("tracing: Check pid filtering when creating events") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f8965fd50d3b..92be9cb1d7d4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2693,7 +2693,7 @@ trace_create_new_event(struct trace_event_call *call, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || - !trace_pid_list_first(pid_list, &first)) + !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; -- cgit v1.2.3-71-gd317 From 450fec13d9170127678f991698ac1a5b05c02e2f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Nov 2021 12:31:23 -0500 Subject: tracing/histograms: String compares should not care about signed values When comparing two strings for the "onmatch" histogram trigger, fields that are strings use string comparisons, which do not care about being signed or not. Do not fail to match two string fields if one is unsigned char array and the other is a signed char array. Link: https://lore.kernel.org/all/20211129123043.5cfd687a@gandalf.local.home/ Cc: stable@vgerk.kernel.org Cc: Tom Zanussi Cc: Yafang Shao Fixes: b05e89ae7cf3b ("tracing: Accept different type for synthetic event fields") Reviewed-by: Masami Hiramatsu Reported-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9555b8e1d1e3..319f9c8ca7e7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3757,7 +3757,7 @@ static int check_synth_field(struct synth_event *event, if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || - field->is_signed != hist_field->is_signed) + (!field->is_string && field->is_signed != hist_field->is_signed)) return -EINVAL; } -- cgit v1.2.3-71-gd317 From f25667e5980a4333729cac3101e5de1bb851f71a Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Wed, 24 Nov 2021 14:08:01 +0000 Subject: tracing: Fix a kmemleak false positive in tracing_map Doing the command: echo 'hist:key=common_pid.execname,common_timestamp' > /sys/kernel/debug/tracing/events/xxx/trigger Triggers many kmemleak reports: unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 The reason is elts->pages[i] is alloced by get_zeroed_page. and kmemleak will not scan the area alloced by get_zeroed_page. The address stored in elts->pages will be regarded as leaked. That is, the elts->pages[i] will have pointers loaded onto it as well, and without telling kmemleak about it, those pointers will look like memory without a reference. To fix this, call kmemleak_alloc to tell kmemleak to scan elts->pages[i] Link: https://lkml.kernel.org/r/20211124140801.87121-1-chenjun102@huawei.com Signed-off-by: Chen Jun Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 39bb56d2dcbe..9628b5571846 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "tracing_map.h" #include "trace.h" @@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a) for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; + kmemleak_free(a->pages[i]); free_page((unsigned long)a->pages[i]); } @@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!a->pages[i]) goto free; + kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL); } out: return a; -- cgit v1.2.3-71-gd317 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..21eccc961bba 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; -- cgit v1.2.3-71-gd317 From 53e87e3cdc155f20c3417b689df8d2ac88d79576 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:54 +0200 Subject: timers/nohz: Last resort update jiffies on nohz_full IRQ entry When at least one CPU runs in nohz_full mode, a dedicated timekeeper CPU is guaranteed to stay online and to never stop its tick. Meanwhile on some rare case, the dedicated timekeeper may be running with interrupts disabled for a while, such as in stop_machine. If jiffies stop being updated, a nohz_full CPU may end up endlessly programming the next tick in the past, taking the last jiffies update monotonic timestamp as a stale base, resulting in an tick storm. Here is a scenario where it matters: 0) CPU 0 is the timekeeper and CPU 1 a nohz_full CPU. 1) A stop machine callback is queued to execute somewhere. 2) CPU 0 reaches MULTI_STOP_DISABLE_IRQ while CPU 1 is still in MULTI_STOP_PREPARE. Hence CPU 0 can't do its timekeeping duty. CPU 1 can still take IRQs. 3) CPU 1 receives an IRQ which queues a timer callback one jiffy forward. 4) On IRQ exit, CPU 1 schedules the tick one jiffy forward, taking last_jiffies_update as a base. But last_jiffies_update hasn't been updated for 2 jiffies since the timekeeper has interrupts disabled. 5) clockevents_program_event(), which relies on ktime_get(), observes that the expiration is in the past and therefore programs the min delta event on the clock. 6) The tick fires immediately, goto 3) 7) Tick storm, the nohz_full CPU is drown and takes ages to reach MULTI_STOP_DISABLE_IRQ, which is the only way out of this situation. Solve this with unconditionally updating jiffies if the value is stale on nohz_full IRQ entry. IRQs and other disturbances are expected to be rare enough on nohz_full for the unconditional call to ktime_get() to actually matter. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20211026141055.57358-2-frederic@kernel.org --- kernel/softirq.c | 3 ++- kernel/time/tick-sched.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 322b65d45676..41f470929e99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -595,7 +595,8 @@ void irq_enter_rcu(void) { __irq_enter_raw(); - if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) tick_irq_enter(); account_hardirq_enter(current); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6bffe5af8cb1..17a283ce2b20 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1375,6 +1375,13 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } -- cgit v1.2.3-71-gd317 From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:55 +0200 Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time. task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime. To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations. Reported-by: Hasegawa Hitomi Signed-off-by: Frederic Weisbecker Signed-off-by: Hasegawa Hitomi Signed-off-by: Thomas Gleixner Tested-by: Masayoshi Mizuma Acked-by: Phil Auld Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org --- include/linux/sched/cputime.h | 5 +++-- kernel/sched/cputime.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..ce3c58286062 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..9392aea1804e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } static int vtime_state_fetch(struct vtime *vtime, int cpu) -- cgit v1.2.3-71-gd317 From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:40 +0530 Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL Vinicius Costa Gomes reported [0] that build fails when CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled. This leads to btf.c not being compiled, and then no symbol being present in vmlinux for the declarations in btf.h. Since BTF is not useful without enabling BPF subsystem, disallow this combination. However, theoretically disabling both now could still fail, as the symbol for kfunc_btf_id_list variables is not available. This isn't a problem as the compiler usually optimizes the whole register/unregister call, but at lower optimization levels it can fail the build in linking stage. Fix that by adding dummy variables so that modules taking address of them still work, but the whole thing is a noop. [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Reported-by: Vinicius Costa Gomes Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com --- include/linux/btf.h | 14 ++++++++++---- kernel/bpf/btf.c | 9 ++------- lib/Kconfig.debug | 1 + 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..0e1b6281fd8f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -245,7 +245,10 @@ struct kfunc_btf_id_set { struct module *owner; }; -struct kfunc_btf_id_list; +struct kfunc_btf_id_list { + struct list_head list; + struct mutex mutex; +}; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s); bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, struct module *owner); + +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, { return false; } + +static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; #endif #define DEFINE_KFUNC_BTF_ID_SET(set, name) \ struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; - #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..ea3df9867cec 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6346,11 +6346,6 @@ BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) /* BTF ID set registration API for modules */ -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -6389,8 +6384,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, return false; } -#endif - #define DEFINE_KFUNC_BTF_ID_LIST(name) \ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ __MUTEX_INITIALIZER(name.mutex) }; \ @@ -6398,3 +6391,5 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ef7ce18b4f5..596bb5e4790c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -316,6 +316,7 @@ config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST + depends on BPF_SYSCALL help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert -- cgit v1.2.3-71-gd317 From b12f031043247b80999bf5e03b8cded3b0b40f8d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:41 +0530 Subject: bpf: Fix bpf_check_mod_kfunc_call for built-in modules When module registering its set is built-in, THIS_MODULE will be NULL, hence we cannot return early in case owner is NULL. Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-3-memxor@gmail.com --- kernel/bpf/btf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ea3df9867cec..9bdb03767db5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6371,8 +6371,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, { struct kfunc_btf_id_set *s; - if (!owner) - return false; mutex_lock(&klist->mutex); list_for_each_entry(s, &klist->list, list) { if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { -- cgit v1.2.3-71-gd317 From 2fa7d94afc1afbb4d702760c058dc2d7ed30f226 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 30 Nov 2021 20:16:07 +0200 Subject: bpf: Fix the off-by-two error in range markings The first commit cited below attempts to fix the off-by-one error that appeared in some comparisons with an open range. Due to this error, arithmetically equivalent pieces of code could get different verdicts from the verifier, for example (pseudocode): // 1. Passes the verifier: if (data + 8 > data_end) return early read *(u64 *)data, i.e. [data; data+7] // 2. Rejected by the verifier (should still pass): if (data + 7 >= data_end) return early read *(u64 *)data, i.e. [data; data+7] The attempted fix, however, shifts the range by one in a wrong direction, so the bug not only remains, but also such piece of code starts failing in the verifier: // 3. Rejected by the verifier, but the check is stricter than in #1. if (data + 8 >= data_end) return early read *(u64 *)data, i.e. [data; data+7] The change performed by that fix converted an off-by-one bug into off-by-two. The second commit cited below added the BPF selftests written to ensure than code chunks like #3 are rejected, however, they should be accepted. This commit fixes the off-by-two error by adjusting new_range in the right direction and fixes the tests by changing the range into the one that should actually fail. Fixes: fb2a311a31d3 ("bpf: fix off by one for range markings with L{T, E} patterns") Fixes: b37242c773b2 ("bpf: add test cases to bpf selftests to cover all access tests") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211130181607.593149-1-maximmi@nvidia.com --- kernel/bpf/verifier.c | 2 +- .../bpf/verifier/xdp_direct_packet_access.c | 32 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 50efda51515b..f3001937bbb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8422,7 +8422,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range = dst_reg->off; if (range_right_open) - new_range--; + new_range++; /* Examples for register markings: * diff --git a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c index bfb97383e6b5..de172a5b8754 100644 --- a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c @@ -112,10 +112,10 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -167,10 +167,10 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -274,9 +274,9 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -437,9 +437,9 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -544,10 +544,10 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -599,10 +599,10 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -706,9 +706,9 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -869,9 +869,9 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3-71-gd317 From 9ed20bafc85806ca6c97c9128cec46c3ef80ae86 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Fri, 3 Dec 2021 17:32:03 -0600 Subject: preempt/dynamic: Fix setup_preempt_mode() return value __setup() callbacks expect 1 for success and 0 for failure. Correct the usage here to reflect that. Fixes: 826bfeb37bb4 ("preempt/dynamic: Support dynamic preempt with preempt= boot option") Reported-by: Mark Rutland Signed-off-by: Andrew Halaney Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211203233203.133581-1-ahalaney@redhat.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 76f9deeaa942..814c52d90c0f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6617,11 +6617,11 @@ static int __init setup_preempt_mode(char *str) int mode = sched_dynamic_mode(str); if (mode < 0) { pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 1; + return 0; } sched_dynamic_update(mode); - return 0; + return 1; } __setup("preempt=", setup_preempt_mode); -- cgit v1.2.3-71-gd317 From 315c4f884800c45cb6bd8c90422fad554a8b9588 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 2 Dec 2021 11:20:33 +0000 Subject: sched/uclamp: Fix rq->uclamp_max not set on first enqueue Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to match the woken up task's uclamp_max when the rq is idle. The code was relying on rq->uclamp_max initialized to zero, so on first enqueue static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { ... if (uc_se->value > READ_ONCE(uc_rq->value)) WRITE_ONCE(uc_rq->value, uc_se->value); } was actually resetting it. But since commit d81ae8aac85c changed the default to 1024, this no longer works. And since rq->uclamp_flags is also initialized to 0, neither above code path nor uclamp_idle_reset() update the rq->uclamp_max on first wake up from idle. This is only visible from first wake up(s) until the first dequeue to idle after enabling the static key. And it only matters if the uclamp_max of this task is < 1024 since only then its uclamp_max will be effectively ignored. Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to ensure uclamp_idle_reset() is called which then will update the rq uclamp_max value as expected. Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 814c52d90c0f..77563109c0ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1918,7 +1918,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) -- cgit v1.2.3-71-gd317 From 7d5b7cad79da76f3dad4a9f6040e524217814e5a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 Dec 2021 19:20:30 +0100 Subject: ftrace: Use direct_ops hash in unregister_ftrace_direct Now when we have *direct_multi interface the direct_functions hash is no longer owned just by direct_ops. It's also used by any other ftrace_ops passed to *direct_multi interface. Thus to find out that we are unregistering the last function from direct_ops, we need to check directly direct_ops's hash. Link: https://lkml.kernel.org/r/20211206182032.87248-2-jolsa@kernel.org Cc: Ingo Molnar Cc: Heiko Carstens Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface") Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849..7f0594e28226 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5217,6 +5217,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; + struct ftrace_hash *hash; int ret = -ENODEV; mutex_lock(&direct_mutex); @@ -5225,7 +5226,8 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; - if (direct_functions->count == 1) + hash = direct_ops.func_hash->filter_hash; + if (hash->count == 1) unregister_ftrace_function(&direct_ops); ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); -- cgit v1.2.3-71-gd317 From fea3ffa48c6d42a11dca766c89284d22eaf5603f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 Dec 2021 19:20:31 +0100 Subject: ftrace: Add cleanup to unregister_ftrace_direct_multi Adding ops cleanup to unregister_ftrace_direct_multi, so it can be reused in another register call. Link: https://lkml.kernel.org/r/20211206182032.87248-3-jolsa@kernel.org Cc: Ingo Molnar Cc: Heiko Carstens Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface") Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7f0594e28226..be5f6b32a012 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5542,6 +5542,10 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) err = unregister_ftrace_function(ops); remove_direct_functions_hash(hash, addr); mutex_unlock(&direct_mutex); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); -- cgit v1.2.3-71-gd317 From c24be24aed405d64ebcf04526614c13b2adfb1d2 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 9 Dec 2021 02:43:17 +0000 Subject: tracing: Fix possible memory leak in __create_synth_event() error path There's error paths in __create_synth_event() after the argv is allocated that fail to free it. Add a jump to free it when necessary. Link: https://lkml.kernel.org/r/20211209024317.11783-1-linmq006@gmail.com Suggested-by: Steven Rostedt (VMware) Signed-off-by: Miaoqian Lin [ Fixed up the patch and change log ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74..ca9c13b2ecf4 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1237,9 +1237,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) argv + consumed, &consumed, &field_version); if (IS_ERR(field)) { - argv_free(argv); ret = PTR_ERR(field); - goto err; + goto err_free_arg; } /* @@ -1262,18 +1261,19 @@ static int __create_synth_event(const char *name, const char *raw_fields) if (cmd_version > 1 && n_fields_this_loop >= 1) { synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str)); ret = -EINVAL; - goto err; + goto err_free_arg; } fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; - goto err; + goto err_free_arg; } n_fields_this_loop++; } + argv_free(argv); if (consumed < argc) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -1281,7 +1281,6 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err; } - argv_free(argv); } if (n_fields == 0) { @@ -1307,6 +1306,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) kfree(saved_fields); return ret; + err_free_arg: + argv_free(argv); err: for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); -- cgit v1.2.3-71-gd317 From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:51 -0800 Subject: wait: add wake_up_pollfree() Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'. However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone. Considering the three non-blocking poll systems: - io_uring poll doesn't handle POLLFREE at all, so it is broken anyway. - aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later. - epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile. Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up. Reported-by: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ kernel/sched/wait.c | 7 +++++++ 2 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2d0df57c9902..851e07da2583 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); +void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 76577d1642a5..eca38107b32f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any -- cgit v1.2.3-71-gd317 From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: timers: implement usleep_idle_range() Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 14 +++++++++++++- kernel/time/timer.c | 16 +++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/delay.h b/include/linux/delay.h index 8eacf67eb212..039e7e0c7378 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e3d2c23c413d..85f1021ad459 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); -- cgit v1.2.3-71-gd317