From f45b2974cc0ae959a4c503a071e38a56bd64372f Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 17 Nov 2021 13:57:08 +0100 Subject: bpf, x86: Fix "no previous prototype" warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arch_prepare_bpf_dispatcher function does not have a prototype, and yields the following warning when W=1 is enabled for the kernel build. >> arch/x86/net/bpf_jit_comp.c:2188:5: warning: no previous \ prototype for 'arch_prepare_bpf_dispatcher' [-Wmissing-prototypes] 2188 | int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, \ int num_funcs) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Remove the warning by adding a function declaration to include/linux/bpf.h. Fixes: 75ccbef6369e ("bpf: Introduce BPF dispatcher") Reported-by: kernel test robot Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117125708.769168-1-bjorn@kernel.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7a163a3146b..84ff6ef49462 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -732,6 +732,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); +int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ -- cgit v1.2.3-71-gd317 From 6966df483d7b5b218aeb0e13e7e334a8fc3c1744 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 18 Nov 2021 13:49:51 +0200 Subject: regulator: Update protection IRQ helper docs The documentation of IRQ notification helper had still references to first RFC implementation which called BUG() while trying to protect the hardware. Behaviour was improved as calling the BUG() was not a proper solution. Current implementation attempts to call poweroff if handling of potentially damaging error notification fails. Update the documentation to reflect the actual behaviour. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/0c9cc4bcf20c3da66fd5a85c97ee4288e5727538.1637233864.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index bd7a73db2e66..54cf566616ae 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -499,7 +499,8 @@ struct regulator_irq_data { * best to shut-down regulator(s) or reboot the SOC if error * handling is repeatedly failing. If fatal_cnt is given the IRQ * handling is aborted if it fails for fatal_cnt times and die() - * callback (if populated) or BUG() is called to try to prevent + * callback (if populated) is called. If die() is not populated + * poweroff for the system is attempted in order to prevent any * further damage. * @reread_ms: The time which is waited before attempting to re-read status * at the worker if IC reading fails. Immediate re-read is done @@ -516,11 +517,12 @@ struct regulator_irq_data { * @data: Driver private data pointer which will be passed as such to * the renable, map_event and die callbacks in regulator_irq_data. * @die: Protection callback. If IC status reading or recovery actions - * fail fatal_cnt times this callback or BUG() is called. This - * callback should implement a final protection attempt like - * disabling the regulator. If protection succeeded this may - * return 0. If anything else is returned the core assumes final - * protection failed and calls BUG() as a last resort. + * fail fatal_cnt times this callback is called or system is + * powered off. This callback should implement a final protection + * attempt like disabling the regulator. If protection succeeded + * die() may return 0. If anything else is returned the core + * assumes final protection failed and attempts to perform a + * poweroff as a last resort. * @map_event: Driver callback to map IRQ status into regulator devices with * events / errors. NOTE: callback MUST initialize both the * errors and notifs for all rdevs which it signals having -- cgit v1.2.3-71-gd317 From 28c916ade1bd4205958f74bb817fd3a05dbb7afc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 18 Nov 2021 16:30:14 +0100 Subject: ASoC: soc-acpi: Set mach->id field on comp_ids matches Commit dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") and commit 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") simplified the match tables in soc-acpi-intel-byt-match.c and soc-acpi-intel-cht-match.c by merging identical entries using the new .comp_ids snd_soc_acpi_mach field to point a single entry to multiple ACPI HIDs and clearing the previously unique per entry .id field. But various machine drivers from sound/soc/intel/boards rely on mach->id in one or more ways, e.g. some drivers contain the following snippets: adev = acpi_dev_get_first_match_dev(mach->id, NULL, -1); pkg_found = snd_soc_acpi_find_package_from_hid(mach->id, ... if (!strncmp(snd_soc_cards[i].codec_id, mach->id, 8)) { ... All of which are broken by the match table shrinking. Make the snd_soc_acpi_mach.id field non const (the storage for the tables already is non const) and on a comps_ids match copy the matching HID to the id field to fix this. Fixes: dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") Fixes: 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") Suggested-by: Pierre-Louis Bossart Cc: Pierre-Louis Bossart Cc: Brent Lu Signed-off-by: Hans de Goede Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20211118153014.349222-1-hdegoede@redhat.com Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 2 +- sound/soc/soc-acpi.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index 31f4c4f9aeea..ac0893df9c76 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -147,7 +147,7 @@ struct snd_soc_acpi_link_adr { */ /* Descriptor for SST ASoC machine driver */ struct snd_soc_acpi_mach { - const u8 id[ACPI_ID_LEN]; + u8 id[ACPI_ID_LEN]; const struct snd_soc_acpi_codecs *comp_ids; const u32 link_mask; const struct snd_soc_acpi_link_adr *links; diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index 2ae99b49d3f5..cbd7ea48837b 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -20,8 +20,10 @@ static bool snd_soc_acpi_id_present(struct snd_soc_acpi_mach *machine) if (comp_ids) { for (i = 0; i < comp_ids->num_codecs; i++) { - if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) + if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) { + strscpy(machine->id, comp_ids->codecs[i], ACPI_ID_LEN); return true; + } } } -- cgit v1.2.3-71-gd317 From 7e78781df491e4beb475bac22e6c44236a5002d7 Mon Sep 17 00:00:00 2001 From: Gurchetan Singh Date: Mon, 22 Nov 2021 15:22:09 -0800 Subject: drm/virtgpu api: define a dummy fence signaled event The current virtgpu implementation of poll(..) drops events when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is enabled (otherwise it's like a normal DRM driver). This is because paravirtualized userspaces receives responses in a buffer of type BLOB_MEM_GUEST, not by read(..). To be in line with other DRM drivers and avoid specialized behavior, it is possible to define a dummy event for virtgpu. Paravirtualized userspace will now have to call read(..) on the DRM fd to receive the dummy event. Fixes: b10790434cf2 ("drm/virtgpu api: create context init feature") Reported-by: Daniel Vetter Signed-off-by: Gurchetan Singh Reviewed-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20211122232210.602-2-gurchetansingh@google.com Signed-off-by: Gerd Hoffmann --- drivers/gpu/drm/virtio/virtgpu_drv.h | 1 - drivers/gpu/drm/virtio/virtgpu_ioctl.c | 2 +- include/uapi/drm/virtgpu_drm.h | 7 +++++++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index e0265fe74aa5..0a194aaad419 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -138,7 +138,6 @@ struct virtio_gpu_fence_driver { spinlock_t lock; }; -#define VIRTGPU_EVENT_FENCE_SIGNALED_INTERNAL 0x10000000 struct virtio_gpu_fence_event { struct drm_pending_event base; struct drm_event event; diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index 5618a1d5879c..3607646d3229 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -54,7 +54,7 @@ static int virtio_gpu_fence_event_create(struct drm_device *dev, if (!e) return -ENOMEM; - e->event.type = VIRTGPU_EVENT_FENCE_SIGNALED_INTERNAL; + e->event.type = VIRTGPU_EVENT_FENCE_SIGNALED; e->event.length = sizeof(e->event); ret = drm_event_reserve_init(dev, file, &e->base, &e->event); diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h index a13e20cc66b4..0512fde5e697 100644 --- a/include/uapi/drm/virtgpu_drm.h +++ b/include/uapi/drm/virtgpu_drm.h @@ -196,6 +196,13 @@ struct drm_virtgpu_context_init { __u64 ctx_set_params; }; +/* + * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in + * effect. The event size is sizeof(drm_event), since there is no additional + * payload. + */ +#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000 + #define DRM_IOCTL_VIRTGPU_MAP \ DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) -- cgit v1.2.3-71-gd317 From dacb5d8875cc6cd3a553363b4d6f06760fcbe70c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Nov 2021 19:34:21 +0100 Subject: tcp: fix page frag corruption on page fault Steffen reported a TCP stream corruption for HTTP requests served by the apache web-server using a cifs mount-point and memory mapping the relevant file. The root cause is quite similar to the one addressed by commit 20eb4f29b602 ("net: fix sk_page_frag() recursion from memory reclaim"). Here the nested access to the task page frag is caused by a page fault on the (mmapped) user-space memory buffer coming from the cifs file. The page fault handler performs an smb transaction on a different socket, inside the same process context. Since sk->sk_allaction for such socket does not prevent the usage for the task_frag, the nested allocation modify "under the hood" the page frag in use by the outer sendmsg call, corrupting the stream. The overall relevant stack trace looks like the following: httpd 78268 [001] 3461630.850950: probe:tcp_sendmsg_locked: ffffffff91461d91 tcp_sendmsg_locked+0x1 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139814e sock_sendmsg+0x3e ffffffffc06dfe1d smb_send_kvec+0x28 [...] ffffffffc06cfaf8 cifs_readpages+0x213 ffffffff90e83c4b read_pages+0x6b ffffffff90e83f31 __do_page_cache_readahead+0x1c1 ffffffff90e79e98 filemap_fault+0x788 ffffffff90eb0458 __do_fault+0x38 ffffffff90eb5280 do_fault+0x1a0 ffffffff90eb7c84 __handle_mm_fault+0x4d4 ffffffff90eb8093 handle_mm_fault+0xc3 ffffffff90c74f6d __do_page_fault+0x1ed ffffffff90c75277 do_page_fault+0x37 ffffffff9160111e page_fault+0x1e ffffffff9109e7b5 copyin+0x25 ffffffff9109eb40 _copy_from_iter_full+0xe0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139815c sock_sendmsg+0x4c ffffffff913981f7 sock_write_iter+0x97 ffffffff90f2cc56 do_iter_readv_writev+0x156 ffffffff90f2dff0 do_iter_write+0x80 ffffffff90f2e1c3 vfs_writev+0xa3 ffffffff90f2e27c do_writev+0x5c ffffffff90c042bb do_syscall_64+0x5b ffffffff916000ad entry_SYSCALL_64_after_hwframe+0x65 The cifs filesystem rightfully sets sk_allocations to GFP_NOFS, we can avoid the nesting using the sk page frag for allocation lacking the __GFP_FS flag. Do not define an additional mm-helper for that, as this is strictly tied to the sk page frag usage. v1 -> v2: - use a stricted sk_page_frag() check instead of reordering the code (Eric) Reported-by: Steffen Froemer Fixes: 5640f7685831 ("net: use a per task frag allocator") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b32906e1ab55..715cdb4b2b79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2430,19 +2430,22 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * @sk: socket * * Use the per task page_frag instead of the per socket one for - * optimization when we know that we're in the normal context and owns + * optimization when we know that we're in process context and own * everything that's associated with %current. * - * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest - * inside other socket operations and end up recursing into sk_page_frag() - * while it's already in use. + * Both direct reclaim and page faults can nest inside other + * socket operations and end up recursing into sk_page_frag() + * while it's already in use: explicitly avoid task page_frag + * usage if the caller is potentially doing any of them. + * This assumes that page fault handlers use the GFP_NOFS flags. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (gfpflags_normal_context(sk->sk_allocation)) + if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; return &sk->sk_frag; -- cgit v1.2.3-71-gd317 From cdef485217d30382f3bf6448c54b4401648fe3f1 Mon Sep 17 00:00:00 2001 From: msizanoen1 Date: Tue, 23 Nov 2021 13:48:32 +0100 Subject: ipv6: fix memory leak in fib6_rule_suppress The kernel leaks memory when a `fib` rule is present in IPv6 nftables firewall rules and a suppress_prefix rule is present in the IPv6 routing rules (used by certain tools such as wg-quick). In such scenarios, every incoming packet will leak an allocation in `ip6_dst_cache` slab cache. After some hours of `bpftrace`-ing and source code reading, I tracked down the issue to ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule"). The problem with that change is that the generic `args->flags` always have `FIB_LOOKUP_NOREF` set[1][2] but the IPv6-specific flag `RT6_LOOKUP_F_DST_NOREF` might not be, leading to `fib6_rule_suppress` not decreasing the refcount when needed. How to reproduce: - Add the following nftables rule to a prerouting chain: meta nfproto ipv6 fib saddr . mark . iif oif missing drop This can be done with: sudo nft create table inet test sudo nft create chain inet test test_chain '{ type filter hook prerouting priority filter + 10; policy accept; }' sudo nft add rule inet test test_chain meta nfproto ipv6 fib saddr . mark . iif oif missing drop - Run: sudo ip -6 rule add table main suppress_prefixlength 0 - Watch `sudo slabtop -o | grep ip6_dst_cache` to see memory usage increase with every incoming ipv6 packet. This patch exposes the protocol-specific flags to the protocol specific `suppress` function, and check the protocol-specific `flags` argument for RT6_LOOKUP_F_DST_NOREF instead of the generic FIB_LOOKUP_NOREF when decreasing the refcount, like this. [1]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L71 [2]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L99 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215105 Fixes: ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 +++- net/core/fib_rules.c | 2 +- net/ipv4/fib_rules.c | 1 + net/ipv6/fib6_rules.c | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4b10676c69d1..bd07484ab9dd 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -69,7 +69,7 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); - bool (*suppress)(struct fib_rule *, + bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); @@ -218,7 +218,9 @@ INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); #endif diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 79df7cd9dbc1..1bb567a3b329 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -323,7 +323,7 @@ jumped: if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, - rule, arg)) + rule, flags, arg)) continue; if (err != -EAGAIN) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index ce54a30c2ef1..364ad3446b2f 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -141,6 +141,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 40f3e4f9f33a..dcedfe29d9d9 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -267,6 +267,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; @@ -294,8 +295,7 @@ INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, return false; suppress_route: - if (!(arg->flags & FIB_LOOKUP_NOREF)) - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); return true; } -- cgit v1.2.3-71-gd317 From 20ae1d6aa159eb91a9bf09ff92ccaa94dbea92c2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:25 -0500 Subject: wireguard: device: reset peer src endpoint when netns exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each peer's endpoint contains a dst_cache entry that takes a reference to another netdev. When the containing namespace exits, we take down the socket and prevent future sockets from being created (by setting creating_net to NULL), which removes that potential reference on the netns. However, it doesn't release references to the netns that a netdev cached in dst_cache might be taking, so the netns still might fail to exit. Since the socket is gimped anyway, we can simply clear all the dst_caches (by way of clearing the endpoint src), which will release all references. However, the current dst_cache_reset function only releases those references lazily. But it turns out that all of our usages of wg_socket_clear_peer_endpoint_src are called from contexts that are not exactly high-speed or bottle-necked. For example, when there's connection difficulty, or when userspace is reconfiguring the interface. And in particular for this patch, when the netns is exiting. So for those cases, it makes more sense to call dst_release immediately. For that, we add a small helper function to dst_cache. This patch also adds a test to netns.sh from Hangbin Liu to ensure this doesn't regress. Tested-by: Hangbin Liu Reported-by: Xiumei Mu Cc: Toke Høiland-Jørgensen Cc: Paolo Abeni Fixes: 900575aa33a3 ("wireguard: device: avoid circular netns references") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 3 +++ drivers/net/wireguard/socket.c | 2 +- include/net/dst_cache.h | 11 +++++++++++ net/core/dst_cache.c | 19 +++++++++++++++++++ tools/testing/selftests/wireguard/netns.sh | 24 +++++++++++++++++++++++- 5 files changed, 57 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 551ddaaaf540..77e64ea6be67 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -398,6 +398,7 @@ static struct rtnl_link_ops link_ops __read_mostly = { static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; + struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { @@ -407,6 +408,8 @@ static void wg_netns_pre_exit(struct net *net) mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); + list_for_each_entry(peer, &wg->peer_list, peer_list) + wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 8c496b747108..6f07b949cb81 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -308,7 +308,7 @@ void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer) { write_lock_bh(&peer->endpoint_lock); memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6)); - dst_cache_reset(&peer->endpoint_cache); + dst_cache_reset_now(&peer->endpoint_cache); write_unlock_bh(&peer->endpoint_lock); } diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 67634675e919..df6622a5fe98 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -79,6 +79,17 @@ static inline void dst_cache_reset(struct dst_cache *dst_cache) dst_cache->reset_ts = jiffies; } +/** + * dst_cache_reset_now - invalidate the cache contents immediately + * @dst_cache: the cache + * + * The caller must be sure there are no concurrent users, as this frees + * all dst_cache users immediately, rather than waiting for the next + * per-cpu usage like dst_cache_reset does. Most callers should use the + * higher speed lazily-freed dst_cache_reset function instead. + */ +void dst_cache_reset_now(struct dst_cache *dst_cache); + /** * dst_cache_init - initialize the cache, allocating the required storage * @dst_cache: the cache diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index be74ab4551c2..0ccfd5fa5cb9 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache) free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); + +void dst_cache_reset_now(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + dst_cache->reset_ts = jiffies; + for_each_possible_cpu(i) { + struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); + struct dst_entry *dst = idst->dst; + + idst->cookie = 0; + idst->dst = NULL; + dst_release(dst); + } +} +EXPORT_SYMBOL_GPL(dst_cache_reset_now); diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 2e5c1630885e..8a9461aa0878 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -613,6 +613,28 @@ ip0 link set wg0 up kill $ncat_pid ip0 link del wg0 +# Ensure that dst_cache references don't outlive netns lifetime +ip1 link add dev wg0 type wireguard +ip2 link add dev wg0 type wireguard +configure_peers +ip1 link add veth1 type veth peer name veth2 +ip1 link set veth2 netns $netns2 +ip1 addr add fd00:aa::1/64 dev veth1 +ip2 addr add fd00:aa::2/64 dev veth2 +ip1 link set veth1 up +ip2 link set veth2 up +waitiface $netns1 veth1 +waitiface $netns2 veth2 +ip1 -6 route add default dev veth1 via fd00:aa::2 +ip2 -6 route add default dev veth2 via fd00:aa::1 +n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2 +n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1 +n1 ping6 -c 1 fd00::2 +pp ip netns delete $netns1 +pp ip netns delete $netns2 +pp ip netns add $netns1 +pp ip netns add $netns2 + # Ensure there aren't circular reference loops ip1 link add wg1 type wireguard ip2 link add wg2 type wireguard @@ -631,7 +653,7 @@ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do done < /dev/kmsg alldeleted=1 for object in "${!objects[@]}"; do - if [[ ${objects["$object"]} != *createddestroyed ]]; then + if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then echo "Error: $object: merely ${objects["$object"]}" >&3 alldeleted=0 fi -- cgit v1.2.3-71-gd317 From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Nov 2021 10:39:29 -0500 Subject: siphash: use _unaligned version by default On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS because the ordinary load/store instructions (ldr, ldrh, ldrb) can tolerate any misalignment of the memory address. However, load/store double and load/store multiple instructions (ldrd, ldm) may still only be used on memory addresses that are 32-bit aligned, and so we have to use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we may end up with a severe performance hit due to alignment traps that require fixups by the kernel. Testing shows that this currently happens with clang-13 but not gcc-11. In theory, any compiler version can produce this bug or other problems, as we are dealing with undefined behavior in C99 even on architectures that support this in hardware, see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363. Fortunately, the get_unaligned() accessors do the right thing: when building for ARMv6 or later, the compiler will emit unaligned accesses using the ordinary load/store instructions (but avoid the ones that require 32-bit alignment). When building for older ARM, those accessors will emit the appropriate sequence of ldrb/mov/orr instructions. And on architectures that can truly tolerate any kind of misalignment, the get_unaligned() accessors resolve to the leXX_to_cpup accessors that operate on aligned addresses. Since the compiler will in fact emit ldrd or ldm instructions when building this code for ARM v6 or later, the solution is to use the unaligned accessors unconditionally on architectures where this is known to be fast. The _aligned version of the hash function is however still needed to get the best performance on architectures that cannot do any unaligned access in hardware. This new version avoids the undefined behavior and should produce the fastest hash on all architectures we support. Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/ Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/ Reported-by: Ard Biesheuvel Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF") Signed-off-by: Arnd Bergmann Reviewed-by: Jason A. Donenfeld Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 14 ++++---------- lib/siphash.c | 12 ++++++------ 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..0cda61855d90 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key) } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); -#endif u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); @@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len, static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); -#endif return ___siphash_aligned(data, len, key); } @@ -96,10 +93,8 @@ typedef struct { u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); -#endif u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); @@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); -#endif return ___hsiphash_aligned(data, len, key); } diff --git a/lib/siphash.c b/lib/siphash.c index a90112ee72a1..72b9068ab57b 100644 --- a/lib/siphash.c +++ b/lib/siphash.c @@ -49,6 +49,7 @@ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -80,8 +81,8 @@ u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -113,7 +114,6 @@ u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); -#endif /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 @@ -250,6 +250,7 @@ EXPORT_SYMBOL(siphash_3u32); HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -280,8 +281,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -313,7 +314,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 @@ -418,6 +418,7 @@ EXPORT_SYMBOL(hsiphash_4u32); HSIPROUND; \ return v1 ^ v3; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); @@ -438,8 +439,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -461,7 +462,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 -- cgit v1.2.3-71-gd317 From 79364031c5b4365ca28ac0fa00acfab5bf465be1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 27 Nov 2021 17:32:00 +0100 Subject: bpf: Make sure bpf_disable_instrumentation() is safe vs preemption. The initial implementation of migrate_disable() for mainline was a wrapper around preempt_disable(). RT kernels substituted this with a real migrate disable implementation. Later on mainline gained true migrate disable support, but neither documentation nor affected code were updated. Remove stale comments claiming that migrate_disable() is PREEMPT_RT only. Don't use __this_cpu_inc() in the !PREEMPT_RT path because preemption is not disabled and the RMW operation can be preempted. Fixes: 74d862b682f51 ("sched: Make migrate_disable/enable() independent of RT") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211127163200.10466-3-bigeasy@linutronix.de --- include/linux/bpf.h | 16 ++-------------- include/linux/filter.h | 3 --- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 84ff6ef49462..755f38e893be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1353,28 +1353,16 @@ extern struct mutex bpf_stats_enabled_mutex; * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. - * - * Use the preemption safe inc/dec variants on RT because migrate disable - * is preemptible on RT and preemption in the middle of the RMW operation - * might lead to inconsistent state. Use the raw variants for non RT - * kernels as migrate_disable() maps to preempt_disable() so the slightly - * more expensive save operation can be avoided. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_inc(bpf_prog_active); - else - __this_cpu_inc(bpf_prog_active); + this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_dec(bpf_prog_active); - else - __this_cpu_dec(bpf_prog_active); + this_cpu_dec(bpf_prog_active); migrate_enable(); } diff --git a/include/linux/filter.h b/include/linux/filter.h index 24b7ed2677af..534f678ca50f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -640,9 +640,6 @@ static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. - * - * For non RT enabled kernels migrate_disable/enable() maps to - * preempt_disable/enable(), i.e. it disables also preemption. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) -- cgit v1.2.3-71-gd317 From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Nov 2021 17:21:45 +0200 Subject: net/mlx5: Fix access to a non-supported register Validate MRTC register is supported before triggering a delayed work which accesses it. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 8 +++----- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 380f50d5462d..3ca998874c50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -836,7 +836,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); add_timer(&health->timer); - if (mlx5_core_is_pf(dev)) + if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc)) queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e127c0530b3a..7df9c7f8d9c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1071,18 +1071,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) mlx5_set_driver_version(dev); - mlx5_start_health_poll(dev); - err = mlx5_query_hca_caps(dev); if (err) { mlx5_core_err(dev, "query hca failed\n"); - goto stop_health; + goto reclaim_boot_pages; } + mlx5_start_health_poll(dev); + return 0; -stop_health: - mlx5_stop_health_poll(dev, boot); reclaim_boot_pages: mlx5_reclaim_startup_pages(dev); err_disable_hca: diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..fbaab440a484 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; - u8 regs_63_to_32[0x20]; + u8 regs_63_to_46[0x12]; + u8 mrtc[0x1]; + u8 regs_44_to_32[0xd]; + u8 regs_31_to_0[0x20]; }; -- cgit v1.2.3-71-gd317 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..21eccc961bba 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; -- cgit v1.2.3-71-gd317 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ net/core/dev.c | 5 ++++- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } diff --git a/net/core/dev.c b/net/core/dev.c index 15ac064b5562..2a352e668d10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4210,7 +4210,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; -- cgit v1.2.3-71-gd317 From a37a0ee4d25c152a087c5e913b76f207eaebdb54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 10:29:39 -0800 Subject: net: avoid uninit-value from tcp_conn_request A recent change triggers a KMSAN warning, because request sockets do not initialize @sk_rx_queue_mapping field. Add sk_rx_queue_update() helper to make our intent clear. BUG: KMSAN: uninit-value in sk_rx_queue_set include/net/sock.h:1922 [inline] BUG: KMSAN: uninit-value in tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922 sk_rx_queue_set include/net/sock.h:1922 [inline] tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 invoke_softirq+0xa4/0x130 kernel/softirq.c:432 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x76/0x130 kernel/softirq.c:648 common_interrupt+0xb6/0xd0 arch/x86/kernel/irq.c:240 asm_common_interrupt+0x1e/0x40 smap_restore arch/x86/include/asm/smap.h:67 [inline] get_shadow_origin_ptr mm/kmsan/instrumentation.c:31 [inline] __msan_metadata_ptr_for_load_1+0x28/0x30 mm/kmsan/instrumentation.c:63 tomoyo_check_acl+0x1b0/0x630 security/tomoyo/domain.c:173 tomoyo_path_permission security/tomoyo/file.c:586 [inline] tomoyo_check_open_permission+0x61f/0xe10 security/tomoyo/file.c:777 tomoyo_file_open+0x24f/0x2d0 security/tomoyo/tomoyo.c:311 security_file_open+0xb1/0x1f0 security/security.c:1635 do_dentry_open+0x4e4/0x1bf0 fs/open.c:809 vfs_open+0xaf/0xe0 fs/open.c:957 do_open fs/namei.c:3426 [inline] path_openat+0x52f1/0x5dd0 fs/namei.c:3559 do_filp_open+0x306/0x760 fs/namei.c:3586 do_sys_openat2+0x263/0x8f0 fs/open.c:1212 do_sys_open fs/open.c:1228 [inline] __do_sys_open fs/open.c:1236 [inline] __se_sys_open fs/open.c:1232 [inline] __x64_sys_open+0x314/0x380 fs/open.c:1232 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: __alloc_pages+0xbc7/0x10a0 mm/page_alloc.c:5409 alloc_pages+0x8a5/0xb80 alloc_slab_page mm/slub.c:1810 [inline] allocate_slab+0x287/0x1c20 mm/slub.c:1947 new_slab mm/slub.c:2010 [inline] ___slab_alloc+0xbdf/0x1e90 mm/slub.c:3039 __slab_alloc mm/slub.c:3126 [inline] slab_alloc_node mm/slub.c:3217 [inline] slab_alloc mm/slub.c:3259 [inline] kmem_cache_alloc+0xbb3/0x11c0 mm/slub.c:3264 reqsk_alloc include/net/request_sock.h:91 [inline] inet_reqsk_alloc+0xaf/0x8b0 net/ipv4/tcp_input.c:6712 tcp_conn_request+0x910/0x4dc0 net/ipv4/tcp_input.c:6852 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 Fixes: 342159ee394d ("net: avoid dirtying sk->sk_rx_queue_mapping") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130182939.2584764-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 2 +- include/net/sock.h | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4202c609bb0b..7994455ec714 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -133,7 +133,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif - sk_rx_queue_set(sk, skb); + sk_rx_queue_update(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) diff --git a/include/net/sock.h b/include/net/sock.h index 715cdb4b2b79..bea21ff70e74 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1913,18 +1913,31 @@ static inline int sk_tx_queue_get(const struct sock *sk) return -1; } -static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +static inline void __sk_rx_queue_set(struct sock *sk, + const struct sk_buff *skb, + bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); - if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) + if (force_set || + unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, true); +} + +static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, false); +} + static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING -- cgit v1.2.3-71-gd317 From 213f5f8f31f10aa1e83187ae20fb7fa4e626b724 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Dec 2021 18:26:35 -0800 Subject: ipv4: convert fib_num_tclassid_users to atomic_t Before commit faa041a40b9f ("ipv4: Create cleanup helper for fib_nh") changes to net->ipv4.fib_num_tclassid_users were protected by RTNL. After the change, this is no longer the case, as free_fib_info_rcu() runs after rcu grace period, without rtnl being held. Fixes: faa041a40b9f ("ipv4: Create cleanup helper for fib_nh") Signed-off-by: Eric Dumazet Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- include/net/netns/ipv4.h | 2 +- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_rules.c | 4 ++-- net/ipv4/fib_semantics.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ab5348e57db1..3417ba2d27ad 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -438,7 +438,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { - return net->ipv4.fib_num_tclassid_users; + return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2f65701a43c9..6c5b2efc4f17 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,7 +65,7 @@ struct netns_ipv4 { bool fib_has_custom_local_routes; bool fib_offload_disabled; #ifdef CONFIG_IP_ROUTE_CLASSID - int fib_num_tclassid_users; + atomic_t fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; struct sock *fibnl; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 9fe13e4f5d08..4d61ddd8a0ec 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1582,7 +1582,7 @@ static int __net_init fib_net_init(struct net *net) int error; #ifdef CONFIG_IP_ROUTE_CLASSID - net->ipv4.fib_num_tclassid_users = 0; + atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 364ad3446b2f..d279cb8ac158 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -264,7 +264,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) - net->ipv4.fib_num_tclassid_users++; + atomic_inc(&net->ipv4.fib_num_tclassid_users); } #endif @@ -296,7 +296,7 @@ static int fib4_rule_delete(struct fib_rule *rule) #ifdef CONFIG_IP_ROUTE_CLASSID if (((struct fib4_rule *)rule)->tclassid) - net->ipv4.fib_num_tclassid_users--; + atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif net->ipv4.fib_has_custom_rules = true; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3364cb9c67e0..fde7797b5806 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -220,7 +220,7 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) - net->ipv4.fib_num_tclassid_users--; + atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } @@ -632,7 +632,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) - net->ipv4.fib_num_tclassid_users++; + atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; -- cgit v1.2.3-71-gd317 From 72f6a45202f20f0e1a46b0acb7803369cc53d0b8 Mon Sep 17 00:00:00 2001 From: Xiayu Zhang Date: Wed, 1 Dec 2021 10:57:13 +0800 Subject: Fix Comment of ETH_P_802_3_MIN The description of ETH_P_802_3_MIN is misleading. The value of EthernetType in Ethernet II frame is more than 0x0600, the value of Length in 802.3 frame is less than 0x0600. Signed-off-by: Xiayu Zhang Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 5da4ee234e0b..c0c2f3ed5729 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -117,7 +117,7 @@ #define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ -#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is less than this value +#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value * then the frame is Ethernet II. Else it is 802.3 */ /* -- cgit v1.2.3-71-gd317 From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:55 +0200 Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time. task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime. To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations. Reported-by: Hasegawa Hitomi Signed-off-by: Frederic Weisbecker Signed-off-by: Hasegawa Hitomi Signed-off-by: Thomas Gleixner Tested-by: Masayoshi Mizuma Acked-by: Phil Auld Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org --- include/linux/sched/cputime.h | 5 +++-- kernel/sched/cputime.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..ce3c58286062 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..9392aea1804e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } static int vtime_state_fetch(struct vtime *vtime, int cpu) -- cgit v1.2.3-71-gd317 From f83baa0cb6cfc92ebaf7f9d3a99d7e34f2e77a8a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Dec 2021 19:35:01 +0100 Subject: HID: add hid_is_usb() function to make it simpler for USB detection A number of HID drivers already call hid_is_using_ll_driver() but only for the detection of if this is a USB device or not. Make this more obvious by creating hid_is_usb() and calling the function that way. Also converts the existing hid_is_using_ll_driver() functions to use the new call. Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: linux-input@vger.kernel.org Cc: stable@vger.kernel.org Tested-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211201183503.2373082-1-gregkh@linuxfoundation.org --- drivers/hid/hid-asus.c | 6 ++---- drivers/hid/hid-logitech-dj.c | 2 +- drivers/hid/hid-u2fzero.c | 2 +- drivers/hid/hid-uclogic-params.c | 3 +-- drivers/hid/wacom_sys.c | 2 +- include/linux/hid.h | 5 +++++ 6 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index f3ecddc519ee..08c9a9a60ae4 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -1028,8 +1028,7 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id) if (drvdata->quirks & QUIRK_IS_MULTITOUCH) drvdata->tp = &asus_i2c_tp; - if ((drvdata->quirks & QUIRK_T100_KEYBOARD) && - hid_is_using_ll_driver(hdev, &usb_hid_driver)) { + if ((drvdata->quirks & QUIRK_T100_KEYBOARD) && hid_is_usb(hdev)) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); if (intf->altsetting->desc.bInterfaceNumber == T100_TPAD_INTF) { @@ -1057,8 +1056,7 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id) drvdata->tp = &asus_t100chi_tp; } - if ((drvdata->quirks & QUIRK_MEDION_E1239T) && - hid_is_using_ll_driver(hdev, &usb_hid_driver)) { + if ((drvdata->quirks & QUIRK_MEDION_E1239T) && hid_is_usb(hdev)) { struct usb_host_interface *alt = to_usb_interface(hdev->dev.parent)->altsetting; diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index a0017b010c34..7106b921b53c 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1777,7 +1777,7 @@ static int logi_dj_probe(struct hid_device *hdev, case recvr_type_bluetooth: no_dj_interfaces = 2; break; case recvr_type_dinovo: no_dj_interfaces = 2; break; } - if (hid_is_using_ll_driver(hdev, &usb_hid_driver)) { + if (hid_is_usb(hdev)) { intf = to_usb_interface(hdev->dev.parent); if (intf && intf->altsetting->desc.bInterfaceNumber >= no_dj_interfaces) { diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c index 31ea7fc69916..ad489caf53ad 100644 --- a/drivers/hid/hid-u2fzero.c +++ b/drivers/hid/hid-u2fzero.c @@ -311,7 +311,7 @@ static int u2fzero_probe(struct hid_device *hdev, unsigned int minor; int ret; - if (!hid_is_using_ll_driver(hdev, &usb_hid_driver)) + if (!hid_is_usb(hdev)) return -EINVAL; dev = devm_kzalloc(&hdev->dev, sizeof(*dev), GFP_KERNEL); diff --git a/drivers/hid/hid-uclogic-params.c b/drivers/hid/hid-uclogic-params.c index 3d67b748a3b9..adff1bd68d9f 100644 --- a/drivers/hid/hid-uclogic-params.c +++ b/drivers/hid/hid-uclogic-params.c @@ -843,8 +843,7 @@ int uclogic_params_init(struct uclogic_params *params, struct uclogic_params p = {0, }; /* Check arguments */ - if (params == NULL || hdev == NULL || - !hid_is_using_ll_driver(hdev, &usb_hid_driver)) { + if (params == NULL || hdev == NULL || !hid_is_usb(hdev)) { rc = -EINVAL; goto cleanup; } diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 2717d39600b4..22d73772fbc5 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2214,7 +2214,7 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix) if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) { char *product_name = wacom->hdev->name; - if (hid_is_using_ll_driver(wacom->hdev, &usb_hid_driver)) { + if (hid_is_usb(wacom->hdev)) { struct usb_interface *intf = to_usb_interface(wacom->hdev->dev.parent); struct usb_device *dev = interface_to_usbdev(intf); product_name = dev->product; diff --git a/include/linux/hid.h b/include/linux/hid.h index 9e067f937dbc..f453be385bd4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -840,6 +840,11 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev, return hdev->ll_driver == driver; } +static inline bool hid_is_usb(struct hid_device *hdev) +{ + return hid_is_using_ll_driver(hdev, &usb_hid_driver); +} + #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 -- cgit v1.2.3-71-gd317 From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:40 +0530 Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL Vinicius Costa Gomes reported [0] that build fails when CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled. This leads to btf.c not being compiled, and then no symbol being present in vmlinux for the declarations in btf.h. Since BTF is not useful without enabling BPF subsystem, disallow this combination. However, theoretically disabling both now could still fail, as the symbol for kfunc_btf_id_list variables is not available. This isn't a problem as the compiler usually optimizes the whole register/unregister call, but at lower optimization levels it can fail the build in linking stage. Fix that by adding dummy variables so that modules taking address of them still work, but the whole thing is a noop. [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Reported-by: Vinicius Costa Gomes Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com --- include/linux/btf.h | 14 ++++++++++---- kernel/bpf/btf.c | 9 ++------- lib/Kconfig.debug | 1 + 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..0e1b6281fd8f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -245,7 +245,10 @@ struct kfunc_btf_id_set { struct module *owner; }; -struct kfunc_btf_id_list; +struct kfunc_btf_id_list { + struct list_head list; + struct mutex mutex; +}; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s); bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, struct module *owner); + +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, { return false; } + +static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; #endif #define DEFINE_KFUNC_BTF_ID_SET(set, name) \ struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; - #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..ea3df9867cec 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6346,11 +6346,6 @@ BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) /* BTF ID set registration API for modules */ -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -6389,8 +6384,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, return false; } -#endif - #define DEFINE_KFUNC_BTF_ID_LIST(name) \ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ __MUTEX_INITIALIZER(name.mutex) }; \ @@ -6398,3 +6391,5 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ef7ce18b4f5..596bb5e4790c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -316,6 +316,7 @@ config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST + depends on BPF_SYSCALL help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert -- cgit v1.2.3-71-gd317 From 03cfda4fa6ea9bea2f30160579a78c2b8c1e616e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Dec 2021 15:37:24 -0800 Subject: tcp: fix another uninit-value (sk_rx_queue_mapping) KMSAN is still not happy [1]. I missed that passive connections do not inherit their sk_rx_queue_mapping values from the request socket, but instead tcp_child_process() is calling sk_mark_napi_id(child, skb) We have many sk_mark_napi_id() callers, so I am providing a new helper, forcing the setting sk_rx_queue_mapping and sk_napi_id. Note that we had no KMSAN report for sk_napi_id because passive connections got a copy of this field from the listener. sk_rx_queue_mapping in the other hand is inside the sk_dontcopy_begin/sk_dontcopy_end so sk_clone_lock() leaves this field uninitialized. We might remove dead code populating req->sk_rx_queue_mapping in the future. [1] BUG: KMSAN: uninit-value in __sk_rx_queue_set include/net/sock.h:1924 [inline] BUG: KMSAN: uninit-value in sk_rx_queue_update include/net/sock.h:1938 [inline] BUG: KMSAN: uninit-value in sk_mark_napi_id include/net/busy_poll.h:136 [inline] BUG: KMSAN: uninit-value in tcp_child_process+0xb42/0x1050 net/ipv4/tcp_minisocks.c:833 __sk_rx_queue_set include/net/sock.h:1924 [inline] sk_rx_queue_update include/net/sock.h:1938 [inline] sk_mark_napi_id include/net/busy_poll.h:136 [inline] tcp_child_process+0xb42/0x1050 net/ipv4/tcp_minisocks.c:833 tcp_v4_rcv+0x3d83/0x4ed0 net/ipv4/tcp_ipv4.c:2066 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 run_ksoftirqd+0x33/0x50 kernel/softirq.c:920 smpboot_thread_fn+0x616/0xbf0 kernel/smpboot.c:164 kthread+0x721/0x850 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 Uninit was created at: __alloc_pages+0xbc7/0x10a0 mm/page_alloc.c:5409 alloc_pages+0x8a5/0xb80 alloc_slab_page mm/slub.c:1810 [inline] allocate_slab+0x287/0x1c20 mm/slub.c:1947 new_slab mm/slub.c:2010 [inline] ___slab_alloc+0xbdf/0x1e90 mm/slub.c:3039 __slab_alloc mm/slub.c:3126 [inline] slab_alloc_node mm/slub.c:3217 [inline] slab_alloc mm/slub.c:3259 [inline] kmem_cache_alloc+0xbb3/0x11c0 mm/slub.c:3264 sk_prot_alloc+0xeb/0x570 net/core/sock.c:1914 sk_clone_lock+0xd6/0x1940 net/core/sock.c:2118 inet_csk_clone_lock+0x8d/0x6a0 net/ipv4/inet_connection_sock.c:956 tcp_create_openreq_child+0xb1/0x1ef0 net/ipv4/tcp_minisocks.c:453 tcp_v4_syn_recv_sock+0x268/0x2710 net/ipv4/tcp_ipv4.c:1563 tcp_check_req+0x207c/0x2a30 net/ipv4/tcp_minisocks.c:765 tcp_v4_rcv+0x36f5/0x4ed0 net/ipv4/tcp_ipv4.c:2047 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 Fixes: 342159ee394d ("net: avoid dirtying sk->sk_rx_queue_mapping") Fixes: a37a0ee4d25c ("net: avoid uninit-value from tcp_conn_request") Signed-off-by: Eric Dumazet Reported-by: syzbot Tested-by: Alexander Potapenko Signed-off-by: David S. Miller --- include/net/busy_poll.h | 13 +++++++++++++ net/ipv4/tcp_minisocks.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 7994455ec714..c4898fcbf923 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -136,6 +136,19 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) sk_rx_queue_update(sk, skb); } +/* Variant of sk_mark_napi_id() for passive flow setup, + * as sk->sk_napi_id and sk->sk_rx_queue_mapping content + * needs to be set. + */ +static inline void sk_mark_napi_id_set(struct sock *sk, + const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + WRITE_ONCE(sk->sk_napi_id, skb->napi_id); +#endif + sk_rx_queue_set(sk, skb); +} + static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cf913a66df17..7c2d3ac2363a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -829,8 +829,8 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; - /* record NAPI ID of child */ - sk_mark_napi_id(child, skb); + /* record sk_napi_id and sk_rx_queue_mapping of child. */ + sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { -- cgit v1.2.3-71-gd317 From dac8e00fb640e9569cdeefd3ce8a75639e5d0711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Dec 2021 18:27:18 -0800 Subject: bonding: make tx_rebalance_counter an atomic KCSAN reported a data-race [1] around tx_rebalance_counter which can be accessed from different contexts, without the protection of a lock/mutex. [1] BUG: KCSAN: data-race in bond_alb_init_slave / bond_alb_monitor write to 0xffff888157e8ca24 of 4 bytes by task 7075 on cpu 0: bond_alb_init_slave+0x713/0x860 drivers/net/bonding/bond_alb.c:1613 bond_enslave+0xd94/0x3010 drivers/net/bonding/bond_main.c:1949 do_set_master net/core/rtnetlink.c:2521 [inline] __rtnl_newlink net/core/rtnetlink.c:3475 [inline] rtnl_newlink+0x1298/0x13b0 net/core/rtnetlink.c:3506 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5571 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2491 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5589 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x5fc/0x6c0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x6e1/0x7d0 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2409 ___sys_sendmsg net/socket.c:2463 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2492 __do_sys_sendmsg net/socket.c:2501 [inline] __se_sys_sendmsg net/socket.c:2499 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2499 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888157e8ca24 of 4 bytes by task 1082 on cpu 1: bond_alb_monitor+0x8f/0xc00 drivers/net/bonding/bond_alb.c:1511 process_one_work+0x3fc/0x980 kernel/workqueue.c:2298 worker_thread+0x616/0xa70 kernel/workqueue.c:2445 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x00000001 -> 0x00000064 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1082 Comm: kworker/u4:3 Not tainted 5.16.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: bond1 bond_alb_monitor Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/net/bonding/bond_alb.c | 14 ++++++++------ include/net/bond_alb.h | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 2ec8e015c7b3..533e476988f2 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1501,14 +1501,14 @@ void bond_alb_monitor(struct work_struct *work) struct slave *slave; if (!bond_has_slaves(bond)) { - bond_info->tx_rebalance_counter = 0; + atomic_set(&bond_info->tx_rebalance_counter, 0); bond_info->lp_counter = 0; goto re_arm; } rcu_read_lock(); - bond_info->tx_rebalance_counter++; + atomic_inc(&bond_info->tx_rebalance_counter); bond_info->lp_counter++; /* send learning packets */ @@ -1530,7 +1530,7 @@ void bond_alb_monitor(struct work_struct *work) } /* rebalance tx traffic */ - if (bond_info->tx_rebalance_counter >= BOND_TLB_REBALANCE_TICKS) { + if (atomic_read(&bond_info->tx_rebalance_counter) >= BOND_TLB_REBALANCE_TICKS) { bond_for_each_slave_rcu(bond, slave, iter) { tlb_clear_slave(bond, slave, 1); if (slave == rcu_access_pointer(bond->curr_active_slave)) { @@ -1540,7 +1540,7 @@ void bond_alb_monitor(struct work_struct *work) bond_info->unbalanced_load = 0; } } - bond_info->tx_rebalance_counter = 0; + atomic_set(&bond_info->tx_rebalance_counter, 0); } if (bond_info->rlb_enabled) { @@ -1610,7 +1610,8 @@ int bond_alb_init_slave(struct bonding *bond, struct slave *slave) tlb_init_slave(slave); /* order a rebalance ASAP */ - bond->alb_info.tx_rebalance_counter = BOND_TLB_REBALANCE_TICKS; + atomic_set(&bond->alb_info.tx_rebalance_counter, + BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) bond->alb_info.rlb_rebalance = 1; @@ -1647,7 +1648,8 @@ void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char rlb_clear_slave(bond, slave); } else if (link == BOND_LINK_UP) { /* order a rebalance ASAP */ - bond_info->tx_rebalance_counter = BOND_TLB_REBALANCE_TICKS; + atomic_set(&bond_info->tx_rebalance_counter, + BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) { bond->alb_info.rlb_rebalance = 1; /* If the updelay module parameter is smaller than the diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h index f6af76c87a6c..191c36afa1f4 100644 --- a/include/net/bond_alb.h +++ b/include/net/bond_alb.h @@ -126,7 +126,7 @@ struct tlb_slave_info { struct alb_bond_info { struct tlb_client_info *tx_hashtbl; /* Dynamically allocated */ u32 unbalanced_load; - int tx_rebalance_counter; + atomic_t tx_rebalance_counter; int lp_counter; /* -------- rlb parameters -------- */ int rlb_enabled; -- cgit v1.2.3-71-gd317 From 8581fd402a0cf80b5298e3b225e7a7bd8f110e69 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Dec 2021 12:34:00 -0800 Subject: treewide: Add missing includes masked by cgroup -> bpf dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup.h (therefore swap.h, therefore half of the universe) includes bpf.h which in turn includes module.h and slab.h. Since we're about to get rid of that dependency we need to clean things up. v2: drop the cpu.h include from cacheinfo.h, it's not necessary and it makes riscv sensitive to ordering of include files. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Reviewed-by: Christoph Hellwig Acked-by: Krzysztof Wilczyński Acked-by: Peter Chen Acked-by: SeongJae Park Acked-by: Jani Nikula Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20211120035253.72074-1-kuba@kernel.org/ # v1 Link: https://lore.kernel.org/all/20211120165528.197359-1-kuba@kernel.org/ # cacheinfo discussion Link: https://lore.kernel.org/bpf/20211202203400.1208663-1-kuba@kernel.org --- block/fops.c | 1 + drivers/gpu/drm/drm_gem_shmem_helper.c | 1 + drivers/gpu/drm/i915/gt/intel_gtt.c | 1 + drivers/gpu/drm/i915/i915_request.c | 1 + drivers/gpu/drm/lima/lima_device.c | 1 + drivers/gpu/drm/msm/msm_gem_shrinker.c | 1 + drivers/gpu/drm/ttm/ttm_tt.c | 1 + drivers/net/ethernet/huawei/hinic/hinic_sriov.c | 1 + drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 ++ drivers/pci/controller/dwc/pci-exynos.c | 1 + drivers/pci/controller/dwc/pcie-qcom-ep.c | 1 + drivers/usb/cdns3/host.c | 1 + include/linux/cacheinfo.h | 1 - include/linux/device/driver.h | 1 + include/linux/filter.h | 2 +- mm/damon/vaddr.c | 1 + mm/memory_hotplug.c | 1 + mm/swap_slots.c | 1 + 18 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index ad732a36f9b3..3cb1e81929bc 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index 7b9f69f21f1e..bca0de92802e 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_X86 #include diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 67d14afa6623..b67f620c3d93 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -6,6 +6,7 @@ #include /* fault-inject.h is not standalone! */ #include +#include #include "gem/i915_gem_lmem.h" #include "i915_trace.h" diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 820a1f38b271..89cccefeea63 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "gem/i915_gem_context.h" #include "gt/intel_breadcrumbs.h" diff --git a/drivers/gpu/drm/lima/lima_device.c b/drivers/gpu/drm/lima/lima_device.c index 65fdca366e41..f74f8048af8f 100644 --- a/drivers/gpu/drm/lima/lima_device.c +++ b/drivers/gpu/drm/lima/lima_device.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 4a1420b05e97..086dacf2f26a 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -5,6 +5,7 @@ */ #include +#include #include "msm_drv.h" #include "msm_gem.h" diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 7e83c00a3f48..79c870a3bef8 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c index a78c398bf5b2..01e7d3c0b68e 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "hinic_hw_dev.h" #include "hinic_dev.h" diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c index 0ef68fdd1f26..61c20907315f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c @@ -5,6 +5,8 @@ * */ +#include + #include "otx2_common.h" #include "otx2_ptp.h" diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c index c24dab383654..722dacdd5a17 100644 --- a/drivers/pci/controller/dwc/pci-exynos.c +++ b/drivers/pci/controller/dwc/pci-exynos.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "pcie-designware.h" diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 7b17da2f9b3f..cfe66bf04c1d 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "pcie-designware.h" diff --git a/drivers/usb/cdns3/host.c b/drivers/usb/cdns3/host.c index 84dadfa726aa..9643b905e2d8 100644 --- a/drivers/usb/cdns3/host.c +++ b/drivers/usb/cdns3/host.c @@ -10,6 +10,7 @@ */ #include +#include #include "core.h" #include "drd.h" #include "host-export.h" diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2f909ed084c6..4ff37cb763ae 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,7 +3,6 @@ #define _LINUX_CACHEINFO_H #include -#include #include #include diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index a498ebcf4993..15e7c5e15d62 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -18,6 +18,7 @@ #include #include #include +#include /** * enum probe_type - device driver probe type to try diff --git a/include/linux/filter.h b/include/linux/filter.h index 534f678ca50f..7f1e88e3e2b5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -6,6 +6,7 @@ #define __LINUX_FILTER_H__ #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include -#include struct sk_buff; struct sock; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 35fe49080ee9..47f47f60440e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "prmtv-common.h" diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 852041f6be41..2a9627dc784c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include #include #include +#include #include diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 16f706c55d92..2b5531840583 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-71-gd317 From 802a7dc5cf1bef06f7b290ce76d478138408d6b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Dec 2021 10:03:23 -0800 Subject: netfilter: conntrack: annotate data-races around ct->timeout (struct nf_conn)->timeout can be read/written locklessly, add READ_ONCE()/WRITE_ONCE() to prevent load/store tearing. BUG: KCSAN: data-race in __nf_conntrack_alloc / __nf_conntrack_find_get write to 0xffff888132e78c08 of 4 bytes by task 6029 on cpu 0: __nf_conntrack_alloc+0x158/0x280 net/netfilter/nf_conntrack_core.c:1563 init_conntrack+0x1da/0xb30 net/netfilter/nf_conntrack_core.c:1635 resolve_normal_ct+0x502/0x610 net/netfilter/nf_conntrack_core.c:1746 nf_conntrack_in+0x1c5/0x88f net/netfilter/nf_conntrack_core.c:1901 ipv6_conntrack_local+0x19/0x20 net/netfilter/nf_conntrack_proto.c:414 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline] nf_hook_slow+0x72/0x170 net/netfilter/core.c:619 nf_hook include/linux/netfilter.h:262 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_xmit+0xa3a/0xa60 net/ipv6/ip6_output.c:324 inet6_csk_xmit+0x1a2/0x1e0 net/ipv6/inet6_connection_sock.c:135 __tcp_transmit_skb+0x132a/0x1840 net/ipv4/tcp_output.c:1402 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 tcp_push_pending_frames include/net/tcp.h:1897 [inline] tcp_data_snd_check+0x62/0x2e0 net/ipv4/tcp_input.c:5452 tcp_rcv_established+0x880/0x10e0 net/ipv4/tcp_input.c:5947 tcp_v6_do_rcv+0x36e/0xa50 net/ipv6/tcp_ipv6.c:1521 sk_backlog_rcv include/net/sock.h:1030 [inline] __release_sock+0xf2/0x270 net/core/sock.c:2768 release_sock+0x40/0x110 net/core/sock.c:3300 sk_stream_wait_memory+0x435/0x700 net/core/stream.c:145 tcp_sendmsg_locked+0xb85/0x25a0 net/ipv4/tcp.c:1402 tcp_sendmsg+0x2c/0x40 net/ipv4/tcp.c:1440 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:644 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] __sys_sendto+0x21e/0x2c0 net/socket.c:2036 __do_sys_sendto net/socket.c:2048 [inline] __se_sys_sendto net/socket.c:2044 [inline] __x64_sys_sendto+0x74/0x90 net/socket.c:2044 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888132e78c08 of 4 bytes by task 17446 on cpu 1: nf_ct_is_expired include/net/netfilter/nf_conntrack.h:286 [inline] ____nf_conntrack_find net/netfilter/nf_conntrack_core.c:776 [inline] __nf_conntrack_find_get+0x1c7/0xac0 net/netfilter/nf_conntrack_core.c:807 resolve_normal_ct+0x273/0x610 net/netfilter/nf_conntrack_core.c:1734 nf_conntrack_in+0x1c5/0x88f net/netfilter/nf_conntrack_core.c:1901 ipv6_conntrack_local+0x19/0x20 net/netfilter/nf_conntrack_proto.c:414 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline] nf_hook_slow+0x72/0x170 net/netfilter/core.c:619 nf_hook include/linux/netfilter.h:262 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_xmit+0xa3a/0xa60 net/ipv6/ip6_output.c:324 inet6_csk_xmit+0x1a2/0x1e0 net/ipv6/inet6_connection_sock.c:135 __tcp_transmit_skb+0x132a/0x1840 net/ipv4/tcp_output.c:1402 __tcp_send_ack+0x1fd/0x300 net/ipv4/tcp_output.c:3956 tcp_send_ack+0x23/0x30 net/ipv4/tcp_output.c:3962 __tcp_ack_snd_check+0x2d8/0x510 net/ipv4/tcp_input.c:5478 tcp_ack_snd_check net/ipv4/tcp_input.c:5523 [inline] tcp_rcv_established+0x8c2/0x10e0 net/ipv4/tcp_input.c:5948 tcp_v6_do_rcv+0x36e/0xa50 net/ipv6/tcp_ipv6.c:1521 sk_backlog_rcv include/net/sock.h:1030 [inline] __release_sock+0xf2/0x270 net/core/sock.c:2768 release_sock+0x40/0x110 net/core/sock.c:3300 tcp_sendpage+0x94/0xb0 net/ipv4/tcp.c:1114 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 rds_tcp_xmit+0x376/0x5f0 net/rds/tcp_send.c:118 rds_send_xmit+0xbed/0x1500 net/rds/send.c:367 rds_send_worker+0x43/0x200 net/rds/threads.c:200 process_one_work+0x3fc/0x980 kernel/workqueue.c:2298 worker_thread+0x616/0xa70 kernel/workqueue.c:2445 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x00027cc2 -> 0x00000000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 17446 Comm: kworker/u4:5 Tainted: G W 5.16.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: krdsd rds_send_worker Note: I chose an arbitrary commit for the Fixes: tag, because I do not think we need to backport this fix to very old kernels. Fixes: e37542ba111f ("netfilter: conntrack: avoid possible false sharing") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 6 +++--- net/netfilter/nf_conntrack_core.c | 6 +++--- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nf_flow_table_core.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index cc663c68ddc4..d24b0a34c8f0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -276,14 +276,14 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { - s32 timeout = ct->timeout - nfct_time_stamp; + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; return timeout > 0 ? timeout : 0; } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { - return (__s32)(ct->timeout - nfct_time_stamp) <= 0; + return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ @@ -302,7 +302,7 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct) static inline void nf_ct_offload_timeout(struct nf_conn *ct) { if (nf_ct_expires(ct) < NF_CT_DAY / 2) - ct->timeout = nfct_time_stamp + NF_CT_DAY; + WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); } struct kernel_param; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 770a63103c7a..4712a90a1820 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -684,7 +684,7 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) tstamp = nf_conn_tstamp_find(ct); if (tstamp) { - s32 timeout = ct->timeout - nfct_time_stamp; + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; tstamp->stop = ktime_get_real_ns(); if (timeout < 0) @@ -1036,7 +1036,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) } /* We want the clashing entry to go away real soon: 1 second timeout. */ - loser_ct->timeout = nfct_time_stamp + HZ; + WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); /* IPS_NAT_CLASH removes the entry automatically on the first * reply. Also prevents UDP tracker from moving the entry to @@ -1560,7 +1560,7 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; - ct->timeout = 0; + WRITE_ONCE(ct->timeout, 0); write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset, 0, offsetof(struct nf_conn, proto) - diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c7708bde057c..81d03acf68d4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1998,7 +1998,7 @@ static int ctnetlink_change_timeout(struct nf_conn *ct, if (timeout > INT_MAX) timeout = INT_MAX; - ct->timeout = nfct_time_stamp + (u32)timeout; + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 87a7388b6c89..ed37bb9b4e58 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -201,8 +201,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (timeout < 0) timeout = 0; - if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) - ct->timeout = nfct_time_stamp + timeout; + if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } static void flow_offload_fixup_ct_state(struct nf_conn *ct) -- cgit v1.2.3-71-gd317 From a97770cc4016c2733bcef9dbe3d5b1ad02d13356 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Mon, 6 Dec 2021 16:12:27 +0800 Subject: net: phy: Remove unnecessary indentation in the comments of phy_device Fix warning as: linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:543: WARNING: Unexpected indentation. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:544: WARNING: Block quote ends without a blank line; unexpected unindent. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:546: WARNING: Unexpected indentation. Suggested-by: Akira Yokosawa Signed-off-by: Yanteng Si Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96e43fbb2dd8..cbf03a5f9cf5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -538,11 +538,12 @@ struct macsec_ops; * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. - * Bits [15:0] are free to use by the PHY driver to communicate - * driver specific behavior. - * Bits [23:16] are currently reserved for future use. - * Bits [31:24] are reserved for defining generic - * PHY driver behavior. + * + * - Bits [15:0] are free to use by the PHY driver to communicate + * driver specific behavior. + * - Bits [23:16] are currently reserved for future use. + * - Bits [31:24] are reserved for defining generic + * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY -- cgit v1.2.3-71-gd317 From 444dd878e85fb33fcfb2682cfdab4c236f33ea3e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2021 17:19:47 +0100 Subject: PM: runtime: Fix pm_runtime_active() kerneldoc comment The kerneldoc comment of pm_runtime_active() does not reflect the behavior of the function, so update it accordingly. Fixes: 403d2d116ec0 ("PM: runtime: Add kerneldoc comments to multiple helpers") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- include/linux/pm_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 222da43b7096..eddd66d426ca 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -129,7 +129,7 @@ static inline bool pm_runtime_suspended(struct device *dev) * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * - * Return %true if runtime PM is enabled for @dev and its runtime PM status is + * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is -- cgit v1.2.3-71-gd317 From cab2d3fd6866e089b5c50db09dece131f85bfebd Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 9 Dec 2021 18:46:33 +0530 Subject: bus: mhi: core: Add support for forced PM resume For whatever reason, some devices like QCA6390, WCN6855 using ath11k are not in M3 state during PM resume, but still functional. The mhi_pm_resume should then not fail in those cases, and let the higher level device specific stack continue resuming process. Add an API mhi_pm_resume_force(), to force resuming irrespective of the current MHI state. This fixes a regression with non functional ath11k WiFi after suspend/resume cycle on some machines. Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179 Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/ Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state") Cc: stable@vger.kernel.org #5.13 Reported-by: Kalle Valo Reported-by: Pengyu Ma Tested-by: Kalle Valo Acked-by: Kalle Valo Signed-off-by: Loic Poulain [mani: Switched to API, added bug report, reported-by tags and CCed stable] Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20211209131633.4168-1-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/pm.c | 21 ++++++++++++++++++--- drivers/net/wireless/ath/ath11k/mhi.c | 6 +++++- include/linux/mhi.h | 13 +++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c index fb99e3727155..547e6e769546 100644 --- a/drivers/bus/mhi/core/pm.c +++ b/drivers/bus/mhi/core/pm.c @@ -881,7 +881,7 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl) } EXPORT_SYMBOL_GPL(mhi_pm_suspend); -int mhi_pm_resume(struct mhi_controller *mhi_cntrl) +static int __mhi_pm_resume(struct mhi_controller *mhi_cntrl, bool force) { struct mhi_chan *itr, *tmp; struct device *dev = &mhi_cntrl->mhi_dev->dev; @@ -898,8 +898,12 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl) if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state)) return -EIO; - if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3) - return -EINVAL; + if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3) { + dev_warn(dev, "Resuming from non M3 state (%s)\n", + TO_MHI_STATE_STR(mhi_get_mhi_state(mhi_cntrl))); + if (!force) + return -EINVAL; + } /* Notify clients about exiting LPM */ list_for_each_entry_safe(itr, tmp, &mhi_cntrl->lpm_chans, node) { @@ -940,8 +944,19 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl) return 0; } + +int mhi_pm_resume(struct mhi_controller *mhi_cntrl) +{ + return __mhi_pm_resume(mhi_cntrl, false); +} EXPORT_SYMBOL_GPL(mhi_pm_resume); +int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl) +{ + return __mhi_pm_resume(mhi_cntrl, true); +} +EXPORT_SYMBOL_GPL(mhi_pm_resume_force); + int __mhi_device_get_sync(struct mhi_controller *mhi_cntrl) { int ret; diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c index 26c7ae242db6..49c0b1ad40a0 100644 --- a/drivers/net/wireless/ath/ath11k/mhi.c +++ b/drivers/net/wireless/ath/ath11k/mhi.c @@ -533,7 +533,11 @@ static int ath11k_mhi_set_state(struct ath11k_pci *ab_pci, ret = mhi_pm_suspend(ab_pci->mhi_ctrl); break; case ATH11K_MHI_RESUME: - ret = mhi_pm_resume(ab_pci->mhi_ctrl); + /* Do force MHI resume as some devices like QCA6390, WCN6855 + * are not in M3 state but they are functional. So just ignore + * the MHI state while resuming. + */ + ret = mhi_pm_resume_force(ab_pci->mhi_ctrl); break; case ATH11K_MHI_TRIGGER_RDDM: ret = mhi_force_rddm_mode(ab_pci->mhi_ctrl); diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 723985879035..a5cc4cdf9cc8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl); */ int mhi_pm_resume(struct mhi_controller *mhi_cntrl); +/** + * mhi_pm_resume_force - Force resume MHI from suspended state + * @mhi_cntrl: MHI controller + * + * Resume the device irrespective of its MHI state. As per the MHI spec, devices + * has to be in M3 state during resume. But some devices seem to be in a + * different MHI state other than M3 but they continue working fine if allowed. + * This API is intented to be used for such devices. + * + * Return: 0 if the resume succeeds, a negative error code otherwise + */ +int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl); + /** * mhi_download_rddm_image - Download ramdump image from device for * debugging purpose. -- cgit v1.2.3-71-gd317 From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:51 -0800 Subject: wait: add wake_up_pollfree() Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'. However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone. Considering the three non-blocking poll systems: - io_uring poll doesn't handle POLLFREE at all, so it is broken anyway. - aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later. - epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile. Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up. Reported-by: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ kernel/sched/wait.c | 7 +++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2d0df57c9902..851e07da2583 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); +void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 76577d1642a5..eca38107b32f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any -- cgit v1.2.3-71-gd317 From 50252e4b5e989ce64555c7aef7516bdefc2fea72 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:55 -0800 Subject: aio: fix use-after-free due to missing POLLFREE handling signalfd_poll() and binder_poll() are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, by sending a POLLFREE notification to all waiters. Unfortunately, only eventpoll handles POLLFREE. A second type of non-blocking poll, aio poll, was added in kernel v4.18, and it doesn't handle POLLFREE. This allows a use-after-free to occur if a signalfd or binder fd is polled with aio poll, and the waitqueue gets freed. Fix this by making aio poll handle POLLFREE. A patch by Ramji Jiyani (https://lore.kernel.org/r/20211027011834.2497484-1-ramjiyani@google.com) tried to do this by making aio_poll_wake() always complete the request inline if POLLFREE is seen. However, that solution had two bugs. First, it introduced a deadlock, as it unconditionally locked the aio context while holding the waitqueue lock, which inverts the normal locking order. Second, it didn't consider that POLLFREE notifications are missed while the request has been temporarily de-queued. The second problem was solved by my previous patch. This patch then properly fixes the use-after-free by handling POLLFREE in a deadlock-free way. It does this by taking advantage of the fact that freeing of the waitqueue is RCU-delayed, similar to what eventpoll does. Fixes: 2c14fa838cbe ("aio: implement IOCB_CMD_POLL") Cc: # v4.18+ Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/aio.c | 137 +++++++++++++++++++++++++++++++--------- include/uapi/asm-generic/poll.h | 2 +- 2 files changed, 107 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/fs/aio.c b/fs/aio.c index 2bc1352a83d8..c9bb0d3d8593 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1620,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work) iocb_put(iocb); } +/* + * Safely lock the waitqueue which the request is on, synchronizing with the + * case where the ->poll() provider decides to free its waitqueue early. + * + * Returns true on success, meaning that req->head->lock was locked, req->wait + * is on req->head, and an RCU read lock was taken. Returns false if the + * request was already removed from its waitqueue (which might no longer exist). + */ +static bool poll_iocb_lock_wq(struct poll_iocb *req) +{ + wait_queue_head_t *head; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us, then check whether the request is still on the queue. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + head = smp_load_acquire(&req->head); + if (head) { + spin_lock(&head->lock); + if (!list_empty(&req->wait.entry)) + return true; + spin_unlock(&head->lock); + } + rcu_read_unlock(); + return false; +} + +static void poll_iocb_unlock_wq(struct poll_iocb *req) +{ + spin_unlock(&req->head->lock); + rcu_read_unlock(); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1639,24 +1684,25 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (!mask && !READ_ONCE(req->cancelled)) { - /* - * The request isn't actually ready to be completed yet. - * Reschedule completion if another wakeup came in. - */ - if (req->work_need_resched) { - schedule_work(&req->work); - req->work_need_resched = false; - } else { - req->work_scheduled = false; + if (poll_iocb_lock_wq(req)) { + if (!mask && !READ_ONCE(req->cancelled)) { + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + poll_iocb_unlock_wq(req); + spin_unlock_irq(&ctx->ctx_lock); + return; } - spin_unlock(&req->head->lock); - spin_unlock_irq(&ctx->ctx_lock); - return; - } - list_del_init(&req->wait.entry); - spin_unlock(&req->head->lock); + list_del_init(&req->wait.entry); + poll_iocb_unlock_wq(req); + } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); @@ -1670,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb) struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; - spin_lock(&req->head->lock); - WRITE_ONCE(req->cancelled, true); - if (!req->work_scheduled) { - schedule_work(&aiocb->poll.work); - req->work_scheduled = true; - } - spin_unlock(&req->head->lock); + if (poll_iocb_lock_wq(req)) { + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); + } /* else, the request was force-cancelled by POLLFREE already */ return 0; } @@ -1728,7 +1775,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() - * is called), and we must not miss any wakeups. + * is called), and we must not miss any wakeups. POLLFREE is an + * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; @@ -1736,6 +1784,28 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, schedule_work(&req->work); req->work_scheduled = true; } + + /* + * If the waitqueue is being freed early but we can't complete + * the request inline, we have to tear down the request as best + * we can. That means immediately removing the request from its + * waitqueue and preventing all further accesses to the + * waitqueue via the request. We also need to schedule the + * completion work (done above). Also mark the request as + * cancelled, to potentially skip an unneeded call to ->poll(). + */ + if (mask & POLLFREE) { + WRITE_ONCE(req->cancelled, true); + list_del_init(&req->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&req->head, NULL); + } } return 1; } @@ -1743,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; + bool queued; int error; }; @@ -1753,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ - if (unlikely(pt->iocb->poll.head)) { + if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } + pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); @@ -1789,6 +1861,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; + apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ @@ -1797,9 +1870,10 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); - if (likely(req->head)) { - spin_lock(&req->head->lock); - if (list_empty(&req->wait.entry) || req->work_scheduled) { + if (likely(apt.queued)) { + bool on_queue = poll_iocb_lock_wq(req); + + if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. @@ -1815,7 +1889,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!list_empty(&req->wait.entry)) { + } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. @@ -1823,7 +1897,8 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); + if (on_queue) + poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 41b509f410bf..f9c520ce4bf4 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -29,7 +29,7 @@ #define POLLRDHUP 0x2000 #endif -#define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */ +#define POLLFREE (__force __poll_t)0x4000 #define POLL_BUSY_LOOP (__force __poll_t)0x8000 -- cgit v1.2.3-71-gd317 From a4f1192cb53758a7210ed5a9ee695aeba22f75fb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 9 Dec 2021 14:30:33 +0200 Subject: percpu_ref: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Dennis Zhou --- include/linux/percpu-refcount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b31d3f3312ce..d73a1c08c3e3 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -51,9 +51,9 @@ #define _LINUX_PERCPU_REFCOUNT_H #include -#include #include #include +#include #include struct percpu_ref; -- cgit v1.2.3-71-gd317 From 9dcc38e2813e0cd3b195940c98b181ce6ede8f20 Mon Sep 17 00:00:00 2001 From: Drew DeVault Date: Fri, 10 Dec 2021 14:46:09 -0800 Subject: Increase default MLOCK_LIMIT to 8 MiB This limit has not been updated since 2008, when it was increased to 64 KiB at the request of GnuPG. Until recently, the main use-cases for this feature were (1) preventing sensitive memory from being swapped, as in GnuPG's use-case; and (2) real-time use-cases. In the first case, little memory is called for, and in the second case, the user is generally in a position to increase it if they need more. The introduction of IOURING_REGISTER_BUFFERS adds a third use-case: preparing fixed buffers for high-performance I/O. This use-case will take as much of this memory as it can get, but is still limited to 64 KiB by default, which is very little. This increases the limit to 8 MB, which was chosen fairly arbitrarily as a more generous, but still conservative, default value. It is also possible to raise this limit in userspace. This is easily done, for example, in the use-case of a network daemon: systemd, for instance, provides for this via LimitMEMLOCK in the service file; OpenRC via the rc_ulimit variables. However, there is no established userspace facility for configuring this outside of daemons: end-user applications do not presently have access to a convenient means of raising their limits. The buck, as it were, stops with the kernel. It's much easier to address it here than it is to bring it to hundreds of distributions, and it can only realistically be relied upon to be high-enough by end-user software if it is more-or-less ubiquitous. Most distros don't change this particular rlimit from the kernel-supplied default value, so a change here will easily provide that ubiquity. Link: https://lkml.kernel.org/r/20211028080813.15966-1-sir@cmpwn.com Signed-off-by: Drew DeVault Acked-by: Jens Axboe Acked-by: Cyril Hrubis Acked-by: Johannes Weiner Cc: Pavel Begunkov Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Andrew Dona-Couch Cc: Ammar Faizi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/resource.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/resource.h b/include/uapi/linux/resource.h index 74ef57b38f9f..ac5d6a3031db 100644 --- a/include/uapi/linux/resource.h +++ b/include/uapi/linux/resource.h @@ -66,10 +66,17 @@ struct rlimit64 { #define _STK_LIM (8*1024*1024) /* - * GPG2 wants 64kB of mlocked memory, to make sure pass phrases - * and other sensitive information are never written to disk. + * Limit the amount of locked memory by some sane default: + * root can always increase this limit if needed. + * + * The main use-cases are (1) preventing sensitive memory + * from being swapped; (2) real-time operations; (3) via + * IOURING_REGISTER_BUFFERS. + * + * The first two don't need much. The latter will take as + * much as it can get. 8MB is a reasonably sane default. */ -#define MLOCK_LIMIT ((PAGE_SIZE > 64*1024) ? PAGE_SIZE : 64*1024) +#define MLOCK_LIMIT (8*1024*1024) /* * Due to binary compatibility, the actual resource numbers -- cgit v1.2.3-71-gd317 From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: timers: implement usleep_idle_range() Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 14 +++++++++++++- kernel/time/timer.c | 16 +++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/delay.h b/include/linux/delay.h index 8eacf67eb212..039e7e0c7378 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e3d2c23c413d..85f1021ad459 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); -- cgit v1.2.3-71-gd317