From f45b2974cc0ae959a4c503a071e38a56bd64372f Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn@kernel.org>
Date: Wed, 17 Nov 2021 13:57:08 +0100
Subject: bpf, x86: Fix "no previous prototype" warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The arch_prepare_bpf_dispatcher function does not have a prototype, and
yields the following warning when W=1 is enabled for the kernel build.

  >> arch/x86/net/bpf_jit_comp.c:2188:5: warning: no previous \
  prototype for 'arch_prepare_bpf_dispatcher' [-Wmissing-prototypes]
        2188 | int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, \
	int num_funcs)
             |     ^~~~~~~~~~~~~~~~~~~~~~~~~~~

Remove the warning by adding a function declaration to include/linux/bpf.h.

Fixes: 75ccbef6369e ("bpf: Introduce BPF dispatcher")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Björn Töpel <bjorn@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211117125708.769168-1-bjorn@kernel.org
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7a163a3146b..84ff6ef49462 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -732,6 +732,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 #define BPF_DISPATCHER_INIT(_name) {				\
 	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
 	.func = &_name##_func,					\
-- 
cgit v1.2.3-71-gd317


From 6966df483d7b5b218aeb0e13e7e334a8fc3c1744 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 18 Nov 2021 13:49:51 +0200
Subject: regulator: Update protection IRQ helper docs

The documentation of IRQ notification helper had still references to
first RFC implementation which called BUG() while trying to protect the
hardware. Behaviour was improved as calling the BUG() was not a proper
solution. Current implementation attempts to call poweroff if handling
of potentially damaging error notification fails. Update the
documentation to reflect the actual behaviour.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/0c9cc4bcf20c3da66fd5a85c97ee4288e5727538.1637233864.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bd7a73db2e66..54cf566616ae 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -499,7 +499,8 @@ struct regulator_irq_data {
  *		best to shut-down regulator(s) or reboot the SOC if error
  *		handling is repeatedly failing. If fatal_cnt is given the IRQ
  *		handling is aborted if it fails for fatal_cnt times and die()
- *		callback (if populated) or BUG() is called to try to prevent
+ *		callback (if populated) is called. If die() is not populated
+ *		poweroff for the system is attempted in order to prevent any
  *		further damage.
  * @reread_ms:	The time which is waited before attempting to re-read status
  *		at the worker if IC reading fails. Immediate re-read is done
@@ -516,11 +517,12 @@ struct regulator_irq_data {
  * @data:	Driver private data pointer which will be passed as such to
  *		the renable, map_event and die callbacks in regulator_irq_data.
  * @die:	Protection callback. If IC status reading or recovery actions
- *		fail fatal_cnt times this callback or BUG() is called. This
- *		callback should implement a final protection attempt like
- *		disabling the regulator. If protection succeeded this may
- *		return 0. If anything else is returned the core assumes final
- *		protection failed and calls BUG() as a last resort.
+ *		fail fatal_cnt times this callback is called or system is
+ *		powered off. This callback should implement a final protection
+ *		attempt like disabling the regulator. If protection succeeded
+ *		die() may return 0. If anything else is returned the core
+ *		assumes final protection failed and attempts to perform a
+ *		poweroff as a last resort.
  * @map_event:	Driver callback to map IRQ status into regulator devices with
  *		events / errors. NOTE: callback MUST initialize both the
  *		errors and notifs for all rdevs which it signals having
-- 
cgit v1.2.3-71-gd317


From 28c916ade1bd4205958f74bb817fd3a05dbb7afc Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 18 Nov 2021 16:30:14 +0100
Subject: ASoC: soc-acpi: Set mach->id field on comp_ids matches

Commit dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using
compatible IDs") and commit 959ae8215a9e ("ASoC: Intel: soc-acpi-cht:
shrink tables using compatible IDs") simplified the match tables in
soc-acpi-intel-byt-match.c and soc-acpi-intel-cht-match.c by merging
identical entries using the new .comp_ids snd_soc_acpi_mach field to
point a single entry to multiple ACPI HIDs and clearing the previously
unique per entry .id field.

But various machine drivers from sound/soc/intel/boards rely on mach->id
in one or more ways, e.g. some drivers contain the following snippets:

	adev = acpi_dev_get_first_match_dev(mach->id, NULL, -1);

	pkg_found = snd_soc_acpi_find_package_from_hid(mach->id, ...

	if (!strncmp(snd_soc_cards[i].codec_id, mach->id, 8)) { ...

All of which are broken by the match table shrinking.

Make the snd_soc_acpi_mach.id field non const (the storage for the tables
already is non const) and on a comps_ids match copy the matching HID to
the id field to fix this.

Fixes: dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs")
Fixes: 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs")
Suggested-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Cc: Brent Lu <brent.lu@intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20211118153014.349222-1-hdegoede@redhat.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi.h | 2 +-
 sound/soc/soc-acpi.c     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 31f4c4f9aeea..ac0893df9c76 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -147,7 +147,7 @@ struct snd_soc_acpi_link_adr {
  */
 /* Descriptor for SST ASoC machine driver */
 struct snd_soc_acpi_mach {
-	const u8 id[ACPI_ID_LEN];
+	u8 id[ACPI_ID_LEN];
 	const struct snd_soc_acpi_codecs *comp_ids;
 	const u32 link_mask;
 	const struct snd_soc_acpi_link_adr *links;
diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c
index 2ae99b49d3f5..cbd7ea48837b 100644
--- a/sound/soc/soc-acpi.c
+++ b/sound/soc/soc-acpi.c
@@ -20,8 +20,10 @@ static bool snd_soc_acpi_id_present(struct snd_soc_acpi_mach *machine)
 
 	if (comp_ids) {
 		for (i = 0; i < comp_ids->num_codecs; i++) {
-			if (acpi_dev_present(comp_ids->codecs[i], NULL, -1))
+			if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) {
+				strscpy(machine->id, comp_ids->codecs[i], ACPI_ID_LEN);
 				return true;
+			}
 		}
 	}
 
-- 
cgit v1.2.3-71-gd317


From 7e78781df491e4beb475bac22e6c44236a5002d7 Mon Sep 17 00:00:00 2001
From: Gurchetan Singh <gurchetansingh@chromium.org>
Date: Mon, 22 Nov 2021 15:22:09 -0800
Subject: drm/virtgpu api: define a dummy fence signaled event

The current virtgpu implementation of poll(..) drops events
when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is enabled (otherwise
it's like a normal DRM driver).

This is because paravirtualized userspaces receives responses in a
buffer of type BLOB_MEM_GUEST, not by read(..).

To be in line with other DRM drivers and avoid specialized behavior,
it is possible to define a dummy event for virtgpu.  Paravirtualized
userspace will now have to call read(..) on the DRM fd to receive the
dummy event.

Fixes: b10790434cf2 ("drm/virtgpu api: create context init feature")
Reported-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20211122232210.602-2-gurchetansingh@google.com
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 drivers/gpu/drm/virtio/virtgpu_drv.h   | 1 -
 drivers/gpu/drm/virtio/virtgpu_ioctl.c | 2 +-
 include/uapi/drm/virtgpu_drm.h         | 7 +++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h
index e0265fe74aa5..0a194aaad419 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -138,7 +138,6 @@ struct virtio_gpu_fence_driver {
 	spinlock_t       lock;
 };
 
-#define VIRTGPU_EVENT_FENCE_SIGNALED_INTERNAL 0x10000000
 struct virtio_gpu_fence_event {
 	struct drm_pending_event base;
 	struct drm_event event;
diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index 5618a1d5879c..3607646d3229 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -54,7 +54,7 @@ static int virtio_gpu_fence_event_create(struct drm_device *dev,
 	if (!e)
 		return -ENOMEM;
 
-	e->event.type = VIRTGPU_EVENT_FENCE_SIGNALED_INTERNAL;
+	e->event.type = VIRTGPU_EVENT_FENCE_SIGNALED;
 	e->event.length = sizeof(e->event);
 
 	ret = drm_event_reserve_init(dev, file, &e->base, &e->event);
diff --git a/include/uapi/drm/virtgpu_drm.h b/include/uapi/drm/virtgpu_drm.h
index a13e20cc66b4..0512fde5e697 100644
--- a/include/uapi/drm/virtgpu_drm.h
+++ b/include/uapi/drm/virtgpu_drm.h
@@ -196,6 +196,13 @@ struct drm_virtgpu_context_init {
 	__u64 ctx_set_params;
 };
 
+/*
+ * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in
+ * effect.  The event size is sizeof(drm_event), since there is no additional
+ * payload.
+ */
+#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000
+
 #define DRM_IOCTL_VIRTGPU_MAP \
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
 
-- 
cgit v1.2.3-71-gd317


From dacb5d8875cc6cd3a553363b4d6f06760fcbe70c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 26 Nov 2021 19:34:21 +0100
Subject: tcp: fix page frag corruption on page fault

Steffen reported a TCP stream corruption for HTTP requests
served by the apache web-server using a cifs mount-point
and memory mapping the relevant file.

The root cause is quite similar to the one addressed by
commit 20eb4f29b602 ("net: fix sk_page_frag() recursion from
memory reclaim"). Here the nested access to the task page frag
is caused by a page fault on the (mmapped) user-space memory
buffer coming from the cifs file.

The page fault handler performs an smb transaction on a different
socket, inside the same process context. Since sk->sk_allaction
for such socket does not prevent the usage for the task_frag,
the nested allocation modify "under the hood" the page frag
in use by the outer sendmsg call, corrupting the stream.

The overall relevant stack trace looks like the following:

httpd 78268 [001] 3461630.850950:      probe:tcp_sendmsg_locked:
        ffffffff91461d91 tcp_sendmsg_locked+0x1
        ffffffff91462b57 tcp_sendmsg+0x27
        ffffffff9139814e sock_sendmsg+0x3e
        ffffffffc06dfe1d smb_send_kvec+0x28
        [...]
        ffffffffc06cfaf8 cifs_readpages+0x213
        ffffffff90e83c4b read_pages+0x6b
        ffffffff90e83f31 __do_page_cache_readahead+0x1c1
        ffffffff90e79e98 filemap_fault+0x788
        ffffffff90eb0458 __do_fault+0x38
        ffffffff90eb5280 do_fault+0x1a0
        ffffffff90eb7c84 __handle_mm_fault+0x4d4
        ffffffff90eb8093 handle_mm_fault+0xc3
        ffffffff90c74f6d __do_page_fault+0x1ed
        ffffffff90c75277 do_page_fault+0x37
        ffffffff9160111e page_fault+0x1e
        ffffffff9109e7b5 copyin+0x25
        ffffffff9109eb40 _copy_from_iter_full+0xe0
        ffffffff91462370 tcp_sendmsg_locked+0x5e0
        ffffffff91462370 tcp_sendmsg_locked+0x5e0
        ffffffff91462b57 tcp_sendmsg+0x27
        ffffffff9139815c sock_sendmsg+0x4c
        ffffffff913981f7 sock_write_iter+0x97
        ffffffff90f2cc56 do_iter_readv_writev+0x156
        ffffffff90f2dff0 do_iter_write+0x80
        ffffffff90f2e1c3 vfs_writev+0xa3
        ffffffff90f2e27c do_writev+0x5c
        ffffffff90c042bb do_syscall_64+0x5b
        ffffffff916000ad entry_SYSCALL_64_after_hwframe+0x65

The cifs filesystem rightfully sets sk_allocations to GFP_NOFS,
we can avoid the nesting using the sk page frag for allocation
lacking the __GFP_FS flag. Do not define an additional mm-helper
for that, as this is strictly tied to the sk page frag usage.

v1 -> v2:
 - use a stricted sk_page_frag() check instead of reordering the
   code (Eric)

Reported-by: Steffen Froemer <sfroemer@redhat.com>
Fixes: 5640f7685831 ("net: use a per task frag allocator")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index b32906e1ab55..715cdb4b2b79 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2430,19 +2430,22 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
  * @sk: socket
  *
  * Use the per task page_frag instead of the per socket one for
- * optimization when we know that we're in the normal context and owns
+ * optimization when we know that we're in process context and own
  * everything that's associated with %current.
  *
- * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
- * inside other socket operations and end up recursing into sk_page_frag()
- * while it's already in use.
+ * Both direct reclaim and page faults can nest inside other
+ * socket operations and end up recursing into sk_page_frag()
+ * while it's already in use: explicitly avoid task page_frag
+ * usage if the caller is potentially doing any of them.
+ * This assumes that page fault handlers use the GFP_NOFS flags.
  *
  * Return: a per task page_frag if context allows that,
  * otherwise a per socket one.
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-	if (gfpflags_normal_context(sk->sk_allocation))
+	if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
+	    (__GFP_DIRECT_RECLAIM | __GFP_FS))
 		return &current->task_frag;
 
 	return &sk->sk_frag;
-- 
cgit v1.2.3-71-gd317


From cdef485217d30382f3bf6448c54b4401648fe3f1 Mon Sep 17 00:00:00 2001
From: msizanoen1 <msizanoen@qtmlabs.xyz>
Date: Tue, 23 Nov 2021 13:48:32 +0100
Subject: ipv6: fix memory leak in fib6_rule_suppress

The kernel leaks memory when a `fib` rule is present in IPv6 nftables
firewall rules and a suppress_prefix rule is present in the IPv6 routing
rules (used by certain tools such as wg-quick). In such scenarios, every
incoming packet will leak an allocation in `ip6_dst_cache` slab cache.

After some hours of `bpftrace`-ing and source code reading, I tracked
down the issue to ca7a03c41753 ("ipv6: do not free rt if
FIB_LOOKUP_NOREF is set on suppress rule").

The problem with that change is that the generic `args->flags` always have
`FIB_LOOKUP_NOREF` set[1][2] but the IPv6-specific flag
`RT6_LOOKUP_F_DST_NOREF` might not be, leading to `fib6_rule_suppress` not
decreasing the refcount when needed.

How to reproduce:
 - Add the following nftables rule to a prerouting chain:
     meta nfproto ipv6 fib saddr . mark . iif oif missing drop
   This can be done with:
     sudo nft create table inet test
     sudo nft create chain inet test test_chain '{ type filter hook prerouting priority filter + 10; policy accept; }'
     sudo nft add rule inet test test_chain meta nfproto ipv6 fib saddr . mark . iif oif missing drop
 - Run:
     sudo ip -6 rule add table main suppress_prefixlength 0
 - Watch `sudo slabtop -o | grep ip6_dst_cache` to see memory usage increase
   with every incoming ipv6 packet.

This patch exposes the protocol-specific flags to the protocol
specific `suppress` function, and check the protocol-specific `flags`
argument for RT6_LOOKUP_F_DST_NOREF instead of the generic
FIB_LOOKUP_NOREF when decreasing the refcount, like this.

[1]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L71
[2]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L99

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215105
Fixes: ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule")
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h | 4 +++-
 net/core/fib_rules.c    | 2 +-
 net/ipv4/fib_rules.c    | 1 +
 net/ipv6/fib6_rules.c   | 4 ++--
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 4b10676c69d1..bd07484ab9dd 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -69,7 +69,7 @@ struct fib_rules_ops {
 	int			(*action)(struct fib_rule *,
 					  struct flowi *, int,
 					  struct fib_lookup_arg *);
-	bool			(*suppress)(struct fib_rule *,
+	bool			(*suppress)(struct fib_rule *, int,
 					    struct fib_lookup_arg *);
 	int			(*match)(struct fib_rule *,
 					 struct flowi *, int);
@@ -218,7 +218,9 @@ INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule,
 			    struct fib_lookup_arg *arg));
 
 INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule,
+						int flags,
 						struct fib_lookup_arg *arg));
 INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule,
+						int flags,
 						struct fib_lookup_arg *arg));
 #endif
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 79df7cd9dbc1..1bb567a3b329 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -323,7 +323,7 @@ jumped:
 		if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress,
 							      fib6_rule_suppress,
 							      fib4_rule_suppress,
-							      rule, arg))
+							      rule, flags, arg))
 			continue;
 
 		if (err != -EAGAIN) {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ce54a30c2ef1..364ad3446b2f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -141,6 +141,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule,
 }
 
 INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule,
+						int flags,
 						struct fib_lookup_arg *arg)
 {
 	struct fib_result *result = (struct fib_result *) arg->result;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 40f3e4f9f33a..dcedfe29d9d9 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -267,6 +267,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
 }
 
 INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
+						int flags,
 						struct fib_lookup_arg *arg)
 {
 	struct fib6_result *res = arg->result;
@@ -294,8 +295,7 @@ INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
 	return false;
 
 suppress_route:
-	if (!(arg->flags & FIB_LOOKUP_NOREF))
-		ip6_rt_put(rt);
+	ip6_rt_put_flags(rt, flags);
 	return true;
 }
 
-- 
cgit v1.2.3-71-gd317


From 20ae1d6aa159eb91a9bf09ff92ccaa94dbea92c2 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 29 Nov 2021 10:39:25 -0500
Subject: wireguard: device: reset peer src endpoint when netns exits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each peer's endpoint contains a dst_cache entry that takes a reference
to another netdev. When the containing namespace exits, we take down the
socket and prevent future sockets from being created (by setting
creating_net to NULL), which removes that potential reference on the
netns. However, it doesn't release references to the netns that a netdev
cached in dst_cache might be taking, so the netns still might fail to
exit. Since the socket is gimped anyway, we can simply clear all the
dst_caches (by way of clearing the endpoint src), which will release all
references.

However, the current dst_cache_reset function only releases those
references lazily. But it turns out that all of our usages of
wg_socket_clear_peer_endpoint_src are called from contexts that are not
exactly high-speed or bottle-necked. For example, when there's
connection difficulty, or when userspace is reconfiguring the interface.
And in particular for this patch, when the netns is exiting. So for
those cases, it makes more sense to call dst_release immediately. For
that, we add a small helper function to dst_cache.

This patch also adds a test to netns.sh from Hangbin Liu to ensure this
doesn't regress.

Tested-by: Hangbin Liu <liuhangbin@gmail.com>
Reported-by: Xiumei Mu <xmu@redhat.com>
Cc: Toke Høiland-Jørgensen <toke@redhat.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Fixes: 900575aa33a3 ("wireguard: device: avoid circular netns references")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wireguard/device.c             |  3 +++
 drivers/net/wireguard/socket.c             |  2 +-
 include/net/dst_cache.h                    | 11 +++++++++++
 net/core/dst_cache.c                       | 19 +++++++++++++++++++
 tools/testing/selftests/wireguard/netns.sh | 24 +++++++++++++++++++++++-
 5 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index 551ddaaaf540..77e64ea6be67 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -398,6 +398,7 @@ static struct rtnl_link_ops link_ops __read_mostly = {
 static void wg_netns_pre_exit(struct net *net)
 {
 	struct wg_device *wg;
+	struct wg_peer *peer;
 
 	rtnl_lock();
 	list_for_each_entry(wg, &device_list, device_list) {
@@ -407,6 +408,8 @@ static void wg_netns_pre_exit(struct net *net)
 			mutex_lock(&wg->device_update_lock);
 			rcu_assign_pointer(wg->creating_net, NULL);
 			wg_socket_reinit(wg, NULL, NULL);
+			list_for_each_entry(peer, &wg->peer_list, peer_list)
+				wg_socket_clear_peer_endpoint_src(peer);
 			mutex_unlock(&wg->device_update_lock);
 		}
 	}
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index 8c496b747108..6f07b949cb81 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -308,7 +308,7 @@ void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer)
 {
 	write_lock_bh(&peer->endpoint_lock);
 	memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6));
-	dst_cache_reset(&peer->endpoint_cache);
+	dst_cache_reset_now(&peer->endpoint_cache);
 	write_unlock_bh(&peer->endpoint_lock);
 }
 
diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index 67634675e919..df6622a5fe98 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -79,6 +79,17 @@ static inline void dst_cache_reset(struct dst_cache *dst_cache)
 	dst_cache->reset_ts = jiffies;
 }
 
+/**
+ *	dst_cache_reset_now - invalidate the cache contents immediately
+ *	@dst_cache: the cache
+ *
+ *	The caller must be sure there are no concurrent users, as this frees
+ *	all dst_cache users immediately, rather than waiting for the next
+ *	per-cpu usage like dst_cache_reset does. Most callers should use the
+ *	higher speed lazily-freed dst_cache_reset function instead.
+ */
+void dst_cache_reset_now(struct dst_cache *dst_cache);
+
 /**
  *	dst_cache_init - initialize the cache, allocating the required storage
  *	@dst_cache: the cache
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index be74ab4551c2..0ccfd5fa5cb9 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
 	free_percpu(dst_cache->cache);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_reset_now(struct dst_cache *dst_cache)
+{
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	dst_cache->reset_ts = jiffies;
+	for_each_possible_cpu(i) {
+		struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
+		struct dst_entry *dst = idst->dst;
+
+		idst->cookie = 0;
+		idst->dst = NULL;
+		dst_release(dst);
+	}
+}
+EXPORT_SYMBOL_GPL(dst_cache_reset_now);
diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh
index 2e5c1630885e..8a9461aa0878 100755
--- a/tools/testing/selftests/wireguard/netns.sh
+++ b/tools/testing/selftests/wireguard/netns.sh
@@ -613,6 +613,28 @@ ip0 link set wg0 up
 kill $ncat_pid
 ip0 link del wg0
 
+# Ensure that dst_cache references don't outlive netns lifetime
+ip1 link add dev wg0 type wireguard
+ip2 link add dev wg0 type wireguard
+configure_peers
+ip1 link add veth1 type veth peer name veth2
+ip1 link set veth2 netns $netns2
+ip1 addr add fd00:aa::1/64 dev veth1
+ip2 addr add fd00:aa::2/64 dev veth2
+ip1 link set veth1 up
+ip2 link set veth2 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+ip1 -6 route add default dev veth1 via fd00:aa::2
+ip2 -6 route add default dev veth2 via fd00:aa::1
+n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2
+n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1
+n1 ping6 -c 1 fd00::2
+pp ip netns delete $netns1
+pp ip netns delete $netns2
+pp ip netns add $netns1
+pp ip netns add $netns2
+
 # Ensure there aren't circular reference loops
 ip1 link add wg1 type wireguard
 ip2 link add wg2 type wireguard
@@ -631,7 +653,7 @@ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do
 done < /dev/kmsg
 alldeleted=1
 for object in "${!objects[@]}"; do
-	if [[ ${objects["$object"]} != *createddestroyed ]]; then
+	if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then
 		echo "Error: $object: merely ${objects["$object"]}" >&3
 		alldeleted=0
 	fi
-- 
cgit v1.2.3-71-gd317


From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Nov 2021 10:39:29 -0500
Subject: siphash: use _unaligned version by default

On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
because the ordinary load/store instructions (ldr, ldrh, ldrb) can
tolerate any misalignment of the memory address. However, load/store
double and load/store multiple instructions (ldrd, ldm) may still only
be used on memory addresses that are 32-bit aligned, and so we have to
use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we
may end up with a severe performance hit due to alignment traps that
require fixups by the kernel. Testing shows that this currently happens
with clang-13 but not gcc-11. In theory, any compiler version can
produce this bug or other problems, as we are dealing with undefined
behavior in C99 even on architectures that support this in hardware,
see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363.

Fortunately, the get_unaligned() accessors do the right thing: when
building for ARMv6 or later, the compiler will emit unaligned accesses
using the ordinary load/store instructions (but avoid the ones that
require 32-bit alignment). When building for older ARM, those accessors
will emit the appropriate sequence of ldrb/mov/orr instructions. And on
architectures that can truly tolerate any kind of misalignment, the
get_unaligned() accessors resolve to the leXX_to_cpup accessors that
operate on aligned addresses.

Since the compiler will in fact emit ldrd or ldm instructions when
building this code for ARM v6 or later, the solution is to use the
unaligned accessors unconditionally on architectures where this is
known to be fast. The _aligned version of the hash function is
however still needed to get the best performance on architectures
that cannot do any unaligned access in hardware.

This new version avoids the undefined behavior and should produce
the fastest hash on all architectures we support.

Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/
Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/siphash.h | 14 ++++----------
 lib/siphash.c           | 12 ++++++------
 2 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index bf21591a9e5e..0cda61855d90 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key)
 }
 
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
-#endif
 
 u64 siphash_1u64(const u64 a, const siphash_key_t *key);
 u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
@@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
 static inline u64 siphash(const void *data, size_t len,
 			  const siphash_key_t *key)
 {
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
 		return __siphash_unaligned(data, len, key);
-#endif
 	return ___siphash_aligned(data, len, key);
 }
 
@@ -96,10 +93,8 @@ typedef struct {
 
 u32 __hsiphash_aligned(const void *data, size_t len,
 		       const hsiphash_key_t *key);
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_unaligned(const void *data, size_t len,
 			 const hsiphash_key_t *key);
-#endif
 
 u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
 u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
@@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
 static inline u32 hsiphash(const void *data, size_t len,
 			   const hsiphash_key_t *key)
 {
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
 		return __hsiphash_unaligned(data, len, key);
-#endif
 	return ___hsiphash_aligned(data, len, key);
 }
 
diff --git a/lib/siphash.c b/lib/siphash.c
index a90112ee72a1..72b9068ab57b 100644
--- a/lib/siphash.c
+++ b/lib/siphash.c
@@ -49,6 +49,7 @@
 	SIPROUND; \
 	return (v0 ^ v1) ^ (v2 ^ v3);
 
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
 {
 	const u8 *end = data + len - (len % sizeof(u64));
@@ -80,8 +81,8 @@ u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
 	POSTAMBLE
 }
 EXPORT_SYMBOL(__siphash_aligned);
+#endif
 
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
 {
 	const u8 *end = data + len - (len % sizeof(u64));
@@ -113,7 +114,6 @@ u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
 	POSTAMBLE
 }
 EXPORT_SYMBOL(__siphash_unaligned);
-#endif
 
 /**
  * siphash_1u64 - compute 64-bit siphash PRF value of a u64
@@ -250,6 +250,7 @@ EXPORT_SYMBOL(siphash_3u32);
 	HSIPROUND; \
 	return (v0 ^ v1) ^ (v2 ^ v3);
 
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
 {
 	const u8 *end = data + len - (len % sizeof(u64));
@@ -280,8 +281,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
 	HPOSTAMBLE
 }
 EXPORT_SYMBOL(__hsiphash_aligned);
+#endif
 
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_unaligned(const void *data, size_t len,
 			 const hsiphash_key_t *key)
 {
@@ -313,7 +314,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len,
 	HPOSTAMBLE
 }
 EXPORT_SYMBOL(__hsiphash_unaligned);
-#endif
 
 /**
  * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
@@ -418,6 +418,7 @@ EXPORT_SYMBOL(hsiphash_4u32);
 	HSIPROUND; \
 	return v1 ^ v3;
 
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
 {
 	const u8 *end = data + len - (len % sizeof(u32));
@@ -438,8 +439,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
 	HPOSTAMBLE
 }
 EXPORT_SYMBOL(__hsiphash_aligned);
+#endif
 
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u32 __hsiphash_unaligned(const void *data, size_t len,
 			 const hsiphash_key_t *key)
 {
@@ -461,7 +462,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len,
 	HPOSTAMBLE
 }
 EXPORT_SYMBOL(__hsiphash_unaligned);
-#endif
 
 /**
  * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
-- 
cgit v1.2.3-71-gd317


From 79364031c5b4365ca28ac0fa00acfab5bf465be1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 27 Nov 2021 17:32:00 +0100
Subject: bpf: Make sure bpf_disable_instrumentation() is safe vs preemption.

The initial implementation of migrate_disable() for mainline was a
wrapper around preempt_disable(). RT kernels substituted this with a
real migrate disable implementation.

Later on mainline gained true migrate disable support, but neither
documentation nor affected code were updated.

Remove stale comments claiming that migrate_disable() is PREEMPT_RT only.

Don't use __this_cpu_inc() in the !PREEMPT_RT path because preemption is
not disabled and the RMW operation can be preempted.

Fixes: 74d862b682f51 ("sched: Make migrate_disable/enable() independent of RT")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211127163200.10466-3-bigeasy@linutronix.de
---
 include/linux/bpf.h    | 16 ++--------------
 include/linux/filter.h |  3 ---
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84ff6ef49462..755f38e893be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1353,28 +1353,16 @@ extern struct mutex bpf_stats_enabled_mutex;
  * kprobes, tracepoints) to prevent deadlocks on map operations as any of
  * these events can happen inside a region which holds a map bucket lock
  * and can deadlock on it.
- *
- * Use the preemption safe inc/dec variants on RT because migrate disable
- * is preemptible on RT and preemption in the middle of the RMW operation
- * might lead to inconsistent state. Use the raw variants for non RT
- * kernels as migrate_disable() maps to preempt_disable() so the slightly
- * more expensive save operation can be avoided.
  */
 static inline void bpf_disable_instrumentation(void)
 {
 	migrate_disable();
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		this_cpu_inc(bpf_prog_active);
-	else
-		__this_cpu_inc(bpf_prog_active);
+	this_cpu_inc(bpf_prog_active);
 }
 
 static inline void bpf_enable_instrumentation(void)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT_RT))
-		this_cpu_dec(bpf_prog_active);
-	else
-		__this_cpu_dec(bpf_prog_active);
+	this_cpu_dec(bpf_prog_active);
 	migrate_enable();
 }
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 24b7ed2677af..534f678ca50f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -640,9 +640,6 @@ static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void
  * This uses migrate_disable/enable() explicitly to document that the
  * invocation of a BPF program does not require reentrancy protection
  * against a BPF program which is invoked from a preempting task.
- *
- * For non RT enabled kernels migrate_disable/enable() maps to
- * preempt_disable/enable(), i.e. it disables also preemption.
  */
 static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
 					  const void *ctx)
-- 
cgit v1.2.3-71-gd317


From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Sun, 7 Nov 2021 17:21:45 +0200
Subject: net/mlx5: Fix access to a non-supported register

Validate MRTC register is supported before triggering a delayed work
which accesses it.

Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c   | 8 +++-----
 include/linux/mlx5/mlx5_ifc.h                    | 5 ++++-
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 380f50d5462d..3ca998874c50 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -836,7 +836,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 	health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
 	add_timer(&health->timer);
 
-	if (mlx5_core_is_pf(dev))
+	if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
 		queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index e127c0530b3a..7df9c7f8d9c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1071,18 +1071,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
 
 	mlx5_set_driver_version(dev);
 
-	mlx5_start_health_poll(dev);
-
 	err = mlx5_query_hca_caps(dev);
 	if (err) {
 		mlx5_core_err(dev, "query hca failed\n");
-		goto stop_health;
+		goto reclaim_boot_pages;
 	}
 
+	mlx5_start_health_poll(dev);
+
 	return 0;
 
-stop_health:
-	mlx5_stop_health_poll(dev, boot);
 reclaim_boot_pages:
 	mlx5_reclaim_startup_pages(dev);
 err_disable_hca:
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3636df90899a..fbaab440a484 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits {
 	u8         regs_84_to_68[0x11];
 	u8         tracer_registers[0x4];
 
-	u8         regs_63_to_32[0x20];
+	u8         regs_63_to_46[0x12];
+	u8         mrtc[0x1];
+	u8         regs_44_to_32[0xd];
+
 	u8         regs_31_to_0[0x20];
 };
 
-- 
cgit v1.2.3-71-gd317


From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 1 Dec 2021 23:45:50 +0900
Subject: kprobes: Limit max data_size of the kretprobe instances

The 'kprobe::data_size' is unsigned, thus it can not be negative.  But if
user sets it enough big number (e.g. (size_t)-8), the result of 'data_size
+ sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct
kretprobe_instance) or zero. In result, the kretprobe_instance are
allocated without enough memory, and kretprobe accesses outside of
allocated memory.

To avoid this issue, introduce a max limitation of the
kretprobe::data_size. 4KB per instance should be OK.

Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2

Cc: stable@vger.kernel.org
Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler")
Reported-by: zhangyue <zhangyue1@kylinos.cn>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h | 2 ++
 kernel/kprobes.c        | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e974caf39d3e..8c8f7a4d93af 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -153,6 +153,8 @@ struct kretprobe {
 	struct kretprobe_holder *rph;
 };
 
+#define KRETPROBE_MAX_DATA_SIZE	4096
+
 struct kretprobe_instance {
 	union {
 		struct freelist_node freelist;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e9db0c810554..21eccc961bba 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp)
 		}
 	}
 
+	if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+		return -E2BIG;
+
 	rp->kp.pre_handler = pre_handler_kretprobe;
 	rp->kp.post_handler = NULL;
 
-- 
cgit v1.2.3-71-gd317


From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Nov 2021 09:01:55 -0800
Subject: net: annotate data-races on txq->xmit_lock_owner

syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner
without annotations.

No serious issue there, let's document what is happening there.

BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit

write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0:
 __netif_tx_unlock include/linux/netdevice.h:4437 [inline]
 __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229
 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265
 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline]
 macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567
 __netdev_start_xmit include/linux/netdevice.h:4987 [inline]
 netdev_start_xmit include/linux/netdevice.h:5001 [inline]
 xmit_one+0x105/0x2f0 net/core/dev.c:3590
 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606
 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342
 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817
 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194
 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259
 neigh_hh_output include/net/neighbour.h:511 [inline]
 neigh_output include/net/neighbour.h:525 [inline]
 ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126
 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline]
 ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201
 NF_HOOK_COND include/linux/netfilter.h:296 [inline]
 ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224
 dst_output include/net/dst.h:450 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508
 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702
 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898
 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421
 expire_timers+0x116/0x240 kernel/time/timer.c:1466
 __run_timers+0x368/0x410 kernel/time/timer.c:1734
 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747
 __do_softirq+0x158/0x2de kernel/softirq.c:558
 __irq_exit_rcu kernel/softirq.c:636 [inline]
 irq_exit_rcu+0x37/0x70 kernel/softirq.c:648
 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097
 asm_sysvec_apic_timer_interrupt+0x12/0x20

read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1:
 __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213
 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265
 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline]
 macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567
 __netdev_start_xmit include/linux/netdevice.h:4987 [inline]
 netdev_start_xmit include/linux/netdevice.h:5001 [inline]
 xmit_one+0x105/0x2f0 net/core/dev.c:3590
 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606
 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342
 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817
 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194
 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259
 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523
 neigh_output include/net/neighbour.h:527 [inline]
 ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126
 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline]
 ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201
 NF_HOOK_COND include/linux/netfilter.h:296 [inline]
 ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224
 dst_output include/net/dst.h:450 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508
 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702
 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898
 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421
 expire_timers+0x116/0x240 kernel/time/timer.c:1466
 __run_timers+0x368/0x410 kernel/time/timer.c:1734
 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747
 __do_softirq+0x158/0x2de kernel/softirq.c:558
 __irq_exit_rcu kernel/softirq.c:636 [inline]
 irq_exit_rcu+0x37/0x70 kernel/softirq.c:648
 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097
 asm_sysvec_apic_timer_interrupt+0x12/0x20
 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443
 folio_test_anon include/linux/page-flags.h:581 [inline]
 PageAnon include/linux/page-flags.h:586 [inline]
 zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347
 zap_pmd_range mm/memory.c:1467 [inline]
 zap_pud_range mm/memory.c:1496 [inline]
 zap_p4d_range mm/memory.c:1517 [inline]
 unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538
 unmap_single_vma+0x157/0x210 mm/memory.c:1583
 unmap_vmas+0xd0/0x180 mm/memory.c:1615
 exit_mmap+0x23d/0x470 mm/mmap.c:3170
 __mmput+0x27/0x1b0 kernel/fork.c:1113
 mmput+0x3d/0x50 kernel/fork.c:1134
 exit_mm+0xdb/0x170 kernel/exit.c:507
 do_exit+0x608/0x17a0 kernel/exit.c:819
 do_group_exit+0xce/0x180 kernel/exit.c:929
 get_signal+0xfc3/0x1550 kernel/signal.c:2852
 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868
 handle_signal_work kernel/entry/common.c:148 [inline]
 exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
 exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207
 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline]
 syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300
 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x00000000 -> 0xffffffff

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G        W         5.16.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 19 +++++++++++++------
 net/core/dev.c            |  5 ++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec42495a43a..be5cb3360b94 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 {
 	spin_lock(&txq->_xmit_lock);
-	txq->xmit_lock_owner = cpu;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, cpu);
 }
 
 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
@@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	txq->xmit_lock_owner = smp_processor_id();
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
 }
 
 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
 {
 	bool ok = spin_trylock(&txq->_xmit_lock);
-	if (likely(ok))
-		txq->xmit_lock_owner = smp_processor_id();
+
+	if (likely(ok)) {
+		/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+		WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
+	}
 	return ok;
 }
 
 static inline void __netif_tx_unlock(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock(&txq->_xmit_lock);
 }
 
 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 {
-	txq->xmit_lock_owner = -1;
+	/* Pairs with READ_ONCE() in __dev_queue_xmit() */
+	WRITE_ONCE(txq->xmit_lock_owner, -1);
 	spin_unlock_bh(&txq->_xmit_lock);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 15ac064b5562..2a352e668d10 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4210,7 +4210,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	if (dev->flags & IFF_UP) {
 		int cpu = smp_processor_id(); /* ok because BHs are off */
 
-		if (txq->xmit_lock_owner != cpu) {
+		/* Other cpus might concurrently change txq->xmit_lock_owner
+		 * to -1 or to their cpu id, but not to our id.
+		 */
+		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
 			if (dev_xmit_recursion())
 				goto recursion_alert;
 
-- 
cgit v1.2.3-71-gd317


From a37a0ee4d25c152a087c5e913b76f207eaebdb54 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Nov 2021 10:29:39 -0800
Subject: net: avoid uninit-value from tcp_conn_request

A recent change triggers a KMSAN warning, because request
sockets do not initialize @sk_rx_queue_mapping field.

Add sk_rx_queue_update() helper to make our intent clear.

BUG: KMSAN: uninit-value in sk_rx_queue_set include/net/sock.h:1922 [inline]
BUG: KMSAN: uninit-value in tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922
 sk_rx_queue_set include/net/sock.h:1922 [inline]
 tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922
 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528
 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406
 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738
 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100
 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204
 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:460 [inline]
 ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline]
 ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline]
 ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609
 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644
 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline]
 __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553
 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605
 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696
 gro_normal_list net/core/dev.c:5850 [inline]
 napi_complete_done+0x579/0xdd0 net/core/dev.c:6587
 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
 virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557
 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020
 napi_poll net/core/dev.c:7087 [inline]
 net_rx_action+0x824/0x1880 net/core/dev.c:7174
 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558
 invoke_softirq+0xa4/0x130 kernel/softirq.c:432
 __irq_exit_rcu kernel/softirq.c:636 [inline]
 irq_exit_rcu+0x76/0x130 kernel/softirq.c:648
 common_interrupt+0xb6/0xd0 arch/x86/kernel/irq.c:240
 asm_common_interrupt+0x1e/0x40
 smap_restore arch/x86/include/asm/smap.h:67 [inline]
 get_shadow_origin_ptr mm/kmsan/instrumentation.c:31 [inline]
 __msan_metadata_ptr_for_load_1+0x28/0x30 mm/kmsan/instrumentation.c:63
 tomoyo_check_acl+0x1b0/0x630 security/tomoyo/domain.c:173
 tomoyo_path_permission security/tomoyo/file.c:586 [inline]
 tomoyo_check_open_permission+0x61f/0xe10 security/tomoyo/file.c:777
 tomoyo_file_open+0x24f/0x2d0 security/tomoyo/tomoyo.c:311
 security_file_open+0xb1/0x1f0 security/security.c:1635
 do_dentry_open+0x4e4/0x1bf0 fs/open.c:809
 vfs_open+0xaf/0xe0 fs/open.c:957
 do_open fs/namei.c:3426 [inline]
 path_openat+0x52f1/0x5dd0 fs/namei.c:3559
 do_filp_open+0x306/0x760 fs/namei.c:3586
 do_sys_openat2+0x263/0x8f0 fs/open.c:1212
 do_sys_open fs/open.c:1228 [inline]
 __do_sys_open fs/open.c:1236 [inline]
 __se_sys_open fs/open.c:1232 [inline]
 __x64_sys_open+0x314/0x380 fs/open.c:1232
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Uninit was created at:
 __alloc_pages+0xbc7/0x10a0 mm/page_alloc.c:5409
 alloc_pages+0x8a5/0xb80
 alloc_slab_page mm/slub.c:1810 [inline]
 allocate_slab+0x287/0x1c20 mm/slub.c:1947
 new_slab mm/slub.c:2010 [inline]
 ___slab_alloc+0xbdf/0x1e90 mm/slub.c:3039
 __slab_alloc mm/slub.c:3126 [inline]
 slab_alloc_node mm/slub.c:3217 [inline]
 slab_alloc mm/slub.c:3259 [inline]
 kmem_cache_alloc+0xbb3/0x11c0 mm/slub.c:3264
 reqsk_alloc include/net/request_sock.h:91 [inline]
 inet_reqsk_alloc+0xaf/0x8b0 net/ipv4/tcp_input.c:6712
 tcp_conn_request+0x910/0x4dc0 net/ipv4/tcp_input.c:6852
 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528
 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406
 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738
 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100
 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204
 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:460 [inline]
 ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline]
 ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline]
 ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609
 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644
 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline]
 __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553
 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605
 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696
 gro_normal_list net/core/dev.c:5850 [inline]
 napi_complete_done+0x579/0xdd0 net/core/dev.c:6587
 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
 virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557
 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020
 napi_poll net/core/dev.c:7087 [inline]
 net_rx_action+0x824/0x1880 net/core/dev.c:7174
 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558

Fixes: 342159ee394d ("net: avoid dirtying sk->sk_rx_queue_mapping")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20211130182939.2584764-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/busy_poll.h |  2 +-
 include/net/sock.h      | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 4202c609bb0b..7994455ec714 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -133,7 +133,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
 	if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
 		WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
 #endif
-	sk_rx_queue_set(sk, skb);
+	sk_rx_queue_update(sk, skb);
 }
 
 static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
diff --git a/include/net/sock.h b/include/net/sock.h
index 715cdb4b2b79..bea21ff70e74 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1913,18 +1913,31 @@ static inline int sk_tx_queue_get(const struct sock *sk)
 	return -1;
 }
 
-static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+static inline void __sk_rx_queue_set(struct sock *sk,
+				     const struct sk_buff *skb,
+				     bool force_set)
 {
 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
 	if (skb_rx_queue_recorded(skb)) {
 		u16 rx_queue = skb_get_rx_queue(skb);
 
-		if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
+		if (force_set ||
+		    unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
 			WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
 	}
 #endif
 }
 
+static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+{
+	__sk_rx_queue_set(sk, skb, true);
+}
+
+static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
+{
+	__sk_rx_queue_set(sk, skb, false);
+}
+
 static inline void sk_rx_queue_clear(struct sock *sk)
 {
 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
-- 
cgit v1.2.3-71-gd317


From 213f5f8f31f10aa1e83187ae20fb7fa4e626b724 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 1 Dec 2021 18:26:35 -0800
Subject: ipv4: convert fib_num_tclassid_users to atomic_t

Before commit faa041a40b9f ("ipv4: Create cleanup helper for fib_nh")
changes to net->ipv4.fib_num_tclassid_users were protected by RTNL.

After the change, this is no longer the case, as free_fib_info_rcu()
runs after rcu grace period, without rtnl being held.

Fixes: faa041a40b9f ("ipv4: Create cleanup helper for fib_nh")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsahern@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 2 +-
 include/net/netns/ipv4.h | 2 +-
 net/ipv4/fib_frontend.c  | 2 +-
 net/ipv4/fib_rules.c     | 4 ++--
 net/ipv4/fib_semantics.c | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index ab5348e57db1..3417ba2d27ad 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -438,7 +438,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static inline int fib_num_tclassid_users(struct net *net)
 {
-	return net->ipv4.fib_num_tclassid_users;
+	return atomic_read(&net->ipv4.fib_num_tclassid_users);
 }
 #else
 static inline int fib_num_tclassid_users(struct net *net)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2f65701a43c9..6c5b2efc4f17 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -65,7 +65,7 @@ struct netns_ipv4 {
 	bool			fib_has_custom_local_routes;
 	bool			fib_offload_disabled;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	int			fib_num_tclassid_users;
+	atomic_t		fib_num_tclassid_users;
 #endif
 	struct hlist_head	*fib_table_hash;
 	struct sock		*fibnl;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 9fe13e4f5d08..4d61ddd8a0ec 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1582,7 +1582,7 @@ static int __net_init fib_net_init(struct net *net)
 	int error;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	net->ipv4.fib_num_tclassid_users = 0;
+	atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
 #endif
 	error = ip_fib_net_init(net);
 	if (error < 0)
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 364ad3446b2f..d279cb8ac158 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -264,7 +264,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	if (tb[FRA_FLOW]) {
 		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
 		if (rule4->tclassid)
-			net->ipv4.fib_num_tclassid_users++;
+			atomic_inc(&net->ipv4.fib_num_tclassid_users);
 	}
 #endif
 
@@ -296,7 +296,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (((struct fib4_rule *)rule)->tclassid)
-		net->ipv4.fib_num_tclassid_users--;
+		atomic_dec(&net->ipv4.fib_num_tclassid_users);
 #endif
 	net->ipv4.fib_has_custom_rules = true;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3364cb9c67e0..fde7797b5806 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -220,7 +220,7 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
 {
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (fib_nh->nh_tclassid)
-		net->ipv4.fib_num_tclassid_users--;
+		atomic_dec(&net->ipv4.fib_num_tclassid_users);
 #endif
 	fib_nh_common_release(&fib_nh->nh_common);
 }
@@ -632,7 +632,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	nh->nh_tclassid = cfg->fc_flow;
 	if (nh->nh_tclassid)
-		net->ipv4.fib_num_tclassid_users++;
+		atomic_inc(&net->ipv4.fib_num_tclassid_users);
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	nh->fib_nh_weight = nh_weight;
-- 
cgit v1.2.3-71-gd317


From 72f6a45202f20f0e1a46b0acb7803369cc53d0b8 Mon Sep 17 00:00:00 2001
From: Xiayu Zhang <Xiayu.Zhang@mediatek.com>
Date: Wed, 1 Dec 2021 10:57:13 +0800
Subject: Fix Comment of ETH_P_802_3_MIN

The description of ETH_P_802_3_MIN is misleading.
The value of EthernetType in Ethernet II frame is more than 0x0600,
the value of Length in 802.3 frame is less than 0x0600.

Signed-off-by: Xiayu Zhang <Xiayu.Zhang@mediatek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5da4ee234e0b..c0c2f3ed5729 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -117,7 +117,7 @@
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
-#define ETH_P_802_3_MIN	0x0600		/* If the value in the ethernet type is less than this value
+#define ETH_P_802_3_MIN	0x0600		/* If the value in the ethernet type is more than this value
 					 * then the frame is Ethernet II. Else it is 802.3 */
 
 /*
-- 
cgit v1.2.3-71-gd317


From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 26 Oct 2021 16:10:55 +0200
Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full

getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime
than the actual time.

task_cputime_adjusted() snapshots utime and stime and then adjust their
sum to match the scheduler maintained cputime.sum_exec_runtime.
Unfortunately in nohz_full, sum_exec_runtime is only updated once per
second in the worst case, causing a discrepancy against utime and stime
that can be updated anytime by the reader using vtime.

To fix this situation, perform an update of cputime.sum_exec_runtime
when the cputime snapshot reports the task as actually running while
the tick is disabled. The related overhead is then contained within the
relevant situations.

Reported-by: Hasegawa Hitomi <hasegawa-hitomi@fujitsu.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Hasegawa Hitomi <hasegawa-hitomi@fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Acked-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org
---
 include/linux/sched/cputime.h |  5 +++--
 kernel/sched/cputime.c        | 12 +++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6c9f19a33865..ce3c58286062 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -18,15 +18,16 @@
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
+extern bool task_cputime(struct task_struct *t,
 			 u64 *utime, u64 *stime);
 extern u64 task_gtime(struct task_struct *t);
 #else
-static inline void task_cputime(struct task_struct *t,
+static inline bool task_cputime(struct task_struct *t,
 				u64 *utime, u64 *stime)
 {
 	*utime = t->utime;
 	*stime = t->stime;
+	return false;
 }
 
 static inline u64 task_gtime(struct task_struct *t)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 872e481d5098..9392aea1804e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 		.sum_exec_runtime = p->se.sum_exec_runtime,
 	};
 
-	task_cputime(p, &cputime.utime, &cputime.stime);
+	if (task_cputime(p, &cputime.utime, &cputime.stime))
+		cputime.sum_exec_runtime = task_sched_runtime(p);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
@@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t)
  * add up the pending nohz execution time since the last
  * cputime snapshot.
  */
-void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
 	struct vtime *vtime = &t->vtime;
 	unsigned int seq;
 	u64 delta;
+	int ret;
 
 	if (!vtime_accounting_enabled()) {
 		*utime = t->utime;
 		*stime = t->stime;
-		return;
+		return false;
 	}
 
 	do {
+		ret = false;
 		seq = read_seqcount_begin(&vtime->seqcount);
 
 		*utime = t->utime;
@@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		if (vtime->state < VTIME_SYS)
 			continue;
 
+		ret = true;
 		delta = vtime_delta(vtime);
 
 		/*
@@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		else
 			*utime += vtime->utime + delta;
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+	return ret;
 }
 
 static int vtime_state_fetch(struct vtime *vtime, int cpu)
-- 
cgit v1.2.3-71-gd317


From f83baa0cb6cfc92ebaf7f9d3a99d7e34f2e77a8a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 1 Dec 2021 19:35:01 +0100
Subject: HID: add hid_is_usb() function to make it simpler for USB detection

A number of HID drivers already call hid_is_using_ll_driver() but only
for the detection of if this is a USB device or not.  Make this more
obvious by creating hid_is_usb() and calling the function that way.

Also converts the existing hid_is_using_ll_driver() functions to use the
new call.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Cc: stable@vger.kernel.org
Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20211201183503.2373082-1-gregkh@linuxfoundation.org
---
 drivers/hid/hid-asus.c           | 6 ++----
 drivers/hid/hid-logitech-dj.c    | 2 +-
 drivers/hid/hid-u2fzero.c        | 2 +-
 drivers/hid/hid-uclogic-params.c | 3 +--
 drivers/hid/wacom_sys.c          | 2 +-
 include/linux/hid.h              | 5 +++++
 6 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index f3ecddc519ee..08c9a9a60ae4 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -1028,8 +1028,7 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	if (drvdata->quirks & QUIRK_IS_MULTITOUCH)
 		drvdata->tp = &asus_i2c_tp;
 
-	if ((drvdata->quirks & QUIRK_T100_KEYBOARD) &&
-	    hid_is_using_ll_driver(hdev, &usb_hid_driver)) {
+	if ((drvdata->quirks & QUIRK_T100_KEYBOARD) && hid_is_usb(hdev)) {
 		struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
 
 		if (intf->altsetting->desc.bInterfaceNumber == T100_TPAD_INTF) {
@@ -1057,8 +1056,7 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		drvdata->tp = &asus_t100chi_tp;
 	}
 
-	if ((drvdata->quirks & QUIRK_MEDION_E1239T) &&
-	    hid_is_using_ll_driver(hdev, &usb_hid_driver)) {
+	if ((drvdata->quirks & QUIRK_MEDION_E1239T) && hid_is_usb(hdev)) {
 		struct usb_host_interface *alt =
 			to_usb_interface(hdev->dev.parent)->altsetting;
 
diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c
index a0017b010c34..7106b921b53c 100644
--- a/drivers/hid/hid-logitech-dj.c
+++ b/drivers/hid/hid-logitech-dj.c
@@ -1777,7 +1777,7 @@ static int logi_dj_probe(struct hid_device *hdev,
 	case recvr_type_bluetooth:	no_dj_interfaces = 2; break;
 	case recvr_type_dinovo:		no_dj_interfaces = 2; break;
 	}
-	if (hid_is_using_ll_driver(hdev, &usb_hid_driver)) {
+	if (hid_is_usb(hdev)) {
 		intf = to_usb_interface(hdev->dev.parent);
 		if (intf && intf->altsetting->desc.bInterfaceNumber >=
 							no_dj_interfaces) {
diff --git a/drivers/hid/hid-u2fzero.c b/drivers/hid/hid-u2fzero.c
index 31ea7fc69916..ad489caf53ad 100644
--- a/drivers/hid/hid-u2fzero.c
+++ b/drivers/hid/hid-u2fzero.c
@@ -311,7 +311,7 @@ static int u2fzero_probe(struct hid_device *hdev,
 	unsigned int minor;
 	int ret;
 
-	if (!hid_is_using_ll_driver(hdev, &usb_hid_driver))
+	if (!hid_is_usb(hdev))
 		return -EINVAL;
 
 	dev = devm_kzalloc(&hdev->dev, sizeof(*dev), GFP_KERNEL);
diff --git a/drivers/hid/hid-uclogic-params.c b/drivers/hid/hid-uclogic-params.c
index 3d67b748a3b9..adff1bd68d9f 100644
--- a/drivers/hid/hid-uclogic-params.c
+++ b/drivers/hid/hid-uclogic-params.c
@@ -843,8 +843,7 @@ int uclogic_params_init(struct uclogic_params *params,
 	struct uclogic_params p = {0, };
 
 	/* Check arguments */
-	if (params == NULL || hdev == NULL ||
-	    !hid_is_using_ll_driver(hdev, &usb_hid_driver)) {
+	if (params == NULL || hdev == NULL || !hid_is_usb(hdev)) {
 		rc = -EINVAL;
 		goto cleanup;
 	}
diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index 2717d39600b4..22d73772fbc5 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c
@@ -2214,7 +2214,7 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix)
 	if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) {
 		char *product_name = wacom->hdev->name;
 
-		if (hid_is_using_ll_driver(wacom->hdev, &usb_hid_driver)) {
+		if (hid_is_usb(wacom->hdev)) {
 			struct usb_interface *intf = to_usb_interface(wacom->hdev->dev.parent);
 			struct usb_device *dev = interface_to_usbdev(intf);
 			product_name = dev->product;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 9e067f937dbc..f453be385bd4 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -840,6 +840,11 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev,
 	return hdev->ll_driver == driver;
 }
 
+static inline bool hid_is_usb(struct hid_device *hdev)
+{
+	return hid_is_using_ll_driver(hdev, &usb_hid_driver);
+}
+
 #define	PM_HINT_FULLON	1<<5
 #define PM_HINT_NORMAL	1<<1
 
-- 
cgit v1.2.3-71-gd317


From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 22 Nov 2021 20:17:40 +0530
Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL

Vinicius Costa Gomes reported [0] that build fails when
CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled.
This leads to btf.c not being compiled, and then no symbol being present
in vmlinux for the declarations in btf.h. Since BTF is not useful
without enabling BPF subsystem, disallow this combination.

However, theoretically disabling both now could still fail, as the
symbol for kfunc_btf_id_list variables is not available. This isn't a
problem as the compiler usually optimizes the whole register/unregister
call, but at lower optimization levels it can fail the build in linking
stage.

Fix that by adding dummy variables so that modules taking address of
them still work, but the whole thing is a noop.

  [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com

Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration")
Reported-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com
---
 include/linux/btf.h | 14 ++++++++++----
 kernel/bpf/btf.c    |  9 ++-------
 lib/Kconfig.debug   |  1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 203eef993d76..0e1b6281fd8f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -245,7 +245,10 @@ struct kfunc_btf_id_set {
 	struct module *owner;
 };
 
-struct kfunc_btf_id_list;
+struct kfunc_btf_id_list {
+	struct list_head list;
+	struct mutex mutex;
+};
 
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
@@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 				 struct kfunc_btf_id_set *s);
 bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
 			      struct module *owner);
+
+extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
+extern struct kfunc_btf_id_list prog_test_kfunc_list;
 #else
 static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 					     struct kfunc_btf_id_set *s)
@@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
 {
 	return false;
 }
+
+static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused;
+static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused;
 #endif
 
 #define DEFINE_KFUNC_BTF_ID_SET(set, name)                                     \
 	struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
 					 THIS_MODULE }
 
-extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
-extern struct kfunc_btf_id_list prog_test_kfunc_list;
-
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dbc3ad07e21b..ea3df9867cec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6346,11 +6346,6 @@ BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
 
 /* BTF ID set registration API for modules */
 
-struct kfunc_btf_id_list {
-	struct list_head list;
-	struct mutex mutex;
-};
-
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 
 void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
@@ -6389,8 +6384,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
 	return false;
 }
 
-#endif
-
 #define DEFINE_KFUNC_BTF_ID_LIST(name)                                         \
 	struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list),           \
 					  __MUTEX_INITIALIZER(name.mutex) };   \
@@ -6398,3 +6391,5 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
 
 DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
 DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+
+#endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9ef7ce18b4f5..596bb5e4790c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -316,6 +316,7 @@ config DEBUG_INFO_BTF
 	bool "Generate BTF typeinfo"
 	depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED
 	depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST
+	depends on BPF_SYSCALL
 	help
 	  Generate deduplicated BTF type information from DWARF debug info.
 	  Turning this on expects presence of pahole tool, which will convert
-- 
cgit v1.2.3-71-gd317


From 03cfda4fa6ea9bea2f30160579a78c2b8c1e616e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 2 Dec 2021 15:37:24 -0800
Subject: tcp: fix another uninit-value (sk_rx_queue_mapping)

KMSAN is still not happy [1].

I missed that passive connections do not inherit their
sk_rx_queue_mapping values from the request socket,
but instead tcp_child_process() is calling
sk_mark_napi_id(child, skb)

We have many sk_mark_napi_id() callers, so I am providing
a new helper, forcing the setting sk_rx_queue_mapping
and sk_napi_id.

Note that we had no KMSAN report for sk_napi_id because
passive connections got a copy of this field from the listener.
sk_rx_queue_mapping in the other hand is inside the
sk_dontcopy_begin/sk_dontcopy_end so sk_clone_lock()
leaves this field uninitialized.

We might remove dead code populating req->sk_rx_queue_mapping
in the future.

[1]

BUG: KMSAN: uninit-value in __sk_rx_queue_set include/net/sock.h:1924 [inline]
BUG: KMSAN: uninit-value in sk_rx_queue_update include/net/sock.h:1938 [inline]
BUG: KMSAN: uninit-value in sk_mark_napi_id include/net/busy_poll.h:136 [inline]
BUG: KMSAN: uninit-value in tcp_child_process+0xb42/0x1050 net/ipv4/tcp_minisocks.c:833
 __sk_rx_queue_set include/net/sock.h:1924 [inline]
 sk_rx_queue_update include/net/sock.h:1938 [inline]
 sk_mark_napi_id include/net/busy_poll.h:136 [inline]
 tcp_child_process+0xb42/0x1050 net/ipv4/tcp_minisocks.c:833
 tcp_v4_rcv+0x3d83/0x4ed0 net/ipv4/tcp_ipv4.c:2066
 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204
 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:460 [inline]
 ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline]
 ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline]
 ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609
 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644
 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline]
 __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553
 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605
 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696
 gro_normal_list net/core/dev.c:5850 [inline]
 napi_complete_done+0x579/0xdd0 net/core/dev.c:6587
 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
 virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557
 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020
 napi_poll net/core/dev.c:7087 [inline]
 net_rx_action+0x824/0x1880 net/core/dev.c:7174
 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558
 run_ksoftirqd+0x33/0x50 kernel/softirq.c:920
 smpboot_thread_fn+0x616/0xbf0 kernel/smpboot.c:164
 kthread+0x721/0x850 kernel/kthread.c:327
 ret_from_fork+0x1f/0x30

Uninit was created at:
 __alloc_pages+0xbc7/0x10a0 mm/page_alloc.c:5409
 alloc_pages+0x8a5/0xb80
 alloc_slab_page mm/slub.c:1810 [inline]
 allocate_slab+0x287/0x1c20 mm/slub.c:1947
 new_slab mm/slub.c:2010 [inline]
 ___slab_alloc+0xbdf/0x1e90 mm/slub.c:3039
 __slab_alloc mm/slub.c:3126 [inline]
 slab_alloc_node mm/slub.c:3217 [inline]
 slab_alloc mm/slub.c:3259 [inline]
 kmem_cache_alloc+0xbb3/0x11c0 mm/slub.c:3264
 sk_prot_alloc+0xeb/0x570 net/core/sock.c:1914
 sk_clone_lock+0xd6/0x1940 net/core/sock.c:2118
 inet_csk_clone_lock+0x8d/0x6a0 net/ipv4/inet_connection_sock.c:956
 tcp_create_openreq_child+0xb1/0x1ef0 net/ipv4/tcp_minisocks.c:453
 tcp_v4_syn_recv_sock+0x268/0x2710 net/ipv4/tcp_ipv4.c:1563
 tcp_check_req+0x207c/0x2a30 net/ipv4/tcp_minisocks.c:765
 tcp_v4_rcv+0x36f5/0x4ed0 net/ipv4/tcp_ipv4.c:2047
 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204
 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:460 [inline]
 ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline]
 ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline]
 ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609
 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644
 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline]
 __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553
 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605
 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696
 gro_normal_list net/core/dev.c:5850 [inline]
 napi_complete_done+0x579/0xdd0 net/core/dev.c:6587
 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
 virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557
 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020
 napi_poll net/core/dev.c:7087 [inline]
 net_rx_action+0x824/0x1880 net/core/dev.c:7174
 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558

Fixes: 342159ee394d ("net: avoid dirtying sk->sk_rx_queue_mapping")
Fixes: a37a0ee4d25c ("net: avoid uninit-value from tcp_conn_request")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Tested-by: Alexander Potapenko <glider@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h  | 13 +++++++++++++
 net/ipv4/tcp_minisocks.c |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 7994455ec714..c4898fcbf923 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -136,6 +136,19 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
 	sk_rx_queue_update(sk, skb);
 }
 
+/* Variant of sk_mark_napi_id() for passive flow setup,
+ * as sk->sk_napi_id and sk->sk_rx_queue_mapping content
+ * needs to be set.
+ */
+static inline void sk_mark_napi_id_set(struct sock *sk,
+				       const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
+#endif
+	sk_rx_queue_set(sk, skb);
+}
+
 static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cf913a66df17..7c2d3ac2363a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -829,8 +829,8 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
-	/* record NAPI ID of child */
-	sk_mark_napi_id(child, skb);
+	/* record sk_napi_id and sk_rx_queue_mapping of child. */
+	sk_mark_napi_id_set(child, skb);
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
-- 
cgit v1.2.3-71-gd317


From dac8e00fb640e9569cdeefd3ce8a75639e5d0711 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 2 Dec 2021 18:27:18 -0800
Subject: bonding: make tx_rebalance_counter an atomic

KCSAN reported a data-race [1] around tx_rebalance_counter
which can be accessed from different contexts, without
the protection of a lock/mutex.

[1]
BUG: KCSAN: data-race in bond_alb_init_slave / bond_alb_monitor

write to 0xffff888157e8ca24 of 4 bytes by task 7075 on cpu 0:
 bond_alb_init_slave+0x713/0x860 drivers/net/bonding/bond_alb.c:1613
 bond_enslave+0xd94/0x3010 drivers/net/bonding/bond_main.c:1949
 do_set_master net/core/rtnetlink.c:2521 [inline]
 __rtnl_newlink net/core/rtnetlink.c:3475 [inline]
 rtnl_newlink+0x1298/0x13b0 net/core/rtnetlink.c:3506
 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5571
 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2491
 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5589
 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
 netlink_unicast+0x5fc/0x6c0 net/netlink/af_netlink.c:1345
 netlink_sendmsg+0x6e1/0x7d0 net/netlink/af_netlink.c:1916
 sock_sendmsg_nosec net/socket.c:704 [inline]
 sock_sendmsg net/socket.c:724 [inline]
 ____sys_sendmsg+0x39a/0x510 net/socket.c:2409
 ___sys_sendmsg net/socket.c:2463 [inline]
 __sys_sendmsg+0x195/0x230 net/socket.c:2492
 __do_sys_sendmsg net/socket.c:2501 [inline]
 __se_sys_sendmsg net/socket.c:2499 [inline]
 __x64_sys_sendmsg+0x42/0x50 net/socket.c:2499
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff888157e8ca24 of 4 bytes by task 1082 on cpu 1:
 bond_alb_monitor+0x8f/0xc00 drivers/net/bonding/bond_alb.c:1511
 process_one_work+0x3fc/0x980 kernel/workqueue.c:2298
 worker_thread+0x616/0xa70 kernel/workqueue.c:2445
 kthread+0x2c7/0x2e0 kernel/kthread.c:327
 ret_from_fork+0x1f/0x30

value changed: 0x00000001 -> 0x00000064

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 1082 Comm: kworker/u4:3 Not tainted 5.16.0-rc3-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: bond1 bond_alb_monitor

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c | 14 ++++++++------
 include/net/bond_alb.h         |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 2ec8e015c7b3..533e476988f2 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1501,14 +1501,14 @@ void bond_alb_monitor(struct work_struct *work)
 	struct slave *slave;
 
 	if (!bond_has_slaves(bond)) {
-		bond_info->tx_rebalance_counter = 0;
+		atomic_set(&bond_info->tx_rebalance_counter, 0);
 		bond_info->lp_counter = 0;
 		goto re_arm;
 	}
 
 	rcu_read_lock();
 
-	bond_info->tx_rebalance_counter++;
+	atomic_inc(&bond_info->tx_rebalance_counter);
 	bond_info->lp_counter++;
 
 	/* send learning packets */
@@ -1530,7 +1530,7 @@ void bond_alb_monitor(struct work_struct *work)
 	}
 
 	/* rebalance tx traffic */
-	if (bond_info->tx_rebalance_counter >= BOND_TLB_REBALANCE_TICKS) {
+	if (atomic_read(&bond_info->tx_rebalance_counter) >= BOND_TLB_REBALANCE_TICKS) {
 		bond_for_each_slave_rcu(bond, slave, iter) {
 			tlb_clear_slave(bond, slave, 1);
 			if (slave == rcu_access_pointer(bond->curr_active_slave)) {
@@ -1540,7 +1540,7 @@ void bond_alb_monitor(struct work_struct *work)
 				bond_info->unbalanced_load = 0;
 			}
 		}
-		bond_info->tx_rebalance_counter = 0;
+		atomic_set(&bond_info->tx_rebalance_counter, 0);
 	}
 
 	if (bond_info->rlb_enabled) {
@@ -1610,7 +1610,8 @@ int bond_alb_init_slave(struct bonding *bond, struct slave *slave)
 	tlb_init_slave(slave);
 
 	/* order a rebalance ASAP */
-	bond->alb_info.tx_rebalance_counter = BOND_TLB_REBALANCE_TICKS;
+	atomic_set(&bond->alb_info.tx_rebalance_counter,
+		   BOND_TLB_REBALANCE_TICKS);
 
 	if (bond->alb_info.rlb_enabled)
 		bond->alb_info.rlb_rebalance = 1;
@@ -1647,7 +1648,8 @@ void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char
 			rlb_clear_slave(bond, slave);
 	} else if (link == BOND_LINK_UP) {
 		/* order a rebalance ASAP */
-		bond_info->tx_rebalance_counter = BOND_TLB_REBALANCE_TICKS;
+		atomic_set(&bond_info->tx_rebalance_counter,
+			   BOND_TLB_REBALANCE_TICKS);
 		if (bond->alb_info.rlb_enabled) {
 			bond->alb_info.rlb_rebalance = 1;
 			/* If the updelay module parameter is smaller than the
diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h
index f6af76c87a6c..191c36afa1f4 100644
--- a/include/net/bond_alb.h
+++ b/include/net/bond_alb.h
@@ -126,7 +126,7 @@ struct tlb_slave_info {
 struct alb_bond_info {
 	struct tlb_client_info	*tx_hashtbl; /* Dynamically allocated */
 	u32			unbalanced_load;
-	int			tx_rebalance_counter;
+	atomic_t		tx_rebalance_counter;
 	int			lp_counter;
 	/* -------- rlb parameters -------- */
 	int rlb_enabled;
-- 
cgit v1.2.3-71-gd317


From 8581fd402a0cf80b5298e3b225e7a7bd8f110e69 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 2 Dec 2021 12:34:00 -0800
Subject: treewide: Add missing includes masked by cgroup -> bpf dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cgroup.h (therefore swap.h, therefore half of the universe)
includes bpf.h which in turn includes module.h and slab.h.
Since we're about to get rid of that dependency we need
to clean things up.

v2: drop the cpu.h include from cacheinfo.h, it's not necessary
and it makes riscv sensitive to ordering of include files.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Krzysztof Wilczyński <kw@linux.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/all/20211120035253.72074-1-kuba@kernel.org/  # v1
Link: https://lore.kernel.org/all/20211120165528.197359-1-kuba@kernel.org/ # cacheinfo discussion
Link: https://lore.kernel.org/bpf/20211202203400.1208663-1-kuba@kernel.org
---
 block/fops.c                                          | 1 +
 drivers/gpu/drm/drm_gem_shmem_helper.c                | 1 +
 drivers/gpu/drm/i915/gt/intel_gtt.c                   | 1 +
 drivers/gpu/drm/i915/i915_request.c                   | 1 +
 drivers/gpu/drm/lima/lima_device.c                    | 1 +
 drivers/gpu/drm/msm/msm_gem_shrinker.c                | 1 +
 drivers/gpu/drm/ttm/ttm_tt.c                          | 1 +
 drivers/net/ethernet/huawei/hinic/hinic_sriov.c       | 1 +
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 ++
 drivers/pci/controller/dwc/pci-exynos.c               | 1 +
 drivers/pci/controller/dwc/pcie-qcom-ep.c             | 1 +
 drivers/usb/cdns3/host.c                              | 1 +
 include/linux/cacheinfo.h                             | 1 -
 include/linux/device/driver.h                         | 1 +
 include/linux/filter.h                                | 2 +-
 mm/damon/vaddr.c                                      | 1 +
 mm/memory_hotplug.c                                   | 1 +
 mm/swap_slots.c                                       | 1 +
 18 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index ad732a36f9b3..3cb1e81929bc 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
 #include <linux/falloc.h>
 #include <linux/suspend.h>
 #include <linux/fs.h>
+#include <linux/module.h>
 #include "blk.h"
 
 static inline struct inode *bdev_file_inode(struct file *file)
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index 7b9f69f21f1e..bca0de92802e 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -9,6 +9,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/module.h>
 
 #ifdef CONFIG_X86
 #include <asm/set_memory.h>
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 67d14afa6623..b67f620c3d93 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -6,6 +6,7 @@
 #include <linux/slab.h> /* fault-inject.h is not standalone! */
 
 #include <linux/fault-inject.h>
+#include <linux/sched/mm.h>
 
 #include "gem/i915_gem_lmem.h"
 #include "i915_trace.h"
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 820a1f38b271..89cccefeea63 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
 
 #include "gem/i915_gem_context.h"
 #include "gt/intel_breadcrumbs.h"
diff --git a/drivers/gpu/drm/lima/lima_device.c b/drivers/gpu/drm/lima/lima_device.c
index 65fdca366e41..f74f8048af8f 100644
--- a/drivers/gpu/drm/lima/lima_device.c
+++ b/drivers/gpu/drm/lima/lima_device.c
@@ -4,6 +4,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/reset.h>
 #include <linux/clk.h>
+#include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
 
diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
index 4a1420b05e97..086dacf2f26a 100644
--- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
+++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
 
 #include "msm_drv.h"
 #include "msm_gem.h"
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 7e83c00a3f48..79c870a3bef8 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -34,6 +34,7 @@
 #include <linux/sched.h>
 #include <linux/shmem_fs.h>
 #include <linux/file.h>
+#include <linux/module.h>
 #include <drm/drm_cache.h>
 #include <drm/ttm/ttm_bo_driver.h>
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c
index a78c398bf5b2..01e7d3c0b68e 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_sriov.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_sriov.c
@@ -8,6 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/etherdevice.h>
 #include <linux/netdevice.h>
+#include <linux/module.h>
 
 #include "hinic_hw_dev.h"
 #include "hinic_dev.h"
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
index 0ef68fdd1f26..61c20907315f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
@@ -5,6 +5,8 @@
  *
  */
 
+#include <linux/module.h>
+
 #include "otx2_common.h"
 #include "otx2_ptp.h"
 
diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c
index c24dab383654..722dacdd5a17 100644
--- a/drivers/pci/controller/dwc/pci-exynos.c
+++ b/drivers/pci/controller/dwc/pci-exynos.c
@@ -19,6 +19,7 @@
 #include <linux/platform_device.h>
 #include <linux/phy/phy.h>
 #include <linux/regulator/consumer.h>
+#include <linux/module.h>
 
 #include "pcie-designware.h"
 
diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
index 7b17da2f9b3f..cfe66bf04c1d 100644
--- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
+++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
@@ -18,6 +18,7 @@
 #include <linux/pm_domain.h>
 #include <linux/regmap.h>
 #include <linux/reset.h>
+#include <linux/module.h>
 
 #include "pcie-designware.h"
 
diff --git a/drivers/usb/cdns3/host.c b/drivers/usb/cdns3/host.c
index 84dadfa726aa..9643b905e2d8 100644
--- a/drivers/usb/cdns3/host.c
+++ b/drivers/usb/cdns3/host.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 #include "core.h"
 #include "drd.h"
 #include "host-export.h"
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 2f909ed084c6..4ff37cb763ae 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -3,7 +3,6 @@
 #define _LINUX_CACHEINFO_H
 
 #include <linux/bitops.h>
-#include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index a498ebcf4993..15e7c5e15d62 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -18,6 +18,7 @@
 #include <linux/klist.h>
 #include <linux/pm.h>
 #include <linux/device/bus.h>
+#include <linux/module.h>
 
 /**
  * enum probe_type - device driver probe type to try
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 534f678ca50f..7f1e88e3e2b5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -6,6 +6,7 @@
 #define __LINUX_FILTER_H__
 
 #include <linux/atomic.h>
+#include <linux/bpf.h>
 #include <linux/refcount.h>
 #include <linux/compat.h>
 #include <linux/skbuff.h>
@@ -26,7 +27,6 @@
 
 #include <asm/byteorder.h>
 #include <uapi/linux/filter.h>
-#include <uapi/linux/bpf.h>
 
 struct sk_buff;
 struct sock;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 35fe49080ee9..47f47f60440e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -13,6 +13,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
 
 #include "prmtv-common.h"
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 852041f6be41..2a9627dc784c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
 #include <linux/memblock.h>
 #include <linux/compaction.h>
 #include <linux/rmap.h>
+#include <linux/module.h>
 
 #include <asm/tlbflush.h>
 
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 16f706c55d92..2b5531840583 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -30,6 +30,7 @@
 #include <linux/swap_slots.h>
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mutex.h>
 #include <linux/mm.h>
-- 
cgit v1.2.3-71-gd317


From 802a7dc5cf1bef06f7b290ce76d478138408d6b1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Dec 2021 10:03:23 -0800
Subject: netfilter: conntrack: annotate data-races around ct->timeout

(struct nf_conn)->timeout can be read/written locklessly,
add READ_ONCE()/WRITE_ONCE() to prevent load/store tearing.

BUG: KCSAN: data-race in __nf_conntrack_alloc / __nf_conntrack_find_get

write to 0xffff888132e78c08 of 4 bytes by task 6029 on cpu 0:
 __nf_conntrack_alloc+0x158/0x280 net/netfilter/nf_conntrack_core.c:1563
 init_conntrack+0x1da/0xb30 net/netfilter/nf_conntrack_core.c:1635
 resolve_normal_ct+0x502/0x610 net/netfilter/nf_conntrack_core.c:1746
 nf_conntrack_in+0x1c5/0x88f net/netfilter/nf_conntrack_core.c:1901
 ipv6_conntrack_local+0x19/0x20 net/netfilter/nf_conntrack_proto.c:414
 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline]
 nf_hook_slow+0x72/0x170 net/netfilter/core.c:619
 nf_hook include/linux/netfilter.h:262 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_xmit+0xa3a/0xa60 net/ipv6/ip6_output.c:324
 inet6_csk_xmit+0x1a2/0x1e0 net/ipv6/inet6_connection_sock.c:135
 __tcp_transmit_skb+0x132a/0x1840 net/ipv4/tcp_output.c:1402
 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline]
 tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680
 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864
 tcp_push_pending_frames include/net/tcp.h:1897 [inline]
 tcp_data_snd_check+0x62/0x2e0 net/ipv4/tcp_input.c:5452
 tcp_rcv_established+0x880/0x10e0 net/ipv4/tcp_input.c:5947
 tcp_v6_do_rcv+0x36e/0xa50 net/ipv6/tcp_ipv6.c:1521
 sk_backlog_rcv include/net/sock.h:1030 [inline]
 __release_sock+0xf2/0x270 net/core/sock.c:2768
 release_sock+0x40/0x110 net/core/sock.c:3300
 sk_stream_wait_memory+0x435/0x700 net/core/stream.c:145
 tcp_sendmsg_locked+0xb85/0x25a0 net/ipv4/tcp.c:1402
 tcp_sendmsg+0x2c/0x40 net/ipv4/tcp.c:1440
 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:644
 sock_sendmsg_nosec net/socket.c:704 [inline]
 sock_sendmsg net/socket.c:724 [inline]
 __sys_sendto+0x21e/0x2c0 net/socket.c:2036
 __do_sys_sendto net/socket.c:2048 [inline]
 __se_sys_sendto net/socket.c:2044 [inline]
 __x64_sys_sendto+0x74/0x90 net/socket.c:2044
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff888132e78c08 of 4 bytes by task 17446 on cpu 1:
 nf_ct_is_expired include/net/netfilter/nf_conntrack.h:286 [inline]
 ____nf_conntrack_find net/netfilter/nf_conntrack_core.c:776 [inline]
 __nf_conntrack_find_get+0x1c7/0xac0 net/netfilter/nf_conntrack_core.c:807
 resolve_normal_ct+0x273/0x610 net/netfilter/nf_conntrack_core.c:1734
 nf_conntrack_in+0x1c5/0x88f net/netfilter/nf_conntrack_core.c:1901
 ipv6_conntrack_local+0x19/0x20 net/netfilter/nf_conntrack_proto.c:414
 nf_hook_entry_hookfn include/linux/netfilter.h:142 [inline]
 nf_hook_slow+0x72/0x170 net/netfilter/core.c:619
 nf_hook include/linux/netfilter.h:262 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_xmit+0xa3a/0xa60 net/ipv6/ip6_output.c:324
 inet6_csk_xmit+0x1a2/0x1e0 net/ipv6/inet6_connection_sock.c:135
 __tcp_transmit_skb+0x132a/0x1840 net/ipv4/tcp_output.c:1402
 __tcp_send_ack+0x1fd/0x300 net/ipv4/tcp_output.c:3956
 tcp_send_ack+0x23/0x30 net/ipv4/tcp_output.c:3962
 __tcp_ack_snd_check+0x2d8/0x510 net/ipv4/tcp_input.c:5478
 tcp_ack_snd_check net/ipv4/tcp_input.c:5523 [inline]
 tcp_rcv_established+0x8c2/0x10e0 net/ipv4/tcp_input.c:5948
 tcp_v6_do_rcv+0x36e/0xa50 net/ipv6/tcp_ipv6.c:1521
 sk_backlog_rcv include/net/sock.h:1030 [inline]
 __release_sock+0xf2/0x270 net/core/sock.c:2768
 release_sock+0x40/0x110 net/core/sock.c:3300
 tcp_sendpage+0x94/0xb0 net/ipv4/tcp.c:1114
 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833
 rds_tcp_xmit+0x376/0x5f0 net/rds/tcp_send.c:118
 rds_send_xmit+0xbed/0x1500 net/rds/send.c:367
 rds_send_worker+0x43/0x200 net/rds/threads.c:200
 process_one_work+0x3fc/0x980 kernel/workqueue.c:2298
 worker_thread+0x616/0xa70 kernel/workqueue.c:2445
 kthread+0x2c7/0x2e0 kernel/kthread.c:327
 ret_from_fork+0x1f/0x30

value changed: 0x00027cc2 -> 0x00000000

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 17446 Comm: kworker/u4:5 Tainted: G        W         5.16.0-rc4-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: krdsd rds_send_worker

Note: I chose an arbitrary commit for the Fixes: tag,
because I do not think we need to backport this fix to very old kernels.

Fixes: e37542ba111f ("netfilter: conntrack: avoid possible false sharing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 6 +++---
 net/netfilter/nf_conntrack_core.c    | 6 +++---
 net/netfilter/nf_conntrack_netlink.c | 2 +-
 net/netfilter/nf_flow_table_core.c   | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index cc663c68ddc4..d24b0a34c8f0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -276,14 +276,14 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
 /* jiffies until ct expires, 0 if already expired */
 static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
 {
-	s32 timeout = ct->timeout - nfct_time_stamp;
+	s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
 
 	return timeout > 0 ? timeout : 0;
 }
 
 static inline bool nf_ct_is_expired(const struct nf_conn *ct)
 {
-	return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
+	return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0;
 }
 
 /* use after obtaining a reference count */
@@ -302,7 +302,7 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct)
 static inline void nf_ct_offload_timeout(struct nf_conn *ct)
 {
 	if (nf_ct_expires(ct) < NF_CT_DAY / 2)
-		ct->timeout = nfct_time_stamp + NF_CT_DAY;
+		WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY);
 }
 
 struct kernel_param;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 770a63103c7a..4712a90a1820 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -684,7 +684,7 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp) {
-		s32 timeout = ct->timeout - nfct_time_stamp;
+		s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
 
 		tstamp->stop = ktime_get_real_ns();
 		if (timeout < 0)
@@ -1036,7 +1036,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
 	}
 
 	/* We want the clashing entry to go away real soon: 1 second timeout. */
-	loser_ct->timeout = nfct_time_stamp + HZ;
+	WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
 
 	/* IPS_NAT_CLASH removes the entry automatically on the first
 	 * reply.  Also prevents UDP tracker from moving the entry to
@@ -1560,7 +1560,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* save hash for reusing when confirming */
 	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
 	ct->status = 0;
-	ct->timeout = 0;
+	WRITE_ONCE(ct->timeout, 0);
 	write_pnet(&ct->ct_net, net);
 	memset(&ct->__nfct_init_offset, 0,
 	       offsetof(struct nf_conn, proto) -
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c7708bde057c..81d03acf68d4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1998,7 +1998,7 @@ static int ctnetlink_change_timeout(struct nf_conn *ct,
 
 	if (timeout > INT_MAX)
 		timeout = INT_MAX;
-	ct->timeout = nfct_time_stamp + (u32)timeout;
+	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
 
 	if (test_bit(IPS_DYING_BIT, &ct->status))
 		return -ETIME;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 87a7388b6c89..ed37bb9b4e58 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -201,8 +201,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 	if (timeout < 0)
 		timeout = 0;
 
-	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
-		ct->timeout = nfct_time_stamp + timeout;
+	if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
 }
 
 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-- 
cgit v1.2.3-71-gd317


From a97770cc4016c2733bcef9dbe3d5b1ad02d13356 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng01@gmail.com>
Date: Mon, 6 Dec 2021 16:12:27 +0800
Subject: net: phy: Remove unnecessary indentation in the comments of
 phy_device

Fix warning as:

linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:543: WARNING: Unexpected indentation.
linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:544: WARNING: Block quote ends without a blank line; unexpected unindent.
linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:546: WARNING: Unexpected indentation.

Suggested-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 96e43fbb2dd8..cbf03a5f9cf5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -538,11 +538,12 @@ struct macsec_ops;
  * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
- *		Bits [15:0] are free to use by the PHY driver to communicate
- *			    driver specific behavior.
- *		Bits [23:16] are currently reserved for future use.
- *		Bits [31:24] are reserved for defining generic
- *			     PHY driver behavior.
+ *
+ *      - Bits [15:0] are free to use by the PHY driver to communicate
+ *        driver specific behavior.
+ *      - Bits [23:16] are currently reserved for future use.
+ *      - Bits [31:24] are reserved for defining generic
+ *        PHY driver behavior.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
  * @phy_timer: The timer for handling the state machine
  * @phylink: Pointer to phylink instance for this PHY
-- 
cgit v1.2.3-71-gd317


From 444dd878e85fb33fcfb2682cfdab4c236f33ea3e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 3 Dec 2021 17:19:47 +0100
Subject: PM: runtime: Fix pm_runtime_active() kerneldoc comment

The kerneldoc comment of pm_runtime_active() does not reflect the
behavior of the function, so update it accordingly.

Fixes: 403d2d116ec0 ("PM: runtime: Add kerneldoc comments to multiple helpers")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/pm_runtime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 222da43b7096..eddd66d426ca 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -129,7 +129,7 @@ static inline bool pm_runtime_suspended(struct device *dev)
  * pm_runtime_active - Check whether or not a device is runtime-active.
  * @dev: Target device.
  *
- * Return %true if runtime PM is enabled for @dev and its runtime PM status is
+ * Return %true if runtime PM is disabled for @dev or its runtime PM status is
  * %RPM_ACTIVE, or %false otherwise.
  *
  * Note that the return value of this function can only be trusted if it is
-- 
cgit v1.2.3-71-gd317


From cab2d3fd6866e089b5c50db09dece131f85bfebd Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 9 Dec 2021 18:46:33 +0530
Subject: bus: mhi: core: Add support for forced PM resume

For whatever reason, some devices like QCA6390, WCN6855 using ath11k
are not in M3 state during PM resume, but still functional. The
mhi_pm_resume should then not fail in those cases, and let the higher
level device specific stack continue resuming process.

Add an API mhi_pm_resume_force(), to force resuming irrespective of the
current MHI state. This fixes a regression with non functional ath11k WiFi
after suspend/resume cycle on some machines.

Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179

Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/
Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state")
Cc: stable@vger.kernel.org #5.13
Reported-by: Kalle Valo <kvalo@codeaurora.org>
Reported-by: Pengyu Ma <mapengyu@gmail.com>
Tested-by: Kalle Valo <kvalo@kernel.org>
Acked-by: Kalle Valo <kvalo@kernel.org>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
[mani: Switched to API, added bug report, reported-by tags and CCed stable]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20211209131633.4168-1-manivannan.sadhasivam@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bus/mhi/core/pm.c             | 21 ++++++++++++++++++---
 drivers/net/wireless/ath/ath11k/mhi.c |  6 +++++-
 include/linux/mhi.h                   | 13 +++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index fb99e3727155..547e6e769546 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -881,7 +881,7 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl)
 }
 EXPORT_SYMBOL_GPL(mhi_pm_suspend);
 
-int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
+static int __mhi_pm_resume(struct mhi_controller *mhi_cntrl, bool force)
 {
 	struct mhi_chan *itr, *tmp;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
@@ -898,8 +898,12 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
 	if (MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state))
 		return -EIO;
 
-	if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3)
-		return -EINVAL;
+	if (mhi_get_mhi_state(mhi_cntrl) != MHI_STATE_M3) {
+		dev_warn(dev, "Resuming from non M3 state (%s)\n",
+			 TO_MHI_STATE_STR(mhi_get_mhi_state(mhi_cntrl)));
+		if (!force)
+			return -EINVAL;
+	}
 
 	/* Notify clients about exiting LPM */
 	list_for_each_entry_safe(itr, tmp, &mhi_cntrl->lpm_chans, node) {
@@ -940,8 +944,19 @@ int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
 
 	return 0;
 }
+
+int mhi_pm_resume(struct mhi_controller *mhi_cntrl)
+{
+	return __mhi_pm_resume(mhi_cntrl, false);
+}
 EXPORT_SYMBOL_GPL(mhi_pm_resume);
 
+int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl)
+{
+	return __mhi_pm_resume(mhi_cntrl, true);
+}
+EXPORT_SYMBOL_GPL(mhi_pm_resume_force);
+
 int __mhi_device_get_sync(struct mhi_controller *mhi_cntrl)
 {
 	int ret;
diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c
index 26c7ae242db6..49c0b1ad40a0 100644
--- a/drivers/net/wireless/ath/ath11k/mhi.c
+++ b/drivers/net/wireless/ath/ath11k/mhi.c
@@ -533,7 +533,11 @@ static int ath11k_mhi_set_state(struct ath11k_pci *ab_pci,
 		ret = mhi_pm_suspend(ab_pci->mhi_ctrl);
 		break;
 	case ATH11K_MHI_RESUME:
-		ret = mhi_pm_resume(ab_pci->mhi_ctrl);
+		/* Do force MHI resume as some devices like QCA6390, WCN6855
+		 * are not in M3 state but they are functional. So just ignore
+		 * the MHI state while resuming.
+		 */
+		ret = mhi_pm_resume_force(ab_pci->mhi_ctrl);
 		break;
 	case ATH11K_MHI_TRIGGER_RDDM:
 		ret = mhi_force_rddm_mode(ab_pci->mhi_ctrl);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 723985879035..a5cc4cdf9cc8 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl);
  */
 int mhi_pm_resume(struct mhi_controller *mhi_cntrl);
 
+/**
+ * mhi_pm_resume_force - Force resume MHI from suspended state
+ * @mhi_cntrl: MHI controller
+ *
+ * Resume the device irrespective of its MHI state. As per the MHI spec, devices
+ * has to be in M3 state during resume. But some devices seem to be in a
+ * different MHI state other than M3 but they continue working fine if allowed.
+ * This API is intented to be used for such devices.
+ *
+ * Return: 0 if the resume succeeds, a negative error code otherwise
+ */
+int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl);
+
 /**
  * mhi_download_rddm_image - Download ramdump image from device for
  *                           debugging purpose.
-- 
cgit v1.2.3-71-gd317


From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 8 Dec 2021 17:04:51 -0800
Subject: wait: add wake_up_pollfree()

Several ->poll() implementations are special in that they use a
waitqueue whose lifetime is the current task, rather than the struct
file as is normally the case.  This is okay for blocking polls, since a
blocking poll occurs within one task; however, non-blocking polls
require another solution.  This solution is for the queue to be cleared
before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'.

However, that has a bug: wake_up_poll() calls __wake_up() with
nr_exclusive=1.  Therefore, if there are multiple "exclusive" waiters,
and the wakeup function for the first one returns a positive value, only
that one will be called.  That's *not* what's needed for POLLFREE;
POLLFREE is special in that it really needs to wake up everyone.

Considering the three non-blocking poll systems:

- io_uring poll doesn't handle POLLFREE at all, so it is broken anyway.

- aio poll is unaffected, since it doesn't support exclusive waits.
  However, that's fragile, as someone could add this feature later.

- epoll doesn't appear to be broken by this, since its wakeup function
  returns 0 when it sees POLLFREE.  But this is fragile.

Although there is a workaround (see epoll), it's better to define a
function which always sends POLLFREE to all waiters.  Add such a
function.  Also make it verify that the queue really becomes empty after
all waiters have been woken up.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/wait.h | 26 ++++++++++++++++++++++++++
 kernel/sched/wait.c  |  7 +++++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2d0df57c9902..851e07da2583 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
+void __wake_up_pollfree(struct wait_queue_head *wq_head);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
 #define wake_up_interruptible_sync_poll_locked(x, m)				\
 	__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
 
+/**
+ * wake_up_pollfree - signal that a polled waitqueue is going away
+ * @wq_head: the wait queue head
+ *
+ * In the very rare cases where a ->poll() implementation uses a waitqueue whose
+ * lifetime is tied to a task rather than to the 'struct file' being polled,
+ * this function must be called before the waitqueue is freed so that
+ * non-blocking polls (e.g. epoll) are notified that the queue is going away.
+ *
+ * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
+ * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
+ */
+static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+	/*
+	 * For performance reasons, we don't always take the queue lock here.
+	 * Therefore, we might race with someone removing the last entry from
+	 * the queue, and proceed while they still hold the queue lock.
+	 * However, rcu_read_lock() is required to be held in such cases, so we
+	 * can safely proceed with an RCU-delayed free.
+	 */
+	if (waitqueue_active(wq_head))
+		__wake_up_pollfree(wq_head);
+}
+
 #define ___wait_cond_timeout(condition)						\
 ({										\
 	bool __cond = (condition);						\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 76577d1642a5..eca38107b32f 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+	__wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+	/* POLLFREE must have cleared the queue. */
+	WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
 /*
  * Note: we use "set_current_state()" _after_ the wait-queue add,
  * because we need a memory barrier there on SMP, so that any
-- 
cgit v1.2.3-71-gd317


From 50252e4b5e989ce64555c7aef7516bdefc2fea72 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 8 Dec 2021 17:04:55 -0800
Subject: aio: fix use-after-free due to missing POLLFREE handling

signalfd_poll() and binder_poll() are special in that they use a
waitqueue whose lifetime is the current task, rather than the struct
file as is normally the case.  This is okay for blocking polls, since a
blocking poll occurs within one task; however, non-blocking polls
require another solution.  This solution is for the queue to be cleared
before it is freed, by sending a POLLFREE notification to all waiters.

Unfortunately, only eventpoll handles POLLFREE.  A second type of
non-blocking poll, aio poll, was added in kernel v4.18, and it doesn't
handle POLLFREE.  This allows a use-after-free to occur if a signalfd or
binder fd is polled with aio poll, and the waitqueue gets freed.

Fix this by making aio poll handle POLLFREE.

A patch by Ramji Jiyani <ramjiyani@google.com>
(https://lore.kernel.org/r/20211027011834.2497484-1-ramjiyani@google.com)
tried to do this by making aio_poll_wake() always complete the request
inline if POLLFREE is seen.  However, that solution had two bugs.
First, it introduced a deadlock, as it unconditionally locked the aio
context while holding the waitqueue lock, which inverts the normal
locking order.  Second, it didn't consider that POLLFREE notifications
are missed while the request has been temporarily de-queued.

The second problem was solved by my previous patch.  This patch then
properly fixes the use-after-free by handling POLLFREE in a
deadlock-free way.  It does this by taking advantage of the fact that
freeing of the waitqueue is RCU-delayed, similar to what eventpoll does.

Fixes: 2c14fa838cbe ("aio: implement IOCB_CMD_POLL")
Cc: <stable@vger.kernel.org> # v4.18+
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/aio.c                        | 137 +++++++++++++++++++++++++++++++---------
 include/uapi/asm-generic/poll.h |   2 +-
 2 files changed, 107 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 2bc1352a83d8..c9bb0d3d8593 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1620,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
 	iocb_put(iocb);
 }
 
+/*
+ * Safely lock the waitqueue which the request is on, synchronizing with the
+ * case where the ->poll() provider decides to free its waitqueue early.
+ *
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
+ * is on req->head, and an RCU read lock was taken.  Returns false if the
+ * request was already removed from its waitqueue (which might no longer exist).
+ */
+static bool poll_iocb_lock_wq(struct poll_iocb *req)
+{
+	wait_queue_head_t *head;
+
+	/*
+	 * While we hold the waitqueue lock and the waitqueue is nonempty,
+	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
+	 * lock in the first place can race with the waitqueue being freed.
+	 *
+	 * We solve this as eventpoll does: by taking advantage of the fact that
+	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
+	 * we enter rcu_read_lock() and see that the pointer to the queue is
+	 * non-NULL, we can then lock it without the memory being freed out from
+	 * under us, then check whether the request is still on the queue.
+	 *
+	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+	 * case the caller deletes the entry from the queue, leaving it empty.
+	 * In that case, only RCU prevents the queue memory from being freed.
+	 */
+	rcu_read_lock();
+	head = smp_load_acquire(&req->head);
+	if (head) {
+		spin_lock(&head->lock);
+		if (!list_empty(&req->wait.entry))
+			return true;
+		spin_unlock(&head->lock);
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+static void poll_iocb_unlock_wq(struct poll_iocb *req)
+{
+	spin_unlock(&req->head->lock);
+	rcu_read_unlock();
+}
+
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1639,24 +1684,25 @@ static void aio_poll_complete_work(struct work_struct *work)
 	 * avoid further branches in the fast path.
 	 */
 	spin_lock_irq(&ctx->ctx_lock);
-	spin_lock(&req->head->lock);
-	if (!mask && !READ_ONCE(req->cancelled)) {
-		/*
-		 * The request isn't actually ready to be completed yet.
-		 * Reschedule completion if another wakeup came in.
-		 */
-		if (req->work_need_resched) {
-			schedule_work(&req->work);
-			req->work_need_resched = false;
-		} else {
-			req->work_scheduled = false;
+	if (poll_iocb_lock_wq(req)) {
+		if (!mask && !READ_ONCE(req->cancelled)) {
+			/*
+			 * The request isn't actually ready to be completed yet.
+			 * Reschedule completion if another wakeup came in.
+			 */
+			if (req->work_need_resched) {
+				schedule_work(&req->work);
+				req->work_need_resched = false;
+			} else {
+				req->work_scheduled = false;
+			}
+			poll_iocb_unlock_wq(req);
+			spin_unlock_irq(&ctx->ctx_lock);
+			return;
 		}
-		spin_unlock(&req->head->lock);
-		spin_unlock_irq(&ctx->ctx_lock);
-		return;
-	}
-	list_del_init(&req->wait.entry);
-	spin_unlock(&req->head->lock);
+		list_del_init(&req->wait.entry);
+		poll_iocb_unlock_wq(req);
+	} /* else, POLLFREE has freed the waitqueue, so we must complete */
 	list_del_init(&iocb->ki_list);
 	iocb->ki_res.res = mangle_poll(mask);
 	spin_unlock_irq(&ctx->ctx_lock);
@@ -1670,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
 	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
 	struct poll_iocb *req = &aiocb->poll;
 
-	spin_lock(&req->head->lock);
-	WRITE_ONCE(req->cancelled, true);
-	if (!req->work_scheduled) {
-		schedule_work(&aiocb->poll.work);
-		req->work_scheduled = true;
-	}
-	spin_unlock(&req->head->lock);
+	if (poll_iocb_lock_wq(req)) {
+		WRITE_ONCE(req->cancelled, true);
+		if (!req->work_scheduled) {
+			schedule_work(&aiocb->poll.work);
+			req->work_scheduled = true;
+		}
+		poll_iocb_unlock_wq(req);
+	} /* else, the request was force-cancelled by POLLFREE already */
 
 	return 0;
 }
@@ -1728,7 +1775,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		 *
 		 * Don't remove the request from the waitqueue here, as it might
 		 * not actually be complete yet (we won't know until vfs_poll()
-		 * is called), and we must not miss any wakeups.
+		 * is called), and we must not miss any wakeups.  POLLFREE is an
+		 * exception to this; see below.
 		 */
 		if (req->work_scheduled) {
 			req->work_need_resched = true;
@@ -1736,6 +1784,28 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			schedule_work(&req->work);
 			req->work_scheduled = true;
 		}
+
+		/*
+		 * If the waitqueue is being freed early but we can't complete
+		 * the request inline, we have to tear down the request as best
+		 * we can.  That means immediately removing the request from its
+		 * waitqueue and preventing all further accesses to the
+		 * waitqueue via the request.  We also need to schedule the
+		 * completion work (done above).  Also mark the request as
+		 * cancelled, to potentially skip an unneeded call to ->poll().
+		 */
+		if (mask & POLLFREE) {
+			WRITE_ONCE(req->cancelled, true);
+			list_del_init(&req->wait.entry);
+
+			/*
+			 * Careful: this *must* be the last step, since as soon
+			 * as req->head is NULL'ed out, the request can be
+			 * completed and freed, since aio_poll_complete_work()
+			 * will no longer need to take the waitqueue lock.
+			 */
+			smp_store_release(&req->head, NULL);
+		}
 	}
 	return 1;
 }
@@ -1743,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 struct aio_poll_table {
 	struct poll_table_struct	pt;
 	struct aio_kiocb		*iocb;
+	bool				queued;
 	int				error;
 };
 
@@ -1753,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
 
 	/* multiple wait queues per file are not supported */
-	if (unlikely(pt->iocb->poll.head)) {
+	if (unlikely(pt->queued)) {
 		pt->error = -EINVAL;
 		return;
 	}
 
+	pt->queued = true;
 	pt->error = 0;
 	pt->iocb->poll.head = head;
 	add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1789,6 +1861,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	apt.pt._qproc = aio_poll_queue_proc;
 	apt.pt._key = req->events;
 	apt.iocb = aiocb;
+	apt.queued = false;
 	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
 
 	/* initialized the list so that we can do list_empty checks */
@@ -1797,9 +1870,10 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 
 	mask = vfs_poll(req->file, &apt.pt) & req->events;
 	spin_lock_irq(&ctx->ctx_lock);
-	if (likely(req->head)) {
-		spin_lock(&req->head->lock);
-		if (list_empty(&req->wait.entry) || req->work_scheduled) {
+	if (likely(apt.queued)) {
+		bool on_queue = poll_iocb_lock_wq(req);
+
+		if (!on_queue || req->work_scheduled) {
 			/*
 			 * aio_poll_wake() already either scheduled the async
 			 * completion work, or completed the request inline.
@@ -1815,7 +1889,7 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 		} else if (cancel) {
 			/* Cancel if possible (may be too late though). */
 			WRITE_ONCE(req->cancelled, true);
-		} else if (!list_empty(&req->wait.entry)) {
+		} else if (on_queue) {
 			/*
 			 * Actually waiting for an event, so add the request to
 			 * active_reqs so that it can be cancelled if needed.
@@ -1823,7 +1897,8 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
 			aiocb->ki_cancel = aio_poll_cancel;
 		}
-		spin_unlock(&req->head->lock);
+		if (on_queue)
+			poll_iocb_unlock_wq(req);
 	}
 	if (mask) { /* no async, we'd stolen it */
 		aiocb->ki_res.res = mangle_poll(mask);
diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h
index 41b509f410bf..f9c520ce4bf4 100644
--- a/include/uapi/asm-generic/poll.h
+++ b/include/uapi/asm-generic/poll.h
@@ -29,7 +29,7 @@
 #define POLLRDHUP       0x2000
 #endif
 
-#define POLLFREE	(__force __poll_t)0x4000	/* currently only for epoll */
+#define POLLFREE	(__force __poll_t)0x4000
 
 #define POLL_BUSY_LOOP	(__force __poll_t)0x8000
 
-- 
cgit v1.2.3-71-gd317


From a4f1192cb53758a7210ed5a9ee695aeba22f75fb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 9 Dec 2021 14:30:33 +0200
Subject: percpu_ref: Replace kernel.h with the necessary inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b31d3f3312ce..d73a1c08c3e3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -51,9 +51,9 @@
 #define _LINUX_PERCPU_REFCOUNT_H
 
 #include <linux/atomic.h>
-#include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
 #include <linux/gfp.h>
 
 struct percpu_ref;
-- 
cgit v1.2.3-71-gd317


From 9dcc38e2813e0cd3b195940c98b181ce6ede8f20 Mon Sep 17 00:00:00 2001
From: Drew DeVault <sir@cmpwn.com>
Date: Fri, 10 Dec 2021 14:46:09 -0800
Subject: Increase default MLOCK_LIMIT to 8 MiB

This limit has not been updated since 2008, when it was increased to 64
KiB at the request of GnuPG.  Until recently, the main use-cases for this
feature were (1) preventing sensitive memory from being swapped, as in
GnuPG's use-case; and (2) real-time use-cases.  In the first case, little
memory is called for, and in the second case, the user is generally in a
position to increase it if they need more.

The introduction of IOURING_REGISTER_BUFFERS adds a third use-case:
preparing fixed buffers for high-performance I/O.  This use-case will take
as much of this memory as it can get, but is still limited to 64 KiB by
default, which is very little.  This increases the limit to 8 MB, which
was chosen fairly arbitrarily as a more generous, but still conservative,
default value.

It is also possible to raise this limit in userspace.  This is easily
done, for example, in the use-case of a network daemon: systemd, for
instance, provides for this via LimitMEMLOCK in the service file; OpenRC
via the rc_ulimit variables.  However, there is no established userspace
facility for configuring this outside of daemons: end-user applications do
not presently have access to a convenient means of raising their limits.

The buck, as it were, stops with the kernel.  It's much easier to address
it here than it is to bring it to hundreds of distributions, and it can
only realistically be relied upon to be high-enough by end-user software
if it is more-or-less ubiquitous.  Most distros don't change this
particular rlimit from the kernel-supplied default value, so a change here
will easily provide that ubiquity.

Link: https://lkml.kernel.org/r/20211028080813.15966-1-sir@cmpwn.com
Signed-off-by: Drew DeVault <sir@cmpwn.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Acked-by: Cyril Hrubis <chrubis@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Andrew Dona-Couch <andrew@donacou.ch>
Cc: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/resource.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/resource.h b/include/uapi/linux/resource.h
index 74ef57b38f9f..ac5d6a3031db 100644
--- a/include/uapi/linux/resource.h
+++ b/include/uapi/linux/resource.h
@@ -66,10 +66,17 @@ struct rlimit64 {
 #define _STK_LIM	(8*1024*1024)
 
 /*
- * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
- * and other sensitive information are never written to disk.
+ * Limit the amount of locked memory by some sane default:
+ * root can always increase this limit if needed.
+ *
+ * The main use-cases are (1) preventing sensitive memory
+ * from being swapped; (2) real-time operations; (3) via
+ * IOURING_REGISTER_BUFFERS.
+ *
+ * The first two don't need much. The latter will take as
+ * much as it can get. 8MB is a reasonably sane default.
  */
-#define MLOCK_LIMIT	((PAGE_SIZE > 64*1024) ? PAGE_SIZE : 64*1024)
+#define MLOCK_LIMIT	(8*1024*1024)
 
 /*
  * Due to binary compatibility, the actual resource numbers
-- 
cgit v1.2.3-71-gd317


From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:22 -0800
Subject: timers: implement usleep_idle_range()

Patch series "mm/damon: Fix fake /proc/loadavg reports", v3.

This patchset fixes DAMON's fake load report issue.  The first patch
makes yet another variant of usleep_range() for this fix, and the second
patch fixes the issue of DAMON by making it using the newly introduced
function.

This patch (of 2):

Some kernel threads such as DAMON could need to repeatedly sleep in
micro seconds level.  Because usleep_range() sleeps in uninterruptible
state, however, such threads would make /proc/loadavg reports fake load.

To help such cases, this commit implements a variant of usleep_range()
called usleep_idle_range().  It is same to usleep_range() but sets the
state of the current task as TASK_IDLE while sleeping.

Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: John Stultz <john.stultz@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delay.h | 14 +++++++++++++-
 kernel/time/timer.c   | 16 +++++++++-------
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 8eacf67eb212..039e7e0c7378 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -20,6 +20,7 @@
  */
 
 #include <linux/math.h>
+#include <linux/sched.h>
 
 extern unsigned long loops_per_jiffy;
 
@@ -58,7 +59,18 @@ void calibrate_delay(void);
 void __attribute__((weak)) calibration_delay_done(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
+void usleep_range_state(unsigned long min, unsigned long max,
+			unsigned int state);
+
+static inline void usleep_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
+}
+
+static inline void usleep_idle_range(unsigned long min, unsigned long max)
+{
+	usleep_range_state(min, max, TASK_IDLE);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e3d2c23c413d..85f1021ad459 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs)
 EXPORT_SYMBOL(msleep_interruptible);
 
 /**
- * usleep_range - Sleep for an approximate time
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min:	Minimum time in usecs to sleep
+ * @max:	Maximum time in usecs to sleep
+ * @state:	State of the current task that will be while sleeping
  *
  * In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range() instead of udelay().  The sleep improves responsiveness
+ * usleep_range_state() instead of udelay().  The sleep improves responsiveness
  * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
  * power usage by allowing hrtimers to take advantage of an already-
  * scheduled interrupt instead of scheduling a new one just for this sleep.
  */
-void __sched usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range_state(unsigned long min, unsigned long max,
+				unsigned int state)
 {
 	ktime_t exp = ktime_add_us(ktime_get(), min);
 	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
 
 	for (;;) {
-		__set_current_state(TASK_UNINTERRUPTIBLE);
+		__set_current_state(state);
 		/* Do not return before the requested sleep time has elapsed */
 		if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
 			break;
 	}
 }
-EXPORT_SYMBOL(usleep_range);
+EXPORT_SYMBOL(usleep_range_state);
-- 
cgit v1.2.3-71-gd317